From 1644c72accb59c325c7e17bb1bb46e03391a4c27 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Wed, 11 Oct 2017 16:07:30 +0800 Subject: [PATCH 001/275] Add framework of the factorization machine layer --- doc/api/v2/config/layer.rst | 15 +++-- .../layers/FactorizationMachineLayer.cpp | 65 +++++++++++++++++++ .../layers/FactorizationMachineLayer.h | 59 +++++++++++++++++ paddle/gserver/tests/test_LayerGrad.cpp | 19 ++++++ proto/ModelConfig.proto | 3 + python/paddle/trainer/config_parser.py | 15 +++++ .../paddle/trainer_config_helpers/layers.py | 65 +++++++++++++++++++ .../tests/configs/file_list.sh | 3 +- .../test_factorization_machine.protostr | 39 +++++++++++ .../configs/test_factorization_machine.py | 9 +++ 10 files changed, 287 insertions(+), 5 deletions(-) create mode 100644 paddle/gserver/layers/FactorizationMachineLayer.cpp create mode 100644 paddle/gserver/layers/FactorizationMachineLayer.h create mode 100644 python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr create mode 100644 python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst index d4e9d53e5c..89d6953c33 100644 --- a/doc/api/v2/config/layer.rst +++ b/doc/api/v2/config/layer.rst @@ -54,7 +54,7 @@ img_conv .. _api_v2.layer_context_projection: -context_projection +context_projection ------------------ .. autoclass:: paddle.v2.layer.context_projection :noindex: @@ -70,7 +70,7 @@ Image Pooling Layer img_pool -------- .. autoclass:: paddle.v2.layer.img_pool - :noindex: + :noindex: spp --- @@ -99,7 +99,7 @@ sum_to_one_norm --------------- .. autoclass:: paddle.v2.layer.sum_to_one_norm :noindex: - + cross_channel_norm ------------------ .. autoclass:: paddle.v2.layer.cross_channel_norm @@ -109,7 +109,7 @@ row_l2_norm ----------- .. autoclass:: paddle.v2.layer.row_l2_norm :noindex: - + Recurrent Layers ================ @@ -395,6 +395,13 @@ multiplex .. autoclass:: paddle.v2.layer.multiplex :noindex: +Factorization Machine Layer +============================ + +factorization_machine +--------------------- +.. autoclass:: paddle.v2.layer.factorization_machine + :noindex: Slicing and Joining Layers ========================== diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp new file mode 100644 index 0000000000..5456bf2601 --- /dev/null +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -0,0 +1,65 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "FactorizationMachineLayer.h" +#include +#include +#include "paddle/math/SparseMatrix.h" +#include "paddle/utils/Logging.h" +#include "paddle/utils/Stat.h" + +namespace paddle { + +REGISTER_LAYER(factorization_machine, FactorizationMachineLayer); + +bool FactorizationMachineLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + /* Initialize the basic parent class */ + Layer::init(layerMap, parameterMap); + + factorSize_ = config_.factor_size(); + + /* initialize the latentVectors_ */ + CHECK_EQ(inputLayers_.size(), 1UL); + size_t height = inputLayers_[0]->getSize(); + latentVectors_.reset(new Weight(height, factorSize_, parameters_[0])); + + return true; +} + +void FactorizationMachineLayer::forward(PassType passType) { + Layer::forward(passType); + + auto input = getInput(0); + + int batchSize = input.getBatchSize(); + int size = getSize(); + reserveOutput(batchSize, size); + + MatrixPtr outV = getOutputValue(); + + /* activation */ { + REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str()); + forwardActivation(); + } +} + +void FactorizationMachineLayer::backward(const UpdateCallback& callback) { + /* Do derivation */ { + REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str()); + backwardActivation(); + } +} + +} // namespace paddle diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h new file mode 100644 index 0000000000..e7807c8986 --- /dev/null +++ b/paddle/gserver/layers/FactorizationMachineLayer.h @@ -0,0 +1,59 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "Layer.h" +#include "paddle/math/Matrix.h" +#include "paddle/utils/ThreadLocal.h" + +namespace paddle { +/** + * @brief The Factorization Machine models pairwise (order-2) feature + * interactions as inner product of the learned latent vectors corresponding + * to each input feature. + * + * The Factorization Machine can effectively capture feature interactions + * especially when the input is sparse. While in principle FM can model higher + * order feature interaction, in practice usually only order-2 feature + * interactions are considered. The Factorization Machine Layer here only + * computes the order-2 interations with the formula: + * + * \f[ + * y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j + * \f] + * + * The config file api is factorization_machine. + */ + +class FactorizationMachineLayer : public Layer { +protected: + /// The latent vectors, shape: (size, factorSize_) + std::unique_ptr latentVectors_; + /// The hyperparameter that defines the dimensionality of the factorization + size_t factorSize_; + +public: + explicit FactorizationMachineLayer(const LayerConfig& config) + : Layer(config) {} + ~FactorizationMachineLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void forward(PassType passType) override; + void backward(const UpdateCallback& callback = nullptr) override; +}; + +} // namespace paddle diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 90a3352898..542db5ee5b 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2359,6 +2359,25 @@ TEST(Layer, ScaleShiftLayer) { } } +void testFactorizationMachineLayer(InputType type, bool useGpu) { + const int FACTOR_SIZE = 10; + TestConfig config; + config.layerConfig.set_type("factorization_machine"); + config.layerConfig.set_factor_size(FACTOR_SIZE); + config.biasSize = 1; + config.inputDefs.push_back({type, "layer_0", 8192, 0}); + config.layerConfig.add_inputs(); + testLayerGrad(config, "factorization_machine", 16, false, useGpu, false); +} + +TEST(Layer, FactorizationMachineLayer) { + testFactorizationMachineLayer(INPUT_DATA, false); + testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false); +#ifdef PADDLE_WITH_CUDA + testFactorizationMachineLayer(INPUT_DATA, true); +#endif +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); initMain(argc, argv); diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index ebf0911d6e..0d2140ccf9 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -525,6 +525,9 @@ message LayerConfig { // for switch order layer optional ReshapeConfig reshape_conf = 59; + + // for factorization machine layer + optional uint32 factor_size = 60; } message EvaluatorConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 098a51ab87..07b3ff66dc 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -3780,6 +3780,21 @@ class SwitchOrderLayer(LayerBase): self.config.reshape_conf.width_axis.extend(reshape['width']) +@config_layer('factorization_machine') +class FactorizationMachineLayer(LayerBase): + def __init__(self, name, inputs, factor_size, **xargs): + super(FactorizationMachineLayer, self).__init__( + name, 'factorization_machine', size=1, inputs=inputs, **xargs) + config_assert( + len(self.inputs) == 1, + 'factorization machine layer must have one and only one input.') + self.config.factor_size = factor_size + input_layer = self.get_input_layer(0) + psize = input_layer.size * factor_size + dims = [input_layer.size, 1] + self.create_input_parameter(0, psize, dims) + + # Deprecated, use a new layer specific class instead @config_func def Layer(name, type, **xargs): diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index d37f29d2c4..e6348dca2a 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -143,6 +143,7 @@ __all__ = [ 'scale_shift_layer', 'img_conv3d_layer', 'resize_layer', + 'factorization_machine', ] @@ -253,6 +254,8 @@ class LayerType(object): RESIZE = 'resize' + FACTORIZATION_MACHINE = 'factorization_machine' + @staticmethod def is_layer_type(type_name): """ @@ -6955,3 +6958,65 @@ def resize_layer(input, size, name=None): """ Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size) return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size) + + +@wrap_name_default() +@wrap_act_default(act=LinearActivation()) +@wrap_param_attr_default() +@layer_support() +def factorization_machine(input, + factor_size, + act=None, + name=None, + param_attr=None, + layer_attr=None): + """ + The Factorization Machine models pairwise feature interactions as inner + product of the learned latent vectors corresponding to each input feature. + + The Factorization Machine can effectively capture feature interactions + especially when the input is sparse. In practice, usually order 2 feature + interactions are considered using Factorization Machine with the formula: + + .. math:: + + y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j + + Note: + X is the input vector with size n. V is the factor matrix. Each row of V + is the latent vector corresponding to each input dimesion. The size of + each latent vector is k. + + .. code-block:: python + + factor_machine = factorization_machine(input=input_layer, factor_size=10) + + :param input: The input layer. + :type input: LayerOutput + :param factor_size: The hyperparameter that defines the dimensionality of + the latent vector size + :type context_len: int + :param act: Activation Type. Default is linear activation. + :type act: BaseActivation + :param param_attr: The Parameter Attribute. If None, the latent vectors will + be initialized smartly. It's better to set it by + yourself. + :type param_attr: ParameterAttribute + :param layer_attr: Extra Layer config. + :type layer_attr: ExtraLayerAttribute|None + :return: LayerOutput object. + :rtype: LayerOutput + + """ + assert isinstance(input, LayerOutput) + assert factor_size > 0, "the factor_size must be greater than 0." + + Layer( + inputs=[Input(input.name, **param_attr.attr)], + name=name, + factor_size=factor_size, + type=LayerType.FACTORIZATION_MACHINE, + active_type=act.name, + **ExtraLayerAttribute.to_kwargs(layer_attr)) + return LayerOutput( + name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1) diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh index 6a4550c209..40bbb04bd4 100755 --- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh +++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh @@ -10,6 +10,7 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer -test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer) +test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer +test_factorization_machine) export whole_configs=(test_split_datasource) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr new file mode 100644 index 0000000000..585a5c7b23 --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr @@ -0,0 +1,39 @@ +type: "nn" +layers { + name: "data" + type: "data" + size: 1024 + active_type: "" +} +layers { + name: "__factorization_machine_0__" + type: "factorization_machine" + size: 1 + active_type: "" + inputs { + input_layer_name: "data" + input_parameter_name: "___factorization_machine_0__.w0" + } + factor_size: 10 +} +parameters { + name: "___factorization_machine_0__.w0" + size: 10240 + initial_mean: 0.0 + initial_std: 0.03125 + dims: 1024 + dims: 1 + initial_strategy: 0 + initial_smart: true +} +input_layer_names: "data" +output_layer_names: "__factorization_machine_0__" +sub_models { + name: "root" + layer_names: "data" + layer_names: "__factorization_machine_0__" + input_layer_names: "data" + output_layer_names: "__factorization_machine_0__" + is_recurrent_layer_group: false +} + diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py new file mode 100644 index 0000000000..62ceb359cf --- /dev/null +++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py @@ -0,0 +1,9 @@ +from paddle.trainer_config_helpers import * + +settings(batch_size=1000, learning_rate=1e-5) + +data = data_layer(name='data', size=1024) + +fm = factorization_machine(input=data, factor_size=10) + +outputs(fm) From f504c8a83d641b573ef0765227246460dea2f764 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Wed, 11 Oct 2017 21:47:27 +0800 Subject: [PATCH 002/275] Remove unnecessary configs --- paddle/gserver/tests/test_LayerGrad.cpp | 4 +--- .../tests/configs/test_factorization_machine.py | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index f63c93c943..eea884cb50 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2371,10 +2371,8 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) { TEST(Layer, FactorizationMachineLayer) { testFactorizationMachineLayer(INPUT_DATA, false); - testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false); -#ifdef PADDLE_WITH_CUDA testFactorizationMachineLayer(INPUT_DATA, true); -#endif + testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false); } int main(int argc, char** argv) { diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py index 62ceb359cf..b249de0fee 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py +++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py @@ -1,7 +1,5 @@ from paddle.trainer_config_helpers import * -settings(batch_size=1000, learning_rate=1e-5) - data = data_layer(name='data', size=1024) fm = factorization_machine(input=data, factor_size=10) From 947b6a77ce08c1ca2dc386514f0e97eb75ade91a Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Tue, 17 Oct 2017 00:26:53 +0800 Subject: [PATCH 003/275] Implement factorization machine layer --- .../layers/FactorizationMachineLayer.cpp | 62 +++++++++++++++++-- .../layers/FactorizationMachineLayer.h | 12 ++++ paddle/gserver/tests/test_LayerGrad.cpp | 5 +- 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index 5456bf2601..09128eeeef 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -33,7 +33,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap, /* initialize the latentVectors_ */ CHECK_EQ(inputLayers_.size(), 1UL); size_t height = inputLayers_[0]->getSize(); - latentVectors_.reset(new Weight(height, factorSize_, parameters_[0])); + latentVectors_ = + std::unique_ptr(new Weight(height, factorSize_, parameters_[0])); + + v2_ = latentVectors_->getW()->clone(0, 0, useGpu_); return true; } @@ -41,14 +44,28 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap, void FactorizationMachineLayer::forward(PassType passType) { Layer::forward(passType); - auto input = getInput(0); + const MatrixPtr& inputV = getInputValue(0); - int batchSize = input.getBatchSize(); - int size = getSize(); + size_t batchSize = inputV->getHeight(); + size_t size = getSize(); reserveOutput(batchSize, size); MatrixPtr outV = getOutputValue(); + Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_); + Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_); + + REGISTER_TIMER_INFO("FwMulTimer", getName().c_str()); + tmpMul_->mul(*inputV, *latentVectors_->getW()); + tmpOut_->pow2(*tmpMul_, 2); + outV->sumRows(*tmpOut_, 0.5, 0); + + x2_ = inputV->clone(0, 0, useGpu_); + x2_->pow2(*inputV, 2); + v2_->pow2(*latentVectors_->getW(), 2); + tmpOut_->mul(*x2_, *v2_); + outV->sumRows(*tmpOut_, -0.5, 1.0); + /* activation */ { REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str()); forwardActivation(); @@ -60,6 +77,43 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) { REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str()); backwardActivation(); } + + const MatrixPtr& inputV = getInputValue(0); + const MatrixPtr& oGrad = getOutputGrad(); + + MatrixPtr tmpSum = + Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_); + MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0), + latentVectors_->getW()->getHeight(), + 1, + false, + useGpu_); + + /* Calculate the gradients of the latentVectors_ matrix */ + if (latentVectors_->getWGrad()) { + MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_); + tmpIn->rowScale(0, *inputV, *oGrad); + + latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1); + + tmpIn->rowScale(0, *x2_, *oGrad); + tmpSum->sumCols(*tmpIn, -1, 0); + latentVectors_->getWGrad()->addRowScale( + 0, *latentVectors_->getW(), *tmpSum_T); + + /* Increasing the number of gradient */ + latentVectors_->getParameterPtr()->incUpdate(callback); + } + + /* Calculate the input layers gradient */ + MatrixPtr inGrad = getInputGrad(0); + if (inGrad != NULL) { + MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose(); + inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1); + tmpSum_T->sumRows(*v2_, -1, 0); + inGrad->addColScale(0, *inputV, *tmpSum); + inGrad->rowScale(0, *inGrad, *oGrad); + } } } // namespace paddle diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h index e7807c8986..7cf064690f 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.h +++ b/paddle/gserver/layers/FactorizationMachineLayer.h @@ -40,10 +40,22 @@ namespace paddle { class FactorizationMachineLayer : public Layer { protected: /// The latent vectors, shape: (size, factorSize_) + /// Each row of the latentVectors_ matrix is the latent vector + /// corresponding to one input feature dimension std::unique_ptr latentVectors_; /// The hyperparameter that defines the dimensionality of the factorization size_t factorSize_; +private: + /// The result of input matrix * letent vector matrix that will be used in + /// both forward and backward step + MatrixPtr tmpMul_; + MatrixPtr tmpOut_; + /// Store the square values of the letent vectors matrix + MatrixPtr v2_; + /// Store the square values of input matrix + MatrixPtr x2_; + public: explicit FactorizationMachineLayer(const LayerConfig& config) : Layer(config) {} diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index eea884cb50..21e8fb7eed 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2363,8 +2363,9 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) { TestConfig config; config.layerConfig.set_type("factorization_machine"); config.layerConfig.set_factor_size(FACTOR_SIZE); - config.biasSize = 1; - config.inputDefs.push_back({type, "layer_0", 8192, 0}); + config.layerConfig.set_size(1); + config.biasSize = 0; + config.inputDefs.push_back({type, "layer_0", 1024, 10240}); config.layerConfig.add_inputs(); testLayerGrad(config, "factorization_machine", 16, false, useGpu, false); } From 2ce8f1875bb6f69bdc48eb16e78a2c163316ca2b Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Tue, 17 Oct 2017 11:09:41 +0800 Subject: [PATCH 004/275] Fix tests for factorization machine layer --- paddle/gserver/tests/test_LayerGrad.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 21e8fb7eed..54053b751b 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2373,7 +2373,6 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) { TEST(Layer, FactorizationMachineLayer) { testFactorizationMachineLayer(INPUT_DATA, false); testFactorizationMachineLayer(INPUT_DATA, true); - testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false); } int main(int argc, char** argv) { From 86053e7766a93ee0130131c20f262c58a4cbc86d Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Tue, 17 Oct 2017 12:20:43 +0800 Subject: [PATCH 005/275] Reduce the input size in testing factorization machine --- paddle/gserver/tests/test_LayerGrad.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 54053b751b..6c604b1e67 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2365,14 +2365,15 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) { config.layerConfig.set_factor_size(FACTOR_SIZE); config.layerConfig.set_size(1); config.biasSize = 0; - config.inputDefs.push_back({type, "layer_0", 1024, 10240}); + config.inputDefs.push_back({type, "layer_0", 128, 1280}); config.layerConfig.add_inputs(); testLayerGrad(config, "factorization_machine", 16, false, useGpu, false); } TEST(Layer, FactorizationMachineLayer) { - testFactorizationMachineLayer(INPUT_DATA, false); - testFactorizationMachineLayer(INPUT_DATA, true); + for (auto useGpu : {false, true}) { + testFactorizationMachineLayer(INPUT_DATA, useGpu); + } } int main(int argc, char** argv) { From 9741ade8ee761f78291e249ea17ad5e3e2c904d2 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Tue, 17 Oct 2017 16:53:54 +0800 Subject: [PATCH 006/275] Change pow to square in factorization machine layer --- paddle/gserver/layers/FactorizationMachineLayer.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index 09128eeeef..8d9dcbaea7 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -57,12 +57,12 @@ void FactorizationMachineLayer::forward(PassType passType) { REGISTER_TIMER_INFO("FwMulTimer", getName().c_str()); tmpMul_->mul(*inputV, *latentVectors_->getW()); - tmpOut_->pow2(*tmpMul_, 2); + tmpMul_->square2(*tmpOut_); outV->sumRows(*tmpOut_, 0.5, 0); x2_ = inputV->clone(0, 0, useGpu_); - x2_->pow2(*inputV, 2); - v2_->pow2(*latentVectors_->getW(), 2); + inputV->square2(*x2_); + latentVectors_->getW()->square2(*v2_); tmpOut_->mul(*x2_, *v2_); outV->sumRows(*tmpOut_, -0.5, 1.0); From 8654e8a5203c62ca7b69c1778ff0b71f7c5f8223 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Tue, 17 Oct 2017 23:42:51 +0800 Subject: [PATCH 007/275] Fix dims in config parser for factorization machine layer --- python/paddle/trainer/config_parser.py | 2 +- .../tests/configs/protostr/test_factorization_machine.protostr | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 9aba0b49ad..557a91ca7b 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -3794,7 +3794,7 @@ class FactorizationMachineLayer(LayerBase): self.config.factor_size = factor_size input_layer = self.get_input_layer(0) psize = input_layer.size * factor_size - dims = [input_layer.size, 1] + dims = [input_layer.size, factor_size] self.create_input_parameter(0, psize, dims) diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr index 585a5c7b23..4f3002b199 100644 --- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr +++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr @@ -22,7 +22,7 @@ parameters { initial_mean: 0.0 initial_std: 0.03125 dims: 1024 - dims: 1 + dims: 10 initial_strategy: 0 initial_smart: true } From 4c72b0634cc2c280f0edcc84a0ece00511fdd6cd Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Wed, 18 Oct 2017 15:36:36 +0800 Subject: [PATCH 008/275] Fix creation of tmp variable in factorization machine layer --- paddle/gserver/layers/FactorizationMachineLayer.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index 8d9dcbaea7..e5c9d1a90d 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -33,10 +33,11 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap, /* initialize the latentVectors_ */ CHECK_EQ(inputLayers_.size(), 1UL); size_t height = inputLayers_[0]->getSize(); + CHECK_EQ(parameters_[0]->getSize(), height * factorSize_); latentVectors_ = std::unique_ptr(new Weight(height, factorSize_, parameters_[0])); - v2_ = latentVectors_->getW()->clone(0, 0, useGpu_); + v2_ = Matrix::create(height, factorSize_, false, useGpu_); return true; } From e4733224c9e6c6c2eede669e9cdbf17e7be86501 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Fri, 20 Oct 2017 14:08:51 -0700 Subject: [PATCH 009/275] initial commit for float16 --- paddle/math/float16.h | 142 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) create mode 100644 paddle/math/float16.h diff --git a/paddle/math/float16.h b/paddle/math/float16.h new file mode 100644 index 0000000000..84e533d1fc --- /dev/null +++ b/paddle/math/float16.h @@ -0,0 +1,142 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include + +#ifdef __arm__ +#define PADDLE_ARM_32 +#endif + +#ifdef __aarch64__ +#define PADDLE_ARM_64 +#endif + +#if defined(PADDLE_ARM_32) || defined(PADDLE_ARM_64) +#define PADDLE_ARM +#endif + +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#define PADDLE_NEON +#endif + +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_32) +#define PADDLE_NEON_32 +#endif + +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_64) +#define PADDLE_NEON_64 +#endif + +#ifdef __CUDA_ARCH__ // use __CUDACC__ instead +#define PADDLE_HOSTDEVICE __host__ __device__ +#if CUDA_VERSION >= 7050 +#define PADDLE_CUDA_FP16 +#include +#endif // CUDA_VERSION >= 7050 +#else +#define PADDLE_HOSTDEVICE +#endif // __CUDA_ARCH__ + +#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM) +#include +#else +#ifdef __F16C__ +#undef __F16C__ +#endif +#endif + +#define PADDLE_ALIGNED(x) __attribute__((aligned(x))) + +// https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h +template +To convert(From f) { + return static_cast(f); +} + +namespace paddle { + +class float16; + +// convert from float to half precision in round-to-nearest-even mode +float16 float2half_rn(float f); +float half2float(float16 h); + +class float16 { +public: + uint16_t val_; + + PADDLE_HOSTDEVICE inline explicit float16() : x(0) {} + + PADDLE_HOSTDEVICE inline explicit float16(float val) { + float16 res = float2half_rn(val); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(int val) { + float16 res = cpu_float2half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(double val) { + float16 res = cpu_float2half_rn(static_cast(val)); + x = res.x; + } + + // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated + // and aligned at least on a 2-byte boundary, which leads to efficient + // memory access of float16 struct. +} PADDLE_ALIGNED(2); + +namespace fp16_impl { + +// Conversion routine adapted from +// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion +Union Bits { + float f; + int32_t si; + uint32_t ui; +}; + +static const int shift = 13; +static const int shiftSign = 16; + +static const int32_t infN = 0x7F800000; +static const int32_t maxN = 0x477FE000; // max flt16 as flt32 +static const int32_t minN = 0x38800000; // min flt16 normal as flt32 +static const int32_t sigN = 0x80000000; // sign bit + +static constexpr int32_t infC = infN >> shift; +static constexpr int32_t nanN = (infC + 1) + << shift; // minimum flt16 nan as float32 +static constexpr int32_t maxC = maxN >> shift; +static constexpr int32_t minC = minN >> shift; +static constexpr int32_t sigC = sigN >> shiftSign; + +static const int32_t mulN = 0x52000000; //(1 << 23) / minN +static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) +static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted +static const int32_t norC = 0x00400; // min flt32 normal downshifted + +static constexpr int32_t maxD = infC - maxC - 1; +static constexpr int32_t minD = minC - subC - 1; + +} // namespace half_impl + +} // namespace paddle From d9062cd9ee1297547c16d57c0d5024ceb3555d2f Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Thu, 26 Oct 2017 00:43:47 +0800 Subject: [PATCH 010/275] Add sparse matrix support in factorization machine layer --- .../layers/FactorizationMachineLayer.cpp | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index e5c9d1a90d..06658a2841 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -62,7 +62,12 @@ void FactorizationMachineLayer::forward(PassType passType) { outV->sumRows(*tmpOut_, 0.5, 0); x2_ = inputV->clone(0, 0, useGpu_); - inputV->square2(*x2_); + if (dynamic_cast(x2_.get())) { + x2_->copyFrom(*inputV); + (dynamic_cast(x2_.get()))->square2(); + } else { + inputV->square2(*x2_); + } latentVectors_->getW()->square2(*v2_); tmpOut_->mul(*x2_, *v2_); outV->sumRows(*tmpOut_, -0.5, 1.0); @@ -93,11 +98,20 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) { /* Calculate the gradients of the latentVectors_ matrix */ if (latentVectors_->getWGrad()) { MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_); - tmpIn->rowScale(0, *inputV, *oGrad); - - latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1); + if (dynamic_cast(inputV.get())) { + CpuSparseMatrix* inputV_s = dynamic_cast(inputV.get()); + CpuSparseMatrix* x2_s = dynamic_cast(x2_.get()); + CpuSparseMatrix* tmpIn_s = dynamic_cast(tmpIn.get()); + tmpIn_s->copyFrom(*inputV_s); + tmpIn_s->rowScale(0, *inputV_s, *oGrad); + latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1); + tmpIn_s->rowScale(0, *x2_s, *oGrad); + } else { + tmpIn->rowScale(0, *inputV, *oGrad); + latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1); + tmpIn->rowScale(0, *x2_, *oGrad); + } - tmpIn->rowScale(0, *x2_, *oGrad); tmpSum->sumCols(*tmpIn, -1, 0); latentVectors_->getWGrad()->addRowScale( 0, *latentVectors_->getW(), *tmpSum_T); From 509ae79a5de846dfd38bd85618b2467066413a97 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Thu, 26 Oct 2017 00:47:06 +0800 Subject: [PATCH 011/275] Add rowScale for CpuSparseMatrix --- paddle/math/CpuSparseMatrix.cpp | 17 +++++++++++++++++ paddle/math/CpuSparseMatrix.h | 9 +++++++++ 2 files changed, 26 insertions(+) diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp index bf62229c03..e211c23a7e 100644 --- a/paddle/math/CpuSparseMatrix.cpp +++ b/paddle/math/CpuSparseMatrix.cpp @@ -260,6 +260,23 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const { os << ";"; } +void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) { + CHECK(getFormat() != SPARSE_CSC) << "Not supported"; + CHECK(height_ == b.getHeight()); + CHECK(width_ == b.getWidth()); + real* A = getValue(); + real* B = b.getValue(); + for (size_t i = 0; i < height_; i++) { + size_t start = getRowStartIdx(i); + size_t end = getRowStartIdx(i + 1); + CHECK(start == b.getRowStartIdx(i)); + CHECK(end == b.getRowStartIdx(i + 1)); + for (size_t j = start; j < end; j++) { + A[j] = B[j] * c.getElement(i, cCol); + } + } +} + void CpuSparseMatrix::randomizeUniform() { CHECK_LE(elementCnt_, height_ * width_); if (valueType_ == FLOAT_VALUE) { diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h index 36d57bbb65..8f9ad67215 100644 --- a/paddle/math/CpuSparseMatrix.h +++ b/paddle/math/CpuSparseMatrix.h @@ -236,6 +236,15 @@ public: const unsigned int* cols, const real* values); + /** + * @brief this_row = b_row * c_row[cCol] + * + * @param[in] cCol the column of matrix c used to scale each row of b + * @param[in] b CpuSparseMatrix + * @param[in] c Matrix + */ + void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c); + void randomizeUniform(); void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream); From a208dd64ae1d7a5662a1bf4728162d8123aa89bf Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 30 Oct 2017 17:39:18 -0700 Subject: [PATCH 012/275] add float16 data type --- paddle/math/float16.h | 365 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 320 insertions(+), 45 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 84e533d1fc..84fe613d51 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -18,7 +18,21 @@ limitations under the License. */ #include #include -#include +#include // seems need to delete it + +#ifdef USE_EIGEN // delete this #if macro +#include "Eigen/src/Core/arch/CUDA/Half.h" +#endif + +#ifdef __CUDACC__ +#define PADDLE_HOSTDEVICE __host__ __device__ +#if CUDA_VERSION >= 7050 +#define PADDLE_CUDA_FP16 +#include +#endif // CUDA_VERSION >= 7050 +#else +#define PADDLE_HOSTDEVICE +#endif // __CUDA_ARCH__ #ifdef __arm__ #define PADDLE_ARM_32 @@ -44,15 +58,9 @@ limitations under the License. */ #define PADDLE_NEON_64 #endif -#ifdef __CUDA_ARCH__ // use __CUDACC__ instead -#define PADDLE_HOSTDEVICE __host__ __device__ -#if CUDA_VERSION >= 7050 -#define PADDLE_CUDA_FP16 -#include -#endif // CUDA_VERSION >= 7050 -#else -#define PADDLE_HOSTDEVICE -#endif // __CUDA_ARCH__ +#if defined(PADDLE_ARM) && defined(PADDLE_NEON) +#include +#endif #if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM) #include @@ -62,7 +70,7 @@ limitations under the License. */ #endif #endif -#define PADDLE_ALIGNED(x) __attribute__((aligned(x))) +#define PADDLE_ALIGN(x) __attribute__((aligned(x))) // https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h template @@ -72,70 +80,337 @@ To convert(From f) { namespace paddle { -class float16; +struct float16; +namespace fp16_impl { // convert from float to half precision in round-to-nearest-even mode -float16 float2half_rn(float f); -float half2float(float16 h); +PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f); +PADDLE_HOSTDEVICE inline float half_to_float(float16 h); +PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x); +} // namespace fp16_impl -class float16 { -public: - uint16_t val_; +// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated +// and aligned at least on a 2-byte boundary, which leads to efficient +// memory access of float16 struct and also makes float16 compatible +// with CUDA half and Eigen::half data types. +struct PADDLE_ALIGN(2) float16 { + uint16_t x; - PADDLE_HOSTDEVICE inline explicit float16() : x(0) {} + // explicit for different types, implicit for half and Eigen::half + + PADDLE_HOSTDEVICE inline float16() {} + + PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} + +#ifdef PADDLE_CUDA_FP16 + PADDLE_HOSTDEVICE inline float16(const half h) { +#if CUDA_VERSION >= 9000 + x = reinterpret_cast<__half_raw*>(&h)->x; +#else + x = h.x; +#endif // CUDA_VERSION >= 9000 + } +#endif // PADDLE_CUDA_FP16 +/* +#ifdef PADDLE_CUDA_FP16 + #if CUDA_VERSION < 9000 + PADDLE_HOSTDEVICE inline float16(const half& h) : x(h.x) {} + #else + PADDLE_HOSTDEVICE inline float16(const __half_raw& h) : x(h.x) {} + PADDLE_HOSTDEVICE inline float16(const half& h) + : x(*reinterpret_cast(&h)) {} + #endif // CUDA_VERSION < 9000 +#endif // PADDLE_CUDA_FP16 +*/ + +#ifdef USE_EIGEN + PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} +#endif // USE_EIGEN + +#if defined(PADDLE_ARM) && defined(PADDLE_NEON) + // __fp16 is a native half precision data type for arm cpu, + // float16_t is an alias for __fp16 in arm_fp16.h + // which is included in arm_neon.h + PADDLE_HOSTDEVICE inline float16(const float16_t h) { + x = *reinterpret_cast(&h); + } +#endif + + PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} PADDLE_HOSTDEVICE inline explicit float16(float val) { - float16 res = float2half_rn(val); + float16 res = fp16_impl::float_to_half_rn(val); + x = res.x; + } + + template + PADDLE_HOSTDEVICE inline explicit float16(const T& val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; } + PADDLE_HOSTDEVICE inline float16& operator=(const float16& rhs) { + x = rhs.x; + return *this; + } + +#ifdef PADDLE_CUDA_FP16 + PADDLE_HOSTDEVICE inline float16& operator=(const half rhs) { +#if CUDA_VERSION >= 9000 + x = reinterpret_cast<__half_raw*>(&rhs)->x; +#else + x = rhs.x; +#endif + return *this; + } +#endif + +#ifdef USE_EIGEN + PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { + x = rhs.x; + return *this; + } +#endif // USE_EIGEN + +#if defined(PADDLE_ARM) && defined(PADDLE_NEON) + PADDLE_HOSTDEVICE inline float16& operator=(const float16_t rhs) { + x = *reinterpret_cast(&rhs); + return *this; + } +#endif + +/* PADDLE_HOSTDEVICE inline explicit float16(int val) { - float16 res = cpu_float2half_rn(static_cast(val)); + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; } PADDLE_HOSTDEVICE inline explicit float16(double val) { - float16 res = cpu_float2half_rn(static_cast(val)); + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; } +*/ + +#ifdef PADDLE_CUDA_FP16 + PADDLE_HOSTDEVICE inline operator half() { +#if CUDA_VERSION >= 9000 + __half_raw h; + h.x = x; + return half(h); +#else + half h; + h.x = x; + return h; +#endif // CUDA_VERSION >= 9000 + } +#endif // PADDLE_CUDA_FP16 - // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated - // and aligned at least on a 2-byte boundary, which leads to efficient - // memory access of float16 struct. -} PADDLE_ALIGNED(2); +#ifdef USE_EIGEN + PADDLE_HOSTDEVICE inline operator Eigen::half() { + Eigen::half h; + h.x = x; + return h; + } +#endif // USE_EIGEN + +#if defined(PADDLE_ARM) && defined(PADDLE_NEON) + PADDLE_HOSTDEVICE inline operator float16_t() { + float16 h = *this; + return *reinterpret_cast(&h); + } +#endif + + PADDLE_HOSTDEVICE inline explicit operator bool() { + return (x & 0x7fff) != 0; + } + + PADDLE_HOSTDEVICE inline explicit operator int8_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator uint8_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator int16_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator uint16_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator int32_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator uint32_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator int64_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator uint64_t() { + return static_cat(fp16_impl::half_to_float(*this)); + } + + PADDLE_HOSTDEVICE inline explicit operator float() { + return fp16_impl::half_to_float(*this); + } + + PADDLE_HOSTDEVICE inline explicit operator double() { + return static_cat(fp16_impl::half_to_float(*this)); + } +}; + +// arithmetic operators +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 +__device__ inline float16 operator+(const float16& a, const float16& b) { + return float16(__hadd(a, b)); +} + +__device__ inline float16 operator-(const float16& a, const float16& b) { + return __hsub(a, b); +} + +__device__ inline float16 operator*(const float16& a, const float16& b) { + return __hmul(a, b); +} + +#elif // on arm cpu + +#else + +#endif namespace fp16_impl { -// Conversion routine adapted from -// http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion Union Bits { float f; int32_t si; uint32_t ui; }; -static const int shift = 13; -static const int shiftSign = 16; +const int shift = 13; +const int shiftSign = 16; + +const int32_t infN = 0x7F800000; +const int32_t maxN = 0x477FE000; // max flt16 as flt32 +const int32_t minN = 0x38800000; // min flt16 normal as flt32 +const int32_t sigN = 0x80000000; // sign bit + +constexpr int32_t infC = infN >> shift; +constexpr int32_t nanN = (infC + 1) << shift; // minimum flt16 nan as float32 +constexpr int32_t maxC = maxN >> shift; +constexpr int32_t minC = minN >> shift; +constexpr int32_t sigC = sigN >> shiftSign; + +const int32_t mulN = 0x52000000; //(1 << 23) / minN +const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) +const int32_t subC = 0x003FF; // max flt32 subnormal downshifted +const int32_t norC = 0x00400; // min flt32 normal downshifted + +constexpr int32_t maxD = infC - maxC - 1; +constexpr int32_t minD = minC - subC - 1; + +PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + half tmp = __float2half(f); + return *reinterpret_cast(&(tmp)); + +#elif defined(__F16C__) + float16 res; + res.x = _cvtss_sh(f, 0); + return res; + +#elif defined(PADDLE_ARM_64) // test on RPI + float16 res; + asm volatile( + "ld1 {v0.s}[0], [%[float_ptr]]\n" + "FCVT h0, s0\n" + "st1 {v0.h}[0], [%[half_ptr]]\n" + : // outputs + : // inputs + [float_ptr] "r"(&f), + [half_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0"); + return res; -static const int32_t infN = 0x7F800000; -static const int32_t maxN = 0x477FE000; // max flt16 as flt32 -static const int32_t minN = 0x38800000; // min flt16 normal as flt32 -static const int32_t sigN = 0x80000000; // sign bit +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v, s; + v.f = f; + uint32_t sign = v.si & sigN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = mulN; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -(minN > v.si); + v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); + v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); + v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + float16 res; + res.x = v.ui | sign; + return res; -static constexpr int32_t infC = infN >> shift; -static constexpr int32_t nanN = (infC + 1) - << shift; // minimum flt16 nan as float32 -static constexpr int32_t maxC = maxN >> shift; -static constexpr int32_t minC = minN >> shift; -static constexpr int32_t sigC = sigN >> shiftSign; +#endif +} -static const int32_t mulN = 0x52000000; //(1 << 23) / minN -static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) -static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted -static const int32_t norC = 0x00400; // min flt32 normal downshifted +PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + half tmp = *reinterpret_cast(&h); + return __half2float(h); + +#elif defined(__F16C__) + return _cvtsh_ss(h.x); + +#elif defined(PADDLE_ARM_64) // test on RPI + float res; + asm volatile( + "ld1 {v0.h}[0], [%[half_ptr]]\n" + "FCVT s0, h0\n" + "st1 {v0.s}[0], [%[float_ptr]]\n" + : // outputs + : // inputs + [half_ptr] "r"(&(h.x)), + [float_ptr] "r"(&res) + : // clobbers + "memory", "v0"); + return res; -static constexpr int32_t maxD = infC - maxC - 1; -static constexpr int32_t minD = minC - subC - 1; +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v; + v.ui = x; + int32_t sign = v.si & sigC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + +#endif +} + +PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x) { + float16 res; + res.x = x; + return res; +} } // namespace half_impl From aeeb77de1d40ea71df2d18de9969980aab4fc631 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 1 Nov 2017 20:53:43 +0800 Subject: [PATCH 013/275] simple pipe reader for hdfs or other service --- python/paddle/v2/reader/decorator.py | 98 ++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 45a4288751..0695542690 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -323,3 +323,101 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False): yield sample return xreader + + +def _buf2lines(buf, line_break="\n"): + # FIXME: line_break should be automatically configured. + lines = buf.split(line_break) + return lines[:-1], lines[-1] + + +def pipe_reader(left_cmd, + parser, + bufsize=8192, + file_type="plain", + cut_lines=True, + line_break="\n"): + """ + pipe_reader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. + + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: + + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" + + A sample parser: + + def sample_parser(lines): + # parse each line as one sample data, + # return a list of samples as batches. + ret = [] + for l in lines: + ret.append(l.split(" ")[1:5]) + return ret + + :param left_cmd: command to excute to get stdout from. + :type left_cmd: string + :param parser: parser function to parse lines of data. + if cut_lines is True, parser will receive list + of lines. + if cut_lines is False, parser will receive a + raw buffer each time. + parser should return a list of parsed values. + :type parser: callable + :param bufsize: the buffer size used for the stdout pipe. + :type bufsize: int + :param file_type: can be plain/gzip, stream buffer data type. + :type file_type: string + :param cut_lines: whether to pass lines instead of raw buffer + to the parser + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: the reader generator. + :rtype: callable + """ + if not isinstance(left_cmd, str): + raise TypeError("left_cmd must be a string") + if not callable(parser): + raise TypeError("parser must be a callable object") + + process = subprocess.Popen( + left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + # TODO(typhoonzero): add a thread to read stderr + + # Always init a decompress object is better than + # create in the loop. + dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + + def reader(): + remained = "" + while True: + buff = process.stdout.read(bufsize) + if buff: + if file_type == "gzip": + decomp_buff = dec.decompress(buff) + elif file_type == "plain": + decomp_buff = buff + else: + raise TypeError("file_type %s is not allowed" % file_type) + + if cut_lines: + lines, remained = _buf2lines(''.join( + [remained, decomp_buff]), line_break) + parsed_list = parser(lines) + for ret in parsed_list: + yield ret + else: + for ret in parser(decomp_buff): + yield ret + else: + break + + return reader From 4172fc09c39b61c3cb1933687680bab15153b59f Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Wed, 1 Nov 2017 21:51:23 +0800 Subject: [PATCH 014/275] Add sparse input support for factorization machine layer --- paddle/gserver/layers/FactorizationMachineLayer.cpp | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index 06658a2841..3bd8d7cb4c 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -104,15 +104,21 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) { CpuSparseMatrix* tmpIn_s = dynamic_cast(tmpIn.get()); tmpIn_s->copyFrom(*inputV_s); tmpIn_s->rowScale(0, *inputV_s, *oGrad); - latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1); + latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1); tmpIn_s->rowScale(0, *x2_s, *oGrad); + + MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_); + ones->zeroMem(); + ones->add(-1); + tmpSum->mul(*ones, *tmpIn_s, 1, 0); } else { tmpIn->rowScale(0, *inputV, *oGrad); latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1); tmpIn->rowScale(0, *x2_, *oGrad); + + tmpSum->sumCols(*tmpIn, -1, 0); } - tmpSum->sumCols(*tmpIn, -1, 0); latentVectors_->getWGrad()->addRowScale( 0, *latentVectors_->getW(), *tmpSum_T); From 9d8b30596491930c6137e56d7883370bff24d2c8 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Tue, 7 Nov 2017 13:19:59 -0800 Subject: [PATCH 015/275] small fix --- paddle/math/float16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 84fe613d51..5fe2854969 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -18,7 +18,7 @@ limitations under the License. */ #include #include -#include // seems need to delete it +#include #ifdef USE_EIGEN // delete this #if macro #include "Eigen/src/Core/arch/CUDA/Half.h" @@ -32,7 +32,7 @@ limitations under the License. */ #endif // CUDA_VERSION >= 7050 #else #define PADDLE_HOSTDEVICE -#endif // __CUDA_ARCH__ +#endif // __CUDACC__ #ifdef __arm__ #define PADDLE_ARM_32 From 3d276277df1b1f8b216cae246d5cdc4f6dd02028 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 8 Nov 2017 14:17:38 +0800 Subject: [PATCH 016/275] Add nce op 1. Add nce forward and backward kernel for CPU --- paddle/operators/nce_op.cc | 120 +++++++++++++++++++++ paddle/operators/nce_op.h | 210 +++++++++++++++++++++++++++++++++++++ 2 files changed, 330 insertions(+) create mode 100644 paddle/operators/nce_op.cc create mode 100644 paddle/operators/nce_op.h diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc new file mode 100644 index 0000000000..afd61b8851 --- /dev/null +++ b/paddle/operators/nce_op.cc @@ -0,0 +1,120 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/nce_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class NCEOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Label")); + PADDLE_ENFORCE(ctx->HasInput("W")); + PADDLE_ENFORCE(ctx->HasOutput("Out")); + PADDLE_ENFORCE(ctx->HasOutput("SampleLogits")); + PADDLE_ENFORCE(ctx->HasOutput("SampleLabels")); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); + if (ctx->HasInput("B")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("W")[0], ctx->GetInputDim("B")[0]); + } + int num_sampled_classes = ctx->Attrs().Get("num_sampled_classes"); + int num_classes = ctx->Attrs().Get("num_classes"); + PADDLE_ENFORCE_EQ(num_classes, ctx->GetInputDim("W")[0]); + PADDLE_ENFORCE_LT(num_sampled_classes, num_classes); + + // set dims of output(Out) + std::vector out_dims(1); + out_dims.push_back(x_dims[0]); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + + // set dims of output(SampleOut) + std::vector sample_out_dims(2); + sample_out_dims.push_back(x_dims[0]); + sample_out_dims.push_back(num_sampled_classes + 1); + ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims)); + ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims)); + } +}; + +class NCEOpMaker : public framework::OpProtoAndCheckerMaker { + public: + NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", ""); + AddInput("Label", ""); + AddInput("W", ""); + AddInput("B", ""); + AddInput("SampleWeight", ""); + AddOutput("Out", ""); + AddOutput("SampleLogits", ""); + AddOutput("SampleLabels", ""); + AddAttr("num_classes", ""); + AddAttr("num_sampled_classes", "").SetDefault(10); + AddComment(R"DOC( +Expand input(X) according to LOD of input(Y). + +)DOC"); + } +}; + +class NCEOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("W")); + PADDLE_ENFORCE(ctx->HasInput("Out")); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The input(Out@GRAD) should not be null"); + + auto x_dims = ctx->GetInputDim("X"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto w_dims = ctx->GetInputDim("W"); + auto w_grad_name = framework::GradVarName("W"); + if (ctx->HasOutput(w_grad_name)) { + ctx->SetOutputDim(w_grad_name, w_dims); + } + + auto bias_grad_name = framework::GradVarName("B"); + if (ctx->HasOutput(bias_grad_name)) { + auto bias_dims = ctx->GetInputDim("B"); + ctx->SetOutputDim(bias_grad_name, bias_dims); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad); +REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel); +REGISTER_OP_CPU_KERNEL(nce_grad, + ops::NCEGradKernel); diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h new file mode 100644 index 0000000000..ce1717c9b0 --- /dev/null +++ b/paddle/operators/nce_op.h @@ -0,0 +1,210 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" +#include "paddle/memory/memcpy.h" +#include "unsupported/Eigen/CXX11/Tensor" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +template +void PrepareSamples(const framework::ExecutionContext& context) { + auto label = context.Input("Label"); + const T* label_data = label->data(); + auto label_dims = label->dims(); + int num_classes = context.Attr("num_classes"); + // random machine + std::random_device rd; + std::mt19937 rng(rd()); + std::uniform_int_distribution rand(0, num_classes - 1); + + auto sample_labels = context.Output("SampleLabels"); + auto sample_labels_dims = sample_labels->dims(); + int* sample_labels_data = + sample_labels->mutable_data(context.GetPlace()); + + int num_label = label_dims.size() == 2 ? label_dims[1] : 1; + for (size_t i = 0; i < label_dims[0]; ++i) { + int j = 0; + for (; j < num_label; ++j) { + sample_labels_data[sample_labels_dims[1] * i + j] = + label_data[i * num_label + j]; + } + for (; j < sample_labels_dims[1]; ++j) { + int id = rand(rng); + sample_labels_data[sample_labels_dims[1] * i + j] = id; + } + } +} + +template +class NCEKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PrepareSamples(context); + auto sample_labels = context.Output("SampleLabels"); + const int* sample_labels_data = sample_labels->data(); + auto sample_out = context.Output("SampleLogits"); + T* sample_out_data = sample_out->mutable_data(context.GetPlace()); + auto label = context.Input("Label"); + auto sample_weight = context.Input("SampleWeight"); + const T* sample_weight_data = nullptr; + if (sample_weight != nullptr) { + sample_weight_data = sample_weight->data(); + } + auto out = context.Output("Out"); + T* out_data = out->mutable_data(context.GetPlace()); + int num_smalped_classes = context.Attr("num_sampled_classes"); + int num_classes = context.Attr("num_classes"); + int num_true_class = 1; + if (label != nullptr) { + num_true_class = label->dims()[1]; + } + T b = 1. / num_classes * num_smalped_classes; + + // forward bias + auto bias = context.Input("B"); + if (bias != nullptr) { + const T* bias_data = bias->data(); + for (size_t i = 0; i < sample_labels->numel(); ++i) { + sample_out_data[i] = bias_data[sample_labels_data[i]]; + } + } else { + for (size_t i = 0; i < sample_labels->numel(); ++i) { + sample_out_data[i] = 0; + } + } + + // forward mul + auto input_mat = EigenMatrix::From(*(context.Input("X"))); + auto weight_mat = EigenMatrix::From(*(context.Input("W"))); + for (size_t i = 0; i < sample_labels->numel(); ++i) { + // sample_out_data[i] += (input_mat.chip((int)(i / + // sample_labels->dims()[1]), 0) * weight_mat.chip(sample_labels_data[i], + // 0)).sum(); + Eigen::Tensor result = + (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) * + weight_mat.chip(sample_labels_data[i], 0)) + .sum(); + sample_out_data[i] += result(0); + // activation_->forward + sample_out_data[i] = (1 / 1 + (sample_out_data[i])); + } + + // forward cost + for (size_t i = 0; i < sample_labels->dims()[0]; ++i) { + size_t j = 0; + T w = sample_weight == nullptr ? 1 : sample_weight_data[i]; + // for true classes + for (; j < num_true_class; ++j) { + T o = sample_out_data[i * sample_out->dims()[1] + j]; + T cost = -log(o / (o + b)); + out_data[i] += w * cost; + } + // for sampled neg classes + for (; j < sample_labels->dims()[1]; ++j) { + T o = sample_out_data[i * sample_out->dims()[1] + j]; + T cost = -log(b / (o + b)); + out_data[i] += w * cost; + } + } + } +}; + +template +class NCEGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto label = context.Input("Label"); + auto sample_out = context.Input("SampleLogits"); + const T* sample_out_data = sample_out->data(); + auto sample_labels = context.Input("SampleLabels"); + const int* sample_labels_data = sample_labels->data(); + auto sample_weight = context.Input("SampleWeight"); + const T* sample_weight_data = nullptr; + if (sample_weight != nullptr) { + sample_weight_data = sample_weight->data(); + } + int num_smalped_classes = context.Attr("num_sampled_classes"); + int num_classes = context.Attr("num_classes"); + int num_true_class = 1; + if (label != nullptr) { + num_true_class = label->dims()[1]; + } + T b = 1. / num_classes * num_smalped_classes; + + Tensor sample_grad; // tmp tensor + T* sample_grad_data = + sample_grad.mutable_data(sample_labels->dims(), context.GetPlace()); + + // backward cost + for (size_t i = 0; i < sample_labels->numel(); ++i) { + T o = sample_out_data[i]; + T w = sample_weight == nullptr + ? 1 + : sample_weight_data[i / sample_labels->dims()[1]]; + sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class + ? -w * b / (o * (o + b)) + : w / (o + b); + // sigmoid->backward + sample_grad_data[i] = + (o > 0) ? sample_grad_data[i] : ((o < 0) ? -sample_grad_data[i] : 0); + } + + // get d_bias + auto d_bias = context.Output(framework::GradVarName("B")); + if (d_bias != nullptr) { + T* d_bias_data = d_bias->mutable_data(context.GetPlace()); + for (size_t i = 0; i < sample_labels->numel(); ++i) { + d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; + } + } + // get d_w + auto d_w = context.Output(framework::GradVarName("W")); + if (d_w != nullptr) { + auto d_w_matrix = EigenMatrix::From(*d_w); + auto x_matrix = EigenMatrix::From(*(context.Input("X"))); + for (size_t i = 0; i < sample_labels->numel(); ++i) { + d_w_matrix.chip(sample_labels_data[i], 0) = + x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) * + sample_grad_data[i]; + } + } + + // get d_x + auto d_x = context.Output(framework::GradVarName("X")); + if (d_x != nullptr) { + auto d_x_matrix = EigenMatrix::From(*d_x); + auto w_matrix = EigenMatrix::From(*(context.Input("W"))); + for (size_t i = 0; i < sample_labels->numel(); ++i) { + d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) += + w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i]; + } + } + } +}; + +} // namespace operators +} // namespace paddle From fef617ae072856bae17edd98cbddf88d198c95d0 Mon Sep 17 00:00:00 2001 From: wanghaox Date: Sat, 11 Nov 2017 19:59:20 +0800 Subject: [PATCH 017/275] for resolve conflicts --- paddle/operators/math/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index b39a64c0f3..d55aed19cb 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -8,22 +8,22 @@ if(WITH_GPU) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) - nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) + nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) else() cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(pooling SRCS pooling.cc DEPS device_context) - cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(vol2col SRCS vol2col.cc DEPS device_context) cc_library(context_project SRCS context_project.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) + cc_library(maxouting SRCS maxouting.cc DEPS device_context) endif() cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) From 4748073dc6793539d318fb7bc437c50fc8826373 Mon Sep 17 00:00:00 2001 From: wanghaox Date: Sat, 11 Nov 2017 20:10:54 +0800 Subject: [PATCH 018/275] paddle/operators/math/CMakeLists.txt maybe del sequence_pooling and add it --- paddle/operators/math/CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index d55aed19cb..b330f30d21 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -8,6 +8,7 @@ if(WITH_GPU) nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator) nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator) nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) + nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) @@ -19,6 +20,7 @@ else() cc_library(softmax SRCS softmax.cc DEPS operator) cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator) cc_library(pooling SRCS pooling.cc DEPS device_context) + cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) cc_library(vol2col SRCS vol2col.cc DEPS device_context) cc_library(context_project SRCS context_project.cc DEPS device_context) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) From f3631a42dff4e1ad54b1c1fc8e5549a488158e02 Mon Sep 17 00:00:00 2001 From: Kavya Srinet Date: Mon, 13 Nov 2017 12:03:03 -0800 Subject: [PATCH 019/275] Updating the writeup of RNN doc --- doc/design/ops/rnn.md | 66 +++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/doc/design/ops/rnn.md b/doc/design/ops/rnn.md index a78eea7d45..2f4854793f 100644 --- a/doc/design/ops/rnn.md +++ b/doc/design/ops/rnn.md @@ -1,62 +1,62 @@ # RNNOp design -This document is about an RNN operator which requires that instances in a mini-batch have the same length. We will have a more flexible RNN operator. +This document describes the RNN (Recurrent Neural Network) operator and how it is implemented in PaddlePaddle. The RNN op requires that all instances in a mini-batch have the same length. We will have a more flexible dynamic RNN operator in the future. ## RNN Algorithm Implementation -

+

The above diagram shows an RNN unrolled into a full network. -There are several important concepts: +There are several important concepts here: -- *step-net*: the sub-graph to run at each step, -- *memory*, $h_t$, the state of the current step, -- *ex-memory*, $h_{t-1}$, the state of the previous step, -- *initial memory value*, the ex-memory of the first step. +- *step-net*: the sub-graph that runs at each step. +- *memory*, $h_t$, the state of the current step. +- *ex-memory*, $h_{t-1}$, the state of the previous step. +- *initial memory value*, the memory of the first (initial) step. ### Step-scope -There could be local variables defined in step-nets. PaddlePaddle runtime realizes these variables in *step-scopes* -- scopes created for each step. +There could be local variables defined in each step-net. PaddlePaddle runtime realizes these variables in *step-scopes* which are created for each step. -

+


-Figure 2 the RNN's data flow +Figure 2 illustrates the RNN's data flow

-Please be aware that all steps run the same step-net. Each step +Please be aware that every step runs the same step-net. Each step does the following: -1. creates the step-scope, -2. realizes local variables, including step-outputs, in the step-scope, and -3. runs the step-net, which could use these variables. +1. Creates the step-scope. +2. Initializes the local variables including step-outputs, in the step-scope. +3. Runs the step-net, which uses the above mentioned variables. -The RNN operator will compose its output from step outputs in step scopes. +The RNN operator will compose its output from step outputs in each of the step scopes. ### Memory and Ex-memory -Let's give more details about memory and ex-memory via a simply example: +Let's give more details about memory and ex-memory using a simple example: $$ h_t = U h_{t-1} + W x_t $$, -where $h_t$ and $h_{t-1}$ are the memory and ex-memory of step $t$'s respectively. +where $h_t$ and $h_{t-1}$ are the memory and ex-memory (previous memory) of step $t$ respectively. -In the implementation, we can make an ex-memory variable either "refers to" the memory variable of the previous step, -or copy the value of the previous memory value to the current ex-memory variable. +In the implementation, we can make an ex-memory variable either "refer to" the memory variable of the previous step, +or copy the memory value of the previous step to the current ex-memory variable. ### Usage in Python For more information on Block, please refer to the [design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md). -We can define an RNN's step-net using Block: +We can define an RNN's step-net using a Block: ```python import paddle as pd -X = some_op() # x is some operator's output, and is a LoDTensor +X = some_op() # x is some operator's output and is a LoDTensor a = some_op() # declare parameters @@ -68,7 +68,7 @@ with rnn.stepnet(): x = rnn.add_input(X) # declare a memory (rnn's step) h = rnn.add_memory(init=a) - # h.pre_state() means previous memory of rnn + # h.pre_state(), the previous memory of rnn new_state = pd.add_two( pd.matmul(W, x) + pd.matmul(U, h.pre_state())) # update current memory h.update(new_state) @@ -80,19 +80,19 @@ out = rnn() Python API functions in above example: -- `rnn.add_input` indicates the parameter is a variable that will be segmented into step-inputs. -- `rnn.add_memory` creates a variable used as the memory. -- `rnn.add_outputs` mark the variables that will be concatenated across steps into the RNN output. +- `rnn.add_input`: indicates that the parameter is a variable that will be segmented into step-inputs. +- `rnn.add_memory`: creates a variable used as the memory. +- `rnn.add_outputs`: marks the variables that will be concatenated across steps into the RNN output. ### Nested RNN and LoDTensor An RNN whose step-net includes other RNN operators is known as an *nested RNN*. -For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. +For example, we could have a 2-level RNN, where the top level corresponds to paragraphs, and the lower level corresponds to sentences. Each step of the higher level RNN also receives an input from the corresponding step of the lower level, and additionally the output from the previous time step at the same level. -The following figure illustrates the feeding of text into the lower level, one sentence each step, and the feeding of step outputs to the top level. The final top level output is about the whole text. +The following figure illustrates feeding in text into the lower level, one sentence at a step, and the feeding in step outputs to the top level. The final top level output is about the whole text. -

+

@@ -110,7 +110,7 @@ a = some_op() # chapter_data is a set of 128-dim word vectors # the first level of LoD is sentence -# the second level of LoD is chapter +# the second level of LoD is a chapter chapter_data = pd.Variable(shape=[None, 128], type=pd.lod_tensor, level=2) def lower_level_rnn(paragraph): @@ -138,14 +138,14 @@ with top_level_rnn.stepnet(): pd.matmul(W0, paragraph_data) + pd.matmul(U0, h.pre_state())) top_level_rnn.add_outputs(h) -# just output the last step +# output the last step chapter_out = top_level_rnn(output_all_steps=False) ``` -in above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is a LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. +In the above example, the construction of the `top_level_rnn` calls `lower_level_rnn`. The input is an LoD Tensor. The top level RNN segments input text data into paragraphs, and the lower level RNN segments each paragraph into sentences. -By default, the `RNNOp` will concatenate the outputs from all the time steps, -if the `output_all_steps` set to False, it will only output the final time step. +By default, the `RNNOp` will concatenate the outputs from all the time steps. +If the `output_all_steps` is set to False, it will only output the final time step.

From 4eb5b39cb2453c77a156f4f76f8436b574772afa Mon Sep 17 00:00:00 2001 From: Kavya Srinet Date: Mon, 13 Nov 2017 14:49:15 -0800 Subject: [PATCH 020/275] Editing the documentation for seq_decoder, and fixing typos --- doc/design/ops/sequence_decoder.md | 112 +++++++++++++---------------- 1 file changed, 48 insertions(+), 64 deletions(-) diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md index 9007aae7a8..bb945ae48b 100644 --- a/doc/design/ops/sequence_decoder.md +++ b/doc/design/ops/sequence_decoder.md @@ -1,35 +1,28 @@ # Design: Sequence Decoder Generating LoDTensors -In tasks such as machine translation and image to text, -a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences. +In tasks such as machine translation and visual captioning, +a [sequence decoder](https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/README.md) is necessary to generate sequences, one word at a time. This documentation describes how to implement the sequence decoder as an operator. ## Beam Search based Decoder -The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences, -it is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set. +The [beam search algorithm](https://en.wikipedia.org/wiki/Beam_search) is necessary when generating sequences. It is a heuristic search algorithm that explores the paths by expanding the most promising node in a limited set. -In the old version of PaddlePaddle, a C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, -due to the complexity, the implementation relays on a lot of special data structures, -quite trivial and hard to be customized by users. +In the old version of PaddlePaddle, the C++ class `RecurrentGradientMachine` implements the general sequence decoder based on beam search, due to the complexity involved, the implementation relies on a lot of special data structures that are quite trivial and hard to be customized by users. -There are a lot of heuristic tricks in the sequence generation tasks, -so the flexibility of sequence decoder is very important to users. +There are a lot of heuristic tricks in the sequence generation tasks, so the flexibility of sequence decoder is very important to users. -During PaddlePaddle's refactoring work, -some new concept is proposed such as [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support sequence usage, -and they can help to make the implementation of beam search based sequence decoder **more transparent and modular** . +During the refactoring of PaddlePaddle, some new concepts are proposed such as: [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md) and [TensorArray](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/tensor_array.md) that can better support the sequence usage, and they can also help make the implementation of beam search based sequence decoder **more transparent and modular** . -For example, the RNN sates, candidates IDs and probabilities of beam search can be represented as `LoDTensors`; +For example, the RNN states, candidates IDs and probabilities of beam search can be represented all as `LoDTensors`; the selected candidate's IDs in each time step can be stored in a `TensorArray`, and `Packed` to the sentences translated. ## Changing LoD's absolute offset to relative offsets -The current `LoDTensor` is designed to store levels of variable-length sequences, -it stores several arrays of integers each represents a level. +The current `LoDTensor` is designed to store levels of variable-length sequences. It stores several arrays of integers where each represents a level. -The integers in each level represents the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, -let's call this format the **absolute-offset LoD** for clear. +The integers in each level represent the begin and end (not inclusive) offset of a sequence **in the underlying tensor**, +let's call this format the **absolute-offset LoD** for clarity. -The relative-offset LoD can fast retrieve any sequence but fails to represent empty sequences, for example, a two-level LoD is as follows +The relative-offset LoD can retrieve any sequence very quickly but fails to represent empty sequences, for example, a two-level LoD is as follows ```python [[0, 3, 9] [0, 2, 3, 3, 3, 9]] @@ -41,10 +34,9 @@ The first level tells that there are two sequences: while on the second level, there are several empty sequences that both begin and end at `3`. It is impossible to tell how many empty second-level sequences exist in the first-level sequences. -There are many scenarios that relay on empty sequence representation, -such as machine translation or image to text, one instance has no translations or the empty candidate set for a prefix. +There are many scenarios that rely on empty sequence representation, for example in machine translation or visual captioning, one instance has no translation or the empty candidate set for a prefix. -So let's introduce another format of LoD, +So let's introduce another format of LoD, it stores **the offsets of the lower level sequences** and is called **relative-offset** LoD. For example, to represent the same sequences of the above data @@ -54,19 +46,18 @@ For example, to represent the same sequences of the above data [0, 2, 3, 3, 3, 9]] ``` -the first level represents that there are two sequences, +the first level represents that there are two sequences, their offsets in the second-level LoD is `[0, 3)` and `[3, 5)`. The second level is the same with the relative offset example because the lower level is a tensor. It is easy to find out the second sequence in the first-level LoD has two empty sequences. -The following demos are based on relative-offset LoD. +The following examples are based on relative-offset LoD. ## Usage in a simple machine translation model -Let's start from a simple machine translation model that is simplified from [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a simple blueprint of what a sequence decoder can do and how to use it. +Let's start from a simple machine translation model that is simplified from the [machine translation chapter](https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation) to draw a blueprint of what a sequence decoder can do and how to use it. -The model has an encoder that learns the semantic vector from a sequence, -and a decoder which uses the sequence decoder to generate new sentences. +The model has an encoder that learns the semantic vector from a sequence, and a decoder which uses the sequence encoder to generate new sentences. **Encoder** ```python @@ -117,7 +108,7 @@ def generate(): # which means there are 2 sentences to translate # - the first sentence has 1 translation prefixes, the offsets are [0, 1) # - the second sentence has 2 translation prefixes, the offsets are [1, 3) and [3, 6) - # the target_word.lod is + # the target_word.lod is # [[0, 1, 6] # [0, 2, 4, 7, 9 12]] # which means 2 sentences to translate, each has 1 and 5 prefixes @@ -154,37 +145,36 @@ def generate(): translation_ids, translation_scores = decoder() ``` -The `decoder.beam_search` is a operator that given the candidates and the scores of translations including the candidates, -return the result of the beam search algorithm. +The `decoder.beam_search` is an operator that, given the candidates and the scores of translations including the candidates, +returns the result of the beam search algorithm. -In this way, users can customize anything on the inputs or outputs of beam search, for example, two ways to prune some translation prefixes +In this way, users can customize anything on the input or output of beam search, for example: -1. meke the correspondind elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. -2. remove some specific candidate in `selected_ids` -3. get the final `translation_ids`, remove the translation sequence in it. +1. Make the corresponding elements in `topk_generated_scores` zero or some small values, beam_search will discard this candidate. +2. Remove some specific candidate in `selected_ids`. +3. Get the final `translation_ids`, remove the translation sequence in it. The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), -so the python syntax is quite similar to a [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). +so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). -Both of them are two-level `LoDTensors` +Both of them are two-level `LoDTensors`: -- the first level represents `batch_size` of (source) sentences; -- the second level represents the candidate ID sets for translation prefix. +- The first level represents `batch_size` of (source) sentences. +- The second level represents the candidate ID sets for translation prefix. -for example, 3 source sentences to translate, and has 2, 3, 1 candidates. +For example, 3 source sentences to translate, and has 2, 3, 1 candidates. -Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, -a `lod_expand` operator is used to expand the LoD of the previous state to fit the current state. +Unlike an RNN, in sequence decoder, the previous state and the current state have different LoD and shape, and an `lod_expand` operator is used to expand the LoD of the previous state to fit the current state. -For example, the previous state +For example, the previous state: * LoD is `[0, 1, 3][0, 2, 5, 6]` * content of tensor is `a1 a2 b1 b2 b3 c1` -the current state stored in `encoder_ctx_expanded` +the current state is stored in `encoder_ctx_expanded`: * LoD is `[0, 2, 7][0 3 5 8 9 11 11]` -* the content is +* the content is - a1 a1 a1 (a1 has 3 candidates, so the state should be copied 3 times for each candidates) - a2 a2 - b1 b1 b1 @@ -192,54 +182,48 @@ the current state stored in `encoder_ctx_expanded` - b3 b3 - None (c1 has 0 candidates, so c1 is dropped) -Benefit from the relative offset LoD, empty candidate set can be represented naturally. +The benefit from the relative offset LoD is that the empty candidate set can be represented naturally. -the status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor, the corresponding syntax is +The status in each time step can be stored in `TensorArray`, and `Pack`ed to a final LoDTensor. The corresponding syntax is: ```python decoder.output(selected_ids) decoder.output(selected_generation_scores) ``` -the `selected_ids` is the candidate ids for the prefixes, -it will be `Packed` by `TensorArray` to a two-level `LoDTensor`, -the first level represents the source sequences, -the second level represents generated sequences. +The `selected_ids` are the candidate ids for the prefixes, and will be `Packed` by `TensorArray` to a two-level `LoDTensor`, where the first level represents the source sequences and the second level represents generated sequences. -Pack the `selected_scores` will get a `LoDTensor` that stores scores of each candidate of translations. +Packing the `selected_scores` will get a `LoDTensor` that stores scores of each translation candidate. -Pack the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. +Packing the `selected_generation_scores` will get a `LoDTensor`, and each tail is the probability of the translation. ## LoD and shape changes during decoding

-According the image above, the only phrase to change LoD is beam search. +According to the image above, the only phase that changes the LoD is beam search. ## Beam search design -The beam search algorthm will be implemented as one method of the sequence decoder, it has 3 inputs +The beam search algorithm will be implemented as one method of the sequence decoder and has 3 inputs: -1. `topk_ids`, top K candidate ids for each prefix. +1. `topk_ids`, the top K candidate ids for each prefix. 2. `topk_scores`, the corresponding scores for `topk_ids` 3. `generated_scores`, the score of the prefixes. -All of the are LoDTensors, so that the sequence affilication is clear. -Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix. +All of these are LoDTensors, so that the sequence affiliation is clear. Beam search will keep a beam for each prefix and select a smaller candidate set for each prefix. -It will return three variables +It will return three variables: 1. `selected_ids`, the final candidate beam search function selected for the next step. 2. `selected_scores`, the scores for the candidates. -3. `generated_scores`, the updated scores for each prefixes (with the new candidates appended). +3. `generated_scores`, the updated scores for each prefix (with the new candidates appended). ## Introducing the LoD-based `Pack` and `Unpack` methods in `TensorArray` -The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors, -and they exist in each time step, +The `selected_ids`, `selected_scores` and `generated_scores` are LoDTensors that exist at each time step, so it is natural to store them in arrays. -Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors, -the results of beam search are better to store in a `TensorArray`. +Currently, PaddlePaddle has a module called `TensorArray` which can store an array of tensors. It is better to store the results of beam search in a `TensorArray`. -The `Pack` and `UnPack` in `TensorArray` are used to package tensors in the array to a `LoDTensor` or split the `LoDTensor` to an array of tensors. -It needs some extensions to support pack or unpack an array of `LoDTensors`. +The `Pack` and `UnPack` in `TensorArray` are used to pack tensors in the array to an `LoDTensor` or split the `LoDTensor` to an array of tensors. +It needs some extensions to support the packing or unpacking an array of `LoDTensors`. From e877cdb8f930cbcd4112a9224232efd898a780b5 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 13 Nov 2017 23:06:07 -0800 Subject: [PATCH 021/275] add float16 arithmetic on arm cpu --- paddle/math/float16.h | 479 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 389 insertions(+), 90 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 5fe2854969..ae7d9754aa 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +// need to define PADDLE_ARM_FP16 + #pragma once #include @@ -24,6 +26,18 @@ limitations under the License. */ #include "Eigen/src/Core/arch/CUDA/Half.h" #endif +#ifdef __GNUC__ +#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) +#else +#define PADDLE_GNUC_VER 0 +#endif // __GNUC__ + +#ifdef __clang__ +#define PADDLE_CLANG_VER (__clang_major__ * 10 + __clang_minor__) +#else +#define PADDLE_CLANG_VER 0 +#endif // __clang__ + #ifdef __CUDACC__ #define PADDLE_HOSTDEVICE __host__ __device__ #if CUDA_VERSION >= 7050 @@ -48,6 +62,7 @@ limitations under the License. */ #if defined(__ARM_NEON) || defined(__ARM_NEON__) #define PADDLE_NEON +#include #endif #if defined(PADDLE_NEON) && defined(PADDLE_ARM_32) @@ -58,26 +73,16 @@ limitations under the License. */ #define PADDLE_NEON_64 #endif -#if defined(PADDLE_ARM) && defined(PADDLE_NEON) -#include -#endif - -#if !defined(__ANDROID__) && !defined(__APPLE__) && !defined(PADDLE_ARM) -#include -#else +#ifdef PADDLE_ARM #ifdef __F16C__ #undef __F16C__ -#endif -#endif +#endif // __F16C__ +#else +#include +#endif // PADDLE_ARM #define PADDLE_ALIGN(x) __attribute__((aligned(x))) -// https://github.com/pytorch/pytorch/blob/master/torch/lib/ATen/Half.h -template -To convert(From f) { - return static_cast(f); -} - namespace paddle { struct float16; @@ -86,13 +91,12 @@ namespace fp16_impl { // convert from float to half precision in round-to-nearest-even mode PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f); PADDLE_HOSTDEVICE inline float half_to_float(float16 h); -PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x); } // namespace fp16_impl // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated // and aligned at least on a 2-byte boundary, which leads to efficient // memory access of float16 struct and also makes float16 compatible -// with CUDA half and Eigen::half data types. +// with CUDA half, ARM float16_t, and Eigen::half data types. struct PADDLE_ALIGN(2) float16 { uint16_t x; @@ -103,7 +107,7 @@ struct PADDLE_ALIGN(2) float16 { PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16(const half h) { + PADDLE_HOSTDEVICE inline float16(const half& h) { #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(&h)->x; #else @@ -111,40 +115,72 @@ struct PADDLE_ALIGN(2) float16 { #endif // CUDA_VERSION >= 9000 } #endif // PADDLE_CUDA_FP16 -/* -#ifdef PADDLE_CUDA_FP16 - #if CUDA_VERSION < 9000 - PADDLE_HOSTDEVICE inline float16(const half& h) : x(h.x) {} - #else - PADDLE_HOSTDEVICE inline float16(const __half_raw& h) : x(h.x) {} - PADDLE_HOSTDEVICE inline float16(const half& h) - : x(*reinterpret_cast(&h)) {} - #endif // CUDA_VERSION < 9000 -#endif // PADDLE_CUDA_FP16 -*/ #ifdef USE_EIGEN PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} #endif // USE_EIGEN -#if defined(PADDLE_ARM) && defined(PADDLE_NEON) +#ifdef PADDLE_NEON // __fp16 is a native half precision data type for arm cpu, - // float16_t is an alias for __fp16 in arm_fp16.h - // which is included in arm_neon.h - PADDLE_HOSTDEVICE inline float16(const float16_t h) { - x = *reinterpret_cast(&h); + // float16_t is an alias for __fp16 in arm_fp16.h, + // which is included in arm_neon.h. + // According to gcc, __fp16 can only be used as an argument to fp16 + // intrinsic defined in arm_neon.h or as a storage type. It cannot + // be used as a formal function argument. + // TODO (kexinzhao): test it on RPI + PADDLE_HOSTDEVICE inline float16(const float16_t* h) { + x = *reinterpret_cast(h); } #endif PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} + PADDLE_HOSTDEVICE inline explicit float16(int8_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(int16_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(int32_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(int64_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + + PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + } + PADDLE_HOSTDEVICE inline explicit float16(float val) { float16 res = fp16_impl::float_to_half_rn(val); x = res.x; } - template - PADDLE_HOSTDEVICE inline explicit float16(const T& val) { + PADDLE_HOSTDEVICE inline explicit float16(double val) { float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; } @@ -155,7 +191,7 @@ struct PADDLE_ALIGN(2) float16 { } #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16& operator=(const half rhs) { + PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) { #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(&rhs)->x; #else @@ -172,27 +208,80 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#if defined(PADDLE_ARM) && defined(PADDLE_NEON) - PADDLE_HOSTDEVICE inline float16& operator=(const float16_t rhs) { - x = *reinterpret_cast(&rhs); +#ifdef PADDLE_NEON + PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { + x = *reinterpret_cast(rhs); return *this; } #endif -/* - PADDLE_HOSTDEVICE inline explicit float16(int val) { + PADDLE_HOSTDEVICE inline float16& operator=(bool b) { + x = b ? 0x3c00 : 0; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) { float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; + return *this; } - PADDLE_HOSTDEVICE inline explicit float16(double val) { + PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) { float16 res = fp16_impl::float_to_half_rn(static_cast(val)); x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(float val) { + float16 res = fp16_impl::float_to_half_rn(val); + x = res.x; + return *this; + } + + PADDLE_HOSTDEVICE inline float16& operator=(double val) { + float16 res = fp16_impl::float_to_half_rn(static_cast(val)); + x = res.x; + return *this; } -*/ #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline operator half() { + PADDLE_HOSTDEVICE inline operator half() const { #if CUDA_VERSION >= 9000 __half_raw h; h.x = x; @@ -206,82 +295,270 @@ struct PADDLE_ALIGN(2) float16 { #endif // PADDLE_CUDA_FP16 #ifdef USE_EIGEN - PADDLE_HOSTDEVICE inline operator Eigen::half() { + PADDLE_HOSTDEVICE inline operator Eigen::half() const { Eigen::half h; h.x = x; return h; } #endif // USE_EIGEN -#if defined(PADDLE_ARM) && defined(PADDLE_NEON) - PADDLE_HOSTDEVICE inline operator float16_t() { +#ifdef PADDLE_NEON + // check whether it works or not + PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; return *reinterpret_cast(&h); } #endif - PADDLE_HOSTDEVICE inline explicit operator bool() { + PADDLE_HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } - PADDLE_HOSTDEVICE inline explicit operator int8_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint8_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int16_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint16_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int32_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint32_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int64_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint64_t() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(fp16_impl::half_to_float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator float() { + PADDLE_HOSTDEVICE inline explicit operator float() const { return fp16_impl::half_to_float(*this); } - PADDLE_HOSTDEVICE inline explicit operator double() { - return static_cat(fp16_impl::half_to_float(*this)); + PADDLE_HOSTDEVICE inline explicit operator double() const { + return static_cast(fp16_impl::half_to_float(*this)); } }; // arithmetic operators #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ inline float16 operator+(const float16& a, const float16& b) { - return float16(__hadd(a, b)); + return float16(__hadd(half(a), half(b))); } __device__ inline float16 operator-(const float16& a, const float16& b) { - return __hsub(a, b); + return float16(__hsub(half(a), half(b))); } __device__ inline float16 operator*(const float16& a, const float16& b) { - return __hmul(a, b); + return float16(__hmul(half(a), half(b))); } -#elif // on arm cpu +__device__ inline float16 operator/(const float16& a, const float16& b) { + // TODO(kexinzhao): check the cuda version that starts to support __hdiv + // instinsic + float num = __half2float(half(a)); + float denom = __half2float(half(b)); + return float16(num / denom); +} -#else +__device__ inline float16 operator-(const float16& a) { + return float16(__hneg(half(a))); +} + +__device__ inline float16& operator+=(float16& a, const float16& b) { + a = a + b; + return a; +} + +__device__ inline float16& operator-=(float16& a, const float16& b) { + a = a - b; + return a; +} + +__device__ inline float16& operator*=(float16& a, const float16& b) { + a = a * b; + return a; +} + +__device__ inline float16& operator/=(float16& a, const float16& b) { + a = a / b; + return a; +} + +__device__ inline bool operator==(const float16& a, const float16& b) { + return __heq(half(a), half(b)); +} + +__device__ inline bool operator!=(const float16& a, const float16& b) { + return __hne(half(a), half(b)); +} + +__device__ inline bool operator<(const float16& a, const float16& b) { + return __hlt(half(a), half(b)); +} + +__device__ inline bool operator<=(const float16& a, const float16& b) { + return __hle(half(a), half(b)); +} + +__device__ inline bool operator>(const float16& a, const float16& b) { + return __hgt(half(a), half(b)); +} + +__device__ inline bool operator>=(const float16& a, const float16& b) { + return __hge(half(a), half(b)); +} + +// On ARMv8.2-A CPU +#elif (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) && \ + defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) +__host__ inline float16 operator+(const float16& a, const float16& b) { + return float16(vaddh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline float16 operator-(const float16& a, const float16& b) { + return float16(vsubh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline float16 operator*(const float16& a, const float16& b) { + return float16(vmulh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline float16 operator/(const float16& a, const float16& b) { + return float16(vdivh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline float16 operator-(const float16& a) { + return float16(vnegh_f16(float16_t(a))); +} + +__host__ inline float16& operator+=(float16& a, const float16& b) { + a = a + b; + return a; +} + +__host__ inline float16& operator-=(float16& a, const float16& b) { + a = a - b; + return a; +} + +__host__ inline float16& operator*=(float16& a, const float16& b) { + a = a * b; + return a; +} + +__host__ inline float16& operator/=(float16& a, const float16& b) { + a = a / b; + return a; +} + +__host__ inline bool operator==(const float16& a, const float16& b) { + return static_cast(vceqh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline bool operator!=(const float16& a, const float16& b) { + return !(a == b); +} + +// compare only available in NEON_64 +__host__ inline bool operator<(const float16& a, const float16& b) { + return static_cast(vclth_f16(float16_t(a), float16_t(b))); +} + +__host__ inline bool operator<=(const float16& a, const float16& b) { + return static_cast(vcleh_f16(float16_t(a), float16_t(b))); +} + +__host__ inline bool operator>(const float16& a, const float16& b) { + return static_cast(vcgth_f16(float16_t(a), float16_t(b))); +} + +__host__ inline bool operator>=(const float16& a, const float16& b) { + return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); +} + +#else // software emulation on other cpu +PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { + return float16(float(a) + float(b)); +} + +PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { + return float16(float(a) - float(b)); +} + +PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { + return float16(float(a) * float(b)); +} + +PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { + return float16(float(a) / float(b)); +} + +PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) { + float16 res; + res.x = a.x ^ 0x8000; + return res; +} + +PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { + a = float16(float(a) + float(b)); + return a; +} + +PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { + a = float16(float(a) - float(b)); + return a; +} + +PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { + a = float16(float(a) * float(b)); + return a; +} + +PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { + a = float16(float(a) / float(b)); + return a; +} + +PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { + return float(a) == float(b); +} + +PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { + return float(a) != float(b); +} + +PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { + return float(a) < float(b); +} + +PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { + return float(a) <= float(b); +} + +PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { + return float(a) > float(b); +} + +PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { + return float(a) >= float(b); +} #endif @@ -320,16 +597,11 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { half tmp = __float2half(f); return *reinterpret_cast(&(tmp)); -#elif defined(__F16C__) - float16 res; - res.x = _cvtss_sh(f, 0); - return res; - -#elif defined(PADDLE_ARM_64) // test on RPI +#elif defined(PADDLE_NEON_64) // test on RPI float16 res; asm volatile( "ld1 {v0.s}[0], [%[float_ptr]]\n" - "FCVT h0, s0\n" + "fcvt h0, s0\n" "st1 {v0.h}[0], [%[half_ptr]]\n" : // outputs : // inputs @@ -339,6 +611,25 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { "memory", "v0"); return res; +#elif defined(PADDLE_NEON_32) // test on RPI + float16 res; + asm volatile( + "vld1.32 {d0[0]}, [%[float_ptr]]\n" + "vcvt.f16.f32 d0, q0\n" + "vst1.16 {d0[0]}, [%[half_ptr]]\n" + : // outputs + : // inputs + [float_ptr] "r"(&f), + [half_ptr] "r"(&(res.x)) + : // clobbers + "memory", "d0"); + return res; + +#elif defined(__F16C__) + float16 res; + res.x = _cvtss_sh(f, 0); + return res; + #else // Conversion routine adapted from // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion @@ -367,10 +658,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { half tmp = *reinterpret_cast(&h); return __half2float(h); -#elif defined(__F16C__) - return _cvtsh_ss(h.x); - -#elif defined(PADDLE_ARM_64) // test on RPI +#elif defined(PADDLE_NEON_64) float res; asm volatile( "ld1 {v0.h}[0], [%[half_ptr]]\n" @@ -384,6 +672,23 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { "memory", "v0"); return res; +#elif defined(PADDLE_NEON_32) + float res; + asm volatile( + "vld1.16 {d0[0]}, [%[half_ptr]]\n" + "vcvt.f32.f16 q0, d0\n" + "vst1.32 {d0[0]}, [%[float_ptr]]\n" + : // outputs + : // inputs + [half_ptr] "r"(&(h.x)), + [float_ptr] "r"(&res) + : // clobbers + "memory", "v0"); + return res; + +#elif defined(__F16C__) + return _cvtsh_ss(h.x); + #else // Conversion routine adapted from // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion @@ -406,12 +711,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { #endif } -PADDLE_HOSTDEVICE inline float16 uint16_to_half(uint16_t x) { - float16 res; - res.x = x; - return res; -} - } // namespace half_impl } // namespace paddle From b341636f7e3ac8a8d2062e63c86c63063bd2f206 Mon Sep 17 00:00:00 2001 From: Kavya Srinet Date: Tue, 14 Nov 2017 10:02:18 -0800 Subject: [PATCH 022/275] Fixing the captioning on 2 level RNN --- doc/design/ops/images/2_level_rnn.dot | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/design/ops/images/2_level_rnn.dot b/doc/design/ops/images/2_level_rnn.dot index a498e882a3..5d77865061 100644 --- a/doc/design/ops/images/2_level_rnn.dot +++ b/doc/design/ops/images/2_level_rnn.dot @@ -1,6 +1,6 @@ digraph G { - rnn [label="1-th level RNN" shape=box] + rnn [label="1st level RNN" shape=box] subgraph cluster0 { label = "time step 0" @@ -8,7 +8,7 @@ digraph G { sent0 [label="sentence"] sent1 [label="sentence"] - rnn1 [label="2-th level RNN" shape=box] + rnn1 [label="2nd level RNN" shape=box] sent0 -> rnn1 sent1 -> rnn1 @@ -20,7 +20,7 @@ digraph G { sent2 [label="sentence"] sent3 [label="sentence"] - rnn2 [label="2-th level RNN" shape=box] + rnn2 [label="2nd level RNN" shape=box] sent2 -> rnn2 sent3 -> rnn2 @@ -32,7 +32,7 @@ digraph G { sent4 [label="sentence"] sent5 [label="sentence"] - rnn3 [label="2-th level RNN" shape=box] + rnn3 [label="2nd level RNN" shape=box] sent4 -> rnn3 sent5 -> rnn3 From 9f2dbc4b5ab45eff990a3c3a6a21664798fe3680 Mon Sep 17 00:00:00 2001 From: Kavya Srinet Date: Tue, 14 Nov 2017 10:11:18 -0800 Subject: [PATCH 023/275] pushing after a pull --- doc/design/ops/sequence_decoder.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/design/ops/sequence_decoder.md b/doc/design/ops/sequence_decoder.md index bb945ae48b..9db5fb8e9a 100644 --- a/doc/design/ops/sequence_decoder.md +++ b/doc/design/ops/sequence_decoder.md @@ -154,7 +154,7 @@ In this way, users can customize anything on the input or output of beam search, 2. Remove some specific candidate in `selected_ids`. 3. Get the final `translation_ids`, remove the translation sequence in it. -The implementation of sequence decoder can reuse the C++ class [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), +The implementation of sequence decoder can reuse the C++ class: [RNNAlgorithm](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/paddle/operators/dynamic_recurrent_op.h#L30), so the python syntax is quite similar to that of an [RNN](https://github.com/Superjom/Paddle/blob/68cac3c0f8451fe62a4cdf156747d6dc0ee000b3/doc/design/block.md#blocks-with-for-and-rnnop). Both of them are two-level `LoDTensors`: From 09d32b068cbdf65f93e98f7b357dbc7e90f11734 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Thu, 16 Nov 2017 00:01:55 +0800 Subject: [PATCH 024/275] Add unitest and comments. --- paddle/operators/nce_op.cc | 115 +++++++++++++------ paddle/operators/nce_op.h | 79 +++++++------ python/paddle/v2/framework/tests/test_nce.py | 96 ++++++++++++++++ 3 files changed, 212 insertions(+), 78 deletions(-) create mode 100644 python/paddle/v2/framework/tests/test_nce.py diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc index afd61b8851..c365d5d922 100644 --- a/paddle/operators/nce_op.cc +++ b/paddle/operators/nce_op.cc @@ -23,57 +23,87 @@ class NCEOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X")); + PADDLE_ENFORCE(ctx->HasInput("Input")); PADDLE_ENFORCE(ctx->HasInput("Label")); - PADDLE_ENFORCE(ctx->HasInput("W")); - PADDLE_ENFORCE(ctx->HasOutput("Out")); + PADDLE_ENFORCE(ctx->HasInput("Weight")); + PADDLE_ENFORCE(ctx->HasOutput("Cost")); PADDLE_ENFORCE(ctx->HasOutput("SampleLogits")); PADDLE_ENFORCE(ctx->HasOutput("SampleLabels")); - auto x_dims = ctx->GetInputDim("X"); + auto x_dims = ctx->GetInputDim("Input"); auto label_dims = ctx->GetInputDim("Label"); PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]); - if (ctx->HasInput("B")) { - PADDLE_ENFORCE_EQ(ctx->GetInputDim("W")[0], ctx->GetInputDim("B")[0]); + int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1; + if (ctx->HasInput("Bias")) { + PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0], + ctx->GetInputDim("Bias")[0]); } - int num_sampled_classes = ctx->Attrs().Get("num_sampled_classes"); - int num_classes = ctx->Attrs().Get("num_classes"); - PADDLE_ENFORCE_EQ(num_classes, ctx->GetInputDim("W")[0]); + auto num_sampled_classes = ctx->Attrs().Get("num_sampled_classes"); + auto num_classes = ctx->Attrs().Get("num_classes"); + std::vector sampled_labels = + ctx->Attrs().Get>("sampled_labels"); + PADDLE_ENFORCE_EQ(num_classes, ctx->GetInputDim("Weight")[0]); PADDLE_ENFORCE_LT(num_sampled_classes, num_classes); - + if (sampled_labels.size() > 0) { + PADDLE_ENFORCE_EQ(sampled_labels.size(), + static_cast(num_sampled_classes)); + } // set dims of output(Out) - std::vector out_dims(1); + std::vector out_dims; out_dims.push_back(x_dims[0]); - ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + ctx->SetOutputDim("Cost", framework::make_ddim(out_dims)); // set dims of output(SampleOut) - std::vector sample_out_dims(2); + std::vector sample_out_dims; sample_out_dims.push_back(x_dims[0]); - sample_out_dims.push_back(num_sampled_classes + 1); + sample_out_dims.push_back(num_sampled_classes + num_true_classes); ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims)); ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims)); } + + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } }; class NCEOpMaker : public framework::OpProtoAndCheckerMaker { public: NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", ""); - AddInput("Label", ""); - AddInput("W", ""); - AddInput("B", ""); - AddInput("SampleWeight", ""); - AddOutput("Out", ""); - AddOutput("SampleLogits", ""); - AddOutput("SampleLabels", ""); - AddAttr("num_classes", ""); - AddAttr("num_sampled_classes", "").SetDefault(10); + AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim]."); + AddInput("Label", + "(Tensor) A tensor of shape [batch_size, num_true_class]. " + "'num_true_class' is the number of target class in each sample."); + AddInput("Weight", + "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the " + "total number of class."); + AddInput("Bias", + "(Tensor) A tensor of shape [num_class]. 'num_class' is the total " + "number of class. It is a dispensable input.") + .AsDispensable(); + AddInput("SampleWeight", + "(Tensor) A tensor of shape [batch_size] storing a weight for " + "each sample. And it is a dispensable input. The default value of " + "sample is 1.") + .AsDispensable(); + AddOutput("Cost", + "(Tensor) A tensor of shape [batch_size]. Cost of samples."); + AddOutput("SampleLogits", "An intermediate tensor.").AsIntermediate(); + AddOutput("SampleLabels", "An intermediate tensor.").AsIntermediate(); + AddAttr("num_classes", "Total number of classes."); + AddAttr("num_sampled_classes", "The number of negative classes.") + .SetDefault(10); + AddAttr>("sampled_labels", ""); AddComment(R"DOC( -Expand input(X) according to LOD of input(Y). - +Computes and returns the noise-contrastive estimation training loss. +See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). +By default this uses a uniform distribution for sampling. +The number of target classes per example should be same. If you have a variable number of target classes, you can pad them out to a constant number by either repeating them or by padding with an otherwise unused class. )DOC"); } }; @@ -82,32 +112,41 @@ class NCEOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - protected: void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X")); - PADDLE_ENFORCE(ctx->HasInput("W")); - PADDLE_ENFORCE(ctx->HasInput("Out")); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + PADDLE_ENFORCE(ctx->HasInput("Input")); + PADDLE_ENFORCE(ctx->HasInput("Weight")); + PADDLE_ENFORCE(ctx->HasInput("Cost")); + PADDLE_ENFORCE(ctx->HasInput("SampleLogits")); + PADDLE_ENFORCE(ctx->HasInput("SampleLabels")); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")), "The input(Out@GRAD) should not be null"); - auto x_dims = ctx->GetInputDim("X"); - auto x_grad_name = framework::GradVarName("X"); + auto x_dims = ctx->GetInputDim("Input"); + auto x_grad_name = framework::GradVarName("Input"); if (ctx->HasOutput(x_grad_name)) { ctx->SetOutputDim(x_grad_name, x_dims); } - auto w_dims = ctx->GetInputDim("W"); - auto w_grad_name = framework::GradVarName("W"); + auto w_dims = ctx->GetInputDim("Weight"); + auto w_grad_name = framework::GradVarName("Weight"); if (ctx->HasOutput(w_grad_name)) { ctx->SetOutputDim(w_grad_name, w_dims); } - auto bias_grad_name = framework::GradVarName("B"); + auto bias_grad_name = framework::GradVarName("Bias"); if (ctx->HasOutput(bias_grad_name)) { - auto bias_dims = ctx->GetInputDim("B"); + auto bias_dims = ctx->GetInputDim("Bias"); ctx->SetOutputDim(bias_grad_name, bias_dims); } } + + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Input")->type()), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index ce1717c9b0..3017bccdca 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -14,12 +14,11 @@ #pragma once +#include #include #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" -#include "paddle/memory/memcpy.h" #include "unsupported/Eigen/CXX11/Tensor" - namespace paddle { namespace operators { @@ -32,9 +31,12 @@ using EigenMatrix = framework::EigenMatrix; template void PrepareSamples(const framework::ExecutionContext& context) { auto label = context.Input("Label"); - const T* label_data = label->data(); + const int64_t* label_data = label->data(); auto label_dims = label->dims(); int num_classes = context.Attr("num_classes"); + // for unitest + std::vector sampled_labels = + context.Attr>("sampled_labels"); // random machine std::random_device rd; std::mt19937 rng(rd()); @@ -42,19 +44,24 @@ void PrepareSamples(const framework::ExecutionContext& context) { auto sample_labels = context.Output("SampleLabels"); auto sample_labels_dims = sample_labels->dims(); - int* sample_labels_data = - sample_labels->mutable_data(context.GetPlace()); + int64_t* sample_labels_data = + sample_labels->mutable_data(context.GetPlace()); int num_label = label_dims.size() == 2 ? label_dims[1] : 1; + int index = 0; for (size_t i = 0; i < label_dims[0]; ++i) { int j = 0; for (; j < num_label; ++j) { - sample_labels_data[sample_labels_dims[1] * i + j] = - label_data[i * num_label + j]; + sample_labels_data[index++] = label_data[i * num_label + j]; } - for (; j < sample_labels_dims[1]; ++j) { - int id = rand(rng); - sample_labels_data[sample_labels_dims[1] * i + j] = id; + if (sampled_labels.size() > 0) { + for (auto label : sampled_labels) { + sample_labels_data[index++] = label; + } + } else { + for (; j < sample_labels_dims[1]; ++j) { + sample_labels_data[index++] = rand(rng); + } } } } @@ -65,7 +72,7 @@ class NCEKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { PrepareSamples(context); auto sample_labels = context.Output("SampleLabels"); - const int* sample_labels_data = sample_labels->data(); + const int64_t* sample_labels_data = sample_labels->data(); auto sample_out = context.Output("SampleLogits"); T* sample_out_data = sample_out->mutable_data(context.GetPlace()); auto label = context.Input("Label"); @@ -74,7 +81,7 @@ class NCEKernel : public framework::OpKernel { if (sample_weight != nullptr) { sample_weight_data = sample_weight->data(); } - auto out = context.Output("Out"); + auto out = context.Output("Cost"); T* out_data = out->mutable_data(context.GetPlace()); int num_smalped_classes = context.Attr("num_sampled_classes"); int num_classes = context.Attr("num_classes"); @@ -83,9 +90,8 @@ class NCEKernel : public framework::OpKernel { num_true_class = label->dims()[1]; } T b = 1. / num_classes * num_smalped_classes; - // forward bias - auto bias = context.Input("B"); + auto bias = context.Input("Bias"); if (bias != nullptr) { const T* bias_data = bias->data(); for (size_t i = 0; i < sample_labels->numel(); ++i) { @@ -96,27 +102,23 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] = 0; } } - // forward mul - auto input_mat = EigenMatrix::From(*(context.Input("X"))); - auto weight_mat = EigenMatrix::From(*(context.Input("W"))); + auto input_mat = EigenMatrix::From(*(context.Input("Input"))); + auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); for (size_t i = 0; i < sample_labels->numel(); ++i) { - // sample_out_data[i] += (input_mat.chip((int)(i / - // sample_labels->dims()[1]), 0) * weight_mat.chip(sample_labels_data[i], - // 0)).sum(); Eigen::Tensor result = (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) * weight_mat.chip(sample_labels_data[i], 0)) .sum(); sample_out_data[i] += result(0); // activation_->forward - sample_out_data[i] = (1 / 1 + (sample_out_data[i])); + sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } - // forward cost for (size_t i = 0; i < sample_labels->dims()[0]; ++i) { size_t j = 0; - T w = sample_weight == nullptr ? 1 : sample_weight_data[i]; + out_data[i] = 0; + T w = sample_weight == nullptr ? 1. : sample_weight_data[i]; // for true classes for (; j < num_true_class; ++j) { T o = sample_out_data[i * sample_out->dims()[1] + j]; @@ -137,11 +139,13 @@ template class NCEGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto d_out = context.Input(framework::GradVarName("Cost")); + const T* d_out_data = d_out->data(); auto label = context.Input("Label"); auto sample_out = context.Input("SampleLogits"); const T* sample_out_data = sample_out->data(); auto sample_labels = context.Input("SampleLabels"); - const int* sample_labels_data = sample_labels->data(); + const int64_t* sample_labels_data = sample_labels->data(); auto sample_weight = context.Input("SampleWeight"); const T* sample_weight_data = nullptr; if (sample_weight != nullptr) { @@ -154,11 +158,9 @@ class NCEGradKernel : public framework::OpKernel { num_true_class = label->dims()[1]; } T b = 1. / num_classes * num_smalped_classes; - Tensor sample_grad; // tmp tensor T* sample_grad_data = sample_grad.mutable_data(sample_labels->dims(), context.GetPlace()); - // backward cost for (size_t i = 0; i < sample_labels->numel(); ++i) { T o = sample_out_data[i]; @@ -166,15 +168,12 @@ class NCEGradKernel : public framework::OpKernel { ? 1 : sample_weight_data[i / sample_labels->dims()[1]]; sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class - ? -w * b / (o * (o + b)) - : w / (o + b); - // sigmoid->backward - sample_grad_data[i] = - (o > 0) ? sample_grad_data[i] : ((o < 0) ? -sample_grad_data[i] : 0); + ? w * (b / (o + b)) * (o - 1) + : w * (o * (1 - o) / (o + b)); + sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]]; } - // get d_bias - auto d_bias = context.Output(framework::GradVarName("B")); + auto d_bias = context.Output(framework::GradVarName("Bias")); if (d_bias != nullptr) { T* d_bias_data = d_bias->mutable_data(context.GetPlace()); for (size_t i = 0; i < sample_labels->numel(); ++i) { @@ -182,22 +181,23 @@ class NCEGradKernel : public framework::OpKernel { } } // get d_w - auto d_w = context.Output(framework::GradVarName("W")); + auto d_w = context.Output(framework::GradVarName("Weight")); if (d_w != nullptr) { + d_w->mutable_data(context.GetPlace()); auto d_w_matrix = EigenMatrix::From(*d_w); - auto x_matrix = EigenMatrix::From(*(context.Input("X"))); + auto x_matrix = EigenMatrix::From(*(context.Input("Input"))); for (size_t i = 0; i < sample_labels->numel(); ++i) { - d_w_matrix.chip(sample_labels_data[i], 0) = + d_w_matrix.chip(sample_labels_data[i], 0) += x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) * sample_grad_data[i]; } } - // get d_x - auto d_x = context.Output(framework::GradVarName("X")); + auto d_x = context.Output(framework::GradVarName("Input")); if (d_x != nullptr) { + d_x->mutable_data(context.GetPlace()); auto d_x_matrix = EigenMatrix::From(*d_x); - auto w_matrix = EigenMatrix::From(*(context.Input("W"))); + auto w_matrix = EigenMatrix::From(*(context.Input("Weight"))); for (size_t i = 0; i < sample_labels->numel(); ++i) { d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) += w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i]; @@ -205,6 +205,5 @@ class NCEGradKernel : public framework::OpKernel { } } }; - } // namespace operators } // namespace paddle diff --git a/python/paddle/v2/framework/tests/test_nce.py b/python/paddle/v2/framework/tests/test_nce.py new file mode 100644 index 0000000000..8b1e7a6bb5 --- /dev/null +++ b/python/paddle/v2/framework/tests/test_nce.py @@ -0,0 +1,96 @@ +import unittest +import numpy as np +from op_test import OpTest + + +def nce(input, weight, bias, sample_weight, labels, num_classes, + num_sample_class): + samples = [] + sample_labels = [] + batch_size = input.shape[0] + num_true_class = labels.shape[1] + for i in range(batch_size): + w = 1 if sample_weight is None else sample_weight[i] + for label in labels[i]: + samples.append((i, label, True, w)) + sample_labels.append(label) + for num in range(num_sample_class): + samples.append((i, num, False, w)) + sample_labels.append(num) + # forward bias + sampleOut = np.zeros(len(samples)).astype(np.float32) + if bias is not None: + for i in range(len(samples)): + sampleOut[i] = bias[samples[i][1]] + # forward weight + for i in range(len(samples)): + sampleOut[i] += np.dot(input[samples[i][0]], weight[samples[i][1]]) + + # forward activation + sampleOut = 1.0 / (1.0 + np.exp(-sampleOut)) + # forward cost + out = np.zeros(batch_size).astype(np.float32) + b = 1.0 / num_classes * num_sample_class + for i in range(len(samples)): + o = sampleOut[i] + cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b)) + out[samples[i][0]] += cost * samples[i][3] + return (out, np.array(sampleOut).reshape(batch_size, + num_sample_class + num_true_class), + np.array(sample_labels).reshape(batch_size, + num_sample_class + num_true_class)) + + +class TestNCE(OpTest): + def generate_data(self, dim, batch_size, num_classes, num_true_class, + num_sampled_classes): + input = np.random.randn(batch_size, dim).astype(np.float32) + weight = np.random.randn(num_classes, dim).astype(np.float32) + bias = np.random.randn(num_classes).astype(np.float32) + sample_weight = np.random.randn(batch_size).astype(np.float32) + labels = np.random.randint(0, num_classes, (batch_size, num_true_class)) + self.attrs = { + 'num_classes': num_classes, + 'num_sampled_classes': num_sampled_classes, + 'sampled_labels': range(num_sampled_classes) + } + self.inputs = { + 'X': input, + 'Label': labels, + 'W': weight, + 'B': bias, + 'SampleWeight': sample_weight + } + + def set_data(self): + self.generate_data(5, 5, 4, 1, 2) + + def compute(self): + out = nce(self.inputs['X'], self.inputs['W'], self.inputs['B'], + self.inputs['SampleWeight'], self.inputs['Label'], + self.attrs['num_classes'], self.attrs['num_sampled_classes']) + self.outputs = { + 'Out': out[0], + 'SampleLogits': out[1], + 'SampleLabels': out[2] + } + + def setUp(self): + self.op_type = 'nce' + self.set_data() + self.compute() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X", "W", "B"], "Out", max_relative_error=0.02) + + +class TestNCECase1(TestNCE): + def set_data(self): + self.generate_data(10, 20, 10, 2, 5) + + +if __name__ == '__main__': + unittest.main() From e60eb1eacdac476b52cbd029660249fe709b7196 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Thu, 16 Nov 2017 00:45:36 +0800 Subject: [PATCH 025/275] fix unitest --- .../v2/{framework => fluid}/tests/test_nce.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) rename python/paddle/v2/{framework => fluid}/tests/test_nce.py (86%) diff --git a/python/paddle/v2/framework/tests/test_nce.py b/python/paddle/v2/fluid/tests/test_nce.py similarity index 86% rename from python/paddle/v2/framework/tests/test_nce.py rename to python/paddle/v2/fluid/tests/test_nce.py index 8b1e7a6bb5..82978f2d23 100644 --- a/python/paddle/v2/framework/tests/test_nce.py +++ b/python/paddle/v2/fluid/tests/test_nce.py @@ -55,10 +55,10 @@ class TestNCE(OpTest): 'sampled_labels': range(num_sampled_classes) } self.inputs = { - 'X': input, + 'Input': input, 'Label': labels, - 'W': weight, - 'B': bias, + 'Weight': weight, + 'Bias': bias, 'SampleWeight': sample_weight } @@ -66,11 +66,12 @@ class TestNCE(OpTest): self.generate_data(5, 5, 4, 1, 2) def compute(self): - out = nce(self.inputs['X'], self.inputs['W'], self.inputs['B'], - self.inputs['SampleWeight'], self.inputs['Label'], - self.attrs['num_classes'], self.attrs['num_sampled_classes']) + out = nce(self.inputs['Input'], self.inputs['Weight'], + self.inputs['Bias'], self.inputs['SampleWeight'], + self.inputs['Label'], self.attrs['num_classes'], + self.attrs['num_sampled_classes']) self.outputs = { - 'Out': out[0], + 'Cost': out[0], 'SampleLogits': out[1], 'SampleLabels': out[2] } @@ -84,7 +85,8 @@ class TestNCE(OpTest): self.check_output() def test_check_grad(self): - self.check_grad(["X", "W", "B"], "Out", max_relative_error=0.02) + self.check_grad( + ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02) class TestNCECase1(TestNCE): From af37838edf4a3ad3c1f098d4026218c130258ac2 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 15 Nov 2017 22:48:01 -0800 Subject: [PATCH 026/275] add test for float16 --- paddle/math/float16.h | 16 ++++++++-------- paddle/math/tests/CMakeLists.txt | 3 ++- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index ae7d9754aa..e9d4e6737d 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -20,7 +20,7 @@ limitations under the License. */ #include #include -#include +#define USE_EIGEN #ifdef USE_EIGEN // delete this #if macro #include "Eigen/src/Core/arch/CUDA/Half.h" @@ -100,8 +100,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h); struct PADDLE_ALIGN(2) float16 { uint16_t x; - // explicit for different types, implicit for half and Eigen::half - PADDLE_HOSTDEVICE inline float16() {} PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} @@ -120,7 +118,8 @@ struct PADDLE_ALIGN(2) float16 { PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. @@ -208,7 +207,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { x = *reinterpret_cast(rhs); return *this; @@ -302,7 +302,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#ifdef PADDLE_NEON +#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ + defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) // check whether it works or not PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; @@ -371,7 +372,6 @@ __device__ inline float16 operator*(const float16& a, const float16& b) { __device__ inline float16 operator/(const float16& a, const float16& b) { // TODO(kexinzhao): check the cuda version that starts to support __hdiv - // instinsic float num = __half2float(half(a)); float denom = __half2float(half(b)); return float16(num / denom); @@ -595,7 +595,7 @@ constexpr int32_t minD = minC - subC - 1; PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 half tmp = __float2half(f); - return *reinterpret_cast(&(tmp)); + return *reinterpret_cast(&tmp); #elif defined(PADDLE_NEON_64) // test on RPI float16 res; diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index d8b7f9e3fc..ab4ac38b3c 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -21,7 +21,7 @@ if(WITH_GPU) CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) - link_paddle_test(test_lazyAssign) + link_paddle_test(test_lazyAssign) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) @@ -33,3 +33,4 @@ add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) +add_simple_unittest(test_float16) From 7a1a586355844eb18fb6c87304cee5bbf70d078d Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Thu, 16 Nov 2017 17:15:03 +0800 Subject: [PATCH 027/275] Update variable names and docs for factorization machine layer --- .../layers/FactorizationMachineLayer.cpp | 110 +++++++++--------- .../layers/FactorizationMachineLayer.h | 31 +++-- paddle/gserver/tests/test_LayerGrad.cpp | 1 + paddle/math/CpuSparseMatrix.cpp | 8 +- .../paddle/trainer_config_helpers/layers.py | 14 ++- 5 files changed, 94 insertions(+), 70 deletions(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index 3bd8d7cb4c..f0f1738f30 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -32,12 +32,10 @@ bool FactorizationMachineLayer::init(const LayerMap& layerMap, /* initialize the latentVectors_ */ CHECK_EQ(inputLayers_.size(), 1UL); - size_t height = inputLayers_[0]->getSize(); - CHECK_EQ(parameters_[0]->getSize(), height * factorSize_); - latentVectors_ = - std::unique_ptr(new Weight(height, factorSize_, parameters_[0])); - - v2_ = Matrix::create(height, factorSize_, false, useGpu_); + size_t inputSize = inputLayers_[0]->getSize(); + CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_); + latentVectors_ = std::unique_ptr( + new Weight(inputSize, factorSize_, parameters_[0])); return true; } @@ -48,79 +46,85 @@ void FactorizationMachineLayer::forward(PassType passType) { const MatrixPtr& inputV = getInputValue(0); size_t batchSize = inputV->getHeight(); - size_t size = getSize(); - reserveOutput(batchSize, size); + size_t outputSize = getSize(); + size_t inputSize = inputLayers_[0]->getSize(); + reserveOutput(batchSize, outputSize); MatrixPtr outV = getOutputValue(); - Matrix::resizeOrCreate(tmpMul_, batchSize, factorSize_, false, useGpu_); + Matrix::resizeOrCreate( + latentVectorsSquare_, inputSize, factorSize_, false, useGpu_); + Matrix::resizeOrCreate( + inputMulFactor_, batchSize, factorSize_, false, useGpu_); Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_); - REGISTER_TIMER_INFO("FwMulTimer", getName().c_str()); - tmpMul_->mul(*inputV, *latentVectors_->getW()); - tmpMul_->square2(*tmpOut_); + REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str()); + inputMulFactor_->mul(*inputV, *latentVectors_->getW()); + inputMulFactor_->square2(*tmpOut_); outV->sumRows(*tmpOut_, 0.5, 0); - x2_ = inputV->clone(0, 0, useGpu_); - if (dynamic_cast(x2_.get())) { - x2_->copyFrom(*inputV); - (dynamic_cast(x2_.get()))->square2(); + inputSquare_ = inputV->clone(0, 0, useGpu_); + if (dynamic_cast(inputSquare_.get())) { + inputSquare_->copyFrom(*inputV); + (dynamic_cast(inputSquare_.get()))->square2(); } else { - inputV->square2(*x2_); + inputV->square2(*inputSquare_); } - latentVectors_->getW()->square2(*v2_); - tmpOut_->mul(*x2_, *v2_); + latentVectors_->getW()->square2(*latentVectorsSquare_); + tmpOut_->mul(*inputSquare_, *latentVectorsSquare_); outV->sumRows(*tmpOut_, -0.5, 1.0); /* activation */ { - REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str()); + REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str()); forwardActivation(); } } void FactorizationMachineLayer::backward(const UpdateCallback& callback) { - /* Do derivation */ { - REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str()); - backwardActivation(); - } + /* Do derivation */ { backwardActivation(); } const MatrixPtr& inputV = getInputValue(0); const MatrixPtr& oGrad = getOutputGrad(); - MatrixPtr tmpSum = - Matrix::create(1, latentVectors_->getW()->getHeight(), false, useGpu_); - MatrixPtr tmpSum_T = Matrix::create(tmpSum->getRowBuf(0), - latentVectors_->getW()->getHeight(), - 1, - false, - useGpu_); + Matrix::resizeOrCreate( + tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_); + MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0), + latentVectors_->getW()->getHeight(), + 1, + false, + useGpu_); /* Calculate the gradients of the latentVectors_ matrix */ if (latentVectors_->getWGrad()) { - MatrixPtr tmpIn = inputV->clone(0, 0, useGpu_); + MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_); if (dynamic_cast(inputV.get())) { - CpuSparseMatrix* inputV_s = dynamic_cast(inputV.get()); - CpuSparseMatrix* x2_s = dynamic_cast(x2_.get()); - CpuSparseMatrix* tmpIn_s = dynamic_cast(tmpIn.get()); - tmpIn_s->copyFrom(*inputV_s); - tmpIn_s->rowScale(0, *inputV_s, *oGrad); - latentVectors_->getWGrad()->mul(*tmpIn_s->getTranspose(), *tmpMul_, 1, 1); - tmpIn_s->rowScale(0, *x2_s, *oGrad); - - MatrixPtr ones = Matrix::create(1, inputV->getHeight(), false, useGpu_); - ones->zeroMem(); - ones->add(-1); - tmpSum->mul(*ones, *tmpIn_s, 1, 0); + CpuSparseMatrix* sparseInputV = + dynamic_cast(inputV.get()); + CpuSparseMatrix* sparseInputSquare = + dynamic_cast(inputSquare_.get()); + CpuSparseMatrix* sparseTmpInput = + dynamic_cast(tmpInput.get()); + sparseTmpInput->copyFrom(*sparseInputV); + sparseTmpInput->rowScale(0, *sparseInputV, *oGrad); + latentVectors_->getWGrad()->mul( + *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1); + sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad); + + Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_); + negOnes_->zeroMem(); + negOnes_->add(-1); + tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0); } else { - tmpIn->rowScale(0, *inputV, *oGrad); - latentVectors_->getWGrad()->mul(*tmpIn->getTranspose(), *tmpMul_, 1, 1); - tmpIn->rowScale(0, *x2_, *oGrad); + tmpInput->rowScale(0, *inputV, *oGrad); + latentVectors_->getWGrad()->mul( + *tmpInput->getTranspose(), *inputMulFactor_, 1, 1); + tmpInput->rowScale(0, *inputSquare_, *oGrad); - tmpSum->sumCols(*tmpIn, -1, 0); + tmpSum_->sumCols(*tmpInput, -1, 0); } latentVectors_->getWGrad()->addRowScale( - 0, *latentVectors_->getW(), *tmpSum_T); + 0, *latentVectors_->getW(), *tmpSumTrans); /* Increasing the number of gradient */ latentVectors_->getParameterPtr()->incUpdate(callback); @@ -129,10 +133,10 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) { /* Calculate the input layers gradient */ MatrixPtr inGrad = getInputGrad(0); if (inGrad != NULL) { - MatrixPtr latentVectors_T = latentVectors_->getW()->getTranspose(); - inGrad->mul(*tmpMul_, *latentVectors_T, 1, 1); - tmpSum_T->sumRows(*v2_, -1, 0); - inGrad->addColScale(0, *inputV, *tmpSum); + inGrad->mul( + *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1); + tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0); + inGrad->addColScale(0, *inputV, *tmpSum_); inGrad->rowScale(0, *inGrad, *oGrad); } } diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h index 7cf064690f..85d40fdb1e 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.h +++ b/paddle/gserver/layers/FactorizationMachineLayer.h @@ -34,27 +34,36 @@ namespace paddle { * y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j * \f] * + * The detailed calculation for forward and backward can be found at this paper: + * + * Rendle, Steffen. Factorization machines. IEEE 10th International + * Conference on Data Mining (ICDM). IEEE, 2010. + * * The config file api is factorization_machine. */ class FactorizationMachineLayer : public Layer { protected: - /// The latent vectors, shape: (size, factorSize_) - /// Each row of the latentVectors_ matrix is the latent vector - /// corresponding to one input feature dimension + // The latent vectors, shape: (size, factorSize_) + // Each row of the latentVectors_ matrix is the latent vector + // corresponding to one input feature dimension std::unique_ptr latentVectors_; - /// The hyperparameter that defines the dimensionality of the factorization + // The hyperparameter that defines the dimensionality of the factorization size_t factorSize_; private: - /// The result of input matrix * letent vector matrix that will be used in - /// both forward and backward step - MatrixPtr tmpMul_; + // Store the square values of the letent vectors matrix + MatrixPtr latentVectorsSquare_; + // Store the square values of input matrix + MatrixPtr inputSquare_; + // The result of input matrix * latent vector matrix that will be used in + // both forward and backward step + MatrixPtr inputMulFactor_; + // Temporary calculation result store MatrixPtr tmpOut_; - /// Store the square values of the letent vectors matrix - MatrixPtr v2_; - /// Store the square values of input matrix - MatrixPtr x2_; + MatrixPrt tmpSum_; + // Negative identity matrix + MatrixPtr negOnes_; public: explicit FactorizationMachineLayer(const LayerConfig& config) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 072d75c23d..04ff618c21 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2442,6 +2442,7 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) { TEST(Layer, FactorizationMachineLayer) { for (auto useGpu : {false, true}) { testFactorizationMachineLayer(INPUT_DATA, useGpu); + testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu); } } diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp index e211c23a7e..6a432cd16b 100644 --- a/paddle/math/CpuSparseMatrix.cpp +++ b/paddle/math/CpuSparseMatrix.cpp @@ -262,15 +262,15 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const { void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) { CHECK(getFormat() != SPARSE_CSC) << "Not supported"; - CHECK(height_ == b.getHeight()); - CHECK(width_ == b.getWidth()); + CHECK_EQ(height_, b.getHeight()); + CHECK_EQ(width_, b.getWidth()); real* A = getValue(); real* B = b.getValue(); for (size_t i = 0; i < height_; i++) { size_t start = getRowStartIdx(i); size_t end = getRowStartIdx(i + 1); - CHECK(start == b.getRowStartIdx(i)); - CHECK(end == b.getRowStartIdx(i + 1)); + CHECK_EQ(start, b.getRowStartIdx(i)); + CHECK_EQ(end, b.getRowStartIdx(i + 1)); for (size_t j = start; j < end; j++) { A[j] = B[j] * c.getElement(i, cCol); } diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 30e334e7c8..7e38383bd6 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -7161,16 +7161,26 @@ def factorization_machine(input, The Factorization Machine models pairwise feature interactions as inner product of the learned latent vectors corresponding to each input feature. The Factorization Machine can effectively capture feature interactions - especially when the input is sparse. In practice, usually order 2 feature - interactions are considered using Factorization Machine with the formula: + especially when the input is sparse. + + This implementation only consider the 2-order feature interactions using + Factorization Machine with the formula: + .. math:: y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j + Note: X is the input vector with size n. V is the factor matrix. Each row of V is the latent vector corresponding to each input dimesion. The size of each latent vector is k. + + For details of Factorization Machine, please refer to the paper: + Rendle, Steffen. Factorization machines. IEEE 10th International + Conference on Data Mining (ICDM). IEEE, 2010. + .. code-block:: python factor_machine = factorization_machine(input=input_layer, factor_size=10) + :param input: The input layer. :type input: LayerOutput :param factor_size: The hyperparameter that defines the dimensionality of From 0b6afb589cb74c4cb24b8ee5461f1d8b12674143 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Thu, 16 Nov 2017 19:11:40 +0800 Subject: [PATCH 028/275] Fix typo in factorization machine layer --- paddle/gserver/layers/FactorizationMachineLayer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h index 85d40fdb1e..85ac175657 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.h +++ b/paddle/gserver/layers/FactorizationMachineLayer.h @@ -61,7 +61,7 @@ private: MatrixPtr inputMulFactor_; // Temporary calculation result store MatrixPtr tmpOut_; - MatrixPrt tmpSum_; + MatrixPtr tmpSum_; // Negative identity matrix MatrixPtr negOnes_; From 09f4f9257981dc3744e9131dabcebebaa5eb7f91 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Thu, 16 Nov 2017 20:33:25 +0800 Subject: [PATCH 029/275] Add unitest for factorization machine layer with sparse input --- paddle/gserver/tests/test_LayerGrad.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 589db0bd6c..7ad9866ecf 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -2444,8 +2444,8 @@ void testFactorizationMachineLayer(InputType type, bool useGpu) { TEST(Layer, FactorizationMachineLayer) { for (auto useGpu : {false, true}) { testFactorizationMachineLayer(INPUT_DATA, useGpu); - testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, useGpu); } + testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false); } int main(int argc, char** argv) { From 4f1aa5bc0ee3c00fa792cfe188fabaab290938b1 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 16 Nov 2017 09:17:09 -0800 Subject: [PATCH 030/275] add test cases --- paddle/math/float16.h | 12 ++--- paddle/math/tests/test_float16.cpp | 78 ++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 6 deletions(-) create mode 100644 paddle/math/tests/test_float16.cpp diff --git a/paddle/math/float16.h b/paddle/math/float16.h index e9d4e6737d..9c06b423ef 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -23,7 +23,7 @@ limitations under the License. */ #define USE_EIGEN #ifdef USE_EIGEN // delete this #if macro -#include "Eigen/src/Core/arch/CUDA/Half.h" +#include "unsupported/Eigen/CXX11/Tensor" #endif #ifdef __GNUC__ @@ -126,7 +126,7 @@ struct PADDLE_ALIGN(2) float16 { // According to gcc, __fp16 can only be used as an argument to fp16 // intrinsic defined in arm_neon.h or as a storage type. It cannot // be used as a formal function argument. - // TODO (kexinzhao): test it on RPI + // TODO(kexinzhao): test it on RPI PADDLE_HOSTDEVICE inline float16(const float16_t* h) { x = *reinterpret_cast(h); } @@ -564,7 +564,7 @@ PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { namespace fp16_impl { -Union Bits { +union Bits { float f; int32_t si; uint32_t ui; @@ -584,7 +584,7 @@ constexpr int32_t maxC = maxN >> shift; constexpr int32_t minC = minN >> shift; constexpr int32_t sigC = sigN >> shiftSign; -const int32_t mulN = 0x52000000; //(1 << 23) / minN +const int32_t mulN = 0x52000000; // (1 << 23) / minN const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) const int32_t subC = 0x003FF; // max flt32 subnormal downshifted const int32_t norC = 0x00400; // min flt32 normal downshifted @@ -693,7 +693,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { // Conversion routine adapted from // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion Bits v; - v.ui = x; + v.ui = h.x; int32_t sign = v.si & sigC; v.si ^= sign; sign <<= shiftSign; @@ -711,6 +711,6 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { #endif } -} // namespace half_impl +} // namespace fp16_impl } // namespace paddle diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp new file mode 100644 index 0000000000..79f63d3a80 --- /dev/null +++ b/paddle/math/tests/test_float16.cpp @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/math/float16.h" + +namespace paddle { + +#ifdef PADDLE_CUDA_FP16 +TEST(float16, gpu) { + // Conversion to and from cuda half + float16 v1 = half(float16(1.0f)); + EXPECT_EQ(v1.x, 0x3c00); + + // Conversion to and from Eigen::half + float16 v2 = Eigen::half(float16(0.5f)); + EXPECT_EQ(v2.x, 0x3800); + + // Conversion from float + EXPECT_EQ(float16(1.0f).x, 0x3c00); + EXPECT_EQ(float16(0.5f).x, 0x3800); + EXPECT_EQ(float16(0.33333f).x, 0x3555); + EXPECT_EQ(float16(0.0f).x, 0x0000); + EXPECT_EQ(float16(-0.0f).x, 0x8000); + EXPECT_EQ(float16(65504.0f).x, 0x7bff); + EXPECT_EQ(float16(65536.0f).x, 0x7c00); + + // Conversion from double + + // Conversion from int + + // Conversion from bool +} + +TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } + +TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } +#endif + +TEST(float16, conversion_cpu) { + // Conversion to and from Eigen::half + EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); + EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); + EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555); + EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000); + EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000); + EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff); + EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00); + + // Conversion from float + EXPECT_EQ(float16(1.0f).x, 0x3c00); + EXPECT_EQ(float16(0.5f).x, 0x3800); + EXPECT_EQ(float16(0.33333f).x, 0x3555); + EXPECT_EQ(float16(0.0f).x, 0x0000); + EXPECT_EQ(float16(-0.0f).x, 0x8000); + EXPECT_EQ(float16(65504.0f).x, 0x7bff); + EXPECT_EQ(float16(65536.0f).x, 0x7c00); + + // Conversion from double + + // Conversion from int + + // Conversion from bool +} + +TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } + +TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } + +} // namespace paddle From 979d2e0b092a1378290ddae421f8793d00fd0938 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 16 Nov 2017 10:05:30 -0800 Subject: [PATCH 031/275] small fix --- paddle/math/float16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 9c06b423ef..3275546e69 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -426,8 +426,8 @@ __device__ inline bool operator>=(const float16& a, const float16& b) { } // On ARMv8.2-A CPU -#elif (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) && \ - defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) +#elif defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) __host__ inline float16 operator+(const float16& a, const float16& b) { return float16(vaddh_f16(float16_t(a), float16_t(b))); } From 22dfa5fa8aaec63753c73848813e280560a8152f Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 16 Nov 2017 14:39:49 -0800 Subject: [PATCH 032/275] fix GPU compiling --- paddle/math/float16.h | 12 ++++++------ paddle/math/tests/CMakeLists.txt | 5 ++++- .../math/tests/{test_float16.cpp => test_float16.cu} | 2 +- 3 files changed, 11 insertions(+), 8 deletions(-) rename paddle/math/tests/{test_float16.cpp => test_float16.cu} (98%) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 3275546e69..6799a83bd3 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -118,8 +118,8 @@ struct PADDLE_ALIGN(2) float16 { PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} #endif // USE_EIGEN -#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ - defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. @@ -207,8 +207,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ - defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { x = *reinterpret_cast(rhs); return *this; @@ -302,8 +302,8 @@ struct PADDLE_ALIGN(2) float16 { } #endif // USE_EIGEN -#if (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) && \ - defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) // check whether it works or not PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index ab4ac38b3c..dc06f99090 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -22,15 +22,18 @@ if(WITH_GPU) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) link_paddle_test(test_lazyAssign) + CUDA_ADD_EXECUTABLE(test_float16 test_float16.cu) + link_paddle_test(test_float16) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) compile_cu_as_cpp(test_lazyAssign.cu) add_unittest(test_lazyAssign test_lazyAssign.cu) + compile_cu_as_cpp(test_float16.cu) + add_unittest(test_float16 test_float16.cu) endif(WITH_GPU) add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) -add_simple_unittest(test_float16) diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cu similarity index 98% rename from paddle/math/tests/test_float16.cpp rename to paddle/math/tests/test_float16.cu index 79f63d3a80..40bc54f5b4 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cu @@ -15,7 +15,7 @@ limitations under the License. */ namespace paddle { #ifdef PADDLE_CUDA_FP16 -TEST(float16, gpu) { +TEST(float16, conversion_gpu) { // Conversion to and from cuda half float16 v1 = half(float16(1.0f)); EXPECT_EQ(v1.x, 0x3c00); From 080ff0c83200a229fb032cd03d4d900b634b1b02 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Thu, 16 Nov 2017 16:28:33 -0800 Subject: [PATCH 033/275] two tests for cpu and gpu separately --- paddle/math/tests/CMakeLists.txt | 6 ++-- paddle/math/tests/test_float16.cpp | 47 ++++++++++++++++++++++++++++++ paddle/math/tests/test_float16.cu | 32 +------------------- 3 files changed, 50 insertions(+), 35 deletions(-) create mode 100644 paddle/math/tests/test_float16.cpp diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index dc06f99090..c131544515 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -18,21 +18,19 @@ add_simple_unittest(test_CpuGpuVector) add_simple_unittest(test_Allocator) if(WITH_GPU) + nv_test(test_float16_gpu SRCS test_float16.cu) CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) link_paddle_test(test_lazyAssign) - CUDA_ADD_EXECUTABLE(test_float16 test_float16.cu) - link_paddle_test(test_float16) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) compile_cu_as_cpp(test_lazyAssign.cu) add_unittest(test_lazyAssign test_lazyAssign.cu) - compile_cu_as_cpp(test_float16.cu) - add_unittest(test_float16 test_float16.cu) endif(WITH_GPU) +cc_test(test_float16 SRCS test_float16.cpp) add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp new file mode 100644 index 0000000000..8d4279b413 --- /dev/null +++ b/paddle/math/tests/test_float16.cpp @@ -0,0 +1,47 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/math/float16.h" + +namespace paddle { + +TEST(float16, conversion_cpu) { + // Conversion to and from Eigen::half + EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); + EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); + EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555); + EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000); + EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000); + EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff); + EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00); + + // Conversion from float + EXPECT_EQ(float16(1.0f).x, 0x3c00); + EXPECT_EQ(float16(0.5f).x, 0x3800); + EXPECT_EQ(float16(0.33333f).x, 0x3555); + EXPECT_EQ(float16(0.0f).x, 0x0000); + EXPECT_EQ(float16(-0.0f).x, 0x8000); + EXPECT_EQ(float16(65504.0f).x, 0x7bff); + EXPECT_EQ(float16(65536.0f).x, 0x7c00); + + // Conversion from double + + // Conversion from int + + // Conversion from bool +} + +TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } + +TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } + +} // namespace paddle diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 40bc54f5b4..6c0a1c351c 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -39,40 +39,10 @@ TEST(float16, conversion_gpu) { // Conversion from bool } +#endif TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } -#endif - -TEST(float16, conversion_cpu) { - // Conversion to and from Eigen::half - EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); - EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); - EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555); - EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000); - EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000); - EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff); - EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00); - - // Conversion from float - EXPECT_EQ(float16(1.0f).x, 0x3c00); - EXPECT_EQ(float16(0.5f).x, 0x3800); - EXPECT_EQ(float16(0.33333f).x, 0x3555); - EXPECT_EQ(float16(0.0f).x, 0x0000); - EXPECT_EQ(float16(-0.0f).x, 0x8000); - EXPECT_EQ(float16(65504.0f).x, 0x7bff); - EXPECT_EQ(float16(65536.0f).x, 0x7c00); - - // Conversion from double - - // Conversion from int - - // Conversion from bool -} - -TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } - -TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } } // namespace paddle From 734cac1a53b904c7d3f76fe66cee1b2d19632dcf Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Fri, 17 Nov 2017 00:04:58 -0800 Subject: [PATCH 034/275] fix CUDA_VERSION issue --- paddle/math/float16.h | 29 ++++++++++++++++++++++++++++- paddle/math/tests/test_float16.cpp | 2 ++ paddle/math/tests/test_float16.cu | 2 ++ 3 files changed, 32 insertions(+), 1 deletion(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 6799a83bd3..1922192f7b 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -20,6 +20,10 @@ limitations under the License. */ #include #include +#include + +#include "paddle/utils/Logging.h" + #define USE_EIGEN #ifdef USE_EIGEN // delete this #if macro @@ -48,6 +52,27 @@ limitations under the License. */ #define PADDLE_HOSTDEVICE #endif // __CUDACC__ +#define STR(x) #x +#define XSTR(x) STR(x) + +#ifndef __CUDACC__ +#pragma message "__CUDACC__ not defined" +#else +#pragma message "__CUDACC__ defined" +#endif + +#ifndef CUDA_VERSION +#pragma message "CUDA_VERSION not defined" +#else +#pragma message "CUDA_VERSION defined: " XSTR(CUDA_VERSION) +#endif + +#ifdef __CUDA_ARCH__ +#pragma message "The value of CUDA_ARCH: " XSTR(__CUDA_ARCH__) +#else +#pragma message "CUDA ARCH NOT DEFINED!" +#endif + #ifdef __arm__ #define PADDLE_ARM_32 #endif @@ -359,6 +384,7 @@ struct PADDLE_ALIGN(2) float16 { // arithmetic operators #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ inline float16 operator+(const float16& a, const float16& b) { + printf("GPU Intrinsic used!"); return float16(__hadd(half(a), half(b))); } @@ -495,6 +521,7 @@ __host__ inline bool operator>=(const float16& a, const float16& b) { #else // software emulation on other cpu PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { + LOG(INFO) << "CPU emulation used"; return float16(float(a) + float(b)); } @@ -656,7 +683,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 half tmp = *reinterpret_cast(&h); - return __half2float(h); + return __half2float(tmp); #elif defined(PADDLE_NEON_64) float res; diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index 8d4279b413..1a20d0e925 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -15,6 +15,8 @@ limitations under the License. */ namespace paddle { TEST(float16, conversion_cpu) { + LOG(INFO) << "cpu test started!"; + // Conversion to and from Eigen::half EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 6c0a1c351c..9ca77cf86c 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -16,6 +16,8 @@ namespace paddle { #ifdef PADDLE_CUDA_FP16 TEST(float16, conversion_gpu) { + LOG(INFO) << "GPU tests started"; + // Conversion to and from cuda half float16 v1 = half(float16(1.0f)); EXPECT_EQ(v1.x, 0x3c00); From d5a6c81dc55057ba437efe417992c0521e87c754 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Mon, 20 Nov 2017 11:48:52 +0800 Subject: [PATCH 035/275] Update docs for factorization machine layer --- paddle/gserver/layers/FactorizationMachineLayer.h | 5 ++--- python/paddle/trainer_config_helpers/layers.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h index 85ac175657..3bc36daaab 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.h +++ b/paddle/gserver/layers/FactorizationMachineLayer.h @@ -36,8 +36,7 @@ namespace paddle { * * The detailed calculation for forward and backward can be found at this paper: * - * Rendle, Steffen. Factorization machines. IEEE 10th International - * Conference on Data Mining (ICDM). IEEE, 2010. + * Factorization machines. * * The config file api is factorization_machine. */ @@ -59,7 +58,7 @@ private: // The result of input matrix * latent vector matrix that will be used in // both forward and backward step MatrixPtr inputMulFactor_; - // Temporary calculation result store + // Store temporary calculation result MatrixPtr tmpOut_; MatrixPtr tmpSum_; // Negative identity matrix diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index cc1bf923dd..37214a53d3 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -3876,7 +3876,7 @@ def recurrent_layer(input, :type input: LayerOutput :param act: Activation type. TanhActivation is the default activation. :type act: BaseActivation - :param bias_attr: The parameter attribute for bias. If this parameter is set to + :param bias_attr: The parameter attribute for bias. If this parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. @@ -7307,8 +7307,7 @@ def factorization_machine(input, each latent vector is k. For details of Factorization Machine, please refer to the paper: - Rendle, Steffen. Factorization machines. IEEE 10th International - Conference on Data Mining (ICDM). IEEE, 2010. + Factorization machines. .. code-block:: python factor_machine = factorization_machine(input=input_layer, factor_size=10) From 0f4bf1c939cea4bd3c7516eb5a9787b05563cea0 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Sun, 19 Nov 2017 03:00:38 -0800 Subject: [PATCH 036/275] Add GPU device code for testing --- paddle/math/float16.h | 71 ++-------- paddle/math/tests/test_float16.cpp | 102 ++++++++++++-- paddle/math/tests/test_float16.cu | 217 +++++++++++++++++++++++++---- 3 files changed, 296 insertions(+), 94 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 1922192f7b..a1c341113f 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -// need to define PADDLE_ARM_FP16 - #pragma once #include @@ -21,14 +19,7 @@ limitations under the License. */ #include #include - -#include "paddle/utils/Logging.h" - -#define USE_EIGEN - -#ifdef USE_EIGEN // delete this #if macro #include "unsupported/Eigen/CXX11/Tensor" -#endif #ifdef __GNUC__ #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) @@ -52,27 +43,6 @@ limitations under the License. */ #define PADDLE_HOSTDEVICE #endif // __CUDACC__ -#define STR(x) #x -#define XSTR(x) STR(x) - -#ifndef __CUDACC__ -#pragma message "__CUDACC__ not defined" -#else -#pragma message "__CUDACC__ defined" -#endif - -#ifndef CUDA_VERSION -#pragma message "CUDA_VERSION not defined" -#else -#pragma message "CUDA_VERSION defined: " XSTR(CUDA_VERSION) -#endif - -#ifdef __CUDA_ARCH__ -#pragma message "The value of CUDA_ARCH: " XSTR(__CUDA_ARCH__) -#else -#pragma message "CUDA ARCH NOT DEFINED!" -#endif - #ifdef __arm__ #define PADDLE_ARM_32 #endif @@ -113,7 +83,7 @@ namespace paddle { struct float16; namespace fp16_impl { -// convert from float to half precision in round-to-nearest-even mode +// Convert from float to half precision in round-to-nearest-even mode PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f); PADDLE_HOSTDEVICE inline float half_to_float(float16 h); } // namespace fp16_impl @@ -125,7 +95,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h); struct PADDLE_ALIGN(2) float16 { uint16_t x; - PADDLE_HOSTDEVICE inline float16() {} + PADDLE_HOSTDEVICE inline float16() : x(0) {} PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} @@ -139,21 +109,15 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 -#ifdef USE_EIGEN PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} -#endif // USE_EIGEN #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. - // According to gcc, __fp16 can only be used as an argument to fp16 - // intrinsic defined in arm_neon.h or as a storage type. It cannot - // be used as a formal function argument. - // TODO(kexinzhao): test it on RPI - PADDLE_HOSTDEVICE inline float16(const float16_t* h) { - x = *reinterpret_cast(h); + PADDLE_HOSTDEVICE inline float16(const float16_t& h) { + x = *reinterpret_cast(&h); } #endif @@ -225,17 +189,15 @@ struct PADDLE_ALIGN(2) float16 { } #endif -#ifdef USE_EIGEN PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { x = rhs.x; return *this; } -#endif // USE_EIGEN #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) - PADDLE_HOSTDEVICE inline float16& operator=(const float16_t* rhs) { - x = *reinterpret_cast(rhs); + PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) { + x = *reinterpret_cast(&rhs); return *this; } #endif @@ -319,17 +281,14 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 -#ifdef USE_EIGEN PADDLE_HOSTDEVICE inline operator Eigen::half() const { Eigen::half h; h.x = x; return h; } -#endif // USE_EIGEN #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) - // check whether it works or not PADDLE_HOSTDEVICE inline operator float16_t() const { float16 h = *this; return *reinterpret_cast(&h); @@ -381,10 +340,9 @@ struct PADDLE_ALIGN(2) float16 { } }; -// arithmetic operators +// Arithmetic operators #if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 __device__ inline float16 operator+(const float16& a, const float16& b) { - printf("GPU Intrinsic used!"); return float16(__hadd(half(a), half(b))); } @@ -452,7 +410,7 @@ __device__ inline bool operator>=(const float16& a, const float16& b) { } // On ARMv8.2-A CPU -#elif defined(PADDLE_NEON_64) && defined(PADDLE_ARM_FP16) && \ +#elif defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) __host__ inline float16 operator+(const float16& a, const float16& b) { return float16(vaddh_f16(float16_t(a), float16_t(b))); @@ -502,7 +460,7 @@ __host__ inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } -// compare only available in NEON_64 +#ifdef PADDLE_NEON_64 __host__ inline bool operator<(const float16& a, const float16& b) { return static_cast(vclth_f16(float16_t(a), float16_t(b))); } @@ -518,10 +476,10 @@ __host__ inline bool operator>(const float16& a, const float16& b) { __host__ inline bool operator>=(const float16& a, const float16& b) { return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); } +#endif // PADDLE_NEON_64 -#else // software emulation on other cpu +#else // Software emulation on other cpu PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { - LOG(INFO) << "CPU emulation used"; return float16(float(a) + float(b)); } @@ -624,7 +582,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { half tmp = __float2half(f); return *reinterpret_cast(&tmp); -#elif defined(PADDLE_NEON_64) // test on RPI +#elif defined(PADDLE_NEON_64) float16 res; asm volatile( "ld1 {v0.s}[0], [%[float_ptr]]\n" @@ -638,7 +596,7 @@ PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { "memory", "v0"); return res; -#elif defined(PADDLE_NEON_32) // test on RPI +#elif defined(PADDLE_NEON_32) float16 res; asm volatile( "vld1.32 {d0[0]}, [%[float_ptr]]\n" @@ -689,7 +647,7 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { float res; asm volatile( "ld1 {v0.h}[0], [%[half_ptr]]\n" - "FCVT s0, h0\n" + "fcvt s0, h0\n" "st1 {v0.s}[0], [%[float_ptr]]\n" : // outputs : // inputs @@ -739,5 +697,4 @@ PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { } } // namespace fp16_impl - } // namespace paddle diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index 1a20d0e925..8c74bcc039 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -9,22 +9,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include "paddle/math/float16.h" +#include + namespace paddle { TEST(float16, conversion_cpu) { - LOG(INFO) << "cpu test started!"; - - // Conversion to and from Eigen::half - EXPECT_EQ(float16(Eigen::half(float16(1.0f))).x, 0x3c00); - EXPECT_EQ(float16(Eigen::half(float16(0.5f))).x, 0x3800); - EXPECT_EQ(float16(Eigen::half(float16(0.33333f))).x, 0x3555); - EXPECT_EQ(float16(Eigen::half(float16(0.0f))).x, 0x0000); - EXPECT_EQ(float16(Eigen::half(float16(-0.0f))).x, 0x8000); - EXPECT_EQ(float16(Eigen::half(float16(65504.0f))).x, 0x7bff); - EXPECT_EQ(float16(Eigen::half(float16(65536.0f))).x, 0x7c00); + // Explicit conversion from Eigen::half + EXPECT_EQ(float16(Eigen::half(1.0f)).x, 0x3c00); + EXPECT_EQ(float16(Eigen::half(0.5f)).x, 0x3800); + EXPECT_EQ(float16(Eigen::half(0.33333f)).x, 0x3555); + EXPECT_EQ(float16(Eigen::half(0.0f)).x, 0x0000); + EXPECT_EQ(float16(Eigen::half(-0.0f)).x, 0x8000); + EXPECT_EQ(float16(Eigen::half(65504.0f)).x, 0x7bff); + EXPECT_EQ(float16(Eigen::half(65536.0f)).x, 0x7c00); // Conversion from float EXPECT_EQ(float16(1.0f).x, 0x3c00); @@ -36,14 +35,91 @@ TEST(float16, conversion_cpu) { EXPECT_EQ(float16(65536.0f).x, 0x7c00); // Conversion from double + EXPECT_EQ(float16(1.0).x, 0x3c00); + EXPECT_EQ(float16(0.5).x, 0x3800); + EXPECT_EQ(float16(0.33333).x, 0x3555); + EXPECT_EQ(float16(0.0).x, 0x0000); + EXPECT_EQ(float16(-0.0).x, 0x8000); + EXPECT_EQ(float16(65504.0).x, 0x7bff); + EXPECT_EQ(float16(65536.0).x, 0x7c00); // Conversion from int + EXPECT_EQ(float16(-1).x, 0xbc00); + EXPECT_EQ(float16(0).x, 0x0000); + EXPECT_EQ(float16(1).x, 0x3c00); + EXPECT_EQ(float16(2).x, 0x4000); + EXPECT_EQ(float16(3).x, 0x4200); // Conversion from bool + EXPECT_EQ(float16(true).x, 0x3c00); + EXPECT_EQ(float16(false).x, 0x0000); + + // Implicit conversion to and from Eigen::half + Eigen::half tmp = float16(1.0f); + float16 v_conv = tmp; + EXPECT_EQ(tmp.x, 0x3c00); + EXPECT_EQ(v_conv.x, 0x3c00); + + // Default constructor + float16 v_def; + EXPECT_EQ(v_def.x, 0x0000); + + // Assignment operator + float16 v_assign; + v_assign = v_def; + EXPECT_EQ(v_assign.x, 0x0000); + v_assign = Eigen::half(1.0f); + EXPECT_EQ(v_assign.x, 0x3c00); + v_assign = 0.5f; + EXPECT_EQ(v_assign.x, 0x3800); + v_assign = 0.33333; + EXPECT_EQ(v_assign.x, 0x3555); + v_assign = -1; + EXPECT_EQ(v_assign.x, 0xbc00); + v_assign = true; + EXPECT_EQ(v_assign.x, 0x3c00); + + // Conversion operator + EXPECT_EQ(Eigen::half(float16(1.0f)).x, 0x3c00); + EXPECT_EQ(float(float16(0.5f)), 0.5f); + EXPECT_NEAR(double(float16(0.33333)), 0.33333, 0.0001); + EXPECT_EQ(int(float16(-1)), -1); + EXPECT_EQ(bool(float16(true)), true); } -TEST(float16, arithmetic_cpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } +TEST(float16, arithmetic_cpu) { + EXPECT_EQ(float(float16(1) + float16(1)), 2); + EXPECT_EQ(float(float16(5) + float16(-5)), 0); + EXPECT_NEAR(float(float16(0.33333f) + float16(0.66667f)), 1.0f, 0.001); + EXPECT_EQ(float(float16(3) - float16(5)), -2); + EXPECT_NEAR(float(float16(0.66667f) - float16(0.33333f)), 0.33334f, 0.001); + EXPECT_NEAR(float(float16(3.3f) * float16(2.0f)), 6.6f, 0.01); + EXPECT_NEAR(float(float16(-2.1f) * float16(-3.0f)), 6.3f, 0.01); + EXPECT_NEAR(float(float16(2.0f) / float16(3.0f)), 0.66667f, 0.001); + EXPECT_EQ(float(float16(1.0f) / float16(2.0f)), 0.5f); + EXPECT_EQ(float(-float16(512.0f)), -512.0f); + EXPECT_EQ(float(-float16(-512.0f)), 512.0f); +} -TEST(float16, comparison_cpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } +TEST(float16, comparison_cpu) { + EXPECT_TRUE(float16(1.0f) == float16(1.0f)); + EXPECT_FALSE(float16(-1.0f) == float16(-0.5f)); + EXPECT_TRUE(float16(1.0f) != float16(0.5f)); + EXPECT_FALSE(float16(-1.0f) != float16(-1.0f)); + EXPECT_TRUE(float16(1.0f) < float16(2.0f)); + EXPECT_FALSE(float16(-1.0f) < float16(-1.0f)); + EXPECT_TRUE(float16(1.0f) <= float16(1.0f)); + EXPECT_TRUE(float16(2.0f) > float16(1.0f)); + EXPECT_FALSE(float16(-2.0f) > float16(-2.0f)); + EXPECT_TRUE(float16(2.0f) >= float16(2.0f)); + + EXPECT_TRUE(float16(0.0f) == float16(-0.0f)); + EXPECT_TRUE(float16(0.0f) <= float16(-0.0f)); + EXPECT_TRUE(float16(0.0f) >= float16(-0.0f)); + EXPECT_FALSE(float16(0.0f) < float16(-0.0f)); + EXPECT_FALSE(float16(-0.0f) < float16(0.0f)); + EXPECT_FALSE(float16(0.0f) > float16(-0.0f)); + EXPECT_FALSE(float16(-0.0f) > float16(0.0f)); +} } // namespace paddle diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 9ca77cf86c..941f266603 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -9,42 +9,211 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include "paddle/math/float16.h" -namespace paddle { +#include + +#include "paddle/utils/Logging.h" + +#define ARITHMETIC_KERNEL(op_type, sign) \ + __global__ void op_type( \ + const float16* in1, const float16* in2, float16* out) { \ + out[0] = in1[0] sign in2[0]; \ + } + +#define COMPOUND_KERNEL(op_type, sign) \ + __global__ void op_type(float16* in1, const float16* in2) { \ + in1[0] sign in2[0]; \ + } + +#define COMPARISON_KERNEL(op_type, sign) \ + __global__ void op_type(const float16* in1, const float16* in2, bool* out) { \ + out[0] = in1[0] sign in2[0]; \ + } + +#define ARITHMETIC_KERNEL_LAUNCH(op_type) \ + void Test##op_type(float v_in1, float v_in2, float v_out) { \ + LOG(INFO) << "Test " << #op_type << " on GPU!"; \ + float16 *in1, *in2, *out; \ + float16 *d_in1, *d_in2, *d_out; \ + int size = sizeof(float16); \ + cudaMalloc((void**)&d_in1, size); \ + cudaMalloc((void**)&d_in2, size); \ + cudaMalloc((void**)&d_out, size); \ + in1 = (float16*)malloc(size); \ + in2 = (float16*)malloc(size); \ + out = (float16*)malloc(size); \ + in1[0] = float16(v_in1); \ + in2[0] = float16(v_in2); \ + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ + op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ + cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ + EXPECT_EQ(float(out[0]), v_out); \ + free(in1); \ + free(in2); \ + free(out); \ + cudaFree(d_in1); \ + cudaFree(d_in2); \ + cudaFree(d_out); \ + } + +#define COMPOUND_KERNEL_LAUNCH(op_type) \ + void Test##op_type(float v_in1, float v_in2, float v_out) { \ + LOG(INFO) << "Test " << #op_type << " on GPU!"; \ + float16 *in1, *in2; \ + float16 *d_in1, *d_in2; \ + int size = sizeof(float16); \ + cudaMalloc((void**)&d_in1, size); \ + cudaMalloc((void**)&d_in2, size); \ + in1 = (float16*)malloc(size); \ + in2 = (float16*)malloc(size); \ + in1[0] = float16(v_in1); \ + in2[0] = float16(v_in2); \ + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ + op_type<<<1, 1>>>(d_in1, d_in2); \ + cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ + EXPECT_EQ(float(in1[0]), v_out); \ + free(in1); \ + free(in2); \ + cudaFree(d_in1); \ + cudaFree(d_in2); \ + } + +#define COMPARISON_KERNEL_LAUNCH(op_type) \ + void Test##op_type(float v_in1, float v_in2, bool v_out) { \ + LOG(INFO) << "Test " << #op_type << " on GPU!"; \ + float16 *in1, *in2; \ + float16 *d_in1, *d_in2; \ + bool *out, *d_out; \ + int size = sizeof(float16); \ + cudaMalloc((void**)&d_in1, size); \ + cudaMalloc((void**)&d_in2, size); \ + cudaMalloc((void**)&d_out, 1); \ + in1 = (float16*)malloc(size); \ + in2 = (float16*)malloc(size); \ + out = (bool*)malloc(1); \ + in1[0] = float16(v_in1); \ + in2[0] = float16(v_in2); \ + cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ + cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ + op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ + cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost); \ + EXPECT_EQ(out[0], v_out); \ + free(in1); \ + free(in2); \ + free(out); \ + cudaFree(d_in1); \ + cudaFree(d_in2); \ + cudaFree(d_out); \ + } #ifdef PADDLE_CUDA_FP16 -TEST(float16, conversion_gpu) { - LOG(INFO) << "GPU tests started"; +namespace paddle { - // Conversion to and from cuda half - float16 v1 = half(float16(1.0f)); - EXPECT_EQ(v1.x, 0x3c00); +ARITHMETIC_KERNEL(Add, +) +ARITHMETIC_KERNEL(Sub, -) +ARITHMETIC_KERNEL(Mul, *) +ARITHMETIC_KERNEL(Div, /) - // Conversion to and from Eigen::half - float16 v2 = Eigen::half(float16(0.5f)); - EXPECT_EQ(v2.x, 0x3800); +ARITHMETIC_KERNEL_LAUNCH(Add) +ARITHMETIC_KERNEL_LAUNCH(Sub) +ARITHMETIC_KERNEL_LAUNCH(Mul) +ARITHMETIC_KERNEL_LAUNCH(Div) - // Conversion from float - EXPECT_EQ(float16(1.0f).x, 0x3c00); - EXPECT_EQ(float16(0.5f).x, 0x3800); - EXPECT_EQ(float16(0.33333f).x, 0x3555); - EXPECT_EQ(float16(0.0f).x, 0x0000); - EXPECT_EQ(float16(-0.0f).x, 0x8000); - EXPECT_EQ(float16(65504.0f).x, 0x7bff); - EXPECT_EQ(float16(65536.0f).x, 0x7c00); +// Negative sign kernel +__global__ void Neg(float16* in) { in[0] = -in[0]; } - // Conversion from double +void TestNeg(float v_in, float v_out) { + LOG(INFO) << "Test Neg on GPU!"; + float16 *in, *d_in; + int size = sizeof(float16); + cudaMalloc((void**)&d_in, size); + in = (float16*)malloc(size); + in[0] = float16(v_in); + cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); + Neg<<<1, 1>>>(d_in); + cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); + EXPECT_EQ(float(in[0]), v_out); + free(in); + cudaFree(d_in); +} - // Conversion from int +COMPOUND_KERNEL(AddAssign, +=) +COMPOUND_KERNEL(SubAssign, -=) +COMPOUND_KERNEL(MulAssign, *=) +COMPOUND_KERNEL(DivAssign, /=) - // Conversion from bool +COMPOUND_KERNEL_LAUNCH(AddAssign) +COMPOUND_KERNEL_LAUNCH(SubAssign) +COMPOUND_KERNEL_LAUNCH(MulAssign) +COMPOUND_KERNEL_LAUNCH(DivAssign) + +COMPARISON_KERNEL(Equal, ==) +COMPARISON_KERNEL(NotEqual, !=) +COMPARISON_KERNEL(Less, <) +COMPARISON_KERNEL(LessEqual, <=) +COMPARISON_KERNEL(Greater, >) +COMPARISON_KERNEL(GreaterEqual, >=) + +COMPARISON_KERNEL_LAUNCH(Equal) +COMPARISON_KERNEL_LAUNCH(NotEqual) +COMPARISON_KERNEL_LAUNCH(Less) +COMPARISON_KERNEL_LAUNCH(LessEqual) +COMPARISON_KERNEL_LAUNCH(Greater) +COMPARISON_KERNEL_LAUNCH(GreaterEqual) + +TEST(float16, arithmetic_on_gpu) { + TestAdd(1, 2, 3); + TestSub(2, 1, 1); + TestMul(2, 3, 6); + TestDiv(6, 2, 3); + TestNeg(1, -1); } -#endif -TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); } +TEST(float16, compound_on_gpu) { + TestAddAssign(1, 2, 3); + TestSubAssign(2, 1, 1); + TestMulAssign(2, 3, 6); + TestDivAssign(6, 2, 3); +} -TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); } +TEST(float16, comparision_on_gpu) { + TestEqual(1, 1, true); + TestEqual(1, 2, false); + TestNotEqual(2, 3, true); + TestNotEqual(2, 2, false); + TestLess(3, 4, true); + TestLess(3, 3, false); + TestLessEqual(3, 3, true); + TestLessEqual(3, 2, false); + TestGreater(4, 3, true); + TestGreater(4, 4, false); + TestGreaterEqual(4, 4, true); + TestGreaterEqual(4, 5, false); +} + +TEST(float16, conversion_on_gpu) { + // Explicit conversion to and from cuda half + EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00); + EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800); + EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555); + EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000); + EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000); + EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff); + EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00); + + // Implicit conversion to and from cuda half + half tmp = float16(1.0f); + float16 val = tmp; + EXPECT_EQ(val.x, 0x3c00); + + // Assignment operator + float16 v_assign; + v_assign = tmp; + EXPECT_EQ(v_assign.x, 0x3c00); +} } // namespace paddle +#endif From d646e4768fc0049e172f59f8786d9aeeec50491e Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 20 Nov 2017 00:33:27 -0800 Subject: [PATCH 037/275] fix cmake --- paddle/math/tests/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index c131544515..215bac1271 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -18,11 +18,11 @@ add_simple_unittest(test_CpuGpuVector) add_simple_unittest(test_Allocator) if(WITH_GPU) - nv_test(test_float16_gpu SRCS test_float16.cu) CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu) link_paddle_test(test_Tensor) CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu) - link_paddle_test(test_lazyAssign) + link_paddle_test(test_lazyAssign) + nv_test(test_float16_gpu SRCS test_float16.cu) else() compile_cu_as_cpp(test_Tensor.cu) add_unittest(test_Tensor test_Tensor.cu) @@ -30,8 +30,8 @@ else() add_unittest(test_lazyAssign test_lazyAssign.cu) endif(WITH_GPU) -cc_test(test_float16 SRCS test_float16.cpp) add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) +cc_test(test_float16 SRCS test_float16.cpp) From 6fed6f2079902c86c43161f916c3450094fde6d0 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Mon, 20 Nov 2017 20:44:52 +0800 Subject: [PATCH 038/275] Add support of sparse_binary_vector as input for fm layer --- .../layers/FactorizationMachineLayer.cpp | 20 +++++++++----- .../layers/FactorizationMachineLayer.h | 1 + paddle/math/CpuSparseMatrix.cpp | 26 ++++++++++++++----- 3 files changed, 34 insertions(+), 13 deletions(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index f0f1738f30..b665fb6dfc 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -96,15 +96,20 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) { /* Calculate the gradients of the latentVectors_ matrix */ if (latentVectors_->getWGrad()) { - MatrixPtr tmpInput = inputV->clone(0, 0, useGpu_); if (dynamic_cast(inputV.get())) { + Matrix::resizeOrCreateSparseMatrix(tmpInput_, + inputV->getHeight(), + inputV->getWidth(), + inputV->getElementCnt()); + CpuSparseMatrix* sparseInputV = dynamic_cast(inputV.get()); CpuSparseMatrix* sparseInputSquare = dynamic_cast(inputSquare_.get()); CpuSparseMatrix* sparseTmpInput = - dynamic_cast(tmpInput.get()); + dynamic_cast(tmpInput_.get()); sparseTmpInput->copyFrom(*sparseInputV); + sparseTmpInput->rowScale(0, *sparseInputV, *oGrad); latentVectors_->getWGrad()->mul( *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1); @@ -115,12 +120,15 @@ void FactorizationMachineLayer::backward(const UpdateCallback& callback) { negOnes_->add(-1); tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0); } else { - tmpInput->rowScale(0, *inputV, *oGrad); + Matrix::resizeOrCreate( + tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_); + + tmpInput_->rowScale(0, *inputV, *oGrad); latentVectors_->getWGrad()->mul( - *tmpInput->getTranspose(), *inputMulFactor_, 1, 1); - tmpInput->rowScale(0, *inputSquare_, *oGrad); + *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1); + tmpInput_->rowScale(0, *inputSquare_, *oGrad); - tmpSum_->sumCols(*tmpInput, -1, 0); + tmpSum_->sumCols(*tmpInput_, -1, 0); } latentVectors_->getWGrad()->addRowScale( diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h index 3bc36daaab..df20a49934 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.h +++ b/paddle/gserver/layers/FactorizationMachineLayer.h @@ -61,6 +61,7 @@ private: // Store temporary calculation result MatrixPtr tmpOut_; MatrixPtr tmpSum_; + MatrixPtr tmpInput_; // Negative identity matrix MatrixPtr negOnes_; diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp index 6a432cd16b..dc6979cf5a 100644 --- a/paddle/math/CpuSparseMatrix.cpp +++ b/paddle/math/CpuSparseMatrix.cpp @@ -266,13 +266,25 @@ void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) { CHECK_EQ(width_, b.getWidth()); real* A = getValue(); real* B = b.getValue(); - for (size_t i = 0; i < height_; i++) { - size_t start = getRowStartIdx(i); - size_t end = getRowStartIdx(i + 1); - CHECK_EQ(start, b.getRowStartIdx(i)); - CHECK_EQ(end, b.getRowStartIdx(i + 1)); - for (size_t j = start; j < end; j++) { - A[j] = B[j] * c.getElement(i, cCol); + if (b.getValueType() == FLOAT_VALUE) { + for (size_t i = 0; i < height_; i++) { + size_t start = getRowStartIdx(i); + size_t end = getRowStartIdx(i + 1); + CHECK_EQ(start, b.getRowStartIdx(i)); + CHECK_EQ(end, b.getRowStartIdx(i + 1)); + for (size_t j = start; j < end; j++) { + A[j] = B[j] * c.getElement(i, cCol); + } + } + } else if (b.getValueType() == NO_VALUE) { + for (size_t i = 0; i < height_; i++) { + size_t start = getRowStartIdx(i); + size_t end = getRowStartIdx(i + 1); + CHECK_EQ(start, b.getRowStartIdx(i)); + CHECK_EQ(end, b.getRowStartIdx(i + 1)); + for (size_t j = start; j < end; j++) { + A[j] = c.getElement(i, cCol); + } } } } From 74a699a72ef9046a7f302e339c8e20a8152ae9d8 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Mon, 20 Nov 2017 22:14:24 +0800 Subject: [PATCH 039/275] change clone to resizeOrCreate in fm layer --- .../gserver/layers/FactorizationMachineLayer.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp index b665fb6dfc..be26b9ba88 100644 --- a/paddle/gserver/layers/FactorizationMachineLayer.cpp +++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp @@ -58,16 +58,22 @@ void FactorizationMachineLayer::forward(PassType passType) { inputMulFactor_, batchSize, factorSize_, false, useGpu_); Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_); - REGISTER_TIMER_INFO("InputMulFactorTimer", getName().c_str()); + REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str()); inputMulFactor_->mul(*inputV, *latentVectors_->getW()); inputMulFactor_->square2(*tmpOut_); outV->sumRows(*tmpOut_, 0.5, 0); - inputSquare_ = inputV->clone(0, 0, useGpu_); - if (dynamic_cast(inputSquare_.get())) { + if (dynamic_cast(inputV.get())) { + Matrix::resizeOrCreateSparseMatrix(inputSquare_, + inputV->getHeight(), + inputV->getWidth(), + inputV->getElementCnt(), + inputV->getValueType()); inputSquare_->copyFrom(*inputV); (dynamic_cast(inputSquare_.get()))->square2(); } else { + Matrix::resizeOrCreate( + inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_); inputV->square2(*inputSquare_); } latentVectors_->getW()->square2(*latentVectorsSquare_); @@ -75,7 +81,7 @@ void FactorizationMachineLayer::forward(PassType passType) { outV->sumRows(*tmpOut_, -0.5, 1.0); /* activation */ { - REGISTER_TIMER_INFO("FmAtvTimer", getName().c_str()); + REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str()); forwardActivation(); } } From 19e5c24f00fac22da84387510e94596fb577637b Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 20 Nov 2017 17:23:04 -0800 Subject: [PATCH 040/275] fix bug --- paddle/math/float16.h | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index a1c341113f..3b22174148 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -15,8 +15,6 @@ limitations under the License. */ #pragma once #include -#include -#include #include #include "unsupported/Eigen/CXX11/Tensor" @@ -117,7 +115,8 @@ struct PADDLE_ALIGN(2) float16 { // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. PADDLE_HOSTDEVICE inline float16(const float16_t& h) { - x = *reinterpret_cast(&h); + float16_t tmp = h; + x = *reinterpret_cast(&tmp); } #endif @@ -197,7 +196,8 @@ struct PADDLE_ALIGN(2) float16 { #if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) { - x = *reinterpret_cast(&rhs); + float16_t tmp = rhs; + x = *reinterpret_cast(&tmp); return *this; } #endif @@ -460,23 +460,37 @@ __host__ inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } -#ifdef PADDLE_NEON_64 __host__ inline bool operator<(const float16& a, const float16& b) { +#ifdef PADDLE_NEON_64 return static_cast(vclth_f16(float16_t(a), float16_t(b))); +#else + return float(a) < float(b); +#endif // PADDLE_NEON_64 } __host__ inline bool operator<=(const float16& a, const float16& b) { +#ifdef PADDLE_NEON_64 return static_cast(vcleh_f16(float16_t(a), float16_t(b))); +#else + return float(a) <= float(b); +#endif // PADDLE_NEON_64 } __host__ inline bool operator>(const float16& a, const float16& b) { +#ifdef PADDLE_NEON_64 return static_cast(vcgth_f16(float16_t(a), float16_t(b))); +#else + return float(a) > float(b); +#endif // PADDLE_NEON_64 } __host__ inline bool operator>=(const float16& a, const float16& b) { +#ifdef PADDLE_NEON_64 return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); -} +#else + return float(a) >= float(b); #endif // PADDLE_NEON_64 +} #else // Software emulation on other cpu PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { From bc45335e552b90f1119a8eeec33da216f3cfada8 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 21 Nov 2017 14:52:54 +0800 Subject: [PATCH 041/275] add unpool --- paddle/operators/CMakeLists.txt | 2 + paddle/operators/math/CMakeLists.txt | 2 + paddle/operators/math/unpooling.cc | 110 +++++++++++++++++++++ paddle/operators/math/unpooling.cu | 143 +++++++++++++++++++++++++++ paddle/operators/math/unpooling.h | 48 +++++++++ paddle/operators/unpool_op.cc | 116 ++++++++++++++++++++++ paddle/operators/unpool_op.cu.cc | 22 +++++ paddle/operators/unpool_op.h | 85 ++++++++++++++++ 8 files changed, 528 insertions(+) create mode 100644 paddle/operators/math/unpooling.cc create mode 100644 paddle/operators/math/unpooling.cu create mode 100644 paddle/operators/math/unpooling.h create mode 100644 paddle/operators/unpool_op.cc create mode 100644 paddle/operators/unpool_op.cu.cc create mode 100644 paddle/operators/unpool_op.h diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index d39f7bf452..c720cce182 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -139,6 +139,7 @@ set(DEPS_OPS sum_op pool_op maxout_op + unpool_op pool_with_index_op nccl_op sequence_conv_op @@ -151,6 +152,7 @@ op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(sum_op DEPS net_op selected_rows_functor) op_library(pool_op DEPS pooling) op_library(maxout_op DEPS maxouting) +op_library(unpool_op DEPS unpooling) op_library(pool_with_index_op DEPS pooling) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) if(WITH_GPU) diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt index b330f30d21..cd7e33cd7c 100644 --- a/paddle/operators/math/CMakeLists.txt +++ b/paddle/operators/math/CMakeLists.txt @@ -14,6 +14,7 @@ if(WITH_GPU) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) + nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) else() cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator) cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) @@ -26,6 +27,7 @@ else() cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(maxouting SRCS maxouting.cc DEPS device_context) + cc_library(unpooling SRCS unpooling.cc DEPS device_context) endif() cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc new file mode 100644 index 0000000000..36506b903e --- /dev/null +++ b/paddle/operators/math/unpooling.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/maxouting.h" + +namespace paddle { +namespace operators { +namespace math { + +// All tensors are in NCHW format +template +class Unpool2d_Max_Functor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + framework::Tensor * output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const T* input_data = input.data(); + const T* indices_data = indices.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int i = 0; i < input_feasize; ++i) { + int index = indices_data[i]; + if(index > output_feasize) { + //抛一个异常! + } + output_data[index] = input_data[i]; + } + input_data += input_feasize; + indices_data += input_feasize; + output_data += output_feasize; + } + } + } +}; + + + +template +class Unpool2d_MaxGradFunctor { +public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + framework::Tensor * input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const T* input_data = input.data(); + const T* indices_data = indices.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + + for (int b = 0; b < batch_size; ++b) { + for (int c = 0; c < output_channels; ++c) { + for (int f = 0; f < input_feasize; ++f) { + int index = indices_data[i]; + if(index > output_feasize) { + //抛一个异常! + } + input_grad_data[i] = output_grad_data[index]; + } + input_grad_data += input_feasize; + indices_data += input_feasize; + output_grad_data += output_feasize; + } + } + } +}; + +template class Unpool2d_MaxGradFunctor; +template class Unpool2d_MaxGradFunctor; +template class Unpool2d_MaxFunctor; +template class Unpool2d_MaxFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu new file mode 100644 index 0000000000..53e88a57c1 --- /dev/null +++ b/paddle/operators/math/unpooling.cu @@ -0,0 +1,143 @@ +/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/math/maxouting.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__global__ void KernelUnpool2dMax(const int nthreads, + const T* input_data, + const T* indices_data, + const int input_height, + const int input_width, + T* output_data, + const int output_height, + const int output_width) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int out_offset = i / (input_height * input_width) \ + * output_height * output_width; + int out_index = indices_data[i]; + output_data[out_offset + out_index] = input_data[i]; + } +} +template +__global__ void KernelUnpool2dMaxGrad(const int nthreads, + const T* input_data, + const int input_height, + const int input_width, + const T* output_data, + const T* output_grad, + const int output_height, + const int output_width, + T* input_grad) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int out_offset = i / (input_height * input_width) \ + * output_height * output_width; + int out_index = indices_data[i]; + input_grad[i] = output_grad[out_offset + out_index]; + } +} +/* + * All tensors are in NCHW format. + */ +template +class Unpool2d_MaxFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + framework::Tensor * output) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output->dims()[1]; + const int output_height = output->dims()[2]; + const int output_width = output->dims()[3]; + int input_feasize = input_height * input_width; + int output_feasize = output_height * output_width; + const T* input_data = input.data(); + const T* indices_data = indices.data(); + T* output_data = output->mutable_data(context.GetPlace()); + + int nthreads = output->numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelUnpool2dMax< + T><<(context) + .stream()>>>(nthreads, input_data, indices_data, + input_height, input_width, + output_data, output_height, output_width); + } +}; +/* + * All tensors are in NCHW format. + */ +template +class Unpool2d_MaxGradFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + framework::Tensor * input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad, + int groups) { + const int batch_size = input.dims()[0]; + const int input_height = input.dims()[2]; + const int input_width = input.dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; + + const T* input_data = input.data(); + const T* indices_data = indices.data(); + const T* output_data = output.data(); + const T* output_grad_data = output_grad.data(); + T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int nthreads = output.numel(); + int blocks = (nthreads + 1024 - 1) / 1024; + dim3 threads(1024, 1); + dim3 grid(blocks, 1); + + KernelUnpool2dMaxGrad< + T><<(context) + .stream()>>>( + nthreads, input_data, indices_data, + input_height, input_width, + output_data, output_grad_data, + output_height, output_width, + input_grad_data); + } +}; + +template class Unpool2d_MaxGradFunctor; +template class Unpool2d_MaxGradFunctor; + +template class Unpool2d_MaxFunctor; +template class Unpool2d_MaxFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h new file mode 100644 index 0000000000..bb0e0d08f0 --- /dev/null +++ b/paddle/operators/math/unpooling.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/tensor.h" +#include "paddle/platform/device_context.h" +#include "paddle/platform/hostdevice.h" + +namespace paddle { +namespace operators { +namespace math { + +#define FLT_MAX \ + __FLT_MAX__ + +template + +class Unpool2d_Max_Functor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + framework::Tensor * output); +}; + +template +class Unpool2d_Max_GradFunctor { + public: + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + framework::Tensor * input_grad, + const framework::Tensor& output, + const framework::Tensor& output_grad); +}; +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc new file mode 100644 index 0000000000..d81428e802 --- /dev/null +++ b/paddle/operators/unpool_op.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/operators/unpool_op.h" +namespace paddle { +namespace operators { + +using framework::Tensor; + +class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { + public: + UnpoolOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(Tensor) The input tensor of unpool operator. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddInput("Y", + "(Tensor) The input tensor of the indices given out by MaxPool2d. " + "The format of input tensor is NCHW. Where N is batch size, C is the " + "number of channels, H and W is the height and width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of unpool operator." + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); + AddAttr>("ksize", + "(vector ), the unpooling window size(height, width) " + "of unpooling operator."); + AddAttr>("strides", "(vector, default:{1, 1}), " + "strides(height, width) of unpooling operator.") + .SetDefault({1, 1}); + AddAttr>("paddings", "(vector defalut:{0,0}), " + "paddings(height, width) of unpooling operator.") + .SetDefault({0, 0}); + AddAttr("unpoolingType", + "(string), unpooling type, can be \"max\" for max-unpooling " + "and \"avg\" for average-unpooling.") + .InEnum({"max", "avg"}); + AddComment(R"DOC( + + )DOC"); + } +}; + +int OutputSize(int input_size, int ksize, int padding, int stride) { + int output_size = (input_size -1) * stride - 2 * padding + ksize; + return output_size; +} + +class UnpoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of UnpoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of UnpoolOp should not be null."); + + auto in_x_dims = ctx->GetInputDim("X"); + auto in_y_dims = ctx->GetInputDim("Y"); + std::string unpooling_type = ctx->Attrs().Get("unpooling_type"); + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); + + PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, + "Unpooling intput should be 4-D or 5-D tensor."); + + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } +}; + +class UnpoolOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Input(X@GRAD) should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(unpool2d, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool2d_grad, + ops::UnpoolOpGrad); +REGISTER_OP_CPU_KERNEL(unpool2d, ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL(unpool2d_grad, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc new file mode 100644 index 0000000000..8aeef8b3cf --- /dev/null +++ b/paddle/operators/unpool_op.cu.cc @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/unpool_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(unpool2d, + ops::UnpoolKernel); +REGISTER_OP_GPU_KERNEL(unpool2d_grad, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h new file mode 100644 index 0000000000..38903dee17 --- /dev/null +++ b/paddle/operators/unpool_op.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/framework/op_registry.h" +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/math/unpooling.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class UnpoolKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + const Tensor* in_y = context.Input("Y"); + Tensor* out = context.Output("Out"); + std::string pooling_type = context.Attr("unpooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + switch (ksize.size()) { + case 2: { + if (pooling_type == "max") { + math::Unpool2d_Max_Functor unpool2d_max_forward; + unpool2d_max_forward(context.device_context(), *in_x, *in_y, + ksize, strides, paddings, out); + } + } break; + default: { PADDLE_THROW("Pool op only supports 2D input."); } + } + } +}; + +template +class UnpoolGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* in_x = context.Input("X"); + const Tensor* in_y = context.Input("Y"); + const Tensor* out = context.Input("Out"); + const Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = context.Output(framework::GradVarName("X")); + std::string pooling_type = context.Attr("unpooling_type"); + std::vector ksize = context.Attr>("ksize"); + std::vector strides = context.Attr>("strides"); + std::vector paddings = context.Attr>("paddings"); + + auto& device_ctx = context.device_context(); + math::SetConstant zero; + if (in_x_grad) { + in_x_grad->mutable_data(context.GetPlace()); + zero(device_ctx, in_x_grad, static_cast(0.0)); + } + switch (ksize.size()) { + case 2: { + if (pooling_type == "max") { + math::UnpoolGradFunctor maxout_backward; + maxout_backward(context.device_context(), *in_x, *in_y, in_x_grad, *out, + *out_grad, ksize, strides, paddings); + } + } break; + default: { PADDLE_THROW("Pool op only supports 2D input."); } + } + } +}; + +} // namespace operators +} // namespace paddle From 45a8c9ddaf5d16fdeeb6a424988d23c121d207b4 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 21 Nov 2017 16:28:51 +0800 Subject: [PATCH 042/275] add unpool2d make ok --- paddle/operators/CMakeLists.txt | 7 +++++++ paddle/operators/math/unpooling.cc | 26 ++++++++++---------------- paddle/operators/math/unpooling.cu | 21 ++++++++++++--------- paddle/operators/math/unpooling.h | 5 +++-- paddle/operators/unpool_op.cc | 25 ++++++++++++++++--------- paddle/operators/unpool_op.cu.cc | 7 +++++-- paddle/operators/unpool_op.h | 13 ++++++------- 7 files changed, 59 insertions(+), 45 deletions(-) diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index ee25abd6cb..d53bca277d 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -80,6 +80,13 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(pool2d);\n") endif() + # unpool_op contains several operators + if ("${TARGET}" STREQUAL "unpool_op") + set(pybind_flag 1) + # It's enough to just adding one operator to pybind + file(APPEND ${pybind_file} "USE_OP(unpool2d);\n") + endif() + # pool_cudnn_op contains several operators if ("${TARGET}" STREQUAL "pool_cudnn_op") set(pybind_flag 1) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index 36506b903e..8cfdb4bb60 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/math/maxouting.h" +#include "paddle/operators/math/unpooling.h" namespace paddle { namespace operators { @@ -20,7 +20,7 @@ namespace math { // All tensors are in NCHW format template -class Unpool2d_Max_Functor { +class Unpool2d_MaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -36,16 +36,14 @@ class Unpool2d_Max_Functor { int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; const T* input_data = input.data(); - const T* indices_data = indices.data(); + const int * indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { int index = indices_data[i]; - if(index > output_feasize) { - //抛一个异常! - } + // PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); output_data[index] = input_data[i]; } input_data += input_feasize; @@ -70,26 +68,22 @@ public: const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; - const int output_channels = output->dims()[1]; - const int output_height = output->dims()[2]; - const int output_width = output->dims()[3]; + const int output_channels = output.dims()[1]; + const int output_height = output.dims()[2]; + const int output_width = output.dims()[3]; int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; - const T* input_data = input.data(); - const T* indices_data = indices.data(); - const T* output_data = output.data(); + const int* indices_data = indices.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { - for (int f = 0; f < input_feasize; ++f) { + for (int i = 0; i < input_feasize; ++i) { int index = indices_data[i]; - if(index > output_feasize) { - //抛一个异常! - } + // PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); input_grad_data[i] = output_grad_data[index]; } input_grad_data += input_feasize; diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 53e88a57c1..c8e7b25234 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/operators/math/maxouting.h" +#include "paddle/operators/math/unpooling.h" #include "paddle/platform/cuda_helper.h" namespace paddle { @@ -22,7 +22,7 @@ namespace math { template __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, - const T* indices_data, + const int* indices_data, const int input_height, const int input_width, T* output_data, @@ -30,16 +30,19 @@ __global__ void KernelUnpool2dMax(const int nthreads, const int output_width) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; + // int output_feasize = output_height * output_width; for (int i = index; i < nthreads; i += offset) { int out_offset = i / (input_height * input_width) \ * output_height * output_width; int out_index = indices_data[i]; + // PADDLE_ENFORCE(out_index < output_feasize, "err index in unpooling!"); output_data[out_offset + out_index] = input_data[i]; } } template __global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, + const int* indices_data, const int input_height, const int input_width, const T* output_data, @@ -49,10 +52,13 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, T* input_grad) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; + // int output_feasize = output_height * output_width; for (int i = index; i < nthreads; i += offset) { int out_offset = i / (input_height * input_width) \ * output_height * output_width; int out_index = indices_data[i]; + // PADDLE_ENFORCE(out_index < output_feasize, + // "err index in unpooling!"); input_grad[i] = output_grad[out_offset + out_index]; } } @@ -72,10 +78,8 @@ class Unpool2d_MaxFunctor { const int output_channels = output->dims()[1]; const int output_height = output->dims()[2]; const int output_width = output->dims()[3]; - int input_feasize = input_height * input_width; - int output_feasize = output_height * output_width; const T* input_data = input.data(); - const T* indices_data = indices.data(); + const int* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); int nthreads = output->numel(); @@ -99,19 +103,18 @@ class Unpool2d_MaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor * input_grad, const framework::Tensor& output, - const framework::Tensor& output_grad, - int groups) { + const framework::Tensor& output_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; const int output_channels = output.dims()[1]; const int output_height = output.dims()[2]; const int output_width = output.dims()[3]; - const T* input_data = input.data(); - const T* indices_data = indices.data(); + const int* indices_data = indices.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index bb0e0d08f0..ba4be89746 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -26,7 +26,7 @@ namespace math { template -class Unpool2d_Max_Functor { +class Unpool2d_MaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -35,10 +35,11 @@ class Unpool2d_Max_Functor { }; template -class Unpool2d_Max_GradFunctor { +class Unpool2d_MaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor * input_grad, const framework::Tensor& output, const framework::Tensor& output_grad); diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index d81428e802..9d6e69dffb 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -20,7 +20,8 @@ using framework::Tensor; class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { public: - UnpoolOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) + Unpool2dOpMaker(framework::OpProto* proto, \ + framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "(Tensor) The input tensor of unpool operator. " @@ -39,10 +40,12 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>("ksize", "(vector ), the unpooling window size(height, width) " "of unpooling operator."); - AddAttr>("strides", "(vector, default:{1, 1}), " + AddAttr>("strides", + "(vector, default:{1, 1}), " "strides(height, width) of unpooling operator.") .SetDefault({1, 1}); - AddAttr>("paddings", "(vector defalut:{0,0}), " + AddAttr>("paddings", + "(vector defalut:{0,0}), " "paddings(height, width) of unpooling operator.") .SetDefault({0, 0}); AddAttr("unpoolingType", @@ -73,7 +76,8 @@ class UnpoolOp : public framework::OperatorWithKernel { auto in_x_dims = ctx->GetInputDim("X"); auto in_y_dims = ctx->GetInputDim("Y"); - std::string unpooling_type = ctx->Attrs().Get("unpooling_type"); + std::string unpooling_type = \ + ctx->Attrs().Get("unpooling_type"); std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); @@ -95,7 +99,7 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null"); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), @@ -109,8 +113,11 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(unpool2d, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool2d_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL(unpool2d, ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL(unpool2d, + ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL(unpool2d_grad, - ops::UnpoolGradKernel); + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 8aeef8b3cf..96fb9e40c3 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -16,7 +16,10 @@ namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(unpool2d, - ops::UnpoolKernel); + ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_GPU_KERNEL(unpool2d_grad, ops::UnpoolGradKernel); + float>, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index 38903dee17..47dd8da6f7 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -37,9 +37,8 @@ class UnpoolKernel : public framework::OpKernel { switch (ksize.size()) { case 2: { if (pooling_type == "max") { - math::Unpool2d_Max_Functor unpool2d_max_forward; - unpool2d_max_forward(context.device_context(), *in_x, *in_y, - ksize, strides, paddings, out); + math::Unpool2d_MaxFunctor unpool2d_max_forward; + unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); } } break; default: { PADDLE_THROW("Pool op only supports 2D input."); } @@ -71,12 +70,12 @@ class UnpoolGradKernel : public framework::OpKernel { switch (ksize.size()) { case 2: { if (pooling_type == "max") { - math::UnpoolGradFunctor maxout_backward; - maxout_backward(context.device_context(), *in_x, *in_y, in_x_grad, *out, - *out_grad, ksize, strides, paddings); + math::Unpool2d_MaxGradFunctor unpool2d_max_backward; + unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad, + *out, *out_grad); } } break; - default: { PADDLE_THROW("Pool op only supports 2D input."); } + default: { PADDLE_THROW("Unpool op only supports 2D input."); } } } }; From 200f07c2197bb3e35cfcbfcc7dbb201d241a4069 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 21 Nov 2017 17:44:44 +0800 Subject: [PATCH 043/275] add test --- paddle/operators/math/unpooling.cc | 16 +++---- paddle/operators/math/unpooling.cu | 19 ++++---- paddle/operators/math/unpooling.h | 4 +- paddle/operators/unpool_op.cc | 21 ++++++--- paddle/operators/unpool_op.h | 4 +- .../paddle/v2/fluid/tests/test_unpool2d_op.py | 47 +++++++++++++++++++ 6 files changed, 82 insertions(+), 29 deletions(-) create mode 100644 python/paddle/v2/fluid/tests/test_unpool2d_op.py diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index 8cfdb4bb60..a1747e76e7 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -20,7 +20,7 @@ namespace math { // All tensors are in NCHW format template -class Unpool2d_MaxFunctor { +class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -43,7 +43,7 @@ class Unpool2d_MaxFunctor { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { int index = indices_data[i]; - // PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); output_data[index] = input_data[i]; } input_data += input_feasize; @@ -57,7 +57,7 @@ class Unpool2d_MaxFunctor { template -class Unpool2d_MaxGradFunctor { +class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -83,7 +83,7 @@ public: for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { int index = indices_data[i]; - // PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); + PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); input_grad_data[i] = output_grad_data[index]; } input_grad_data += input_feasize; @@ -94,10 +94,10 @@ public: } }; -template class Unpool2d_MaxGradFunctor; -template class Unpool2d_MaxGradFunctor; -template class Unpool2d_MaxFunctor; -template class Unpool2d_MaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index c8e7b25234..f14dd0626f 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -30,12 +30,11 @@ __global__ void KernelUnpool2dMax(const int nthreads, const int output_width) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; - // int output_feasize = output_height * output_width; for (int i = index; i < nthreads; i += offset) { int out_offset = i / (input_height * input_width) \ * output_height * output_width; int out_index = indices_data[i]; - // PADDLE_ENFORCE(out_index < output_feasize, "err index in unpooling!"); + PADDLE_ASSERT(out_index < (output_height * output_width)); output_data[out_offset + out_index] = input_data[i]; } } @@ -52,13 +51,11 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, T* input_grad) { int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; - // int output_feasize = output_height * output_width; for (int i = index; i < nthreads; i += offset) { int out_offset = i / (input_height * input_width) \ * output_height * output_width; int out_index = indices_data[i]; - // PADDLE_ENFORCE(out_index < output_feasize, - // "err index in unpooling!"); + PADDLE_ASSERT(out_index < (output_height * output_width)); input_grad[i] = output_grad[out_offset + out_index]; } } @@ -66,7 +63,7 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, * All tensors are in NCHW format. */ template -class Unpool2d_MaxFunctor { +class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -99,7 +96,7 @@ class Unpool2d_MaxFunctor { * All tensors are in NCHW format. */ template -class Unpool2d_MaxGradFunctor { +class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -135,11 +132,11 @@ class Unpool2d_MaxGradFunctor { } }; -template class Unpool2d_MaxGradFunctor; -template class Unpool2d_MaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; -template class Unpool2d_MaxFunctor; -template class Unpool2d_MaxFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index ba4be89746..93a77bf53e 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -26,7 +26,7 @@ namespace math { template -class Unpool2d_MaxFunctor { +class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -35,7 +35,7 @@ class Unpool2d_MaxFunctor { }; template -class Unpool2d_MaxGradFunctor { +class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 9d6e69dffb..d450d9f62a 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -49,11 +49,15 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "paddings(height, width) of unpooling operator.") .SetDefault({0, 0}); AddAttr("unpoolingType", - "(string), unpooling type, can be \"max\" for max-unpooling " - "and \"avg\" for average-unpooling.") - .InEnum({"max", "avg"}); + "(string), unpooling type, can be \"max\" for max-unpooling ") + .InEnum({"max"}); AddComment(R"DOC( - + "input: the input Tensor to invert" + "indices: the indices given out by MaxPool2d" + "ksize – Size of the max pooling window." + "stride – Stride of the max pooling window." + "It is set to kernel_size by default." + "padding – Padding that was added to the input" )DOC"); } }; @@ -82,8 +86,13 @@ class UnpoolOp : public framework::OperatorWithKernel { std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, - "Unpooling intput should be 4-D or 5-D tensor."); + PADDLE_ENFORCE(in_x_dims.size() == 4, + "Unpooling intput should be 4-D."); + for (int i = 0; i < 4; ++i) { + PADDLE_ENFORCE(in_x_dims[i] == in_y_dims[i], + "X size must be eq Y size!"); + } + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); for (size_t i = 0; i < ksize.size(); ++i) { diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index 47dd8da6f7..44115b0726 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -37,7 +37,7 @@ class UnpoolKernel : public framework::OpKernel { switch (ksize.size()) { case 2: { if (pooling_type == "max") { - math::Unpool2d_MaxFunctor unpool2d_max_forward; + math::Unpool2dMaxFunctor unpool2d_max_forward; unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); } } break; @@ -70,7 +70,7 @@ class UnpoolGradKernel : public framework::OpKernel { switch (ksize.size()) { case 2: { if (pooling_type == "max") { - math::Unpool2d_MaxGradFunctor unpool2d_max_backward; + math::Unpool2dMaxGradFunctor unpool2d_max_backward; unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad, *out, *out_grad); } diff --git a/python/paddle/v2/fluid/tests/test_unpool2d_op.py b/python/paddle/v2/fluid/tests/test_unpool2d_op.py new file mode 100644 index 0000000000..08f734a264 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_unpool2d_op.py @@ -0,0 +1,47 @@ +import unittest +import numpy as np +from op_test import OpTest + + +def maxout_forward_naive(input, groups): + s0, s1, s2, s3 = input.shape + return np.ndarray([s0, s1 / groups, groups, s2, s3], \ + buffer = input, dtype=input.dtype).max(axis=(2)) + + +class TestUnpool2dOp(OpTest): + def setUp(self): + self.op_type = "unpool2d" + self.init_test_case() + input = np.random.random(self.shape).astype("float32") + output = self.MaxOut_forward_naive(input, self.groups).astype("float32") + + self.inputs = {'X': input} + self.attrs = { + 'strides': self.strides, + 'paddings': self.paddings, + 'ksize': self.ksize, + 'unpooling_type': self.pool_type, + } + + self.outputs = {'Out': output.astype('float32')} + + def init_pool_type(self): + self.pool_type = "max" + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + def init_test_case(self): + self.MaxOut_forward_naive = maxout_forward_naive + self.shape = [100, 6, 2, 2] + self.groups=2 + + + + +if __name__ == '__main__': + unittest.main() From 90f664d0b0eb4cb0f13a5ac5c434ed9cb6544687 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 22 Nov 2017 12:52:43 +0800 Subject: [PATCH 044/275] test unpool ok cpu --- paddle/operators/CMakeLists.txt | 7 -- paddle/operators/math/unpooling.cc | 9 +-- paddle/operators/math/unpooling.cu | 4 +- paddle/operators/unpool_op.cc | 25 +++---- paddle/operators/unpool_op.cu.cc | 4 +- paddle/operators/unpool_op.h | 8 +- .../paddle/v2/fluid/tests/test_unpool2d_op.py | 47 ------------ .../paddle/v2/fluid/tests/test_unpool_op.py | 74 +++++++++++++++++++ 8 files changed, 98 insertions(+), 80 deletions(-) delete mode 100644 python/paddle/v2/fluid/tests/test_unpool2d_op.py create mode 100644 python/paddle/v2/fluid/tests/test_unpool_op.py diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index d53bca277d..ee25abd6cb 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -80,13 +80,6 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP(pool2d);\n") endif() - # unpool_op contains several operators - if ("${TARGET}" STREQUAL "unpool_op") - set(pybind_flag 1) - # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_OP(unpool2d);\n") - endif() - # pool_cudnn_op contains several operators if ("${TARGET}" STREQUAL "pool_cudnn_op") set(pybind_flag 1) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index a1747e76e7..0becab721e 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -32,13 +32,13 @@ class Unpool2dMaxFunctor { const int output_channels = output->dims()[1]; const int output_height = output->dims()[2]; const int output_width = output->dims()[3]; - int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; const T* input_data = input.data(); - const int * indices_data = indices.data(); + const T * indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); - + memset(output_data, 0, \ + sizeof(T) * output_feasize * output_channels * batch_size); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { @@ -74,9 +74,8 @@ public: int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; - const int* indices_data = indices.data(); + const T* indices_data = indices.data(); const T* output_grad_data = output_grad.data(); - T* input_grad_data = input_grad->mutable_data(context.GetPlace()); for (int b = 0; b < batch_size; ++b) { diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index f14dd0626f..cd313770ab 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -76,7 +76,7 @@ class Unpool2dMaxFunctor { const int output_height = output->dims()[2]; const int output_width = output->dims()[3]; const T* input_data = input.data(); - const int* indices_data = indices.data(); + const T* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); int nthreads = output->numel(); @@ -111,7 +111,7 @@ class Unpool2dMaxGradFunctor { const int output_height = output.dims()[2]; const int output_width = output.dims()[3]; const T* input_data = input.data(); - const int* indices_data = indices.data(); + const T* indices_data = indices.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index d450d9f62a..9036005a4d 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -48,7 +48,7 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "(vector defalut:{0,0}), " "paddings(height, width) of unpooling operator.") .SetDefault({0, 0}); - AddAttr("unpoolingType", + AddAttr("unpoolingtype", "(string), unpooling type, can be \"max\" for max-unpooling ") .InEnum({"max"}); AddComment(R"DOC( @@ -80,8 +80,8 @@ class UnpoolOp : public framework::OperatorWithKernel { auto in_x_dims = ctx->GetInputDim("X"); auto in_y_dims = ctx->GetInputDim("Y"); - std::string unpooling_type = \ - ctx->Attrs().Get("unpooling_type"); + std::string unpoolingtype = \ + ctx->Attrs().Get("unpoolingtype"); std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); @@ -108,9 +108,9 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null."); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); + // PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null."); + // PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + // "Input(Out@GRAD) should not be null"); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Input(X@GRAD) should not be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); @@ -120,13 +120,12 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(unpool2d, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool2d_grad, +REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL(unpool2d, +REGISTER_OP_CPU_KERNEL(unpool, ops::UnpoolKernel, ops::UnpoolKernel); -REGISTER_OP_CPU_KERNEL(unpool2d_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); +REGISTER_OP_CPU_KERNEL(unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); + diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 96fb9e40c3..4949fc467e 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -15,10 +15,10 @@ #include "paddle/operators/unpool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(unpool2d, +REGISTER_OP_GPU_KERNEL(unpool, ops::UnpoolKernel, ops::UnpoolKernel); -REGISTER_OP_GPU_KERNEL(unpool2d_grad, +REGISTER_OP_GPU_KERNEL(unpool_grad, ops::UnpoolGradKernel, ops::UnpoolGradKernel { const Tensor* in_x = context.Input("X"); const Tensor* in_y = context.Input("Y"); Tensor* out = context.Output("Out"); - std::string pooling_type = context.Attr("unpooling_type"); + std::string unpoolingtype = context.Attr("unpoolingtype"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); switch (ksize.size()) { case 2: { - if (pooling_type == "max") { + if (unpoolingtype == "max") { math::Unpool2dMaxFunctor unpool2d_max_forward; unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); } @@ -56,7 +56,7 @@ class UnpoolGradKernel : public framework::OpKernel { const Tensor* out_grad = context.Input(framework::GradVarName("Out")); Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - std::string pooling_type = context.Attr("unpooling_type"); + std::string unpoolingtype = context.Attr("unpoolingtype"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); @@ -69,7 +69,7 @@ class UnpoolGradKernel : public framework::OpKernel { } switch (ksize.size()) { case 2: { - if (pooling_type == "max") { + if (unpoolingtype == "max") { math::Unpool2dMaxGradFunctor unpool2d_max_backward; unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad, *out, *out_grad); diff --git a/python/paddle/v2/fluid/tests/test_unpool2d_op.py b/python/paddle/v2/fluid/tests/test_unpool2d_op.py deleted file mode 100644 index 08f734a264..0000000000 --- a/python/paddle/v2/fluid/tests/test_unpool2d_op.py +++ /dev/null @@ -1,47 +0,0 @@ -import unittest -import numpy as np -from op_test import OpTest - - -def maxout_forward_naive(input, groups): - s0, s1, s2, s3 = input.shape - return np.ndarray([s0, s1 / groups, groups, s2, s3], \ - buffer = input, dtype=input.dtype).max(axis=(2)) - - -class TestUnpool2dOp(OpTest): - def setUp(self): - self.op_type = "unpool2d" - self.init_test_case() - input = np.random.random(self.shape).astype("float32") - output = self.MaxOut_forward_naive(input, self.groups).astype("float32") - - self.inputs = {'X': input} - self.attrs = { - 'strides': self.strides, - 'paddings': self.paddings, - 'ksize': self.ksize, - 'unpooling_type': self.pool_type, - } - - self.outputs = {'Out': output.astype('float32')} - - def init_pool_type(self): - self.pool_type = "max" - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out') - - def init_test_case(self): - self.MaxOut_forward_naive = maxout_forward_naive - self.shape = [100, 6, 2, 2] - self.groups=2 - - - - -if __name__ == '__main__': - unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py new file mode 100644 index 0000000000..566da6e26e --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -0,0 +1,74 @@ +import unittest +import numpy as np +from op_test import OpTest + + +def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings): + s0, s1, s2, s3 = input.shape + out_H=(s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0] + out_W=(s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1] + out = np.zeros((s0, s1, out_H, out_W)) + for nidx in xrange(s0): + for cidx in xrange(s1): + for h in xrange(s2): + for w in xrange(s3): + index = indices[nidx, cidx, h, w] + hidx = (index - index % out_W) / out_W + widx = index % out_W + out[nidx, cidx, int(hidx), int(widx)] = input[nidx, cidx, h, w] + + return out + + +class TestUnpoolOp(OpTest): + def setUp(self): + self.op_type = "unpool" + self.init_test_case() + pre_input = np.random.random(self.shape).astype("float32") + N, C, H, W = pre_input.shape + H_out = (H - self.ksize[0] + 2 * self.paddings[0]) / self.strides[0] + 1 + W_out = (W - self.ksize[1] + 2 * self.paddings[1]) / self.strides[1] + 1 + input = np.zeros((N, C, H_out, W_out)) + indices = np.zeros((N, C, H_out, W_out)) + for i in xrange(H_out): + for j in xrange(W_out): + r_start = np.max((i * self.strides[0] - self.paddings[0], 0)) + r_end = np.min((i * self.strides[0] + self.ksize[0] - self.paddings[0], H)) + c_start = np.max((j * self.strides[1] - self.paddings[1], 0)) + c_end = np.min((j * self.strides[1] + self.ksize[1] - self.paddings[1], W)) + for nidx in xrange(N): + for cidx in xrange(C): + x_masked = pre_input[nidx, cidx, r_start:r_end, c_start:c_end] + input[nidx, cidx, i, j] = x_masked.max() + arg = x_masked.argmax() + indices[nidx, cidx, i, j] = (r_start + arg / self.ksize[1]) * W + c_start + arg % self.ksize[1] + output = self.Unpool2d_forward_naive(input, indices, self.ksize, self.strides, self.paddings).astype("float32") + self.inputs = {'X': input.astype('float32'), + 'Y': indices.astype('int16')} + self.attrs = { + 'strides': self.strides, + 'paddings': self.paddings, + 'ksize': self.ksize, + 'unpoolingtype': self.unpoolingtype, + } + self.outputs = {'Out': output.astype('float32')} + + def test_check_output(self): + print self.outputs['Out'] + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', max_relative_error=0.5) + + def init_test_case(self): + self.Unpool2d_forward_naive = unpool2dmax_forward_naive + self.unpoolingtype = "max" + self.shape = [10, 2, 5, 5] + self.ksize = [3, 3] + self.strides = [2, 2] + self.paddings = [0, 0] + + + +if __name__ == '__main__': + unittest.main() From e7cbde80c3fa3de277e74c3a7e80a2046ea9edf5 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 22 Nov 2017 13:57:28 +0800 Subject: [PATCH 045/275] simplify the CMakeLists.txt of trainer/tests --- paddle/trainer/tests/CMakeLists.txt | 51 +++++++++-------------------- 1 file changed, 15 insertions(+), 36 deletions(-) diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt index 2739878b7f..9d33e20656 100644 --- a/paddle/trainer/tests/CMakeLists.txt +++ b/paddle/trainer/tests/CMakeLists.txt @@ -1,19 +1,17 @@ -################# test_Compare ############################ -add_unittest_without_exec(test_Compare - test_Compare.cpp) -add_test(NAME test_Compare - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python - ${CMAKE_CURRENT_BINARY_DIR}/test_Compare - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +set(PYTHON_PATH + ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d + ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests) +function(trainer_test TARGET) + add_unittest_without_exec(${TARGET} ${TARGET}.cpp) + add_test(NAME ${TARGET} + COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +endfunction() -################# test_Trainer ########################### -add_unittest_without_exec(test_Trainer - test_Trainer.cpp) -add_test(NAME test_Trainer - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +trainer_test(test_Compare) +trainer_test(test_PyDataProviderWrapper) +trainer_test(test_recurrent_machine_generation) +trainer_test(test_Trainer) ############### test_TrainerOnePass ########################## if(WITH_PYTHON) @@ -22,32 +20,13 @@ if(WITH_PYTHON) add_unittest_without_exec(test_TrainerOnePass test_TrainerOnePass.cpp) add_test(NAME test_TrainerOnePass - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d - ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests + COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) endif() -################# test_recurrent_machine_generation ############### -add_unittest_without_exec(test_recurrent_machine_generation - test_recurrent_machine_generation.cpp) -add_test(NAME test_recurrent_machine_generation - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ - ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) - -#################### test_PyDataProviderWrapper ######################### -add_unittest_without_exec(test_PyDataProviderWrapper - test_PyDataProviderWrapper.cpp) - -add_test(NAME test_PyDataProviderWrapper - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d - ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests - ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) - #################### test_config_parser ######################### add_test(NAME test_config_parser - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/ + COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) From e2a5905eaec4bafa2d469c94f9da5c01f9aae328 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 22 Nov 2017 15:38:17 +0800 Subject: [PATCH 046/275] gpu test ok unpool2dmax --- paddle/operators/math/unpooling.cc | 2 - paddle/operators/math/unpooling.cu | 42 ++++++++++++------- paddle/operators/math/unpooling.h | 3 -- paddle/operators/unpool_op.cc | 3 -- paddle/operators/unpool_op.h | 9 +++- .../paddle/v2/fluid/tests/test_unpool_op.py | 4 +- 6 files changed, 38 insertions(+), 25 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index 0becab721e..1622dcca87 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -37,8 +37,6 @@ class Unpool2dMaxFunctor { const T* input_data = input.data(); const T * indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); - memset(output_data, 0, \ - sizeof(T) * output_feasize * output_channels * batch_size); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index cd313770ab..d26ceed6ad 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -22,41 +22,56 @@ namespace math { template __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, - const int* indices_data, + const T* indices_data, const int input_height, const int input_width, + const int channels, T* output_data, const int output_height, const int output_width) { + int bsize = input_height * input_width * channels; + int csize = input_height * input_width; + int out_bsize = output_height * output_width * channels; + int out_csize = output_height * output_width; int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; for (int i = index; i < nthreads; i += offset) { - int out_offset = i / (input_height * input_width) \ - * output_height * output_width; + int bidx = i / bsize; + int boffset = i % bsize; + int cidx = boffset / csize; + int out_offset = bidx * out_bsize + cidx * out_csize; int out_index = indices_data[i]; PADDLE_ASSERT(out_index < (output_height * output_width)); + printf("-------%d------[%f]\n", out_offset + out_index, input_data[i]); output_data[out_offset + out_index] = input_data[i]; } } template __global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, - const int* indices_data, + const T* indices_data, const int input_height, const int input_width, + const int channels, const T* output_data, const T* output_grad, const int output_height, const int output_width, T* input_grad) { + int bsize = input_height * input_width * channels; + int csize = input_height * input_width; + int out_bsize = output_height * output_width * channels; + int out_csize = output_height * output_width; int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; for (int i = index; i < nthreads; i += offset) { - int out_offset = i / (input_height * input_width) \ - * output_height * output_width; - int out_index = indices_data[i]; - PADDLE_ASSERT(out_index < (output_height * output_width)); - input_grad[i] = output_grad[out_offset + out_index]; + int bidx = i / bsize; + int boffset = i % bsize; + int cidx = boffset / csize; + int out_offset = bidx * out_bsize + cidx * out_csize; + int out_index = indices_data[i]; + PADDLE_ASSERT(out_index < (output_height * output_width)); + input_grad[i] = output_grad[out_offset + out_index]; } } /* @@ -78,8 +93,7 @@ class Unpool2dMaxFunctor { const T* input_data = input.data(); const T* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); - - int nthreads = output->numel(); + int nthreads = batch_size * output_channels * input_height * input_width; int blocks = (nthreads + 1024 - 1) / 1024; dim3 threads(1024, 1); dim3 grid(blocks, 1); @@ -88,7 +102,7 @@ class Unpool2dMaxFunctor { T><<(context) .stream()>>>(nthreads, input_data, indices_data, - input_height, input_width, + input_height, input_width, output_channels, output_data, output_height, output_width); } }; @@ -115,7 +129,7 @@ class Unpool2dMaxGradFunctor { const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = output.numel(); + int nthreads = batch_size * output_channels * input_height * input_width; int blocks = (nthreads + 1024 - 1) / 1024; dim3 threads(1024, 1); dim3 grid(blocks, 1); @@ -125,7 +139,7 @@ class Unpool2dMaxGradFunctor { reinterpret_cast(context) .stream()>>>( nthreads, input_data, indices_data, - input_height, input_width, + input_height, input_width, output_channels, output_data, output_grad_data, output_height, output_width, input_grad_data); diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index 93a77bf53e..88e88ba117 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -21,9 +21,6 @@ namespace paddle { namespace operators { namespace math { -#define FLT_MAX \ - __FLT_MAX__ - template class Unpool2dMaxFunctor { diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 9036005a4d..add8f15736 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -108,9 +108,6 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); - // PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null."); - // PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - // "Input(Out@GRAD) should not be null"); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Input(X@GRAD) should not be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index 452a328eee..e3a45ff9a7 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -29,11 +29,16 @@ class UnpoolKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { const Tensor* in_x = context.Input("X"); const Tensor* in_y = context.Input("Y"); - Tensor* out = context.Output("Out"); + auto * out = context.Output("Out"); std::string unpoolingtype = context.Attr("unpoolingtype"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + T* output_data = out->mutable_data(context.GetPlace()); + if (output_data) { + math::SetConstant set_zero; + set_zero(context.device_context(), out, static_cast(0)); + } switch (ksize.size()) { case 2: { if (unpoolingtype == "max") { @@ -66,7 +71,7 @@ class UnpoolGradKernel : public framework::OpKernel { if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0.0)); - } + } switch (ksize.size()) { case 2: { if (unpoolingtype == "max") { diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index 566da6e26e..7984743e6f 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -54,6 +54,8 @@ class TestUnpoolOp(OpTest): self.outputs = {'Out': output.astype('float32')} def test_check_output(self): + print self.inputs['X'] + print self.inputs['Y'] print self.outputs['Out'] self.check_output() @@ -63,7 +65,7 @@ class TestUnpoolOp(OpTest): def init_test_case(self): self.Unpool2d_forward_naive = unpool2dmax_forward_naive self.unpoolingtype = "max" - self.shape = [10, 2, 5, 5] + self.shape = [6, 4, 5, 5] self.ksize = [3, 3] self.strides = [2, 2] self.paddings = [0, 0] From 47bd0bb6787d049f094f2c883f54b7d314eedec1 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 22 Nov 2017 15:45:43 +0800 Subject: [PATCH 047/275] del printf --- paddle/operators/math/unpooling.cu | 1 - python/paddle/v2/fluid/tests/test_unpool_op.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index d26ceed6ad..bb8489fb95 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -42,7 +42,6 @@ __global__ void KernelUnpool2dMax(const int nthreads, int out_offset = bidx * out_bsize + cidx * out_csize; int out_index = indices_data[i]; PADDLE_ASSERT(out_index < (output_height * output_width)); - printf("-------%d------[%f]\n", out_offset + out_index, input_data[i]); output_data[out_offset + out_index] = input_data[i]; } } diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index 7984743e6f..b1ddf95acc 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -54,9 +54,6 @@ class TestUnpoolOp(OpTest): self.outputs = {'Out': output.astype('float32')} def test_check_output(self): - print self.inputs['X'] - print self.inputs['Y'] - print self.outputs['Out'] self.check_output() def test_check_grad(self): From 0112c5d640d7e311f99fab553d7da9ee6653865c Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 22 Nov 2017 15:51:52 +0800 Subject: [PATCH 048/275] format code --- paddle/operators/math/unpooling.cc | 1 - paddle/operators/math/unpooling.cu | 30 +++++++++++++++--------------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index 1622dcca87..a3a24a6892 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -69,7 +69,6 @@ public: const int output_channels = output.dims()[1]; const int output_height = output.dims()[2]; const int output_width = output.dims()[3]; - int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; const T* indices_data = indices.data(); diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index bb8489fb95..358847b315 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -29,21 +29,21 @@ __global__ void KernelUnpool2dMax(const int nthreads, T* output_data, const int output_height, const int output_width) { - int bsize = input_height * input_width * channels; - int csize = input_height * input_width; - int out_bsize = output_height * output_width * channels; - int out_csize = output_height * output_width; - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - int bidx = i / bsize; - int boffset = i % bsize; - int cidx = boffset / csize; - int out_offset = bidx * out_bsize + cidx * out_csize; - int out_index = indices_data[i]; - PADDLE_ASSERT(out_index < (output_height * output_width)); - output_data[out_offset + out_index] = input_data[i]; - } + int bsize = input_height * input_width * channels; + int csize = input_height * input_width; + int out_bsize = output_height * output_width * channels; + int out_csize = output_height * output_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int bidx = i / bsize; + int boffset = i % bsize; + int cidx = boffset / csize; + int out_offset = bidx * out_bsize + cidx * out_csize; + int out_index = indices_data[i]; + PADDLE_ASSERT(out_index < (output_height * output_width)); + output_data[out_offset + out_index] = input_data[i]; + } } template __global__ void KernelUnpool2dMaxGrad(const int nthreads, From e553d5728d52f4dd2ebc11228053ed31da05a62c Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 22 Nov 2017 15:59:02 +0800 Subject: [PATCH 049/275] format test code --- .../paddle/v2/fluid/tests/test_unpool_op.py | 27 ++++++++++++------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index b1ddf95acc..106af9f5d9 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -15,7 +15,8 @@ def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings): index = indices[nidx, cidx, h, w] hidx = (index - index % out_W) / out_W widx = index % out_W - out[nidx, cidx, int(hidx), int(widx)] = input[nidx, cidx, h, w] + out[nidx, cidx, int(hidx), int(widx)] = \ + input[nidx, cidx, h, w] return out @@ -26,23 +27,31 @@ class TestUnpoolOp(OpTest): self.init_test_case() pre_input = np.random.random(self.shape).astype("float32") N, C, H, W = pre_input.shape - H_out = (H - self.ksize[0] + 2 * self.paddings[0]) / self.strides[0] + 1 - W_out = (W - self.ksize[1] + 2 * self.paddings[1]) / self.strides[1] + 1 + H_out = (H - self.ksize[0] + 2 * self.paddings[0]) / \ + self.strides[0] + 1 + W_out = (W - self.ksize[1] + 2 * self.paddings[1]) / \ + self.strides[1] + 1 input = np.zeros((N, C, H_out, W_out)) indices = np.zeros((N, C, H_out, W_out)) for i in xrange(H_out): for j in xrange(W_out): r_start = np.max((i * self.strides[0] - self.paddings[0], 0)) - r_end = np.min((i * self.strides[0] + self.ksize[0] - self.paddings[0], H)) + r_end = np.min((i * self.strides[0] + self.ksize[0] - \ + self.paddings[0], H)) c_start = np.max((j * self.strides[1] - self.paddings[1], 0)) - c_end = np.min((j * self.strides[1] + self.ksize[1] - self.paddings[1], W)) + c_end = np.min((j * self.strides[1] + self.ksize[1] - \ + self.paddings[1], W)) for nidx in xrange(N): for cidx in xrange(C): - x_masked = pre_input[nidx, cidx, r_start:r_end, c_start:c_end] + x_masked = pre_input[nidx, cidx, r_start:r_end, \ + c_start:c_end] input[nidx, cidx, i, j] = x_masked.max() arg = x_masked.argmax() - indices[nidx, cidx, i, j] = (r_start + arg / self.ksize[1]) * W + c_start + arg % self.ksize[1] - output = self.Unpool2d_forward_naive(input, indices, self.ksize, self.strides, self.paddings).astype("float32") + indices[nidx, cidx, i, j] = \ + (r_start + arg / self.ksize[1]) * W + \ + c_start + arg % self.ksize[1] + output = self.Unpool2d_forward_naive(input, indices, self.ksize, \ + self.strides, self.paddings).astype("float32") self.inputs = {'X': input.astype('float32'), 'Y': indices.astype('int16')} self.attrs = { @@ -57,7 +66,7 @@ class TestUnpoolOp(OpTest): self.check_output() def test_check_grad(self): - self.check_grad(['X'], 'Out', max_relative_error=0.5) + self.check_grad(['X'], 'Out') def init_test_case(self): self.Unpool2d_forward_naive = unpool2dmax_forward_naive From b3e7c4bcf967f1927c6c5df9c3bf2526080df265 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 22 Nov 2017 16:29:11 +0800 Subject: [PATCH 050/275] simplify the CMakeLists.txt of gserver/tests --- paddle/gserver/tests/CMakeLists.txt | 81 ++++++++++------------------- 1 file changed, 28 insertions(+), 53 deletions(-) diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index c295ea19c9..f68abc1b9f 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -1,5 +1,4 @@ # gserver pacakge unittests - add_simple_unittest(test_LinearChainCRF) add_simple_unittest(test_RecurrentLayer) @@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore) gserver_test(test_Expand) gserver_test(test_MaxPoolingWithMaskOutput) +set(PYTHON_PATH + ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d + ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests) +function(gserver_test2 TARGET) + add_unittest_without_exec(${TARGET} ${TARGET}.cpp) + add_test(NAME ${TARGET} + COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +endfunction() + +gserver_test2(test_CompareTwoNets) +gserver_test2(test_PyDataProvider2) +if(WITH_PYTHON) + gserver_test2(test_PyDataProvider) +endif() +if(NOT MOBILE_INFERENCE) + # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it. + gserver_test2(test_RecurrentGradientMachine) +endif() + ########## test_MKLDNN layers and activations ########## if(WITH_MKLDNN) add_unittest_without_exec(test_MKLDNN @@ -36,26 +55,14 @@ if(WITH_MKLDNN) MKLDNNTester.cpp LayerGradUtil.cpp) add_test(NAME test_MKLDNN - COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python - ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN + COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) endif() -############## test_PyDataProvider ######################## -if(WITH_PYTHON) - add_unittest_without_exec(test_PyDataProvider - test_PyDataProvider.cpp) - - add_test(NAME test_PyDataProvider - COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) -endif() - ############### test_WarpCTCLayer ####################### if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) add_unittest_without_exec(test_WarpCTCLayer test_WarpCTCLayer.cpp) - add_test(NAME test_WarpCTCLayer COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR} WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) @@ -66,57 +73,25 @@ if(NOT MOBILE_INFERENCE) add_unittest(test_Evaluator test_Evaluator.cpp) -############### test_RecurrentGradientMachine ############### - # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine - # I will fix it. - add_unittest_without_exec(test_RecurrentGradientMachine - test_RecurrentGradientMachine.cpp) - add_test(NAME test_RecurrentGradientMachine - COMMAND .set_python_path.sh -d - ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests - ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) - ############### test_NetworkCompare ############### add_unittest_without_exec(test_NetworkCompare test_NetworkCompare.cpp) if(WITH_GPU) - add_test(NAME test_NetworkCompare - COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) + set(use_gpu true) else() - add_test(NAME test_NetworkCompare - COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) + set(use_gpu false) endif() + add_test(NAME test_NetworkCompare + COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu} + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) endif() - -add_unittest_without_exec(test_PyDataProvider2 - test_PyDataProvider2.cpp) - -add_test(NAME test_PyDataProvider2 - COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2 - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle -) - ################# test_CompareSparse ################## add_unittest_without_exec(test_CompareSparse test_CompareSparse.cpp) if(NOT ON_TRAVIS) add_test(NAME test_CompareSparse - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d - ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests - ./.set_port.sh -p port -n 6 - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse + COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6 + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) endif() - -################ test_CompareTwoNets ###################### -add_unittest_without_exec(test_CompareTwoNets - test_CompareTwoNets.cpp) -add_test(NAME test_CompareTwoNets - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d - ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) From 7fe61a7fa823e2b611ca42aacad76f5ca02a7217 Mon Sep 17 00:00:00 2001 From: Kavya Srinet Date: Wed, 22 Nov 2017 10:55:28 -0800 Subject: [PATCH 051/275] Editing and re-writing parts of Data Reader design doc --- doc/design/reader/README.md | 70 ++++++++++++++++++++----------------- 1 file changed, 37 insertions(+), 33 deletions(-) diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md index 320dccec3d..2cd4b6225b 100644 --- a/doc/design/reader/README.md +++ b/doc/design/reader/README.md @@ -1,25 +1,25 @@ # Python Data Reader Design Doc -At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that +During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following: -- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items. -- A *reader creator* is a function that returns a reader function. -- A *reader decorator* is a function, which accepts one or more readers, and returns a reader. -- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. +- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items. +- A *reader creator*: A function that returns a reader function. +- A *reader decorator*: A function, which takes in one or more readers, and returns a reader. +- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items. -and provide function which converts reader to batch reader, frequently used reader creators and reader decorators. +and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators. ## Data Reader Interface -Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`): +*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows: ``` iterable = data_reader() ``` -Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int) +The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.) -An example implementation for single item data reader creator: +An example implementation for single item data reader creator is as follows: ```python def reader_creator_random_image(width, height): @@ -29,7 +29,7 @@ def reader_creator_random_image(width, height): return reader ``` -An example implementation for multiple item data reader creator: +An example implementation for multiple item data reader creator is as follows: ```python def reader_creator_random_image_and_label(width, height, label): def reader(): @@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label): ## Batch Reader Interface -*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple. +*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple. + +Here are some valid outputs: -Here are valid outputs: ```python # a mini batch of three data items. Each data item consist three columns of data, each of which is 1. [(1, 1, 1), @@ -58,20 +59,22 @@ Here are valid outputs: Please note that each item inside the list must be a tuple, below is an invalid output: ```python # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],). - # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1], - # or three column of datas, each of which is 1. + # Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1], + # or three columns of data, each of which is 1. [[1,1,1], [2,2,2], [3,3,3]] ``` -It's easy to convert from reader to batch reader: +It is easy to convert from a reader to a batch reader: + ```python mnist_train = paddle.dataset.mnist.train() mnist_train_batch_reader = paddle.batch(mnist_train, 128) ``` -Also easy to create custom batch reader: +It is also straight forward to create a custom batch reader: + ```python def custom_batch_reader(): while True: @@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader ## Usage -batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`: +Following is how we can use the reader with PaddlePaddle: +The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows: ```python # two data layer is created: @@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...) ## Data Reader Decorator -*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax. +The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax. -Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples: +Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples: ### Prefetch Data -Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data. +Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data. Use `paddle.reader.buffered` to prefetch data: @@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100) ### Compose Multiple Data Readers -For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). +For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661). -We can do: +We can do the following : ```python def reader_creator_random_image(width, height): @@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False) reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader) # Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry. -# And we don't care second item at this time. +# And we don't care about the second item at this time. paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...) ``` ### Shuffle -Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read. +Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read. Example: ```python @@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512) ## Q & A -### Why reader return only a single entry, but not a mini batch? +### Why does a reader return only a single entry, and not a mini batch? -Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2). +Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2). -We provide function `paddle.batch` to turn (single entry) reader into batch reader. +We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader. -### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient? +### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ? -In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically. +In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful. -### Why use a dictionary but not a list to provide mapping? +### Why use a dictionary instead of a list to provide mapping? -We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`). +Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`). -### How to create custom data reader creator +### How to create a custom data reader creator ? ```python def image_reader_creator(image_path, label_path, n): @@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...) ### How is `paddle.train` implemented -An example implementation of paddle.train could be: +An example implementation of paddle.train is: ```python def train(batch_reader, mapping, batch_size, total_pass): From 7046e0249a45b00729c551d0d1ecd64af2c06af5 Mon Sep 17 00:00:00 2001 From: Kavya Srinet Date: Wed, 22 Nov 2017 18:35:05 -0800 Subject: [PATCH 052/275] Updated the design doc for distributed training architecture --- .../refactor/distributed_architecture.md | 168 +++++------------- 1 file changed, 45 insertions(+), 123 deletions(-) diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md index ac7e98ccf1..2b4f921ae9 100644 --- a/doc/design/refactor/distributed_architecture.md +++ b/doc/design/refactor/distributed_architecture.md @@ -2,106 +2,70 @@ ## Abstract -PaddlePaddle v0.10.0 uses the "trainer-parameter server" -architecture. We run multiple replicated instances of trainers (runs -the same code written by the user) and parameter servers for -distributed training. This architecture served us well, but has some -limitations: +PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations: -1. Need to write special code to handle tasks which should only be run - by a single trainer. E.g., initializing model and saving model. +1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc. -2. Model parallelism is hard: need to write if-else branches conditioned - on the trainer ID to partition model onto each trainer, and manually - write the inter-model-shard communication code. +2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers. -3. The user can not directly specify the parameter update rule: need - to modify the parameter server C++ code and compile a new - binary. This adds complication for researchers: A lot of extra - effort is required. Besides, the training job submission program - may not allow running arbitrary binaries. +3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries. -This design doc discusses PaddlePaddle's new distributed training -architecture that addresses the above limitations. +This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations. ## Analysis -We will assume the user writes the trainer program by Python, the same -analysis holds if the trainer program is written in C++. +The assumption is that the user writes the trainer program in either Python or C++. ### Limitation 1 -If we look at the Python code that the user writes, there are two -kinds of functionalities: +There are two basic functionalities in the trainer program: -- The training logic such as load / save model and print log. -- The neural network definition such as the definition of the data - layer, the fully connected layer, the cost function and the +1. The training logic such as loading / saving the model and printing out the logs. +2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the optimizer. -When we training with PaddlePaddle v0.10.0 distributedly, multiple -replicated Python instances are running on different nodes: both the -training logic and the neural network computation is replicated. +When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the +training logic as well as the neural network computation logic, is replicated. -The tasks that should only run once all belong to the training logic, -if we only replicate the neural network computation, but do **not** -replicate the training logic, the limitation could be solved. +The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not** +replicate the training logic, the limitation mentioned above can be avoided. ### Limitation 2 -Model parallelism means running a single model on multiple nodes by -partitioning the model onto different nodes and managing the -inter-model-shard communications. +Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the +inter-model-shard communication between nodes. -PaddlePaddle should be able to modify the nerual network computation -definition to support model parallelism automatically. However, the -computation is only specified in Python code, and PaddlePaddle can not -modify Python code. +PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the +computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup. -Just like compiler uses a intermediate representation (IR) so that -programmer does not need to manually optimize their code in most of -the cases - the compiler will optimize the IR: +Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows: -We can have our own IR too: PaddlePaddle can support model parallel by -converting the IR so the user no longer need to manually do it in -Python: +PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component: -The IR for PaddlePaddle after refactor is called `Block`, it specifies -the computation dependency graph and the variables used in the -computation. +The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation. ### Limitation 3 -The user can not directly specify the parameter update rule for the -parameter server because the parameter server does not use the same -computation definition as the trainer. Instead, the update rule is -baked in the parameter server. The user can not specify the update -rule in the same way of specifying the trainer computation. +The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly. -This could be fixed by making the parameter server run the same -computation definition as the trainer. For a detailed explanation, -please -see +This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document - [Design Doc: Operation Graph Based Parameter Server](./dist_train.md) ## Distributed Training Architecture -The new distributed training architecture can address the above -limitations. Below is the illustration: +The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so: -The architecture includes major components: *PaddlePaddle Python*, -*PaddlePaddle converter* and *PaddlePaddle runtime*: +The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*. ### PaddlePaddle Python -PaddlePaddle Python is the Python library that user's Python trainer -invoke to build the neural network topology, start training, etc. +PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc. ```Python paddle.init() @@ -117,102 +81,60 @@ for i in range(1000): print cost_val ``` -The code above is a typical Python trainer code, the neural network -topology is built using helper functions such as -`paddle.layer.fc`. The training is done by calling `session.eval` -iteratively. +The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively. #### session.eval -As shown in the graph, `session.eval` sends the IR and the evaluation -inputs/targets to the PaddlePaddle cluster for evaluation. The -targets can be any variable in the computation graph. When the target -is the `optimizer` variable, the neural network will be optimized -once. When the target is the `cost` variable, `session.eval` returns -the cost value. +As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation. +The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken. -The Python `session` is a wrapper of the C++ `Session` class. For more -information about `Session`, please -see [Design Doc: Session](./session.md). +The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md). ### PaddlePaddle Converter -PaddlePaddle converter automatically converts the IR in the request -(IR and evaluation inputs/targets) from PaddlePaddle Python to new -partitioned IRs and dispatch the new IRs and evaluation inputs/targets -to different PaddlePaddle runtimes. Below are the steps: +The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed : -1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that - fetches the eval targets to the IR. +1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR. -1. Extract a new computation (sub)graph with `feed` and `fetch` OP as - the boundary. The runtime does not need to run the OP that is not - dependent by the `fetch` OP. +2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP. -1. Optimizes the computation graph. +3. Optimize the computation graph. -1. Place the OPs in the graph onto different devices on different - PaddlePaddle runtime according to a placement algorithm and device - constraint specified by the user. +4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user. -1. Partition the graph according to runtime boundaries and add `send` / - `recv` OP pair on the runtime boundaries. +5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries. -1. Dispatch the partitioned graph to different PaddlePaddle runtimes. +6. Dispatch the partitioned graph to different PaddlePaddle runtimes. + +7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python. -1. PaddlePaddle runtimes with the `fetch` OP reports evaluation - results back to the converter, the convert reports the evaluation - results back to the PaddlePaddle Python. - The output IRs will be cached to optimize the conversion latency. #### Placement Algorithm -Our first implementation will only support "trainer-parameter server" -placement: the parameters, initializers, and optimizers are placed on -the PaddlePaddle runtimes with the parameter server role. And -everything else will be placed on the PaddlePaddle runtimes with the -trainer role. This has the same functionality of our -"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but -is more general and flexible. +Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible. -In the future, we will implement the general placement algorithm, -which makes placements according to the input IR, and a model of -device computation time and device communication time. Model -parallelism requires the general placement algorithm. +In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm. ### PaddlePaddle Runtime -The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and -runs the IR. The runtime does not need to do OP placement since it's -already done by the converter. +The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter. ### Local Training Architecture -The local training architecture will be the same as the distributed -training architecture, the differences are everything runs locally, -and there is just one PaddlePaddle runtime: +The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime: ### Training Data -In PaddlePaddle v0.10.0, training data is typically read -with [data reader](../reader/README.md) from Python. This approach is -no longer efficient when training distributedly since the Python -process no longer runs on the same node with the trainer processes, -the Python reader will need to read from the distributed filesystem -(assuming it has the access) and send to the trainers, doubling the -network traffic. - -When doing distributed training, the user can still use Python data -reader: the training data are sent with `session.eval`. However should -be used for debugging purpose only. The users are encouraged to use -the read data OPs. +In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic. + +When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs. ## References: From 66b84366f1e09366b28e41dbd0d3521152554115 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Thu, 23 Nov 2017 11:53:30 +0800 Subject: [PATCH 053/275] modify for code review by wangyi --- paddle/operators/unpool_op.cc | 26 +++++++++---------- paddle/operators/unpool_op.h | 47 ++++++++++++----------------------- 2 files changed, 28 insertions(+), 45 deletions(-) diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index add8f15736..b5f3d56e96 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -16,11 +16,9 @@ namespace paddle { namespace operators { -using framework::Tensor; - class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { public: - Unpool2dOpMaker(framework::OpProto* proto, \ + Unpool2dOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", @@ -38,26 +36,26 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "the number of channels, H and W is the height and " "width of feature."); AddAttr>("ksize", - "(vector ), the unpooling window size(height, width) " + "(vector), the unpooling window size(height, width) " "of unpooling operator."); AddAttr>("strides", "(vector, default:{1, 1}), " - "strides(height, width) of unpooling operator.") + "strides (height, width) of unpooling operator.") .SetDefault({1, 1}); AddAttr>("paddings", "(vector defalut:{0,0}), " - "paddings(height, width) of unpooling operator.") + "paddings (height, width) of unpooling operator.") .SetDefault({0, 0}); AddAttr("unpoolingtype", "(string), unpooling type, can be \"max\" for max-unpooling ") .InEnum({"max"}); AddComment(R"DOC( - "input: the input Tensor to invert" - "indices: the indices given out by MaxPool2d" - "ksize – Size of the max pooling window." - "stride – Stride of the max pooling window." - "It is set to kernel_size by default." - "padding – Padding that was added to the input" + "input: the input Tensor to invert + indices: the indices given out by MaxPool2d + ksize – Size of the max pooling window. + stride – Stride of the max pooling window. + "It is set to kernel_size by default. + padding – Padding that was added to the input" )DOC"); } }; @@ -80,14 +78,14 @@ class UnpoolOp : public framework::OperatorWithKernel { auto in_x_dims = ctx->GetInputDim("X"); auto in_y_dims = ctx->GetInputDim("Y"); - std::string unpoolingtype = \ + std::string unpoolingtype = ctx->Attrs().Get("unpoolingtype"); std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); PADDLE_ENFORCE(in_x_dims.size() == 4, - "Unpooling intput should be 4-D."); + "Unpooling intput must be of 4-dimensional."); for (int i = 0; i < 4; ++i) { PADDLE_ENFORCE(in_x_dims[i] == in_y_dims[i], "X size must be eq Y size!"); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index e3a45ff9a7..e22171649e 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -21,15 +21,13 @@ limitations under the License. */ namespace paddle { namespace operators { -using Tensor = framework::Tensor; - template class UnpoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* in_y = context.Input("Y"); - auto * out = context.Output("Out"); + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* in_y = context.Input("Y"); + auto * out = context.Output("Out"); std::string unpoolingtype = context.Attr("unpoolingtype"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); @@ -39,15 +37,8 @@ class UnpoolKernel : public framework::OpKernel { math::SetConstant set_zero; set_zero(context.device_context(), out, static_cast(0)); } - switch (ksize.size()) { - case 2: { - if (unpoolingtype == "max") { - math::Unpool2dMaxFunctor unpool2d_max_forward; - unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); - } - } break; - default: { PADDLE_THROW("Pool op only supports 2D input."); } - } + math::Unpool2dMaxFunctor unpool2d_max_forward; + unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); } }; @@ -55,12 +46,13 @@ template class UnpoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - const Tensor* in_x = context.Input("X"); - const Tensor* in_y = context.Input("Y"); - const Tensor* out = context.Input("Out"); - const Tensor* out_grad = - context.Input(framework::GradVarName("Out")); - Tensor* in_x_grad = context.Output(framework::GradVarName("X")); + const framework::Tensor* in_x = context.Input("X"); + const framework::Tensor* in_y = context.Input("Y"); + const framework::Tensor* out = context.Input("Out"); + const framework::Tensor* out_grad = + context.Input(framework::GradVarName("Out")); + framework::Tensor* in_x_grad = + context.Output(framework::GradVarName("X")); std::string unpoolingtype = context.Attr("unpoolingtype"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); @@ -70,18 +62,11 @@ class UnpoolGradKernel : public framework::OpKernel { math::SetConstant zero; if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); - zero(device_ctx, in_x_grad, static_cast(0.0)); - } - switch (ksize.size()) { - case 2: { - if (unpoolingtype == "max") { - math::Unpool2dMaxGradFunctor unpool2d_max_backward; - unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad, - *out, *out_grad); - } - } break; - default: { PADDLE_THROW("Unpool op only supports 2D input."); } + zero(device_ctx, in_x_grad, static_cast(0)); } + math::Unpool2dMaxGradFunctor unpool2d_max_backward; + unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad, + *out, *out_grad); } }; From 6b29904bad2e38ea6a717af9bec2d2ac7ffe070e Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Thu, 23 Nov 2017 19:05:31 +0800 Subject: [PATCH 054/275] Add size, height and width for crop layer. Add size for switch order layer --- python/paddle/trainer/config_parser.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 5ba0e50c6b..9510194576 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2401,6 +2401,15 @@ class CropLayer(LayerBase): image_conf.channels = input_layer.size / (input_layer.width * input_layer.height) + if (len(self.config.inputs) == 2): + self.set_layer_height_width( + self.get_input_layer(1).height, self.get_input_layer(1).width) + self.set_layer_size(self.get_input_layer(1).size) + else: + # NCHW order + self.set_layer_height_width(shape[-2], shape[-1]) + self.set_layer_size(reduce(lambda x, y: x * y, shape)) + @config_layer('batch_norm') class BatchNormLayer(LayerBase): @@ -3850,6 +3859,16 @@ class SwitchOrderLayer(LayerBase): name, 'switch_order', 0, inputs=inputs, **xargs) self.config.reshape_conf.height_axis.extend(reshape['height']) self.config.reshape_conf.width_axis.extend(reshape['width']) + input_layer = self.get_input_layer(0) + if reshape is None: + self.set_layer_size(input_layer.size) + else: + inH = input_layer.height + inW = input_layer.width + inC = input_layer.size / inH / inW + out_dims = [0, inH, inW, inC] + size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:]) + self.set_layer_size(size) @config_layer('scale_sub_region') From 52be2a2a86f4f1cd74dc12a989341f699c67b9ed Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Fri, 24 Nov 2017 15:41:04 +0800 Subject: [PATCH 055/275] Add depth dim --- python/paddle/trainer/config_parser.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 9510194576..b342a90fb6 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -3865,9 +3865,18 @@ class SwitchOrderLayer(LayerBase): else: inH = input_layer.height inW = input_layer.width - inC = input_layer.size / inH / inW - out_dims = [0, inH, inW, inC] - size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:]) + if input_layer.has_depth(): + inD = input_layer.depth + inC = input_layer.size / inH / inW / inD + out_dims = [0, inD, inH, inW, inC] + size = reduce(lambda x, y: x * y, + out_dims[reshape['width'][0]:]) + else: + inC = input_layer.size / inH / inW + out_dims = [0, inH, inW, inC] + size = reduce(lambda x, y: x * y, + out_dims[reshape['width'][0]:]) + self.set_layer_size(size) From 6ace929c3d330bf427465a2dc720a77e7d6b50ed Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Fri, 24 Nov 2017 18:30:35 +0800 Subject: [PATCH 056/275] Rename variable name. --- python/paddle/trainer/config_parser.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index b342a90fb6..9ec6ba6347 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -3863,17 +3863,17 @@ class SwitchOrderLayer(LayerBase): if reshape is None: self.set_layer_size(input_layer.size) else: - inH = input_layer.height - inW = input_layer.width + in_h = input_layer.height + in_w = input_layer.width if input_layer.has_depth(): - inD = input_layer.depth - inC = input_layer.size / inH / inW / inD - out_dims = [0, inD, inH, inW, inC] + in_d = input_layer.depth + in_c = input_layer.size / in_h / in_w / in_d + out_dims = [0, in_d, in_h, in_w, in_c] size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:]) else: - inC = input_layer.size / inH / inW - out_dims = [0, inH, inW, inC] + in_c = input_layer.size / in_h / in_w + out_dims = [0, in_h, in_w, in_c] size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:]) From 4599aea7c2fe90febb0772cda3ceeb50b1953ce3 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 21 Nov 2017 18:06:31 +0800 Subject: [PATCH 057/275] polish mkldnn doc --- doc/design/mkldnn/README.MD | 170 +++++++++++++++++++------- doc/design/mkldnn/image/engine.png | Bin 0 -> 36180 bytes doc/design/mkldnn/image/gradients.png | Bin 0 -> 57433 bytes doc/design/mkldnn/image/layers.png | Bin 0 -> 57028 bytes doc/design/mkldnn/image/matrix.png | Bin 0 -> 19755 bytes 5 files changed, 127 insertions(+), 43 deletions(-) create mode 100644 doc/design/mkldnn/image/engine.png create mode 100644 doc/design/mkldnn/image/gradients.png create mode 100644 doc/design/mkldnn/image/layers.png create mode 100644 doc/design/mkldnn/image/matrix.png diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD index ec6d468183..7c863197e7 100644 --- a/doc/design/mkldnn/README.MD +++ b/doc/design/mkldnn/README.MD @@ -1,21 +1,32 @@ # Intel® MKL-DNN on PaddlePaddle: Design Doc -我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle,充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 +我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn) +(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle, +充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。 -我们短期内的基本目标是: +
+
+Figure 1. PaddlePaddle on IA +
-- 完成常用layer的MKL-DNN实现。 +近期目标 + +- 完成常用Layer的MKL-DNN实现。 - 完成常见深度神经网络VGG,GoogLeNet 和 ResNet的MKL-DNN实现。 +目前的优化,主要针对PaddlePaddle在重构之前的代码框架以及V1的API。 +具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。 ## Contents - [Overview](#overview) - [Actions](#actions) - [CMake](#cmake) + - [Matrix](#matrix) - [Layers](#layers) - [Activations](#activations) - - [Weights](#weights) + - [Parameters](#parameters) + - [Gradients](#gradients) - [Unit Tests](#unit-tests) - [Protobuf Messages](#protobuf-messages) - [Python API](#python-api) @@ -26,42 +37,114 @@ ## Overview -我们会把MKL-DNN作为第三方库集成进PaddlePaddle,整体框架图 +我们会把MKL-DNN会作为第三方库集成进PaddlePaddle,与其他第三方库一样,会在编译PaddlePaddle的时候下载并编译MKL-DNN。 + +同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\]) +作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。 +MKLML可以与MKL-DNN共同使用,以此达到最好的性能。 +
-
-Figure 1. PaddlePaddle on IA. +
+Figure 2. PaddlePaddle with MKL Engines
## Actions -我们把集成方案大致分为了如下几个方面。 + +添加的相关文件和目录结构如下: + +```txt +PaddlePaddle/Paddle +├── ... +├── cmake/ +│ ├── external/ +│ │ ├── ... +│ │ ├── mkldnn.cmake +│ │ └── mklml.cmake +└── paddle/ + ├── ... + ├── math/ + │ ├── ... + │ └── MKLDNNMatrix.* + └── gserver/ + ├── ... + ├── layers/ + │ ├── ... + │ └── MKLDNN*Layer.* + ├── activations/ + │ ├── ... + │ └── MKLDNNActivations.* + └── tests/ + ├── ... + ├── MKLDNNTester.* + └── test_MKLDNN.cpp +``` ### CMake -我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。 +在`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN -当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。 +- `WITH_MKLML` 控制是否使用MKLML库。 +当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 +- `WITH_MKLDNN` 控制是否使用MKL-DNN。 +当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。 -当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭。 +### Matrix +目前在PaddlePaddle中数据都是以`nchw`的格式存储,但是在MKL-DNN中的排列方式不止这一种。 +所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。 -所以,我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中。 +
+
+Figure 3. MKLDNNMatrix +
### Layers -所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 -`paddle/gserver/layers`中,并且文件名都会一以*MKLDNN*开头。 +所有MKL-DNN的Layers都会继承于`MKLDNNLayer`,该类继承于PaddlePaddle的基类`Layer`。 +在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑, +子类只需要使用定义好的接口,实现具体的函数功能即可。 + +
+
+Figure 4. MKLDNNLayer +
+ +每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`的`MKLDNNMatrix`, +分别代表input value, input gradient,output value和output gradient。 +它们会存放MKL-DNN用到的internal memory,同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)。 +他们主要是当数据格式与PaddlePaddle默认的`nchw`格式不匹配时,用于转换内存的工作。 -所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类,该父类继承于PaddlePaddle的基类`Layer`。 +必要的转换函数也会在`MKLDNNLayer`中提前定义好(具体包括reset input、output的value和grad), +这些函数会根据输入参数重新设置internal和external的memory(当然这两者也可以相等,即表示不需要转换), +每个`MKLDNNlayer`的子类只需要使用internal的memory就可以了,所有external的转换工作都会在reset函数中都准备好。 -在`MKLDNNLayer`中会提供一些必要的接口和函数,并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数,子类只需要实现这些函数即可。 +一般来说,每个`MKLDNNLayer`中的`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存, +因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`, +如果不需要external的buffer用于转换,那么internal的buffer也会与它们共享内存。 ### Activations -由于在PaddlePaddle中,激活函数是独立于layer概念的,所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。 +在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是公用一块内存, +所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`。 + +### Parameters +对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer公用一块内存。 +如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式, +在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。 +这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。 + +### Gradients +由于MKL-DNN的操作都是直接覆盖的形式,也就是说输出的结果不会在原来的数据上累加, +这样带来的好处就是不需要一直清空memory,节省了不必要的操作。 +但是注意的是,当网络出现分支且在`backward`的时候,需要累加不同Layer传过来的梯度。 +所以在`MKLDNNlayer`中实现了一个merge的方法,此时每个小分支的`Input Gradient` +会先临时保存在`MKLDNNMatrix`中,由分支处的Layer负责求和,并把结果放到当前层的`output_.grad`中。 +所以整体上,在实现每个子类的时候就不需要关心分支的事情了。 -### Weights -由于有些layer是含有参数的,我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。 -同时,由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致,我们会在网络训练的开始和结束时分别转换这个layout,使得最终保存的参数格式与PaddlePaddle一致。 +
+
+Figure 5. Merge Gradients +
### Unit Tests -会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。 -测试分为每个layer(或activation)的单元测试和简单网络的整体测试。 +我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。 +测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。 每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。 ### Protobuf Messages @@ -80,41 +163,42 @@ if use_mkldnn self.layer_type = mkldnn_* ``` -所有MKL-DNN的layer type会以*mkldnn_*开头,以示区分。 +所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 -并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。 +同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 ### Demos - -会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。 +可能会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。 ### Benchmarking -会添加`benchmark/paddle/image/run_mkldnn.sh`,用于测试使用MKL-DNN之后的性能。 +会添加`benchmark/paddle/image/run_mkldnn.sh`,用于测试和对比,在使用MKL-DNN前后的性能。 ### Others -1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为64。 +1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。 2. 深入PaddlePaddle,寻找有没有其他可以优化的可能,进一步优化。比如可能会用OpenMP改进SGD的更新性能。 ## Design Concerns -为了更好的符合PaddlePaddle的代码风格\[[2](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。 +为了更好的符合PaddlePaddle的代码风格\[[3](#references)\],同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。 我们总结出一些特别需要注意的点: -1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数,我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。 -2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 -3. 创建`MKLDNNMatrix`,同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。 -4. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。 -5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表input value, input gradient,output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory),主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时,用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好,每个子类只需要调用定义好的reset buffer函数即可。 -6. 每个`MKLDNNlayer`的resetbuffer相关的函数(包括reset input、output的Value和grad),他们会根据输入参数reset internal和external的memory,当然这两者也可以相等,即表示不需要转换。只需要把握一个原则,每个`MKLDNNlayer`的子类,只需要使用internal的memory就可以了,所有external的转换工作在父类的reset函数中都提前准备好了。 -7. 一般来说,external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存,因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换,那么internal的buffer也会与他们共享内存。 -8. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存,同时数据格式就是`nchw`,这样下一个cpu device就能拿到正确的数据。在有cpu device的时候,external的memory的格式始终是`nchw`或者`nc`。 -9. 由于MKL-DNN的输出操作都是覆盖data的,不是在原来的数据上累加,所以当网络出现分支时,在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法,此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中,由分支处的layer负责求和,并把结果放到这个layer的`output_.grad`中。所以整体上,每个子类并不会需要关心分支的事情,也是在父类都实现好了。 -10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 +1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数, +我们决定使用已有的`deviceId_`变量来区分layer的属性,定义`-2`为`MKLDNNLayer`特有的设备ID。 +2. 重写父类Layer的**init**函数,修改`deviceId_`为`-2`,代表这个layer是用于跑在MKL-DNN的环境下。 +3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。 +包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。 +4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存, +同时数据格式就是`nchw`,这样下一个cpu device就能拿到正确的数据。 +在有cpu device的时候,external的memory的格式始终是`nchw`或者`nc`。 ## References - -1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN") -2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 -3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`,所以不存在这个问题),所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 +1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。 +主要包括了深度学习相关的数学原语与操作,一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。 +2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。 +目前在PaddlePaddle中,仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。 +3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。 +但是在PaddlePaddle中,无论是重构前的layer还是重构后的op,都不会想要知道next layer/op的信息。 +4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`,所以不存在这个问题)。 +所以需要引入一个转换方法,并且只需要在必要的时候转换这种格式,才能更好的发挥MKL-DNN的性能。 diff --git a/doc/design/mkldnn/image/engine.png b/doc/design/mkldnn/image/engine.png new file mode 100644 index 0000000000000000000000000000000000000000..65bbb41fbb389ff5f7906b0284ada77ac2dc4ec9 GIT binary patch literal 36180 zcmeFZcT|&G*DvZ;M35qgO6V$z6$l``sVIn2MFB%IA~h5tbP0$+P=TPR^dcLihaNgX ziBhB~giu092t~RfKqz;Gz2E)5=R0HEGtRi@kMG`b$Jm>_Ay3wtYtA*-n)5e*^Lc5Y zr^RuY|M0$j`#7|3-MG7N-+lu4m(0Qp{vyH}SqA>|2lB4g^?d~$f-~S3CcA68*Y@oz zj$+$_9|XU%KDqS(xo;mQ_!cMOk0#gb$NTn0%xK@ZcF)IheyBe|)a3o{HlBYWRAB7) zyz*$aWKgi$!=gW8haRfjo%C(wNm;t_w2@i=yRejv)2N6m%0{x(-M8t?&D%K*OfczTt#12a zf+CZa27Y}!JhFawdvS=88n_eL8Y_7RJf_^r_qL61LuSRE^9xdA`6pEHnu6wYhpQpO zooe)a`kdA70BUn4Yv^36|4L+(`i?31&H$_<=CQ8yxqg}J?jhrl3u^D7gNGZgJZt=3 zbVPlth+e--uV0W1+_*#Ap)Uqb)X|3D0%5(5_DAPyPn~%heXZrs$6Xftm+c|8Hr!|= zmsor^7e95OX3;BfvpcoAV%RT_P7vAoUboyRAW&GpQ_WZ^kNa7^)pq%j!N+ z-4kcu?##Oe(#5#6ASwJW&V`Oaj@yn)6Mkeok<+`rJ0Y@L=`h+VSwFbFL#to3^cf3X z*r22?raWD!inL2b8H1H1h_9UKkB-nLok1Rb-SQ!VKXB>up~S0#qF<0&Vdui&(Q-U7 z+QTBK-#^4yWR7ml<`uf|(`rVx3!bjDoU3UP39Nnk`-fJImauQe)Ag~iv)a)a*f$qi zVxm;nOleAKI1;ZAq-+cdg&{q75Jl zo%uCbGSiNzEM4_ptfP5(kD$D1BD=nOW|t{B%l9^4`>NP6nMZ9OE_aK?mA48$D!C}b z2NAM7!>+}Jxp0(Sz4Vl3Xng5s1FDg}D$d~;NA30+xd!Z%-XZad!LC7W?dUTO5_=>0 zcB;}m>Nnb5<${YJ8%`ZVrW^*Ak(PdV`qPmPP#AcRh^;Jv8V(n_j)T$HqNLOSJ zSf)9<+?QjlQw~tOV8b!x#aiUsK8K&UUQ6;e*s`t*FMx5c@B$-jv|8=B%dGn~33+=Q zQT=T|``xYYs%Fr1k5%oKma*!4XxrfjQX{87qx((@*uMG_d_=7hNC>t_&0IlE0lmL7 zfMns>bH82a4H$9iv{|%NOM?916N0Tr+y+7Cz0b_YSh- zP>v6$uDqDx=9Ms?F2HZjE;Om{xFNN`5E)g?F^_dh#ALW7ENd&{Gg}YM#L4SGrZhaB zvJsq?0(WVF@*I`BJ9K%DKjFRR$ab-+-Dxd#$A7P=#e+5T_7Yraieq=JqJAg@j(_Cr z3XeNA&5w5A;z$ot-)PaV7TH+}`yXWXk6R#_xvDCI2OSATKoC8){^*>eZ31%az+oNk zvlD>wVd8rq8Mi_&14(p)RXBb7RPj!~Kq`fIe zD9VtDeBwRexzQRZ;4}$MI2!LNH&*7%gWgI{TBwPd@=UlrP+iF#~op)q-dG&~}Urm+&Xh2!$kqVp| zSMy?OAg%NV^%swGrtSv8cPn+fZiz^5r#l57k!UO)^O<@zaor3?8#+j~8S+hZS{x4- zxe^=e5wIu6oe=nS%K|~YIp-=CLtoWV0inJC4j=2Aqe(nFES|DO7?e!o_kgYi!FBqp z`K8{)6+g-gs1i=05n0|;veBo$OvYHxW_@Th9hq43Zb0m zX{K=tWc`8UV5(_x6mq}g2NYR2)j8^pil&ipVzq=eB@yVX>In=Uw{*o=mc5tiWM(Z| zYkHC0RbaHLGig!Q1?TOe%lZWi)ltq0*(PnCNw#8Wslg!xJ`C#|dpQ~#3?V;`SL!S) z2=i^&!b~Eb{Pae3A6!RSmVd;JY!%>~?C0$`8l04)HcLA2GdxC7GGqATm8`jfy@ER% z>h*|F6Oz>0^)zALaznCy&BXFUD__AZL8LY%oFUrx!=XkRuolBj?3U?Wqb##qO1ljEhIX?M zsv7rC9fhU2I@Mseg=_Rj0`xexus;Th>9~fSq^7Ln8DW`rWtcm}K}u-GYCH7ujJbVi z>c@sBJ7Ukfi34F5tjNZQM5v10&%_^tFl$k)?6kg+E~id-wrp8D9@_TqJ!vJH1pIHt zTmh)dK5_MrVhbl3zv_G7G72fyRgR6zG^kJaoxXY73X)Qq8cq8A_p`N&RUdI{$I)i3 z967SY4r3o-`aW|uk>{=V>%WR3gkp<*8>2}re?L43Te#ArbOc^0i~SQf7&owxmUDtE z$g4f~7!utAqRp&C8y8`%0 z4K^=^)Gj!;U!6LOj)@be9g_FHZM?2{12XIx;DiNI`$`$imF@B;f#u%Y+pu7+wo3(` z!VS>b3dE8#7gDp00v40-ZBMV4;FVO}qg&J}Qptl$EJl>G>=kQK7tUgBV*f*FK=K0~ z*4K(XQ?pl(qWKQmUa340{D~% z*^KGygc&tJH+N3944#HmkVO8|4vmBAO@bo@PlgFl7&+C{4zUxFW^~zP}L1c17sHC+_?@M|=6m0=ZMf!jkDL zWE9>Mw^l)lcSf4;9j(!N+V7cVZ=+8zd*v zWWmyR^8G@*UCKH;9;B(O5nqF|G=)8;pQA+)2CfF@8x`euy~a)+Chp zM}JmoK=o6QGsqvnvG=9;L3YR^J0uYCY7Nya;>mSu1BfAHr)sb2=5%_ZqQDsseyx+X zA(c2$Fj|-S<2%w*k@CZp8~FpOw83uGm1fbQ#Q<8(LXU~n{ID-2x&4{=fr`f@BUXY{ zz{aE)d*Ie`(-3{Jeq?*>NZ_=k|MJ)NrQaGyREa|<+EC(E|5ot|r|WKr{|LaQEWC$s ze8+9^OKg(0dpW_-)5W?9zM`{vN6{?>;UVyg`=#LPtDWkLYTLiN+nH>pY@zqZ`|VQa zg~4pPK-#zqJQX7WPyZvV(+pv406}`6?Y+1QN0-N{^wl;pYWz1=#K4)9#RPW*Hm{6V zgh;u@Ii>X;HEmNx;)7kluw zt>p!@TeSI4sf$Tjw=VOeFBqQ?`O|tBBo|D2NA0SaBGU~02~0@&TXP>iGi+9V+9X

|JvuKVG&L2LEIn3UOwIcl1T5!U5}J?Y&k@LyV}ntGxN$ z<2!ic$a?2%3Sv*CZ|j_}sb=c`O{|AS9&ZvEdU!AHOvekTmFs5{%kQ`FY#2$6Daqii zrd{tS5NDVDon|QjcQ5>H6B0LbbXiNje1mJ~Ap~<$a5dEW71OHBu>Wd%`yfY`;ZHC5jGACKRflnkum z^RJu0D9f$B+-T!ZwX)7-3iJ=o_qzy>J6f#Y(s}EC-P^aAo_tDC&4L>`8yZ?m)|eg zcqKectWcgbMk-$Q&w&B;`lQ3jhT#@E{Qc2GM7dPn4WBIvXeN zw0F4Yg6k7l9=7)pHU=aEXSDPFI`~?zz z)-(TW60~jko6*JfDqPY|Bn2)$JZrVthgz(TmFt{?Q5-IQR%?J7Nv*j3+y!Yseg+7T zJ{n#BWPk8^@wgfy`I7QE_Fn7OPVD4w*X8lF`yn+XRQVf97s*jPFk+1xJtvoaR^*(v zl@$9OHfs;>L@x7i&wS@&%}|vHUTs4ra@A-1>=>nhS{A`v5(@L?Z`q%(RGj+x$dAd@ zuon|i)Vf%Sl?{D9LYeHWQH1+McAuFNOpSqgGI_srP};b8d(PZ`hys}xWmnd{K5&Fk zRQ6iMk@sd_oJ=&Kq-r%$zzZ`}0AH*%Q)2H`{8G~~b5@7)2!(^ulr`xrgy=O8n~Fa> z!ro&D)%2?n`kI!0_Lj1=6Qm=Clc1uGTo!a0a=oa=l8}eC&h@Of1_62pV~%G^U*WMQ zYi-eE@H=7f0Fwj1%DZ)a@D$z<|EGuR3-2#_=K~{bz#%bS_tc>`PMXq9GMrnXD!;BY zJV_odb=M zPh;`qlaWmyogDm(EN+}eg*dG~Do`t5ro%SWH0%%1Kf5`qYN~jVF^@zG)kUm#Iu{j0 z6eMA^n*nVX7T&r>QA{l5=WXkd7|FR8-R_nCS_^hQXpj{N#ePOC}bFguFQTHEvSm>~Rlv$LV$4l8W_i zrtt_8zaZ1P{B%ItgY&M+ZYJ{v3~O_}Kyf~8N^L4`Wef9Y8(ARKr3N;s5K-^FVGPMV zE6wXhNNus6essgFkKJSqH27f5DgBJZpLEzrLI(}YY#Us!PwuHE`gxO3Tc z`T9g8d-~>cW9+&rx#g=Ig(-Hka*|H^M$3c12QxEOG(T7%g=4dg4V*&Z7<$sHoHH7Z zxER)|a)(ah)!KQtB)|FBYasiaf5v&Pm-r*)EOB3q*25W(>HG(njxevD^2tZO&u{Kh$myKq&zNItfNhjtOw|`e3X7z?-RPsKx@HR2`{7AV z%v-6b&en301>uSFk)~5AW^SA$n|0r5-_Sk%k(OJ=Y8DTg)iIY5>BZs&A{z=Ni`b@@ zln0iv9w!)ElbF!;=JH48^z58;$;(=Y6hF3me!+a`f=LkfsS?EkMrPmTq!pPkbZPvA zk6$}OwY(m{w&R<%a#2)hK05jLzHc)Qi1{( zustxbGRMnKz1tAKl!}g*#T12EIYA-P18d;p26;Fg((F~bW4^WX!6FOg^;1R%Y=nXm z2bO>2kPLIvisWS>($en{dRi)SvwV(bnM`mZw|99jOhRXfqK2ED_uRHl51|&j94v%U z*DQ2Yg_E?ZeEeap?LQ8`vAzaDlJ9~UvSm+%XmD2E8$nj64O>eNm-x9<#Ju7B3O3pY zV?=zu*RZwhF|XNby{Bb1j*tSQGnrn^ar`b&&whRdN?Dek&@{O*$;(dGz%9b8HPpF+ z2dsCFEv)!Wz2KA080>x5k*~yw;yE`&JiX5b2Wt&s=pIXw7B}j2E|981XMVhAWMz6g z45ZV0D>hdCUd%sRrk@@xB8?;Dx9akVvmw_UW=mv~I%h)WCVDC&73VuD3w zFIHKBU)j>z%0lO#n9rJRU=ub4O;5jQNBt`KdvR$%b1^whb}dTn7c9EW(Qt=Do*9oj z%$CHF4IKraS(xGmxb?&dOZDvz-d-mNP{*bGYZZD~5JzvI8NO4M%=0<2RK@QwTM5FF ziw%VI;1NSRBX<{`=K16*!tHt=-qYGgAq_M@|0pgzklMZQD<3_kR9~uAh)_mbGJ%_t zhh@k}^BFyN(FcMKn^$I+c>%S<7kdnV3xw10Lx!lUxO`7ik&IpctM7*4oUd|@qJ+lE zg=`V=YFWj*M&!D1pyVm$T!uYyG1r7PKCXmKtSFzh?qr;fpEl=|#q$#%cDOmB6kKPQ zvpa+E7le>)|3v|)By!gSTOj$>Ll%hGoTFgvNj7<#a#vQbo54jB2jHZuGUt4f8h3hs z(D7c2kMwFvtAZTK^LtwaRXc$wn`Qfi&k+UM#e3{uxqoK?D>x(aSy0478;QBys9y;fav=n5_H96%kKYI^3 zkeoaTH78tyBos5CNV>|YP+eozGDn~3* zsnO}LVZpSrV0+FdC`Bdka#6!KV*7UaVJsHdNiQj!c+Ue)LU!leW^%g`#s)|lx4HEV z$j<+Zjz49Msfw~-Y_x3P{6t6b?vnW`6u;!;hrAu!G&;Z;GoE_P0@uHOga8Ef4`=x%5}0|tW1 zvv>L~e<{IbsvN2mp}5T*92#tr$op$ThD9CX{eI1vuUHHpeA{sptnRgv#q-l4x-%Xc z7EK1(19H#O(1bK@;Q^~)sBD=P4k?uIR_74;a0Br!y}sR__$i=7T_gPiS?xr=a*#c{nw){_iW`9X~qhVdRj+D=#YQlnH4t;oTmTK|g-Ih48YRTfbb;D?*= zO0CKgQ<%XeE+Y?!$3EQKG@dcp?+aCvb$UKJ|fn%Bv)6>0@3~lTL9_KJn9g{oFJ;a4)UzEjD!yo>O1!*A(J*Rh2-B)a3uY z(SXG=Kc)!$GD9f-7Sh5zs1s>jG5@KUuqZ>V5}gM~Ck&D8-Ymz0iAfV(Uu=>P@U_+Jh_-$EjJaj zRudpv$OU=~q9gnqmj@*!5k(o3i?aB8)BI&a9&~ZLTbrznUu2m+8a{K3>)3Q$#S9b@ z`4jJ3mK$p09(ge6l^jX0=>D$*LD^nA)0nY*XjAaEJ@%3l zImUT^&pU&WifyoeE|!}_@uuN$PItiOeIWFHKF`N38u88TT(d69@XN&K6)ZJhkv*95 zb&yxm$?jYAM-<2>vLXz*lw6dalkTwubdo>oq)*TJm`&evzs1Mqgt8YQcRE|wm4(aw zm?<{ryt$=}!-j13?V4}t_`IZs4Pw%`Gc49pol8{c7Qu$Ul${f+qsZl<)^BOvdd)wJ zbDG|v;6l_mbgfEjd5puD>5bx43bG))tgjx)@P>Ee(`&zrB*cs$Q$GZfl z_BZxfcS;Sf6(=Hfd@!2gRZhktBZ|GJgu{I#rY5c}BhAT;mVQ7o@&86y-2d5R zGI<8-zq?HnU?iy?j(d>9R6IKY=OrX79aEdhFf7K;seZFL3!e{({Bi+E#sXTvWxAkdp}O6ilb`%AAaqh6)btksBr)_@ zkNFYbYk-FK9}a=rN+!V|I{@ygVkM-a)*jHt!bH6GO5xuw6*8*kI<&-*t<0L$4&|Ce zSG=!`Q~NYD=s!?LvDHx5>Y2<1jPgPsdou(08v>>NTE1+XPisWN(#XzQ#X@?N+AJtE z4lRIc^Bkk>zwkp#WT>tGWLmeveksu}*ngLbU2B&c+wFQ5swgQ7Z?uGG0E9W>S%}|s zq;)7SIx#!(Rh}(PxeeJ_X_W-1>qM0L!f@d3_QH|Cbq(5hxTFg@OT=$FZ2@o?hMC6A zT&iq%Ui|~(GGK_s>bZjUZ0KsJ6@!KV@RjLd0Ad1g^uc?mj+ez5b|dZ=KofyGzO>k~r&|E-BX%`K8b|M^zG8gM#X zr7yvKZgXXdg~8E@@xh;$WB?AhGnMG-0;=fNkW&G_ikqs#L;`%eZylu{IHGbAKqAhO zI@;xDtff_Bx#1<}M3SsC_jh~8pDu3qX8@rXfMmKv%cu5P2QX`NXSiqTKySB&&BnF0 z$?I~xW>6lS8I*^|I@h5_qRjxu#1n%I;hD>g$6f-|*Vh9es8UL8jjUfHxK@Ld#Gu^@ z(Mls5Vqo6+S57TXOeJFRZ@I@<8Xn)(1W+V*2rmcW9w&iay=fY%(d8e98FRpQJ+I%n zHgRi~!8c&$0B#*U^-5(=s3!gn*cLX8OW1i@%j=#f|Fku zw6RD%c&EcZa^{+GF^m_k!#5(;CgL}NAT_5%oOpY4qOem6c}neuw>*IS5ECf0a8JaA{!xHE)5pe*NY@uzasoAj-R2SKc#L!eW5ELE8Db@0Z7S zr#M&JkCL~JS3iKqRY9CyTsYcx+%^Ra2?`t(vEuG#L}6VyeWNpQ_}>(l)!6#2N!ukV z<8){{XS)MMhQ(j}9IO2LzW!PE((A@;y~ahS&tWc+xd#qC&dsju-#sA(==iis4(!r69!lvpdiQS30OP0N=y9;*hlNlPm zRLR)-057jZiX(#0fde3!9*6OPGm(XbURFJ3BHMLUNgn4xh;<7A63+~yhakr?$-_mI zF@rejQ^3-9^3bH@E|nU;e4jfBPao-&ixpYE2=s^ z;Rh^$W&V}lVz*>M>$HDv4@?tw03dJ=eTOXzoS$oR${h>J$0g~FIH1Qi$1l9PItUB6 z1tKe@R1h=2A$@{g7Bb~1s)GFCG?WzuU;m}h4pq5_6j=q4*xJjhB45L*%lYO>SN*bGciX#zkD%97G zBYjg>w;anm@izq4V^O2Djh&#;=AB|4#BP7bS6=nFt)NC!*cfSg>7BJD$cIw>+Y~gb z6wrdZ#gvnsRn8aHF}`G@tQX(c{MFaeaOQUc0(7cGx|4hsmISk-xSbyg2{z?)ivQ$Q z-w=cgB^~k|>2pB{`vNfMef>;u0#fyLhcv^eifxA6!qZm~kq5)^o{^_aiTnCYJW7m6 zto z;ZD=umYFBIpR-c_P;6M3=1Ju{jM0!i!Q#yt%t#!}ovfV#-TOX78Xd5}b`S1K;Z~e| zZ@eB?eCJvSOrJBwRC^wSMBKw4znyiDv)A*B`#1TbvNiZGsjtKzhHE&r6Gn({_DI#< zu;_a2uS#_iD)KV+&I<9JA($jX7u`*{4u^o!LON1R6~)h@ZmT^az9 zgF}RA1TvqF-AdB9C;a}WLibE*p)cP;r&Qsb?S`p$Yg)sTA&DOOi{U_9CH%}60%B9y zoC-bL0DKq+UqC{flzGVx;;}vDhDO)z>#Pug>lUi68}Sq~HNDHd-Y$`KFLG!t^N-ZW zr6^XJ_GK7rte-M_!(TpUogB#NL^NPY{klVqu0g0Pr}S@>g9$N%*T$ z=WY=nDxbJ}eMr-4I{WiWIxYqQJQg5-v-q~JEM^616%*e5z(xSFBLg6gfoni18^>o? zy)Qp;Ddiw?BZTuv?%7e+1!?m=bX>V%lqy{rz;;7nX%rn*JwQw{#rm6ZH2l&EXu74O z&oGki%$AvuQC@H|&zY551EyD7rn!*RgPr_`r#y&^4FcPEOTCj=s$cVKHx6iJmP|&G z3wvP9Mp`$`9GeCFEIWKT?Wn$rzg3#kdpw_NNIsc8@2zvQdDczj@Z@od2bLsfKyFtY ztZ+(&Ez?ORpJlNY zr~}+Uw)QTzq7u7?U}RB;?A z$I%7l4yr;s>rmQ>`n*oF;G(_bvya$O?{)W{5NkqpfzpbMH|$yg{s-;mt~ZfEdD+=wHhFT+JaI1Oqs;*YV#$#*1(%tsl~SH zt7bIZ3aWH-aNwifC3aC>I?R=_!T+B8Mr4;{{oYDY7uDg)BbKG+>ZPie-BT^$RBAcX z$G9mH!R^7FJNQ7)qBj~|Sx877$f*FZ6a!(H(Kz2WX79o=-z`B`&jc&W$GB6P@{RFU z#;IoH6KQXMbduztRK>xpj;eMVWx3PJ`M`J++3&RSMWmgCFWUl)BWiM;wK(XlRH zP$%PbO4P%8b^MsovoP7VeI|<@BZa@h>o0^U(#!suAv3o#(x(m4Lk~^k?merE;-uxI z*KcS9`~I%z_==@j_g9M7KYiV6BEB5gi>z539Hc6w8E@eOUSqyBX(+ieT+O*V=Z9Nw z$1CoSIdi$mUL4GK`p!cB)}TX|P1A z$U+IH($$lj=GlXBxeZCm!&pYmo6XHX$i+D^U@@CSsCiVs!6zFlB!{@N39zO(_RS~^ zOAp=jCyPCc%%Nzc)F|H{(kI0SyNs}v`l+Yz_`dV~7T(C8-A^L^7z4N9@1()WYHr5$ zG#1dYbE~NV#6^7)$uPt*+1~Hfu+R3h=TFeiLNKAh{K7M)bfvB|l!Lg)-ulP_vp+Xb zh<8rsM^nB+BS-;19f!^lts5uQJu_ZUe3pU;7#2w0wRc5SNy&bhTy?TfR$MB@Y>hkJ zHuEVU@_PJQHGP&J#Wn}2?v&__1fViI#cwF}Y-(%Q0}vl_I`!&dht+%gsvVa;n2Mx@ z=|Ls$Ov8zJ=!jaK`v*>a-Cng9X(e(OE0Xsh!s9SUvI4VUxZrDdj?m}Ti+7k`fGxgR zwx(<{r}kiOL-X}SWzSNh)I*H3T7Y#^5S2LY%Ue;ZS8kG;dS585UG~a=&QrSMbif0I zhypXU_7%lsF0LfiH~D^DXWxoAF!Ber*qpSg?yh1`%{Ww!$Rf>V|CyPehq={$p}`uF z+;326e|+o%`Zvi>gW*_xq@>d~^fY|N3oK|Q^a>%J+yJr z-acgFhkg?c(E=2OCklnwP(!Ag89Ig0%SrL`GVJNp@kh6nKU(i8}%TppOXWmZ{1 zFsqo`ZwA6Rv|2w)9%`s^932$%=YBQw8=-Y)S-D(_iXDH|k>E*ra@FqkAv2Am>ROSZ zv4hu27f909)8BmsX9G%|2abp?hJ?Xt0tojv?i(Ar^AQTXk{@W7l|0~1sf44iJ!vlf`oeUfd5|xtd9tmqz2R#GFIL%sfC^08M+ zFWsIx!!C0_Ht0bE6p*qAOMnl#tP9!cPTc`t(pRP4qk(MPt76E@%$vX=@6ii5NVXD( zBAx$*GpsORNAG!DuZ#4#1I6sk($H3?gALG9UOjR4-PP1xz{uR|T3R9;sR4w)nx~WR zO{(oud^73ot^wUcfQ~o>I*4ld%0W-a)xm#3l%Tg}qd|Vi!^j)7o;Wc-P7TBiyL8LydSv1O{o<6i#4Tw;RV|~ zm%sAL6S^7wFW$5vZ`b|Hpm(Q)_3M8?l`RQ?nbix3iyIA+wM*A&jRLM7pkrv*_ESv$ zZHQQn#8Gq(%NL*t_1#)J=%JD4c^9Af`5vog*v1)=?z3^tItTcw{{y-K5F9gH1TZBR z0A+GW=go^FohZ=VG&I>7DSlkrf;Xe%s%K&*)CT2AMuph49${@I z$#Gb0A=X@!FXC))^iAu6CpvH97^^)7%t4U4|NI{|<`qo1_lF!sc0``J^10biP*tMXczQkHb1ph6A$g6F# z(ca%H4S2k&Ol7hn=g#xaM~t`$v=&{az3j9tk|4EI_cN1UA)NDcqdkVdfLou3`xW1* zDSmCSNZ06-d5z6W#H%V7*h}x;_1YK6^I+6@T^Q+|o8EK=~i!;&oQr&^bAURnZh5KP?1T*bnU`aRtS(DZ&--4bh%l zAyw<$+(jICrxa&D?E5libIQ0fdPBq2zn#DT?H~I~?6;_jwh~X0Ct-ejI4oG_3+M>7 z@oPMrj8#ON74w*6$W~L}@FnfMMl)Y*xQ8%~#Vz?PJ5Q1?IqKRWHE0*C;tW&S&ygMI_2IcbybB2eERbB-^reOMU0M1SyvI@~jw;u&0@Zs&v-umIK`Ds#W z*V_zP_@`$TOK}TVA!5_znwLS}&%XA1S2RP-=3+8rPwlnHfm5o}wk+ zm!D8`$?xVrwSU&i`i!7xo_ohJ@Okh0&7Q()tF3Wf(5{vtJ0`B8dD%x^Ty0S51o{Db z6tq|P9Kz(-V?FP5CA|KhS_21|x|>!$X@H&%TzR2wC|u&Z(|L*SZ9KO3)0JbJVPt2-~C|;`i2!Gac#tPP_5WmiLD=G#}6|| z>>FR&L@+^zn7Przp>uiK3`XlCFu@Cs5_sk<6>vu-o|+SItt|${sk!RZ?O%2bc7G3m zi(x!!MxWnFLTvd8a7^~kuLM=&T)+o^HA;eO$VoFSUcK!eD>DZ~1@e@)pinGq z41#?DZ9_Zk<4C)v6IJ-#f6jSmELRxfblIz`i?0)D3h zaJMUAsYl&^pGz!M=Yz|?hIw4g1kh-x&?D04J}7 zgU5te!4GjjdDPYd{0r??Kx63Q26;-CGy`=F0YAzBiS&%nssI16|5V5c>d_f?k$n%Zrme$Wb@wjkW2*!1gm+pRn6w+3N) zlvKEB_~y<>d3-=feQt%?=2N7ic8wIfqgpMl?O z0&$jTxH#x-nh%5fc-Ow|G2yk^`rw_>6xn0H(BjBoTW`HQnz~*-hJj6I|23%ZVGs^OAGF|e!|`dX0k+M z>GO%bJpnZ#@Cy($?!4r{-WYF3EVCfT^F_bf*&OScb5ti3#V}^5wZB7$Lx$S{jd+d9 zy_viINl^52736};koQP!!I}&^f|Vh4czKx!dTgHCfAvt8<#y|&t4rR9#Ag*gy2-oP1s$Bbt;Cmm{^nI;L zT%;+dXOo$8Gq1Lwcqjc8%0bmwBg#2-^R#$9*M!`FNt6ItSlJf!XJGEUoj`4NpKWr} zH)#INWKD=|GIc-no_QNx;;N@AmhG{Kgl1KxJuc=zYnPPn1(FO-sA{~Agi$pSIy+XW zGxOM?J1M+!trJ>X=Pxu`l|m5sa&787nfXS?R5V81EX>@*ZEc;QXQsfzFH@>cq=cn%If}eyv|d__E?#+258emy~!Ux8QCcU0+C`z7y`QVg2>l{@7cJ?qx%I~pSb<} z(@^p?))`q}JxJuKv%Cu9EhmL0(A`dTI8WiR;e&lIbP5JOKVg3)@QX*iJEhW9E#{c$ z0bh|y$@#XqNV3No1YKcd^{}ErgTn_|XiYVrNHFNs+UfLKDM69@C##hv@ks#2t(G9o zBZuJibDJQ!bW5MzyofIpOPvxdX&3PgO zs>JUP9TKh|*x59VP-UhB0Z-07x0)30#WoiEq)~eWa9B?pUh-B;Lx0CtOh}X7hRfk% z@_1KC&6=d@bqO~+sB#I-0@2`1)&u4-MoV7QO+##hS`WD>H-cn;RpP!$*M%zp4B zmzurFNcfmVL7vg#Qy9bIkfNOnO+|Q=x)(>XfPJYV4cqjFzh+@YC0FtUTDXQJmz_p2 z5F9brn9uFpP1`}Q>GnWm`xL+==sZ zOC-a4OlVkcj8cg-wZ}s!lwZjf*QoT$KqU68lDJR|+z0Q6;Y_A^hIrprND}y&IRH#X z!ONCKS`aeLc`FP-c__}W-8J6NYVA_*yFcDV%sCg}dEwOJgrgx+6yI1x4u#^&IGHeK z*?YMo)42ciJN2vP*-y;l+fDI3dLMD@j=CI!*zgS^ebz~uy;GhdIC<02GKF8+F4Hn_ zYQO|ikmfvnbCozZE(g#H)*8Xp9NLg4Q!Tm&PZ^eoLIPAK`@l78BUMrZ@N=O%kOE%! zT-Swr7It&vhP@z!dZ1>4;A7AOvs{A5|I{O!waz)ouy-0|{h`Bp#q$j47M7V+Y@`<9 z2F>Y*39@C&k3Qf88Ea!u0smUt$c663MleKa6x5s+|411jBFJ!-EIu@Ap81oR*QTyQ zIES9^p~aS?n-Wvh_a2w7^llI1HlO=4Fe8>aR8xdCpf9*dfoVHw7m@=xzH5wena8@j zS=fuq_e%8kRMD`#HTY5a$?CoaH%hgR@~rYBR1Zu)j^ExabJ+W36K(-9w^7kLVGe|v zR$kajFj6KrHR*`MVcz{lLDLp!1)ltHNPdN7AM!^( zvW?^;r6PwOoAFVj%sx%5{)m4j==a046Uayg7m*STZWMR%c>2g11tBn28TE*MwXPDL9RzaDs zPzo*j8E}MAyycWY|3V_ZG+y;D=R;>DZhCDIXP;6sm{UG>q*@e6k_RjB1_eR(v!2{o z-qU44o@+v>K*)wcM1UL>Pj#D&D~}Pln;MLzeE(~;D0{Xjt)b3rS*AyXq7%5s=YKM_ z$(~JWuY;E0&CP$(T|^ZqbX(7Ui?yeWss(x62N{7(CI#4gqsJU(Z&8lkx4II>FILW~ zD_&!6Qc>kvw>I@-NwN1Pk#_*=zc65&{ z3U*peQd>{o^o>?Iu;y{C(E```uZ#4b`~u?rj1p)~wvbQ{OfSq1l9Gw1%bXa7%_>#e z>O64Kg9?;@vm%jx)PI(3fKA=T0L_5a%d4isfHP4QuTZ7>Cfy2v`YsC3 zZJtJ#<6=}L84-ZQIpkLqQdg0$q6$i?SRTI~s~miRcX?pAc&Rp$A?`0px=UG2PRcIdz(=%Wr#H}YOZ@UI!MBNwG*iD7=1>BA5Ho)_ci zl=-{9sSUXByt}l?pnPv%OLJbo(l_PFhURT|x8g$HdVIKvv2ugOS{_A zRF-8=$Gp>@;W5x^dT=Dl8`1A9atY`}MoWkz%NaY_0g6*6aIG!oJto~6spGakGrbtc zXhbzuDdEee87*&idsC_6Rk*10?u?^?tqIfg$Vq*6Lf)I{2HhGWY|=TG^G9YV zeQH)&eparjW5$1;zuJ;H88A7I?CAMXk%rJk6?b+P<11sz3cPE~-y-Pl92|O&HwHA^F54r_1J`3GOk-Q` z_r|;J!fH>xq&DHORMTa;Id&oz$86qbTxYu~(^Q~#q!F|Im-N1Y{k+0qTS*ZVLlpNF zPXkIE;{x)N?!@xrXV15q_JsHJeHG8+eU?3tDRsHt*=@OvH!ByM05SZPSwwz`XJA{X zJx8TJxNHv0y7Dfd_3y9y^V1 z4}He@!7Iu7>p|3cmUM;`Rr4>cb4o#bSU!H=8J4n`#Ps90=mkfTkVe5g6tUI{(%Z;T zys^)~*@tPkBgn$=UuGi@H6&u-{rq>$=GyFA`-)wt2R)UIHrdK$ar%EM7#q>kx;Y7D zsBdgx=gZY8ZfdsRfWz?ElV=5-NB6gAn@jpQg|iuPzVW3SX-R%knq^@oJBceFpU*^< zEUxnpbabq2wLjEb?I#{lPCtjKV8UN!zuCJ7i(paImh`gs)$m!zCm z+V*$8YeLjNzB(2LhIs(0ju|$fL~&_0D~RHsYpTTb*D(1#qU%%RdA6`tZM&BM9gl%S zNVChRa6=}zR;VZY|L3yg37xPpM)y9@5C=nT%>Q>UPsq5G61cOG+^_@~ybeHtm;N{3 zqyXb-fTq`wL*Np#A!DFcP+avFF!TTL-3o|?YzcUr;V=sU%l6v|dZ;yre&a<>)==)?|Bi4e)&zEWx`G+OLWqD9;i`Gu{>D z30^s7NMpPatq{C`VTy1nN^Nu6wu6~9xsm$?-zf%qk7Zr_pv#A~4;}O2#RP>hFn3+R z0&fF2m)q(C9AGXw^P+seO7|!=oolChHJq;IMy(D zWksSPe2Ps*DDud~dfM>Q#5u8E{etwzy*C@6$9WwZp8Q|!eR(+4-}`rqqKqxojD1U# zVur|AD_KgBXtOpX>llnRhDbwbwAl)UvNlO!QW`P#Wki-TGWM}#9otx+Gd`cs_xs!a z{r&M=&*SQHX_j;DbD#4*@AtW1_x*aMHP$$D${hS}naq}gSw`>x`0$SAmRZFfHr9ZR zEmeB?=-=TjwH^1~1H!q0r9U5AJhF_cA{n1Nyw?wX^wv(@7m-_!Qtu@@_}t^x+>^qs z{ZNvhiP8{W1i%kQ_?&%cUl`}1RB7tRBbG}YQ|cww(=f=*2H4J>!3p91%}M8LPDBQW zQ8B+kX0TW4LwM7-oj(*{H`qjBwPj*;MqIQx%OJHO=7qMgsEK1gq{V~&6gI!*_#kKA z3Sf*c&Vc0M_Fp6q0^EviWs|iaxw-5>-TmhU0vR$ll~vdb4;tDjPGQV<4J6W3!SAZV z;m?Mbh9@-?sT?SSqp)8Ofmh$^!w9YYv2=ad?Hf~+Z#yR>Ed--(oTS}6QRO!Ii@zo8 z&eWB{l-Dfv$N>#wkQ)3%yZ#l(eC@k?Ix>KBiVc1gOL~1osGVmMTyU#0RY$kMj*Nsi z%e~t9Frny0u*oi}s%LLlu5r}M!NpFT743VlXk!56({AtMw_JBZoynIUo@(5-qwC@s z#jZ<=on;RbXEi+>gc5*1>Xdi#MYtK9`nfyKOY0p`ZOmtgDOZKO+EQ^xQ92?1gk6dq z1mb!8FDgHQ2xiVq1}do`lVV+jmAu_90~$*;lsk1fJKbmR31RDRvS>M~jHj(vh*w$| zmhg&0kvHt+V6c1YQ)jgCk798_g68R@m=j$F?;Z!wut^hC-ldjN*l2|wKVd>aM*MAz z+p#!>IB>++7O&$wjGKkANb;WA3`k3Hqh>+RJ?Uk*)qI%}`0FBJNoZzcX8Rr|Jk@SZqGy~lfp^;I_C zzeh?)>iZLX#3jgU!&A}A#LJ_)Za2`yyI@4_4V3!PEaS7VEHi6}(M*}^h<(Mb2vcol2Q+vt*>qdD z>CW@d4^u^`QdC*$5uG9mvZl0VS9pw`PgP*=k#ZS%tgW|~)GaUDxnjGt#8rXXlj0QQ z9L{P2B!ko{QBPAdfmP)LyIfN4`Fvj!VMwjZh#~uvSBIc)7ODxaL-`JQrp-)27#GF2 zMZ?P)wfe2Wxlg14l~nVnEztKuIAh0Yr$+sVY0A)Q-?FVbS%NqCTFUD&b%sYMEZ=!!m)~T3zt~>ro4S^&xfI}Qsh$HQciL>P-W(B zr(h7uWncnJcqxdB;nYL}#F^@R8w29#+s#4U>3E;>%f0%F{MMo{G`Q00raP+w>-Do1 zuHg26j#1fRazf)ka3hDC8k9;ZIZ~1)=g7h%rLZksxGIlt8S`p z$5oqxNij?@h0K3v%vLgA$xtFt`;hf1Pg9GiMhE0H3ev2geCHbbq${e!t~JJ9m_46N zNMs6F9~vmTH6Km#FU=|Hly%WrL9S@7Xj4Cqrc8WMiQUVr(9F8(N%h5~5*m;&)kdPnvFO*XIBu%iW>76G(1CVk4E` zxZu#;23g-WkxI!1r{$WAlrgN5e|NP~u60GaT+(sBLb)W`ckW3|2k@X>ircN$dw3K+ zTj=3{^fxR0CWoP0FNTCZeem0I@NYhjl>1G@fu9}{2TjZZ zn@mXN6Y2rCzUG+7*N#0PtKbFlO4(#Xnmdo`A`<^4~9z4N5RiJQC!FcouG zkD-#d^>`>`qxb}(MD*O6=v<({uoiZ5as& z(i-DUi&R3R;Z18t6d|w72~6e6_ez)F41Z@^=kGMf-Nr>>?Q+R~nt0+JdE1*fnR|d; z>9vzIL`CBODv9vfclJ?HEZ_n~tArk{^bAgtj6`WLxF}=?nd1t*sp<60%|?_uwL;_m zBkcQ*dSl^hc6SY6hhv2CZw`RdjKj*4>7Pc8$@}nQA~KL#?zXFPN-oyg3;;qj015^Q z_y`mEb9_klA)CSk(29X*-}P=KKm;cUK7s-tS!eU5{a^A%>I8i4_p~2Kp|D$8xi)@Y zmyVkbnskf=j5%Ej%byi(-IbuU@h@V!%N6B`K?4t-l3Gi0(A~(9uS;axz)a^KRv27| zAHiy3+AJlDG_%zfx1#GYirYJey?P{o`o5{3q^3FE5?NFaObi$FD zm4Ui07&}Ri`=}|JrJB1S9=QRss-K|(%&(Qe3&5=PvnST_BJ6g%vTK9YiX>N!k>Y8D z>kD}B@Xru)oJ>w6gc61$K(|@Y$s;8r!;ziZau9|uuUL#cWhUz#xaQamfb|er{2bo2 zDHFTUe1qt{?eHOkW*!RST7t0f*xO0;@sBY%>EDFCJ<<1F}KHGxrSW!}N2$Lu;5OA>zRgOExJMgE8E_@rKK z31$dq0vlmyY4kk58vftx$%8jd+jmp02e|$|>FNrjN+gLq4;8VbSziW?Kq`YQQk{Og zx|IdIvSfZ>;QK}EZur}Xo8kK}+We|yXa0idSOTw`FG}6?hV!lx{?qWdXoHJ;C>(UI zNIi4T4_AW{_p7Qx7@n4fx2@34zfff+=6APdl7+V4mpzpfu9kTnuf zL|#;s9S3zD6}ov-L8AUgH+29`Gsh|9rkOD!%GmkOwc2@<;<@qu+5~Y4_TVjM0el~| z!-_YAh%TBh_CLZ7O}8#pkkAzPz^>hK7t?z8Il_9| z(PberblpXOoR;M(SS4Kpw#=~i8=@M7QA$L#B_+mfOVq!}L#f+7-7u?N7_52_?tdVj zb~0kophHr(vr|i%%Dp{=SQK2p*W0gO zqQ`0FZ}HSStL-JqM=um^vNY4I8cp_4kgu8QRop!P^%@$7TkE-6q`HY@jNnm2CGVRx zCSNJ4?1z&Ms2;>KgIxjZUUDwFSxJ4a(Brr!a12SQ)8)U;lNkQ-y#pQ44gTVbFazG8 z1*>Cl)tep)c^x_9|g9+TJHN^?=Lz#IQ|>Ew_6P}caB zKZc2$tvE)xxSQff13YSyn<2)KsAm1Ny#0{s3!|Ps8H%$!u)SE-@ylrt7P*Ux;ldw0Foe{=QM2+nJ^noq6Kc4N-2 znPzo8p*u_E&H&N$=fAG=X~Q4;EG{PspB*yi7`&Ei)aM^qxN z9_xaVvkr;{Pi*+Kft}ncV|2)h#MiH5KJoTG$`DTjw5cPrDRXBYdIt38E}8a_b{CPs zhE)}aSV(E04T*Yy9$Zt4;eYBp0WUDP+s^D5<YMr<*^&N!}Zu@*ZufV(!(_gZ~cPeKBjms$6Q-syTrQ&ME z@yX}+tEvs(y=2{znGP5IWM|=a_6Pi}4T!#hXMpLuH-(GattQR%QY}MPNq(JR35+LE zJbB}3|RAU9(Vhw{Zcm45d2EmiTOQ+06W{2xW`D_3*R@A^o$C7 z3;D!q*0X}66J13tCmp^<6u(J3X{<2<7iE6SJu=-^G;~vAez^+!a{LQ= zJn%889{Z-a=}``PN~EhWVD_8Wpud)3ZooT2GYc?nVP9WaNU92Ky)*m4g7)22Vw2%d z)465>m?YRHC2*;mSvOfVy34rcm=BiZIPg+Ri9WaWQsw)Lih0E&k4UK>rJw9J8Kyft zC<$9NsQrB{Vy+X%H+-GqsUt@U)WH|u@9(*q)`)Al7D9(=WNWo#jw2qpisj38;{&{C z*Lp%nwR%CAlkN56lih&}4}>k9ViPX;o|TFGw6-c`Mx;7ln0jkT;;TnLG$GIhTt8`# zU=4~5Ft7vB4NN^IZqJ88>alK<4qAe<6)zXZqE>Vhx^&XqAZZC0F>{$Q)D9O_dbCXH zRa+`k=SHcg<*$iUJYoa|+sJfuUrTu4P=Omv>R2r?2odPexca4p2g%iCc8mHY0MY!p zrGcyPZ%24v7aoT*qQ@uNKRq%m)`;tQcgN~wPOg~$!PE}5rDpUSOBt0?nS2R+#$dFo zSb-?qa)MA|zEEOA!(Z1;^@$igsxd$&tnm}T&EaNIV@%9i*_bAYSz_7pwUnXXd)p1? zkWo^K(c$BYAPRJ@?I`O_Yq{4s+rYT2;BHV%=y-=7EQ^u+4Xtr8KSud%+4$%0s~}-+ z7rB5E%k7?DV{pzjyJpu5dbP%?2L506C1~cSyZVNg5{Yo3jB&(^eU=&Hvu8bmzM{KI zhF|sWZIj}B&^Z{$RV$2VI&>}tC)tk%Ry#Zc(9{HA`zW zh<9YMgDEAkw6+TO(fZ42$6wkJ-7ZDdKAUizSY$JCtb)5{+cSc{Jj9&13ce{BTS&;2 zb#<*x9j3bG)Kq%X5ogPGS&|gZ>u5wzA5`bZBZMybW&Pq6Zp!CX1 zC~i*8s-**A4sJpozd^yoyAq|Xz86K~z&f&_9hCC_lbTqF`}xl@8jfM45d8lP^j_n* zIn`KaXvZ0gRvh12Nq<_@mQwFPR!bghR&Yp^G6b9=|xbrdx{Z52E-i~gAVE&>4eActG9ayd} zCC8`emP>3xrNu^g*}dTOv~Cx|-@`Su&kD}a;f%7;XhUm=lt7Cw_Qh^H-RD@&ER>Ca z+j)-eI&DktEpemG4Z;^ny{?M7!FI=Kb}V^(+do^DQG)x!HE3quT{yv%bxp)3x&9W< z7*`vxmYSK{6pbD9*Q=GxOVj&VjAnV&mFrIYHF*C=n$78t)7F)S#a7TLrcS2_H$i^gF- zOsWAFp7vA?)5`M2NwEy{1ZxVu*Vtk!m= z5q1TN=!qdxccvLc(bFT$Y7!(C?L zRtYVqAJFvy3bbP~3L6s|8GfCYN=sWZqp>%|I`;54%-@BAC4_Yb^Ij3wxzYj{QrJ7? zuAx&GYHPJRyC2M9Z3(>Ubn%Rm$^EA}`Na^`epsib$+w}dPTGew^3fW?-TbQiHC9R< z>%ygJzTiV-kEk!LG^=NHPuJf2x=u$q4Kq&_zEv#%jOzv4mQ(M9~T<3 z!Kg)#7D_yQ`4TYLS@6cVeO3gCW(YE$dad^!o!!KuP-*ilKZFFXeiyZhUfX@n( z8e@DP^?vF|*RC{O+HO=+uErfrwoybkbMP&J=m}h=CFA=(v|dOn!otb%{8LmwmR#EM zia^RG`l@Y^Smcw5RXRo~)7%MW%bzNgDp8T;R~3O!tn1NfAJeZbZP!P3Kh?aoAt94R zTx~)uqIpZl7w77JU6RZuTJXg7$Svmkg^N0bUzp->GZaNTiLsWz3Ki)uz^Jx{emUj7EFvbbT{=5wPaZ)05LeOwU6FfcYM<&hI)*aIp9d zEvYVjw)EAOGh=dV<{iDXbGmMK*Z8d%yyL5YYq1a9>&_-Q9}nUSw8$^KZ!N!qP&6!C zE9Nj0i~M2nuD{mlMr2 zwwUlsy#?#E(95j661z3dNW~CXX*5ZH3{dNh9-_R@vbQ|b z{{6(gTt+_G9+p-Qe=8=0_bnPgUyE+-*D8O?e}R$0xSaL|8RnFXx^6!FsF77?lD}rfI5q*EcvL3$0TQ3(adM(k=zc%A6aJhc8ODWc@g*V6Fq1ehszFY zSc>7>tX};1rW=2C_4;%?BT=G91u_veTjI`JrFTWQC^H0)YJO}O85xcBeZY6kGd;LcdZMJrL{kz! zp-V6gRLn5@>Iv(10NqhfREHMs>wN0 zT{1Tf3gSpER=lM|9H04Of670HVUVVl#q~@TVuDchTtqv#&#Q7acTN`C=51}%>)D;( zx#o~yyWfi390Q)fp**8)b~=$C^5qS~lXd%Q%D)8P^{_+zbOG)>2YUVvdS19+aU_9@ zGHX+T)90V!PTp$o7K~cnMFJGYsqE@KkQDIa(j~EvZhb>K`@<6XN~^uw;~_t^H`iu@ zf0qTbH_X*9xIn?%WH&tV9y3mcLtW#B6GGMA?IIDur zlc`YvrThPQTtEn1!2;KFJ0~^d|Gg~Y&TdR#eJwsUWOfm0zL$XO<-n1Hq70FEu-4%i z@w-#`=1N7p*2g1RvPB84r`akje|bIltUVcCJ?fc8Gv>P8i%H9FY+-F^fVkxo6Oc3? z=?K%24nC#GIgp+w(fcDAWD@*i$79r#=el{5*&5-g^HKGAJvfMf3w^A1Zhni_$qpMY8pJio8K2Nn@b%^P%RG zpau}g4}coL!G|Z+P|IxYEH;l0#&_lkP_xQZ%115F4L3T;L4LgFq7Y7<%Z?$aNA`o_ zyj3d@E^gF1+F_aanl&zeC88hDk=>Gs><8*(JX7gkuKhpxT*Q*z| z_{&LzV<5n+0omZ)G>c(FEZPh0<}WJ5Y=dhw5h-q=o2zs_DDVqF1QLBr&dU*>cAuLj zvT+bkLTe#;zRegkzK2|9tj7sDSWbBGuLZm@sDU_*bSCtylk|~_j?b&Ak2u6OMTilc>G3N z(8Pe0i%{&m@=?IczlZFFIz|6$C~glgd-V!L2+0rFg%JE*M!(%im}y#18aSJj^^}v5 z{W`lG`8Ns%TSlFiRMHx>fl&*RqzfBQp1Wif*}rVCN5fVFby-7BGgXDh$zW9dNMWvn&b1Q z=Or`AA9HA3gt3dk$=Ji-X7KFPEiSjRw@7*}hCiZKSL2X(Hs#N2-`)?7oJrY6k)*ohOej zTpzKP-H(;%EtV4fQE?ASeA2rylG3-f4fJ?180XAe8TfAIG`mMed^2iVZ1K-5@(!xWq+3p=vobggkEBcp=r zVE^?tKlMoUMD^@}cyA6A^($9ptAL>-LBQVjVr^-%;p>5T^?u7XQZ;g4=9LuP6vL(; z*_l~S3DC?Ge;=wq#2|fCZmbF^Cj&DUu4PpQQdmzpadoeZflx<2{6gXES_8@NGmH4D zhjspWBvy`^IU0qm9jEDAi%LVV9&^}Oqe9VCwWai(jEyi8o6F_FWX}yx@!2y)1dD1U zEKGgH$Wx^8F0XcF;cJu^xeA%k@=-+q@4~^85hj2yJq53_p}pF?2HWMf_@4^BeyX-sh zjC%)^%U1QrjUQerdGU0Q0krKo<`6at8;y;@#+n_~7|~F!-X*OuI2v#RPZdeY-fi!q znWBm0HU9f?xlMUog&Aw&d|-Y%M5J{3+NIUNYy=Mc+Xi9%3+Wnz@5nZ!X(AC zpf+-J?G>RYRXvpLadnbm8W>*wRx~`L{QQ?JK4~-SyODiX(NI?Xo+HUyoV-NaANa zd+pfj6t*=^^3c=gU?+p$ZPo9)tML#|Kyya>sp1vFYyj zyyphh?G8;|o4*$PMhXYRi7TBNqm3@vOUiQ&e8r>G8fwM@T=*Ym7J`(VBFj+nJI zhix48iU&Dt6d|(-`!?ssldt{gsR^vtR84uYR(zif{kDe^ zd$tv(aZAHSxRH*<`1U8Bu`s_!w)}8 zJTzAKJ+d_su`7`rcdE%cWOtao&$1*cDNy*3w}u^_=ps6{4wxv>exN}wP}S~ki(r~S zGRw=(5cou@+M|*_R8SEnYXJ`EBuHr9Y1U^n~beb}yT2~kJU%xsdmEg2PdL)qm* zyktIgPZb;cyB@ePBZ^8gNivfk@%vr}sxI??g&AAcr26-kvOL95iI~K`aQ>IBP8Lbm znwmZi?0g$5Ef|3*9ZvYWhj|po0J{)UWu{@@ikp0W(2O~pH~I2sC+;I>B&p+A;(@t?Clu;CVTuI8!I0fRS^q^H_dbH zKx9_M9%QQRW5_6(V92ZyHg{n)Nm231A+dX#te2Z(9)OSZV$Cghl)r({a{BaO{#lma zN3`x)e*jM9DM3^)h)VchR{w>guS-?Ju@N#5!C0Z=x7(ArcQe2<6cAnkf=~3%pNyXP zK-h$WY-y+$8*vqqyMtluR`q-U@3`YiC7g@0rbzyEpZS=Z3;LvUm%)?Q9|=0KVR+GiUlQ1aP!lDzu+-Iuiv z-}UpCos~cq6J-WMU!QjRBGRm41-D-K;ZqC6X2Ar;zKK9lgb4ZpaH2-Xm zONZ{%3LQ{4cc_hGK&o+;q?ZT5t>a_BBSeK5@>}tZa-JQXSHwD(A@7BtDsVmw0`(4g|=k}#%aV#b1-6tjF>chqN ze$oO2fUT{Tid7C(1p>hnWM=B(2R(oeDcl+eqOaZwQ>fmYn55NJ zl=ub#%$TPPk0Rbh1?(^o$mBvIo%h{_^~)SGQ=K6noIKbMEv67FG@>N|-N}`<9u3Fv9$GDi4J?e5TiLV)2JP>}r7}$Gg(gh)TD2 zo!$c9mr1^wwC4V@15KGD5$Xv<)sq};fcVOH=+&>ijkLN+%7}70s~q+lAKlP0*wa~;-0G<#aU+H37V6!&>d^EZicAQ z;POXzy2X?9T6tY^p{S~C6v;#PI3BeSEzF&c^EPTq|5W_vIxGaFYItO)1!R1^(4NAd zNV)Y#4vjuelUqDK?U2voBn#mKRbsm!l$~5!0tq#};?sh~1@Krc&VaYS5w-EYBc~eW zv49&0(P2*;EGo(=984=;&seaR<7``7o+tW`A{v%jPM1y3o`6g=J&@k7|K07P>8RlL z#9Y4FP*))40V6!V5PiuS!)Jy)Bid*%JVTv-&^dR7H?B052r2>IsDLn=BJ@f@Zp}no{GLTa3JSI)d}*uf^&o#nK*BagSU8mj zK@1ac4Cy@#5zLWE`H)t~YWT|+v)D93g-+M-$KVUFD>@1A6uX=4P&Fd%CW~u*f7~w8 zg0?uKQe0x(L4-A1@Wc9g+F)At;H9;)DnG#N_|ob)YnSMR$MH-o+k*yjr1eum8Cek@ z3h$njgUIAzSe=}`bIx80e~Qn3pVTx#w$EkkDIu)OE_Mppj8hJMI0x zAd`Znc!tcH8Ae*sdO0cY@968xN1rM4 zvTy#9521!RyK8&V)x~S~;h8vBO>dDsPg&KciR8yCWg!G7Eog?fe6!S-0>9tXcBuLv zO#4r6DKS;b%-x*a{*SV*9`bIx%Fc?f2$y+^!XMbq`U;>;#>1jzOr%AECAMdl{Wms~F7uRF|bS)Lhi zVW65z3K7{DQ1+p0L)}12ZeT+z&i=*}Uz#avm+5bli2#$|CLz6@W@E9nF9v2Yzu#d& zv7n@@>$3ZU=ldFJcJyBQ?kZ%d6Mj`;W*;GQ+}SZck(l|qPUa<%wt99!j8y1wKy~<& zOrD}d?St`ET<^wr#IzfB1J@fuFEIr-hOZgk{K@-Z_Cl2xK(UM<_%?}&5l?@?^z*e) zP+zDII~oGTl$#?zIR+-f^tIwp>bm$FAzIideOp~s2I4I~4TCKaoam=_PX5ue;BZZ~ z!uM}CGp9HiTkzaCjzb-R>>?(RTa9CJeaEM)vXv|l(LG&R=yFHtf4Lav9O7#y>BvFy z4UQDgOjoO2pBwf3$2u;t*)O7}EW;GzDu+RZct6+i{h2r-sdn}3@FJmpRj}OQ1)|O9 z%iNrlK^$`uX;jkv2!j#(f-dUTy*-L(LjR%p57m}Vx!LN zlh`z@#(B2AxYL+uKqu>dtd9qnnNf zzGmI?M|9kB4vp+ik$4&q1}aXI1>-5anXk-Ot%!_}hyT-U#doWhbXX5`U#BZ|r$(C;LgL}-ib z8`-t}=`x!-JCx!LTfHy6!`x`uLt#0&j?+5Liofc4ziKF#*lQ~A%&kv?`w45&M$dlK zZSzs^%Wgs;u1@;YlrTP!cQ&{`@W6!om9O-k$tq`^%nfjnD0%pJ2;n)0T3gA}<6?rC zH%qvHV~tFVl}rgX+|eV*=30@9>jKXBY>|`mQGJfGk6IU{L+=uM)eaglxp!=UOsT`$ z483{)q=C(Fm6wK}9eyQ@Q6*)a1Q$+V-j0qtZeODFCdHoYyz=#U_zhmuA_bs7a?EIi zmK{Mlg72ow(X&e&+$BD&B;8M6xTf`v&J41H72xK&f6CJZT z+PHpjY@{RG9nc0kBdkrLqbW$1_?lsvF24g3_V^_>0DTdiBm#1eglIzCN6m@N3Txc`a+3lnmtWmTcB~dqVB;%%t`Y~+J3#s>8I1? zG(-vC4GW&fdJ`>{N@{uO>n}d1#@0TR-o{)W;oGr@!$wiXzz9T5gP@ywq*3vnhePJ_dn@MX8OlpJ< z<0)8urYGN6g+RTW)45E%DO7!DZu$maW3Atj&aRW^(`c=qwTEhx#)D!rEYZAyFU*L; z>@=3I8C+ruB`c>tlg%pmQHjEzzxr+l)2~_q7)$v;qd~EU6Fp<@193)mYgiOoQsE*A zGHK}HpfD3DD!r$eRZ=*(KdT7moSaLm_x`e{Z+3ZZ4Sl=-TMyUTo_;*;?Bap#IR|dB zYTBX5^ntitL-c+wP4(K3j`bnJeYQF>%S0Idi z*SWw$VI^Jq5`kLOj@xVgZ-a?6tkJ`9k$##ighBd|OYRfw=;}=n&9NjgJYgNs6kxF0 z5ZLTU2!}~b{Zj5^&MNJsucU68mJX{VeZk40v4d5$u$W8BwNH&$Uk;rRIn2H)ouWug zX5FFm`(`!io`To$tBhBa&&xJq)#!Sxk(+?1OYQeN(vLp6Pb?m^<*M4Kk#{S~Bpl?E z>Jxa2{WiSuR>2_JY6e38^IgNgM1| zgT(;(x}z_k0c_3{*k6%REb6z1QYZCrvYqugYTOND%xgiU31s$~`5rnkq{iNrUkdi( zFSvxpgGKe22CH`1XKtrH6;YRSHA>Vnv_>OyIVvPfwM<`4@0&uxHZ)a;OytW>yF)K51LrjgR0@ueNz#zo(`@X|aWA_;T z3swV3CE_njY4vl9_%u3u&w4Ja6Hqcunkv=>=iB_JhAG+PneU~bjPn>mKVxCLb}D$@ zIF~OdGfx~-(^&c&)Uk|Tt1!(rdJ7=3t#fhy6Gp10zJ2^ZZ$DhyUXhMbY%Tp8=WBBv zIQ2%11JzIxFHrDk2&`Xs7DyqEC{dbrH_FamtLNoSaqXmA9 z`12Ah1kjC{Mxdy*A|9XE65|?{5vbjM(hyw$Mr_HvjDd-0+q-bTpDl7Nhz~4ox<3GP zXx4z5&A+23ud;1?tBp@6DO!;kF9(GCw#tI@|r!F*ll9NAHE ze;_?le0-{mNuA1i;W56!2;E>tQdQj7{`_~Lx#@vM~RSDBoqg8#C zQ_)2z#UVIAl6)?FOdusPIAc*Ml+}2czqY3zY8!z1c&wkr76U5FyB)m?OOQRMkw2P0 z`f-$*v}kB&z@LT5jC0_ZH+`&b1lw4pZ<1N_;)01|v4L)jK*YTVVz(9A1tHx>jS3DU zTq@E{fptZ)iV%)LAk)18m-yfiI3mCdk1wsUwg5x;tL-nNl+nh~ANH=2J5NCxl+{Jd z4=oA<6_6t-C#Zm)GjrgcCWpPBq(KI(^YCv?VcBMAK`atP@IRe8%YXRKTtkv1=*3Fk z8pc&^eTb`mmPfGcy5sp>4EcgQVE@GE#3+f~gT3WVj(!`%`?Pe@6xpwW(M+RfnPzRlxPn+2F6?PZE~X-ngygnO0Bu0?kBQLL1?(s_`uZT6cz7lE zuNFst)TP`q4Tpc+UzbTE>@^24nU=-{hDeyYLB?oKawAN}?9l4ffy5I8fMy)A6~Yp# z+oJ*;o4Cyt_oh7E0{gH}-;0{d?^t4o&KqzCao5Xs3gc{Jxu!uWFOlNOVK4l6XQ7N) zoA*GO%U9(Q>%o-49+@>|yS#>qq~T9@K`_*vJvM2;BG@5W)&;)+if4hi&#}0(2uuC& zmRxfDiY!kf(xr5FN_V$3g5N?PpU3z8 zJ=eL;`L64nKg6}!Yt1$17-QZe=C}z~kb8lOM1b_*!2?uD2~nj74<1)Oc<_h?;Sung z**KS6;J=5CN-u;TlnjEmfe(*Og=B>uJSdMu{%!yZd`7gB&~$w80MvH>?_rm1p7DbR ziwBaTLMm>0drk08gkOC=LWn;TK}{o955mR{_WIqX&WolR7L116TsGYHCwDKoo2G6J z;j8Mk=XY{!3gR8UO~DKL&yLJue&J|abUMCvejLyxWUNC@+-Xw>7( zMb|qhG_T%ycR{;D*t=2n;w}`Q%pYACfy_VZBXKX;U)SjWb|na*_;n|`@V{I=qJjSY z_TR1^tVjKP3-IILt}?)w_j3Tg&HR_EhjAi*eM|OlS8p67emzU(|1VdM6+no8E$iQ| z9{k%<|M}hjrA7BV_i70mz^Y)yM}Z@khrx5#*}=lQE&3(clMv*Agl7>n0z5hT%=jqc zdT>l9wd2%G*O~!8*SCh#QS||qvVc20wgtGs{Yh>2s#1FGmMPB9dB>=y6tEXJ<;Z%G z`lH9VE)ZPQuTX!fJG`%CRT2=_y;asE=2@#Fu4*F{i8oAoavf8`}B zd~vyb6zJ6=K|vr}AM0Rh<9{|njF7mF^$kP+do6ZsEw{%NjyumH zY4`;wx`tp)x%=Mtzfxx|=Vq)eoYeB-v2}j!-AI)D+DGDeLUH|!qi|r%_M4^kf`^+;6b%_SHqAFn?>Hc8NKmb4h#sLmE#$8eUAJ&pb9R*+Q7~D;Z3Aio7J5 z_8@fvl?^gW`S^RZ>k3sZ7gSz16DTuP`~csRZxH=qA_c}0*CwS*-a>pLc5V_lL+nNx?11& zu~frk(ht~GM74{thTfR*bT=Wa?3kZxYMdtmO5~IUF52`m<_!61rcYOUjn8B=D@S;D zjd_%O%H#t>PN#1qk$XwO`MQk5gy7nCrI-YLE~JS?&ldtUwX3Tz)?DIdf|bk>c!TsQ zC|ul&3&ZTLXtY0!{eOYdy|HHpwh(hEnZF5Ha(Qy#KWj3_wz?Xy(ae95&4WhNJY3;u zk<+DixF>hHyiqula91<)(IQ;qZZK`~ZFY#7w?K>IweZv|!?0rcgHS2%V+>@h!dz=t zw2FPfvX^_h3~(rRV>ol(1Rc|_OHJEM1{%kRvI4R`yv5O(YC(hNXVp@&eQz%u#YS63 z7RFLuFW?$iZ97;H1-DXO!JrDdnBVo)mb%jatX;Z2gW}h~NpZEiDn;S>IvoWzkb_#S z{i~-sT2qUbN{bYv|BWEmsP)-v*9O8akClbVNKobSV;-%E&~FA)`v(E0+~toy9(0LW zFV61}?_@+_{DcSpdnkw?7)Fx_qJfg^+uodJ=r|>XtFuM!Ngu;Km4JcA@>V*Dlm)gLnwjtbof_jrabe2-l(!i^#A ze5Z3OAPEAWboYwegw_NJw#PgCi*Rnb1}pD|ARoEQ$exNo+I4GZNfzus-DJa9D^R&< zjd-_xsrg(%3B~kPSd(k8mdri*XuHWAmScA#Em*vIfYXln9I#Lj7}PuJXdzZg z9V-+mq4tNxiC2JyrPbo{P17;;Nc~w;t(mDYQGJWL_-_4`*xGV8Riapo-*zj2>@Ax& zBkn@ox>i5Jug}_U^_MJt@u@57Bg+{lxe$qiq&fC&Lf2?v6DUw;9z4AJUN)S0z0Cjh zOqcIOQlj9srYNb8X8G@g$~`w(pt1_XocDiKY^`afo9UA>%-H)=4KH*#r~>?@E44XeMvsjZRAAry=qtXd8c z_li8QYTA}*qIHDqIl|j^7fYZ$_opKBR{n9?6BY%2JHyn~%UYG>CYfKLv@TJEPio%b zES@9!gzCUrSF1qkog%piM6|GVE5uS3K)vOKShPx_0%rW{$QSQ*A(fsY4t|@}tuA#F zN=mlE(oYrMjKY0WeU~e=>%UAL(`m>>!>$Wr8ITRA^4!Yj-(edZ760}X?l~P(^JnTu zHHXz#g({%SZ(lt=CE@tAykKZA@I5JIb}MP`bCD)%L_!dFVP1}i6#N{CJ8|?;Bl;u`UJZXOtnEh$pnRT!=u;%`pDPZQT-x}+PN6G;orDkr71cr5K7O(oEkBnDw? zcpFtn#OeG=uKKmThA*F?h`&Yv;9MtPP6}e=u7-L`-dZ`zIcF#t^7oKZR)`_5&RuQ^ z&R$Cse0(-Cw1FxbCq5Lsqi4op!M)vbaD>cnNTx>iFeZ3gY_Ri1Ku1rY-^9!+zm9>R zd55*V2Ln%OXMT*t_2sw=gz{?9LpSjiaU5W>d0ZjKvb5AuV7|XYnB_bc1<<_DvZ~+~ zd_q4zC2v^b91lfS`K8PJ^L(UN0RalB#^&w}kakR!tG?{ucW4)=XI#5w>qqbVD~@D7 zPR}Clzf&3yEavByYWHcr6J_QT24;V}I)kT^n4($9U#4bmzAt9AXMi~1_*THj5c z;e}LP3(PEbE}ki@!FrDT4+Fg_^Pdc+G z=Wh7{FAU|t7E0WB^1>{AqfxL{B0C>jPgi`Kcq;OH3S8FQNTyH+giya7!R3Th<-?|I z+vm0Ah^!2UmA17-zVuH?))MA!sKfWK&JPlt6WLJf$2Xbh?~ZCXozmMkd4x7j;9*Ku zYONYdM9xOIfL&LkH?^RmQO@(&=c|)ysG&X%rk86WApz|MT>JG(`CKv*#rs3$bVoQN}Na@!9iq=4i+&{ z`vA9&jsDAC-rw4Ue(O2%c#%0En=LRUgvVlQfhFsX2=2WbzsnBC!Bq(3lcIRv2~ zN6?Z|(`ehzx802U#=s%J9Ul;2Y^1(AIG9`;@gPEei^cl=DbJGmNPVJO#qiQGo4}ZYe`lf@|8^8Y7o! zWcPYDB6)ta=#B8pEeu$PWbHZorh$U(Z<6jSi6#6b71yNvF44@){STgWI+Q~l&pMbY z!#aqt(_gZ4KWckIOm&q4J`E5F_U#G^d-MlP;L|`?PO#WJOCykUEoya}r)*-sGh+uA z^y|@}+pL!t9F^!bdJl<$qG#itF86d^Ae$%@z0K~OmC`=x^Ymlob_hD1Ry5p3kAbs( z11r?u{))D7*614o!M9om1UB47K%-?04C$`*s}6-?I%#4MnW6Fmy)7LnwFUjhsBa%F zZRx+5POa5>3{Mbp)?u(*l;oq6bS9(|A$-u3-9=R)Gqn?8QP&TG=}Ztv&y{`f`Nb& z(jRp3FcYQw4kgXQ0qyVlGW+mK)Ogx4yrX766l!s)a$z!BvxSb)g;j#3aAbOVw~_d}XV0q%AU!DW(k#D=`JkA{)o!;+nfsP7`zf_$n3-dehLj*efM4^A zBul*tDN>x(w%dvo?pPM{)A40MjF&E6car>S+!lYt?nX3_$EJk-<|y`@k?XSDcobv= zIBA&v2(k;$!c6K{U%5WrUhQ5!|J7W5IC-nF3F+XNx| zj;px|-@sd+30LQF@M6EPZ(#K+mD=5kTNmjw^qupepCR#d}OE6u9o(d0#5TnOb^)5Gquc5b(hJ8`J>#!+?n79h)IZ3gK4hR~+Z(UE< zFXk&Qc*Ex0pwoQRLy#F2NeKnw2b%_K7q6vsw2PjEbafuD%vrQFg@FX{PEqS|z|b=W zMQGeL)W0frjss~N&hy`0Q&nVt&lLxUEYkJpvq?AdJeoNI>&!Hx7#@N*Bl_>^@? zz0d+7A8yCK2#^)B(vUdJ`UEa^I{h=`BY^mye?e{cw6q5u6F2-&2rxMD$sDsC!?8YT zw}011_IPL?O}zlT;uwf^fiPrB6{?9uP7Td#U>hQ%Ks8LzV5tO$WJt7%o*zTWF6KsD zwQmi{NTfj>zOP*tcJ68=k|tm*M-V8_K3Mf+b`Uj^)s!^l`1-6y1*d+Q=AeWduvlzL zqZMnlW6%4LHMNS2G&Yvan zWbJI4Lduy38pt<%%C@W!82gbOFCPBJ9UzfgGcd}jZq*#ga-|&F{viQaNNi;3Vm#h`_}WWrJIJ*>0D$jf`=YD zkjH6)_|@M!3}EAoq@uyv`2&9aRxH8}X5ah=Ig&bR;0+XjJ>5fc<>-8nJZp*a4wHB_cQ1g3H1|tu3+5Iq~t!R3Pjj-y&cpwz4ml^vZO~D`KkF;OC?0 zD!*IH_^y+E)u26ZMh~W43;gsnng*ohVD43^r1JXKL>AhU{K_p88&H@~(`4{!==YDr zYC2{I(%bLFS%?9@cfm!NpbSn|wI9on1j~ImdK?^#wjKebea$CZWZ4hQ<->-LmaPI|S`a znWrePlBYf$@bnCT^J0atal=raQLQ}?h^y#L01JDP3c#$FZ8bl9m(x=qwXfkJr@*gb zw+*S{vICt*d?UNEH++!Ht!U)W9g>=^Zjum3o)j3JAX3-D$HU)BH%J0LhSU8_E#2`j zfwV{BXvCKTfq5sUd^%?NSGDk0dVA`C;Qo9Xri`Kd8#;@<{O-ypIuqN^HX=vH4ha1v z`u4@!ec4EP7;6@10VOjfKCVZaaUww3MXC^%4q^6cEdwmYoGalrV70Vb6d5N%>Rc$C zgdy=Acv#bVyskYFl3Nt`46422SCrxQT)sNpmNh%I99`g9jmG)hh`hi;#OQ|}Dbq8e zd>P(YNUj7E$o$ii6E%a!QVMzvF|Tnbm99z_w~j%c3Z(rVVL9%APt$cR0c{}U@l_b-ys)CnSQW>4GVn`mGfw^p;NTN`d)5uw*~6acl37N#S6 z{S<5FnsWrd45~gdW2^hR3%^X@z_Pw_B+Zgo(A#V&73*PNJJx8CkI!^+9Rx!}`-FMl zfgdq^ru!rq)xN0$ohgmOnb}bYAgi-bieo>UFi-KdJJnDi{PH%1C(+Exf zCg++#X)kC`6Fs4zwt?#3$Fpx3t-sN2nkOeQY<<>tw0fKOHp0hQrEaB>0Li05oCdR6 z%k6{XwLP%ezBkrO?Mqm3oH%TuiqD`Nm~4)d9N-NysdxcJypaOEa+Uk6NJ3%jtx+#| zd=nQMR&unW@85OX2!f`F2Mod~sfO3b=g-gbFD!g7oxJP{Tvx~BmT+ID%#xO0Ka8h? za`jUgE>%A4#+`i%HSdb^8>GqDJ6kHz-3pkUV7)QEx$1;E7KJ(p z35pIW#JGp74?e$TF9t#!*}$EUI5%il8egVqRQXmK&8DYWQ_C zsH>|$3I_~NK#-3TMVC)#FY10Lln=_qVmHZGVIK&CnizeLJ>B^_XE0gfJ<7X z`WAO*9qMx#CY&ckf5e8qJSVeAyFk5YF7+AD$L2W;$)YW^gYD1!3;esrH)%X?-6#^M z9`?Nr@gh#wzts;QGig1xv{tK~94BvOnhC#E3r^B)`6j#Z^`pOGKV-r_YbHqpBUDHN zH*Fi+k(e%{Xp@*15**{@ta5clCMHGjxcAK?tHvyd*4NqR8m1YKz3>YHWl4RM24OI| z#(@f6WQyX6CN+=TIKDE~=`(BQHqUYw#bCOD$GcHSJjbc68A~Oh(khG|>b>3XxVULv z--{EpzTDd$mF(aL-x?@ux2N!+@<)*}k>xXs5X%7J1okt7<DY=d^=50q0Yr zGe)U~;wyk(H$Gak`r#n?6J2Gp3I*F>2?)<2dK+;_^z8U4Uff02_@bbGq(x7*aM38 zOs}VR5Xqvr#(z|?nfX zO-aRT&22D0qJk=7O1J{??~IzlxPP13BQX_g0pexqx_eGIPaBDZ#VYgX^P!yR`CFCp zIBB%jsa_fTybY+5)qkA1gfUuta{X~jY_C5A94Oe*76s#Ehd#rPUeNU_vb%2eZCS9P ztb_UEjaK<;JU~_*>QH+vte)EWPJ^W$z%0q^G=4R!+M*|gtg$HnF+At`Mo~f5SXy_$8pYq*jfyH4Gda<-I-oCko@%@E{5(CyjxGAE zb{Q&<9yM!YS)b3)lk|UleiVce(`H9OfFSPmknB;WnLo^YB*rb8C`Bu|Q1=-8Uu%nI zg!udWw<>Ny$FM98T{xY%4yrUt$CIQFFK`z*z)dC3>WgMw75nf9xSVotrk(j_et6X# zYo*pTPRvX%D-iNB$xHC+If)6U!Tlxvpyh4@HiA&mY~e)}n$fXMTUDOJ8jYcN!kRR- zQr>ZXTi4lf#sQKd^UIEy8TCyyz8mK&f!EIIEbouMpN33rs{U{=pI>thhn>~J-HR{f zoooJ7Sy(iuLHnu7Tt_^4k))jkJA#}&I?C(UAzoCNn<2zs`U@gannLNs;=(?^b0Pe} z@i$uw3(tUML1j`i4Q(#3>~qD^YHjgJ|Q*T2SdMTDNy4le9Yyl*@bS(a25 zmT3NUFe)K#%mh6OP~fttye^%3=bE;EZKeE;-mySQ#*f8kHCHn}-O6l6si;}Udi3JT z%a<7^I_mpxCk|AvJ3u0;k(;MPZI!m#dP$1M{SO-S*MtNy1oWodwn9qs#|&KI^P&E0 zf1Lmui&8n@2X{95mUV8;yI1daAr~k+y^Vf-FRk9TeN}6YM9w%aL;y!0%~l#?sp%b>ZNG1?9Waj$BPdJNOZeF|8kxZ&!Ffq$@Ci8WzQH? z8tck7;K5<1<{^ojDn8A1S4R^zP-;;T3ghp;W3QZOz*uKY(dKL_8R%3Rd6PwA_`{D| z`$I0&Il+xtWAD87geixhpEMz-BkK@%P`y8z^-5gKnbP^5=Ru8lZkG z(kJyfa=rr0b(7x1&wyW7s+-I2uk`^n9A*(HkJFLVMULv(kk0_T)RSO0cbE0OL*saB zj>u>IS7?Y&Ui=P}X&q=#gjdnwGOx^ZS3{p)3DRc2{BGuRHu}`$sI z{S?}wP^r4F;~;q4it?U}oIUMo=g!ixP;PFRkfE8z6BwBK_rN_4LCM~w0$#4&eT;(^w1tY zi4048rE0^UyR&oVV2CX2K=RjM!f|b8Y`6&GAxzs;C3*5A58#Z3FP)5(x4q>&CAztO z8*tRxFkVpqEaQ(BHWxxx)(~JYCV@PZqY))xhrgOBSb?GRx#)Hsyvyq#Vh`}cGFRTc zy#YXiU7TN2N}}X9JT65ToQZ`G;P86r{un%|$9||ErX!=H`TBd{om}jCEUGMRVW)Is zO~u__lO7b>1Z)Ymzi_yHn|rKYTI&;ES((D!Ok+V@!;h|z(&}n2^_eQ;A6&_TKL!HW z7x&by7}XAG#||23M3-Ur&tpm^UU2iz51#^Cop$>ZPP(l{f-}T#d24Ro`tow+r~P_q zXNh~&+k0)T?0zc*kl_?^Uk!8sj*Ra6DOgtwKbdq59N2t1Y`V83z9KwOrYcB&%O-aH zY*}(6ujAVoJs`bBx2#X)Z6gc&HfoT<8LN0!ZIOm-*;NZwo=DxP*VX1Z#8tAdm(DceS}ge^8QhC zzc-QDNJD607`4rLBdckxcp|w&?OCOk9m2(g0*r7=Pn|s5UUIli2BwlELJ?qG*x?^GL=(AB$ttS zo?w3F-Q0d9iqP}>Tp|dRT~pb4s+<4n^4ZjS5Exf)`1%n>z$9v)lqO}zx94;lhw(Le zsZ`oaUI7@dxD3WOk0heoFv1gGAvsQ3dNehu{fM@lr?rG)9ZjEg@$h)^M$@D-P*S}3 z9RpP2zSJys9~g;PzEI2UULHyuXJUvuOvs!Ut&Z=WMUf)z~n4-=w&vjur9E z9FKJflm9T3WW-dd{ldbhc6 zPI!l&eFPjbf48vgtI~2L#m-loUeZS;iqOi{&2)Ze<(quw{oJ)T2lbiCdzte$I=kGS zrFcCBGybQ#O$&p>L!GgyMhI=#JGMxuI@^NBk1s)5}s*Ji=f29P-xV6B?z}SS0X*3`M@BTB$;{ zl>Bl@Y7wZ%o&o(KAM%uE`m~Fuk!Fb-GSh3b?%?gM125n0Kg?#gD;^BP6=&T;kdA_+ z{0I-2V4nWA-qhVfh;ZW(uH_o7$^n`Lwa$2frk)s4na<&M;ltclp1|pFHD7LhX8Mo`B^+O!&i3^9$ zH?88yuP5ccqJ4-*ql8Z5_GEC6Vy=M^_*cr6y2&sFNnNR%-dwbOjh|iv@K33c|8RNt zwIP`z9=5}noZz4)wCNG@&_{iRE*u__hP2 z1?y_B%97);aM>3nh~(|U6T|$p4rZ>+yh^_JPxeGtq+dY4-OXMM6 zI`OQDDy02L5*r0I)Yz<^b~l-Hy1P+q|NE=YkIeM|$g&RBvh2Mx${q62<#(4Q4~tV? z#`;G%3KiP239gA#B43maZb&%c;@cFsW+rx1KQ*BGJc@qvW3K4XkE=xw*zTc(A#b6^ zQA}G;q#h}!)+v8DID-?$xSV~KF#bv%RE`ufv58>w%$*B=JM9}mT_-GG?4w?4J}0^Z z36d#wtBCrH6f7ivLGR4yJW`+69|YY{4=uvKE`#Rt4zX(Zyt^LsHH{hjN*cCpHZ#2l z2e3H`PKrGa{B%|;mqH>{0#ckR`assfH4yi3+9T3#D-6-L2}U%U5zvHW$Yu zQoy}cCC@mhUm<^A-s``T;0n)}1WKb8?t8 zZ7SqDW>+>0^6f!VYAfz`ZMCC+tayi1Ad&Y=nVty4c2uoIDD1stD)uziCM)kFY@V{h z5LmF-`qbcuM;EU2=yEoUsJMU_ZN`)a;$5kgv;6#c`zgIvaj*cRu*AWyuxwOu(ImOCp=2v!!%gf$nYcss zdLM86vj(e@1463hu7mJ?J6lXHN?e<9i^#oS@7ML4B59r<=k)fpoLn!HT|K#MgO<9W zfj>Aa#1m9TAyg7-*&h1@u#uu>HO&&Ch~>0jcC4VJ)X$S!2jFXDt3bkDx65Vf8CDz4 zSyyYM3S~16%Y%r5BMmbS%pS&CudSB}b~h@$lKyUvA?$$m(}%}I5X4)bq;VJWL|uNz zUOfr0W!GBJR{de+(j`OQ6{#B8wk0<5^vtb2aqe$viPvf!cg1nCF_t-`+b8`YL5#`qiPi z^>Syfod;?}1I;3wqS^B*i2IGEIerc_2F~!MB2m?0<8RpB^%>;$pSPkI5O~-xLoEqP zCf|Gec6ZQ`f0tFt7Yjh@Fj4>!V7&iRN_y5c6uel@uMf1ujfV#NK6D$ zbRqk*xLipcWW4NJWyPvGX)%|2pC+~ug*MwG5%v!H7#Wd@>F14dku9S>jchDB6)YO% z(nbe^k=oW$wp0lHqpd3Fpu<1F=>c?-$3%204zuyks*&Jtkk7fT;GN(PhR(cSkt{ zon*s;50(Z;q1hWKVKQszX8|O_pFc4{^G+%+RdBi3f|WziV~l>{ZIqJ&h_ceVxk779 zLgwv{T8FKZz+E1&Wu)_WW5ofyC`4>ZJ4il*@n624^(Y=h@w3=*!qyx1smR=h zexiGsrhGL~ov?f6t){MEbb8>Q|admaJZwINb67Yd*T~e;; zQ+cSwO-ED0l=k?b4BXj>UP_WwWKI zLeVD*>ZlzvF0XrwS12Q(tMlC5I=l9Y8AhooPhp{8gW}SEjp=8wOhFVU66}&)lQUD> zlw-M6%MeNE*JnWU2>IG-HG3U>K`FaTwd$+f{7#*g<-vFZnF`JH(C2(Hg4*4 z&9aryvwt9;s*lteGdkR#w}}Uu0V9Mw$T0$(+(leTPz31$RSE!C86e=?0H0i$mr6rn zK|y)v=DiA-F%8|ymNVZd!nc`SzIiGVFsm471T?!6yCeeJaYc-yg%<+7#?_rPnDu1D z9e3^e+D_(oXMknyMZf|VFT%#~xGH-#TxKxwB7~@{4Q1JPVl_2S+W#Bih712O3$3#Q zW!28?snb%F1`_k_hm=-!L+%4;rpTI2pX(n!a%>oJIC8k)>D%pK8yd<>`mFAAu6LMO zIRDbd8UruHA#c1+oy7dJhKax4g3Vrof_Yp8=4!wb9C}bydY#c1tsLJzp#+fbHd7(< z0i(1Gm#>?p<~$H?t6TJ6yk~TM&(JVr$*;v(3x2w!j?TR1SU&3XQ89|i%K1~~p;v(Y z4>9W#4R*lh$ijP+Uj~sSAT0zG`&;kcNF6-U4>pPKTx$nX7k)QQ*X#wa(vFb0;ZNRp z&Sp<4ho}kPx3k?xix2e*I3(+X1nD!8G=eI~{43%KIn$Mz$}>EKeUA>-8~IN9CaCUZ z2E?DRL}G1&i?K;EnB{h*-`i!I6b5nWrD8s4vZ5K-#DvzT@xNa2dg{MM{Et1&Fm3kG zkvDFgxr;Psa*ncWVI_yzW0VdQ*Q;4p*S!t?7K|R_`H#4nYP5hJBG=?bfgc~#_JRp+ zf!@loFNohhbyRW39=$dB@8O@HZJZ@m0g?MDQSmcszBb5vAqe>6@y=X9UBj%*xuE|i zK%H&>jBnRfwvpAjfpkDSc4rvOeA>4Kzh<|j@hogE(ludEG`DroB8+v ze^PJs@=Y?AbR!qYGXsK>N$QE0cOnDKW|uZ$O{+E=1{krbgd}& z*pFg4SovDohqo7h^QcuN+&@a^R|EMyoRjsE`5*>l5z6QJu)Xf@WH1RUb@SAmUqIG+ z9d$^E8MFJ(s1y*C7g>NgJa_3;0>5d&#|Lt;mufk2&UJZ#{%iby%NtP=WFKo=0qc@F z*2ulNth0mCnQ=ThTEEJ9%493S8;U?y_bXeJgkM(>juAnR5Vd7L#j&!KWC4l-5Gw4r zQvT7A>E)%sl0zoHNtC}xiXjSDgH=ls{_yC~4 z1_Av{L+=L$strOwjGr*eueXRGAXEMb4(52Jzt_2oJD%H>rr$GDF5^aT4TEQiAXhA1$TJh^b@_n?6}vZMNJ0= ziZF^fpj7RDlnNYWMk{bB6;=XC>fKIC0pi$YkU4*s|04tbwtQ`Erm3XwLJ_i3-L=0; zP50u7&ICV-6kwRDKYuxXYh{mm%ly(vSP1g{_5Mi2tN6CQL=?oHKgDcF5g{@!O9mb;ZDdudhTCG|AIz7V#bWE z=+ZEhNH|5je`zPX>NrM!`E=zn+!V>**OaQb;H6{b>R!_VM>suxNT7YvS5t}K#n*c& zx-mfF5CXDW>?)gPy;oih0Z~J~w$_5C-jA6%x zD-~YOrbqQ$23n8?+%CE_T6A9Y7t2@MI9yMk4HS5p$gM|is*7wS?x_P+cp9e?eVVuW zFUiQRkpA8{hFrkeI+k6C0@u!|KSM;u;98+s)js1`iNHdgS$fve>|~E60buljw$u@- zU*KNX<+$`-f7s?RUIZ&x@qTq*({c@;dsTUTvnP~D?QcX4Zf`%P@L&7I3;OBb4FE(# zfx!U1&|QdX9-8{P8tn`LLQ`ic&#F1K5sRk9R(!v_IL*wrTu9-m5gg&tCP$}Qw<}@i z(U)Z8{%gMuuYQhX)nkAZZN`R;`;@;D5y!YcNP4Nw$afKo9)V2$kNt`fbI(qKJo!_4VOjNq2#?o zAliF6Q7Fq{%m2mxdBw1f&S{rUtfax0*=`CviAnBiPA(4!81Wu*eiC~mh=Faa>HqtYd%W)v zp@C9_0#WAu=KIAAatENn7z5D8yZ9c0^$7bcoD(!TQoBSLikuB;O1 z$<=pcDEGvA2W|CROG!aQS_QO3&pOQ!02F?4VKj67lY+eJ7 z{c+(Vg+!ns;3cu5mybk5I`W`X0{gpx5`9BwttA?up&sbyEsd-(`Zk0{+ss|ixbUjV z{o2~6c!2wx@y$3n(q7*DmMnv;u<6-hQ{gkN5294`edC;KlkFiW)v}qYhvH8ZV%*%X zVUg;}M5_Sa%AF_Qw2`4-i`g)wW^wOMsr$vWT_3x3_Fz1)0FzSqfER{)u3ZB*Y!ZfI z;Bi>?kWF&7!DT7GUS4CygX-Byy}b}TRd<^&iZpbb8AxN1HMVsHek4>0G4{wm?c=_f ze7`D|WNIg6gWJ=L@B8zNw)++>LqG#5vNSgtZ!os`F&ueERp&PK%;Nl5$NGEz+DU4~{EAW+q*t?d@(vwVJ5Jm`CPo_L`T zVf%S4wpb9 z>i=Z?q%S!Kpc60uL;3M15Q=h{+q>HmB#c9ZQ+4|M@#ANZFVG_CFM#bk$VjW&QW1qB z;JWx1-zU!fp6}x(Rf&eQikj2Bmk8jJW@bQQLcu!R#TR1f-eqNgV+jD?Z^OP_L)WUv zlHt3FJ9m;`BF;yUE9=VUHvBWg#1J1nFQuCd2?`{0gD+- zEC}%`_=RrwJg7j#b!Jr%r``x)rl~44ybg66hU&$g?<6=M#6DM;F~e0lu+VhQw48OG zH|indJ@U%F0(v!rJ+(e!R~a}KcW`cfN6}0a!APcljCJ$0#NtSsP56W~3yrc^0nSg? zM-SB-hy6rJIz9KtY!YxrRPc3fU&~X{Ph6y*@j&6+BhVuy-rt znprPA_g!evMS;H15TLhQwEiK`T}oJD)_$=yIGH>+o(z;$_+5_z=R!`IMa(cqInU(N z`w@D`^7b8yfKH&`#JB=FRW$0cI0{roKMvZG=T9kn1?+8iN0n52!w8=AE}KHvxT1d( za!82GcvK9SPe~%#8XU|n0@vV%tONEP3UhN7{1Zeo9^|5K6tYVDO(M3*Q-pWeDOl8l z4c{jtSzZh)doM+1RgQR7w}gdI4@n^YulYCzVroZ@ak$ADQC!&o2r6lj4*jKAB212r3^t1{ZBL={Dr#*NP#rPnc>#fTCazjbpi zO>*lDK2Mf>E|b$uR>7xdouWhmMR~12<2UrB0aKDh8KekXNv9MNx!NkxB$)~YI{rS< zBh<~SRnna!wgxXG7^Vuv?<5UYf*I#=9wLi_+x+5ET zFZB0nTWWLiqXX|)92^jK*aux#VxMIF#jWWk)5@1x*U&0>t7$-{LanJqEDb7uta^81 zX1LMj4UogQtssq5pBQ+@g2g)xTB(#nW)$AYqxFuQ+|z4_xd3|2$+7i;i?U7E<@HyqRX<6swhdU|1r~#ldTQuII+Y z>?maL1-Qi?=u`g#1pK=HCez?_2G@PvCt$WgIu%zDGDG{5OT!$7+JB@FkU6_UGX4Br ze^9SM=mOtakg!6cu%_f~+e0_F-pr9FH2IuOHR^6%m{m-c9%eC-^w4l_JT^%NV<{3{ z+;w!^n!Fq=q?)}Hk3%BwW_Tf)8t~L5E0X_|< z+@E}!(Km%$OtU4s3f^pkl}5eAQ&pTeYI`8xmmIniS@;?OkAW8KK=Gf8=c+X`Jyz`p zI`i<7Z#;;6N8!e{J+NG&CEiE=3fS&R{cZjZTtQ&r39Gc>1k2WJzgkUbAL?lo%mr#X zaSZuGh7Nta4|hNbFB1%~c4XyT)8Sf>u(nZ+$eUBEjJPr>2L<23;6f1>4M_%{y4JBV( zl4x0*ctzrm3jZjC|JIEMR8xx>w^d{SnHG6Hv0RwFI9pYuL!A1XpL_N*jt{@}I^Kh#iu?X`cDvGjB znv8a&=Z2&~NF1%$Ew^s&wc$ict;YEo2_gqT_`eU>|8gI85|E+^t|^_pw}V1%96FPE z1HmPX_xx|uFf_3-R$x1seB?tvyuL5Uc1am<{X1XAb_^aUD^O~?$b|>wA)l10Koo9) zA{szclt`%q*Ly9~Rly7(&KOhAu%7KF;(Rt4T8q%n{#3I* z%J3r6-)5G7?$TS6@2PE(p=%}s+i*D;iEZKXlSnpUt@5-nwoM;Cd5XX_ydEOr!!%`r zepyGe1={KtZ*O+;PE^822obXFaXXV)v>k%7HxT_(o0&(m^SHbW!TEb)j8WPl08gl$ zuCjlmSI@rt2`Q)BMhd3K4}}LQge$_NRjO1hLNDJ^k7+O2~;d80g7PCD|)i0%9cqCVmG6z}(NR+?eyI_8!TO7+sJQFv~1FbGo*! z&x$|eRjC5%CT};-q4cVtV@O-Dr~l^j*bat(=va|)-S?XSxr{}+2ww{%ra)ov^rm)v zzb^Z*`np4yKLH#9^JA}Z;?`a<`4W?yATwtnjuBE>>j&gMRf%lFwqVwBEd1n;Ks_yF zuVLzWt^|GwTzJOT+2z~T(^HyAno(wPe4nT0O4>r-hXc9fM@3@?O=PZL0Dsv8h&zA& zS&g4@r<23#0ET*L55tlF)$ua70ce=){|+oWtC=#8INuU1Ql(KKw2uBB$+>F=a7v~-vQ(xSS!2z$Ip6j#j|%YK z-^4(}qi1$vpSp)G#PfdqL7oX#0!dc?Vt@e3NRk6RG)B>T2>ZUW#u?xRg)(}t_&J(&Yo+_U<)i^TpysYNVQ$r>Jz#|ovI$I(Ur zJ(^F)VA@S=xSBdjB$UUkaVBWz^C#^jRfjR0;^{De29nAN>mA^UTnbV2i!L2B}RdQziUB}VFimnp*EerB(#P^h&K~(!3<)FEL<)rv{ zC^X-SUkE}N>TLzxqx5MYMM^vgX;^V+I6ywa2ukq_S@M$bNx{Dqi3X_kdE`oluD_Tm zaJ^)Jhx?}%5g8@Yo!mnF#w0iuioo^de@#1q9^Uyf^k#k-a?PE1gyX;O<)8o}0?0qr z=4D-)08dD8M=gMvJnFJ%4ZUI}&oTI!qm(A7!~v|lX7c#f(6}JL9PK5NVL$R1jBwEd36R}xi%7NEs2F8I8&059yhutN zhULiK)NGOIjtqe50A*d3B9%8z1n>m7e<$iBWY8}_D%SY*c3UVj5%GB0=O1dx19vW+ zuc548Liw32=o<^w-;rF$*D|VkJ7u7{_(sC4TKUygy}*CWt_jNIODLMH zh(3HKj15-JvhV}>#|8t;qh)|4szjlMy8^6L|4Q{8EF!=y!7yJ#@8vuzd&HQa!%nOJ z*{n%{!lS6*qK~}wCMBxAt^??W@hrm?*u?qCEfEMKIn7)fCPYYdo#}YG{Dq9f>Mb02 z8WFL+*ax4V0GGDB0RSNN6m=0H`AGz^RyE`yCpm!a(L*pM83gA}8j$85o>uEk<%Vg# zwJK+HNOF;R_y6(smSI(HYy0<7l9}PAtl`ng3{fobV&+GNjFTo zyKBA!*WS-w`#GNX>;F54&U@VB9^cUH;0oZivaqC8d3BK-Cs5c1&r*}zB8nslP_ zXvhaBN4W79+Z_FJv>1FiekfxBhFmG{-mlTRJ}>!F^kon- z?u}0JQv}2j6ed5ZEabOtxPMJ&rSMHuEbsBY4;v~PNi6DE4Oj{Jw|Zm-K9Z37ag^k% ze&O-^Ua_m2fGQv$VWx8qg*%mvf!we#@_r{FJBI+(xb{U)cdzE(B~Cz-iUzBN3%}Qt zTQSJ8|6~3Nhzoy}e(nstUrM{*S=j?sA&|+tU`)6?`G-QJeg{L(>YKPu-ZL!c-?qE8 za-F-#X?7e#!Zf%hKrVnY-rX6hg)08Q0(e-p7lIc{y|VGEAO{DPJhc&W6?k036}*Ap zeG6%AKw-Ajp~7v=@q%H)C)i|rx4Bq)kp7A}zNtHjBh%Z`k7yyR8byBO1qw1!E6ySV zD)N3q-OKST7cpU+;*|{TVZL5&V!{IQV=O!++e6W*iYGtOpNigO`~OoNlJy^Th?ES& z7+0k+qG1l88aU65c{*tXP*LI`MSROrXx?kse_zUw>tNrK=^AC7@-X23pE`_$5u4ra zs}^ddtMQ=3)0+16W%4idNWzq05u&3y2pq55>7hReHe%b9Vo;N*H-zYW5Ns$DJPZe? zAl2PEw+i2scg5xkwOpyCB-Nb5%-u|1W%%|;&tWP?90$7-z$Kdy^dJOuR^tCl{*mzi zBmYo~=`{c8=+GK6<`DfxAtjH{4RjX&5#9Tv%KlUCk=N3R91H5-4`uLw?Gb8I8G-Nx zvkiY2^u_7%l}G_WO|ld;bKCSZeBybj7)${J@{?b{>gt;3!OWsJagnSxIqyA%=#o_aH_OzUpSo)wKlD?FX`9J<*BX3S%<>k$6qwa$w zxp`pQ^?Lma;nV;1lkH#A9(1j@EDMZYdXXNYf`4QI(U=_O|NR7rM4bQqZ}^83 z{aw}vWv3qYhbCS(>fh0Jh(LtZ2Fr{47SO5ii%Ike^Jtz(_!qj&wExF=#GhD7CF^^o z-8~WBecj)G209wZiPxQyBE=6&YXT@0;5B4Exi2~VnQ$qJNI`u-ST9HqqR9WL=KsI$ zF#JDndqCeVBl%y_2E3rW0`zE1*zOBs>%BW zJq^HNSRQ-NU#mZ9Q&-X!`Ch{taLA-Amh`fTfcg()pNmGXGGZATEM|8U;lLHN!VE963!=tw>EhAE#*mezBFZ3lzv&DsYMwNb=SOy#wV{Vy|02G9e{)mK-uuLWePn#2W9xbk z4f-Bvr}M>jPy@3=#!cTp?3n*_c%TK3e*Ul-0Rbnd`@30bqIfASEY_UI!iOdR{@9$2 zL4x*r`hH4XEnYI!!u%-mO*@M#rDV!Cv~}44j*{!^y1zREqUnOJr2`y*x`eGTH;Q~! zI4K<8aXOp#*BVnHFGb};Uw}fj5#Y%z(zicV%xg>q7OkVze>l>9sLWphrJlB)6&D&b zE*i>B&=`H$++14B_Wr}y>ovEc2|mz%=tp>}`+7us0o1igM73X$^##V0?Z%N7auM#{ zod*X>LKhe2InIJ9CEV$-`FQuMXMuDU@NyzKkt2le>W;;o$OfS=kvSS3w)lT~XZB;8 zAD%?2pLx|O*J%b4wKO*3?H~R2sR2|CTF+NX2CnloXd@;}{sp4RI-x;`Nc3eHnFPg4My z*bvM9Ck?=bkWi)Dvk8cNR>r-8brfVW83bxo}_`DN*c%=x`A5s$)_$(*|;?8t|wB!Kd`hDRbxHPf!Cc*v#fcevlB zSmy2?!q3MeZ`ACL)zE)Fumk_mB>V#%>p^hf_h?mt&%{1Ru3a4NMT=s8&ZizV4_i~E z$w7bPKhR|Z7zg9#2m7=0?JVwkazOUU(Cntn90WRJ3(DKse(;*_XQEycYa|qcN_I$d zSVhUa%c*CXVJMkgEdJsLl=qL#X5ETO7*VEWD4HR1QgmkO_6lxU-F(9`kJi|Co^<@i zZ?+o6+l!~n9Q|P2I#o*^$KKrz8S{e^=v)2^+mES+sz z3=$xq`I{y1Bw3`tlST){az8ePk1ycHnRFfLu|lLIMP5Q;Id&ZBeM8BZc|G*T&YksU4b6HF9tp>P7oT>iZ6Nsk z9-JHSAuN1(C9iWMyS`>ZvPx|ktmWm#aV~1I+CV3P3*Vqmt3l!VxmOj_wPP`m>^nert7k^gl%P$~GpM1}c%>*wq1G_@wsU~`}k4!lg&~5j8oI}$g*bnM@ zJ61pQaZaFkD8aOzk3L%cxlh()oH^I)(ykx~adnw6=6IJ}a9shEA)zm3ZskJDQ492AnO|kk zlaGH>uFmF;T@>oif?P|)@c7+msk1#H#HUtT_=H)^zT)o)V1_fSMxR9-F5X6XUVN;; zw=`_`8?5!o>A|;nIvDW>gSHtfpsI?pNj4aYmdMSgMke$>?F<>Wfn#ewxAl;7(18vc zbQXsW@_td(aS&BadB)T?HZI$phDRUtM`!zw$pq;LA23!M1TI)lY_b`wSQU{eW@ufM zh=3(UWEe-C@JC*gvpmxy_r+eblrzg)5GxFaF07{ir1 z?xJhy$7zBT+I(M?r+QK03yu$fAJog%j>ox#9&In|Q^SF0U$Ci#4tvxo^ElWsIX{_i z4>Zm|)_UA1VF}1-69rq2!6zOd!}h8)i{@2Qy%gLN7bHh3s@T!s)sxrD{dRpZH=Z2{ zaD5iBx${s0k52J;H&CJc15X?Kl1E0cikTKZGr&jD^oh|nEFEQZ5b4TP1>FhZf1lBC zYr_QLdY09}-)_aeko%;GZeub>mziyhv-?M#*R~aw>>I8-*6qidFKj8f;;ya&REZf| zm9mSAFzD!fZPY*XcX39E8%*T+iUV+IToGL^b?xMXXCfdER*aXqPI3&Uv9G|}7}xUE zQY$J_L%ZVJv0pt@3o`>});nHM01y22wU`5CKj=64VP$SEBbPAx;zkf|uW$b5S;p@S z3q;6Q=WJ;`Rv!s)-1a6J@h+33+PjDsL7RRXr#vZ*OZXf zYvw+RP?uM>%~NwOweu4>PW+zY*M@wd&DoT{hKjqg|00T8*pU||0%4L2z9E+B<57vn z)?v;yZrG#33mQ}Asn$`yTCuM+XV;?TA`Oo zm+@1iI1f?2gyuA1msS^!p#N(DVs_!OAfdm&`6(>I?Cs*RqbZGB^{kdFEr<`JGw|1p zY(N#b;#}s&>X)4pLm=&sXn!uM`^AakW#HI<^BLvOVgC;)%)A02^7Xm+kV=fNiSJXl z$L%EUpxc$vv8u0{C-srpK&Pl-GR<}V`9u4o1FzWtqLE{|b{tSV7Fj5bYs_Y#S}&%@ zx{M*IV!fGeuK`hiR<6XAIDjMmO<`sh*<=pPkj%r{RI$EuDp5mTnz|R7MgMfE-2RpA zTa^2nf9L<&?Gn9Ew0d<&FAnzQ9$M-tJvsA{vHT*y&epnA_X{1_t6heFL|=(YU=A;{$8N5Y$KJ+2ac%L4YmAaHVpf&Va0= zgL>CL=C~l-fw2vb=lgm&;yo$DSWIWURdOkLNlGHH*keK4Bg>EcdKq7#PE;zI-2RXPL9 z=w*zr!r*@N3w#;gE#?4FUvbXJ1W1;^&HkB(msSBeHo);oULnY3TO3U*sUF(fE}>#2 zrrWTG*%gFlp&W=SzA~1J8P;cx3f4k_PPr1pdW;H!3SU(2<4Vg>cSWDg`SUPj;(=!3wQ zTB&&~`zvFdk)%7(^ZPe%0uHs;a+-!UUo8M88(##1@0rp8AP|R>2$BPYwMxW&0TNfq z;PP{O1K+*>yPDPEO;pj37@tYVS9Qy_CiF*wDZ;UcMK&3nhU=VKI27Tc;9y zwqs&u_S~Sg_vy;5k|yc>Vm2!Tmp~(O%~(P|Bd`jF~cm9fk_=8FTPz~FCVUfA~yH79s(2U zH{}n8lW`b`o-Tg?W5~4pZmq-oLUh7QD(n#MDL4Q!cUfVxXpeBPN|pIKrok76u1+|N zYx&r`K!Nzl!@S6YKzfy;o+gNer4mCV+!J3ziJf9WR@+k(!(e<6)E#5Ncy2N_?+^g~{vU+9!;TSw+}Tvwcs4 zn9zFXJ;D8|McMR2V>-y&x)4Ha{17QN zyj}zyxosm1Aoo!VGaySgF@Do}(e|$Tp;dF4M?V<4>MLt=)+<&&vrEdyWKK*Uq|Eu} z6OGk%bUk>pOLk9jUVfjD;oN%6ROx-iW&*o5__;n5_2E4=c21jZlzt23#^642>Y|0t zIt+3j1fw|jzE7{5t;Ip=>5+ICxr`(Rm3Xek4H z2p}PPo{FZN&yDNO%dVsg8TzkcqB}hd4(4Qt7A&>6zetrW7tRBN*k;$q1$-rusJ@je z&lmaQL52$QexLkI)Q6!&cZtMOAm&;8oo(Y#LGJoCE*5_D0aL~lO!GudWumzhO!ufz zjS(;gWFK~6Ny@{rKW&Hk0l1@?;=1spiGr{gzXyoT!L1rS7b_7ww|E;t{z0Yjz+2?J zv6bYe+92$1R0IhlOc*Fk z>=6}D!fN&P{oUyS(kkZ1iJ6r*YDFJ;g`JiXr4u%Y2i70G16;PL3@6)$BD_-TZ7j*= zi7!HLx2e`DR5x_IwKqS!{Kd?oJ|r;hvk3Yy9Wk3Wxo9?FPw0{QRd%aLS|H1WP{THi zOg9x3`r$v13i%ofZ$vA_B}lySF&PhDuuvrsxvU{tN6ZqFlzPfbB46?A%6*Iqdb;_a ziC5~4RZIc8KB)I?RAuX(Zsy71g%_Lip<(~gp+jL?TMcXyMq?e84!d7`h5qx&!$__D z7$8+>xE#tAuVvr)M*qmBJ+Sjb@dU#^EWvR_@Cf8h70VsL-aXt$G@Jxn*Y(>OaXU?r04mIDd(`x!i6^eS}qEfKZXWBw<^ zVF&gBVjdJKeeW(E5$Rs`fvcOsdL)Ab+i%b*slOTh=9V+{sOv7C*I+ zq*R4m-rnE5VE1@Jyl30UvWA%F^Pg{-DQM^gfji6-O3-<_Rk^UucWuyZUn;|L$S*L+06+Z%EBQw zi_dNb7I8fe$Y9T%w_j5ZWX#p$xL?e73Wf?gcjhT%LL!m=b#A{w_u;FA##6S*V#**T zlF-Y8qQ+nHp!3qkWJmy_)^yNj716sEMGq)E1QR964L+dYb~befY!S54RZ~IFYt4dAo5v0FSx9l>4~_0GF2xm!G98p!+;v?x({{vxv~E z4H$|akZD!1#k7)us%dougq_x`N>wm`S}Q^_>d?8>Q=fQ>BY{rOQbpX?m(p?wViu3C z*WWd`Ci9qdCJmc?L*ERg&4y?~90CCX>(O1y@SJ9;vtj<*bHGvsDK2FjppYlo8-G)c zk$`TQH83HfApKRrE+yWoIZg{7ai>#u<;JJjoHQ$8%?`-mzYL|0Kz^7YtUcoO?wR* zyXP;BH}O|rjT->K=(c3_{cUtR&uMc(2Z^tKhfY`fO7IVBv~#bUfY0)ugOCp%%Q}dFctOn2P!uaNA)Nuk4F~V+2x45s(*-$a}L!~gUMdCu^ z-<~(Odx81U_EntOPGi1B`*#$IUyV_5Ov7HGNXGVYzU~LT2~c5u{Zw}1@AEt5b4N|h z)L3EV;x$XA{9(QbUWuKGp^AYgjrG}VD~o~|MWTofdg;VewUzkBs>r>`?hs7q{HHhM zW>f?Z8@4Ik&J{9chs@rgm~0F?Yehy5?KcXPj_Q8ntv_`>bg$t0%b_bYNiIPKb&ZK7 z!t+RSxAl$}x+iTa{PuR0uJW4&PI7SFd}Vj{)N)s4qCwKM}JlQGyW1f?jH`9SZIPqbKureg68vRMJIM6PsxvzqEnbk@XZ|<(A z6)t~ayisZ^Y1g`;G{k#mq)XTi0wtTfbYeW8>}O?M!A>Q>0mQ3dE?SFXu#3!FkX2E} zkq}uLOmSSC)8aDGs042POaPxiC`P&X-eb6HAm@VxXf3 z%otSK_U~!k{}czib=bSCXgG{!8G3B)MnKaD#|?my zw=4jM6V71B2aFS+QuZGfflN5R9rx zosT|WsmtpSj$#0ID}oYdBw9K0%Sc|{daZA25!M{d*?;1`h=UF5D@UZtzHcMzykBNB zZoSRTuwfTP3Zmk1vEB8)GcfT!mG;fspQci%Ke}m6PvTi|@uK5#tBPhB-Z62%=)S-4 zCa}i&lKr~JWB78zJCao7+0ctfCAw4ZEXsur*=MR)`IcK>{BH zDiCh_d|O9&P-mzRbRAT|9~ zLOwp^PrW}dl9#)FDC>_{S*k?hH|v7WGx`~mDH1Pt;m8N;|F~9>@zZyO<5Vo%U5Mt}9DL~3F}#}vg>}$>t#}n* z(XlF-)*JMf&D6w7Ab#kLVooI0#AZwcCnTDM<0os*F)AKC5*ZVz;@{_G{u(&4eDRf184EHt<9@si$HVWY zRlua(htB7tX@GnO|rsjydO7+Xg-#+UTo3Q zOjPMfOPi{9^7UwEX@z-aCRbK-$9(HaV5;O@z|^?=O@?Ge5gM(9iHtN<_v{6ukYA`x z6+FyCQS@3Y z`TM9m$>eag<<$;7AOv_Hfa}s}GB`S`EhQ_r@&Ib$WE3mHZ2Y(OTsMGm2N+@Yq;K_NNIB} zKONbezUU;w5s|aWOMlbL-}vqV#)^7}y~~T?=sTJp&@6iBb>U({z($J?;IYiM(;S(E z&_@TPA&!^L2S_cUp1F`5aOL2zw`0U`5YaQ0U<}sV%r4pCwY#f|S%EK$~DKif} z|MxT7Cc|7mu=O97UV3~2LrEcc{-!`=*UL-!`z<*k)|=Lv>w{{obNw-|Zw5!>6Q2R@ zL`hE@_AB>;i2m!7>!zYa3r%5>Y~%Lt$KLnb!X{&n(IU;IS72|wxY0Xx<$=eWwuqVM zC}5ftN4-yJ7$41Y6nH@atsgzCxW;zUN0*!^i?};8pu3(MOvQP&^h7Bi02;u@jMD7k z4ZCPR4aD~AqZlcFLwd7#&ucVJcI$JvyeQ9xwBnohq{`POk6y0|Vl0vH?yyBr6}=Lg zey9Czk(|#O}00Ay&csG7XILf}S3y){d6_*mb0388f@S=z8i&FobaXNSqK=a>;*k zr$4CtgPlO`*R#~!)HSe%1+IDI<7$NbczmMe{e@qfYOrWqcT)0J%Q7WNYhMyBgYNfb zv_!!eQ#m4v_fDEkRtBw!PT^ zgqKDRGPmG!_(q$?k271($>Wf9IJjDV4;*V4cFVGt1iZG6#8IC5kJX*eW}vKl_Iy zmZ_?XOrNiiLrfHd=ZPfX1^$G6a}^~ z`^5{n0DXfddQ`R0I%gIQO@h%K90nTl(BaoTjE?sp;LiJ1JB?l3{ZFYc$9-`3&$J`> z_F57a0B~E>Qc2Ci3BN^2g2kgL1`vF=(Jxa-`wHQ8M65WsyO+|sxoKrnz>i9AJt7fz zf#fg+fS*Mi7!NyD4k3UoxJf5rQg#e>fFCu^2<_^5xI`Ndkg0}Ek3SXg=?%=#%jbcU?aZwt(O^n zcO0a>5SplubWzJ4LK^rH99z}`+U(r_6N}SH^aBON+KtfN6isNwW zYTf2L73D=ORC?V-@cavNrFx*FmKMEG1tUHiO*94a2?h+)eTT&;DmC@)wxWiI$1-+! z?UD_an~^ATS9th@*+Y$4-LjBfV#WOc>G4$*jIhyLq*5J_u;P0{A%Rq?iM*yC;L$u= zG^$Vf^*a_g_8Lq`PtF05sQwB-!j1@fnAGmiiKs}~21{I>j0SSl(F|;Cv|rON+Gl+` z^sOmrpPE_Yntsgp#@2xEgrF6PVgImCK8qi$VZ_{ID|i`CDKw5d*ved z&dtL>9!hb)gyDZh?X|+|A0%Ya@5(o*80a`PMVKtC-{X`)=5^pM?na(VFLZu6ZPC8T zjg8rbLhyvHvj#By#2(%MggsX+61kOtQfb^x*gd31Mm$}50f4;$`VC~(^vq3wn0%E1 zDMqjLuFJ!Eb_viFWP@cM4(4lsGMJY9CerH0|M8oJpX6x_bs|Pzhws^)AYqB-8ZAPN z$K&Apw%^#iAXFaP@qfs|-DsDz$MEd{XO?dJww*w$ua-Xxb%wMp?&vmZ zV6vMYUwE@M-6Z<9kqd2HZRNMKvx5S=FR~mm0`wLf*0lU7g1j0AF#As8U6-TfPA~7< zmCkmb@6|}8QLT5g;%Oo`+PkxMWWpK^<`LxnIJDH=V`3V`nkdSZa$3y3KUuXYzyC;w z)klZCW*l2%yQe%Kfl~{9rS57Oi=MNpyi~cx7{EG zvZE{m*kTpyMg|V~IhSju|tuLWhlt%=SdQfn4?G0);dLYHzI0frsZy=5?ZI zePK&6AWH7e&F!meJ5Q|JqPJ;@Ycfg;RK#Hf0T(2dY$2ke^h4_d?Osu2%FGZM*^mt9 zLxJGA-nhFt<$Rp;Iy-}Quhxqf(C5Dc13PuF$~6h8^1r`MIx-$F3d$ANSBU}f z?QQd7*xuCh9(MkWVx}5Lvt1W2=}ioeJ#J`e`>ih^q0~c`?MBGL2}^zCzSsaFn848p zb0pbkkq6*))FkhoP;p`O^cZ~kPqI8R>{v4J1Ng{sBhPj2iqTzelfG%NU;hL^U*)+m zU0{&_yO-!Qn&5MiBHV_bxUhrSD3s!_7Z3(ZQN4!?AigJq2hT;4x>AJxR8| zeU{c>_JeE-k)Dklr4MxiJTmn>#Bfy&4-igdl zb%rnz`gMnKGPO*Xt8C8omN%=Qf9viFYZuL{Xfc4DO>qu&PL$%Tr$oOIjQIg=;o5?^s2*!m?i9j!K8mw2cHNwAkle$qIF+VORyX) ztC`$5x;xBgdcbl2Zs^YXC7>Slb0?o1o8t`-1E$athAvodCZF3_^(A8Z#b5&--OlXZ zNKbBjhqdF5#%`Rle=i>FG76k6ok%2IKMp{w(ykB!TWjq^MmBcYN2JPIf55PDIR>}B z_>C&EICeYtUU>6}s~eyMMc$}1Z!~FSI$b@6gGUA*FyJ;*yR=<|{vuh>&GU4i-6%5@ zqO_NGT-B*%i>?4W>hsnX7H!AnX7<##q3-)pZ@GTss7p|ppsh@LE|FB&z9CONC+;TZ88&xk-+ z2&h(?OxA6eb>Q)sanMm-ZRnOhkCRQRQ7u<~;swlz_1Qaa_h>;45#$bhhdwGfP2&RI zQ7N8*sGTGzR)CNR*~b(F;A;R?lHK=BzAf$A<|UZO@z`Mp8wWr-_XYBix^5nH7+`9g z2B>TH@9)Hnz5-6qjJHtwH?-&eN$jN_$1~j~lV^Oqn*e3a29`uR*j*y7J zrk7>Kj_CIAk#P$m}DtFZuY;C|1C8^h$z#+C_9*_I(GS5A!^xQSs{7 z+yH1Cple$zE3N^Q74i0X<>@%vP+@<*?q>_+eG!rD8Cj=y7uvTv;nT_ zZam~o3AiDaQMRN!(?>}f=HGs4IzM`{4i;-xz5!FDzvSY$ce%Kf1(2E_?l*Zsn^VrG z;^`FqDrFxSKJ3FKz6Zo+?qcS9CQ&oWfBhUCgq15Z58bsA%i0#!mPb`0Oi1a z+1`M46#Y$mcxZt88eol;3e2&R4x_GsIB8>_B&;|>alQ;tw0jQ04x#>Kl>VaP!uMSI z2nEY8hIT>!4aiKF&Vsr)gD_)%fe&I3E28_`EX}F?BC@MTSu~b~xquJ)dzrGC**V_O9dpe3jx?>-{dLnj%M|#Mk4AwN zrAg!dmhH)e`3Hpk=YQQE{-D_dlt@EQwgE`a#1z(khsv|p&i8NJ`ex#zkwcoZQ>Ic? z+7b9oY?@YOsV5sBehHTJR1q{)=eCOy0xe56Q{QR^xRH!^BY}v!q>7A%%vGK& z!e0lOocuYc`)VrSlM#EDm9es^L8`1-Gp)Cv0#N=(ruj)jPIvNruQT?UFly7wpIfrD z{CgdP9fL`34e1-M;*q_R7`#6{&;kdKuK^?SkprcUxD5KkQX=g0>x>WPBE+-xdwfR~ zg%dcl)#?QBa?IHH>R^lsF15+iZ-%!OT7V5hy3M=Ku{I*3<5`Y8 z9|XXgFyFOiv8!%ak+74dR4wM5+Cc2#E_<=TjTr@K>oL3Fdfk{VJH{`b*bHF%knG9r zcvI#?TzCN(z0Rzi@A)c2=S-5p#;EOHmxiF66w zii5k)+dd*cY6;U>jjQ6J$r0D+DN$KIx8;?iUqeNZ;Lm(RU(c6@<_L(JN;fqI-b=A} zp~L~v@cW@N^OjY5Wjr*$D7FJ;&>+$9>K$d3NIbihmuC8dHu>Z9t#$b3?!}eLc@Ct_ zWvz?v_I8g5OYZ^n(j>^F<7hCQ8C$>-rKb9Qa`XMCLpJjiR6yt!lw`@XwZ!6-C40bw zHW%+XHcxLZl?wHO39}G?h?I$v^4oJ4n`{^&oJmSI@C3j=c=dz<7#INEWvtT^@tA98 zgjL2|Hml(&54opEKCPY=4MxK*=fFCEtz~`%Ai+gJp49rSX@3f>`6pS6hSmMM9f|2r zP(o&}(%wcY;e-?5ba6-%#fHl1<2^V(4^Q>AuIkzTg-r-?Z-zQ@DT;9Y*>_@E=fasB z-W!3_eYz)YH`np{F z(LF@R<}hQQenb~zLGxvT)hbwJ4Wpo3vX&V%)v=2!y#@AA#y*Per$-W`(B2**`8R3$ zA?B~08 z{8khfl_RRdKihe?!E3OLCk3P2f~S>I3GOXyKq@SC68}MxhV8&VcKH}f4&%LCw87x?!-L9)K4X2Tsjc=-eRS& zT+O$NIel*WMS#frK|q6S>k`<_Os;DT%rP|Tm>eTK6P_0ch6qn-#7j6pf83#?e(wMd zBg6UN1PsK%xWU^vz}mH1lq|*l))Gg7Qy6`3_~aUy#>*k^Fr2CHYNm;?fE4m!A+08z ztx$5%j-|9&#qv8J9NHebYev#Z{qdW)0gN!+I?a1<(1QeUS7>&8YU>9n@N~u1Jx<_H zVArK5+a)j$Rs?*?AoKKw@oY}F%wWu>4U5;{4Dc;PTPW>f97Aady)$M)Y-0bmhHYdn zAAbTaEhy59Ho+i~Bn@E)$-8F!Vm3nA6g7WT3{a-qC)6+)o6-~0vxmP^#y9a_wg|JD ze6};COWX7*y6Rn1Cphqq?siW7FPAC)a7R}mfLEbo>+Y^ix_|ttMF+lHtCwOwVsMU| zn$5tWfOH>hLF5{wX>(Qd^z=86)N2I%DoOw~G-nOmC3a&Mfcu|SuQI68goB|$i#GyA z2Yn2tTr#el&ZCQGAKsn-cDY2LR5A?M#V}0k{@Mm^5vVgx^fjvwiPi${$F~$8IGu$& z#PM2m7TyI+i;Eeg>eBRO{vs)gp3X@Vb!gHi%zm#DNEHOA@0wh{%rr{7!rVW?n_N0H zW!;Rn!xsxC)ycf*N-5+mZ!O{0xdWC3Z@$F+{xl--l$E&06rdOt?lF=CkB)cc;@xIy zOTV?^tSg8INAY-6)?X3GzvFaUTI&^WSOr1&<=p8p@M3B6!kV&k>`OuOc#3#XHoJ0w ziYe}h_Yfbx!*%x9Gvnxm){MS$C7zQ=!aV}yy`bbZFn4$| zQ1e$y(Gi&Q4HhLbG%KL>XE(#6l*}E$#XeH-v3H`1Ao7Es_LfEA#Kn;h7$a_WAF_($ z1@(qjrzFat%izA{l_QetAzof83x$Fh7W=npJQ;6iVC^~YbmG4 z_k5*L7niYHN8Jy{*zi;xm+uWaVn=Pz*;-4D33Qnk=SMD*JVjveiE~D>^WY4=jaXD{ zz+H9CXsK&hMmAVrY_eI{z%1*%jI0pw`}nO~du>j=h^W5TAB5zS_Ie~o&7Y>!O5~Kj zeDgUt1+Q~w>qPQl_dIxq;Q-K~=qbdw{mr{R=lEX%+o!z@9{ zfPAjRk$EAXke|Pk6s{lB3sJuFXUQ+4sICM1Am{k^xh3L4p8*sA!(gvW9g%Yjg_k!n zW0nA2I?LF&Fj_G+5-;dlTRYG(xF(b)2z9??nosi{oXt@AiRQBUkeqK8joWwajy1 zvSQ>{cL{1R(N_A3se`E&2=y;5XpTkEvC4;vTS!}L=mzQjCgk%69S&yB-5*_tFYtHP znk09xq4^IjZ~_v(uo=r^gM!jaaWu&=)*TR`@q${!Ib=pUX9KgwZ}zd)T*P$?TU;oM z)H=Y^=duea*G}h;)ep^amk7B38ps z@B9!F``CDRSE|CTX8bQtW$M-y=0}d#K7V%kjB(5z50r=<0AcQ3&-E)Rf%3%HhNY_( z?b7+jqjhyf^#fS*)K83h1u$ z-EoY`Ou<^O>BCDY-DIlgXuY~Z9k!jLx`<>xe~+LOfXah(7TW~J@(Q>sO+fZOHMKP& z*U+r8<3=v?h&%FoFg$B4lTPa$tl9hCAaUT812LYxBdG4tGdXQ{d@#gO7~%X5X0peM zizmYzpVjBV-b>)9Q6s&Yf*_wNUeRSvtX>U8jI zq+A^F9l&QStzjGv%kwdRrwtmieRJi|1Z?Ipf6B)UED@scpAnmL)0R{t2+e7rLXm`F7rlyX2d^{`6X^C3C&1nz2tcNf$TL9#>&M&Yj| zqDvZ%eoUwRG51ST1`2)vj9i6)hOU7R69$rrb)H8#D(YMYGjqJl*~bj;#e{dXup zoIgGg#;HgzzLv7uZAvv06;~Y6g=K`L%9L!K*|KOz6a&zA0NeacsYZ`%RSw7(%C?WD z=G!b6YmAYQPmWW97z2_+kiX!O7(11Sf0b9KwIlav>^=T1U?dIhE;kROV$Qm4nZ~+A zU@v5a+pfjjYfu|@-fM;D%*^jmN^(U7U&?L!(q2$a!*VRy6MS-J2nkbRT1tcmz{bD| zm#3EZ^)xME`Rg8^#T|M&e9)6EV3rH;q)1;De4!%t{K)??^mk$70Dk-}Nb+uq1Ki2E zW{&;#w35ufkEDCQ3cHcCemXg?JrfYM{Okxtjj}UQ28;?@&S2C}1gr$8Wqz*{!5PiR z;Oc35VMDOf5AWVuN#hU(gYGw|=OV+eIG8{grc0;>oG}n`LM(4Q%&POi4P=n=#f-mm zk&jin&T(1iZo1oC&%*pbF8u6xq3=4;LS$|}$cm)(zUB<%aNu@knwsFa5u}(7C*?Tu zWo{|C2SD<|oDyV> zs*|a4X{7N&wrLXxF6 zg;_1?TP-aJj?%4wv<}ic_?$Lga*m(k)Dni1Q|0f)q%wY2?D*P<7c0$=gmV}Yd~@z= znVJiil6VY3WEL{`9aj9Jg$dxmL$q^aHu~WvHzWg&HLBxqbTA*hS7Zl$4#I&K&pGFp zdkdYE$!Dyl-=7|G!{51b3ZwV=$P0MHT<|{zo<1P8l^Y2-P@+d@IRUm5__%!*-gLmj z;gDq$Ln?O7UNtkTvf~kwzt(aX5id^NNpuqvHXf~oovNV40d~u)FuMyah*-pLlviE6 zOgsehNIV}J3o%fR&J4=mFD+1FhEofY>|3jtPw99DNi@Cs@_ifXf6U<1siJ~AZg^x+ z%Dpm6I-XLMX}S26MF{yz3e!PXsd5$MS<8$vI@#wURr&;#W5E~nJlkn0t}vR^32IOi zn^)A+ZiT+IsBRCO<-42}A>y44!zEVgv&mI;(X(s0&y6dgIt#nWSLdu$S3+| zX5cGrp^V9Ab$WLAWSufsqwaGupt8ERg8a$q_t{4${l;u?=oCkEkhTxSHa6ES$&kD$ z;DUY(u%dPRA-VAOOgE*S>aLEmE#OUr2q~j}26X~99D%G}18`%6y4Ni;e6ZDX18gq$ zpWC=^KDUfd@p_jFx#2uvwY_{M))%Hut7r0R0-cyXXBvN#!xfmaNF|efS+uEHDnelh zfLp05+&Viwy{eG*AOe~yehPIKHMd{zO zLVZ$+@SDDWh<`DL(}!#E_z$j@ex9`#^lik4JXS^j zZLDeKe`MJ?4+z^wtOzdtm|imFz3Yj2J>I4n*L=Y=+Z47}kO(_8I1`iSwk6ZXGZ1y^m6#=@HrA zwN4KQp*i@ZHb~Z~0AohbGpm~tM|X&63)lz7k^q2?AReRCj4*Mr-VPESb+pqrP=ion z0-l$J_L=h9f^UNZI?d2DCqc1_zMGDO*t;?Z|El9!XH3bI+s&o}IGX^3{IY;$1)>+d zEs9_l&E!6M7P6CbFE}r{csnGc6k{ZIpAu+!ysRX^XWX9Idx5`rCU9Z_3N|%$@Qa^x zvJcU(K!M<>7%k-JE}+ETs8!EId~{%TKCqn5fIK{PaC$tDD$Fr7TgxQhVxO=Rc4~H< z^^;1%ex*91T29a%*eq#-dN4s7X-Dd1KLW!dlSIJf6cl)0IdDFA*EGh&t(#8NMId@+ z*v0ys3(mFTd`(z5`I3cGR*${$3erd6W(<;I%pBba{M$D3O&LpLl?VHY zYhf4&c=YTj><-Eb%A=<Q&QP+K1TCTV?bg!L8*qBx;@$E%r z38Ga+&!wT4A}5rE4MG9`zSgA?`dTm+o=V@Xj&$*Hj~ucqEvx6UPGvNE2dEm=F)sUt zIJTFAqWz=fE`mpl!BNX=*S6oDM!Hl_Fx$EV^PBgTN|w?r_FYnYhrfPXg@6hnac&QV zvqS0NoYfgHKf_CYI%%pmuDtkM3(^*d-gr6q6nj>m!nAttJ28QUX(SpnsHZ4@s22K? zZUj}}|7-89gR1P>|ItS=knY@qfOI3VX{7}z>F$!;bR!a*Mp9`33F$_*KDjeD|o$Ni%e+z1ZsyNj&lgt*E$K zTz18uJ>~8_SD~Vy_r`vZC7m^6_cM7evpE);GLNIOkwt3dPtHaVPMpBxSX)}R-1u0! zdq0aMDEE+;DI4Y`co_`Jq-yE95k*i~p)Kd(n?`e*n6>Ecad9Wy) zc%e(gj?$`k0D0*3JAgd!iJV2O^FR*Z9sz4l!Bq{DWlKAFR(G!@ZcqNbuoGe$FKyQ@ zC{PLAnGz6}>%B!Bs}Jg>H&U(!&eg5uNPap|1buhw(S|v_T#^BoJNbYi$i2ZE*G4$y zuH;nFln7R>=%5Y@W)5R+*5y;~!=;W9yfOX^F6(fC%Y6v9 z{waoB($GcS%=1%Y;aLD;TS^a~H^?B-Ic=nzZp@;8sPyFpa^U-9J8un1S=H}!o7c}p zBtvwbz)iM^KIdZ~m`9Vt=fy3YkF^Uw<`ZOD9evU@cGF+9lt)u|FfTSec$Z2;w{?j{ zAWTLymjk9n^$U3>G8lve-5o$2^4ivH{{9st(2)BTAwX%Ffe_eVHs5 zX???EB%Lqke#6PF#6#+9(yjrl%@}hn^x;}}0i<4_p}JUsAY~JTih0rfLnFxTxN$D} zfQSO&|3l-Hu4Q;XnXdn!DT5kXXtSZvRlPp2{w1CUT?LIx-r0z9s3H+i=5e$IkPv6G z_+yGamhz5|f8h(g40~t2;g8mlMeG+LV@v&%8v+FvNAc%_`}%D20hUF`R7NPDAIOX3 zZGS8GuOHwF2cTczg3C@J9@WL({bo?(2VhbG{1ymX^XSYVrw+x{`Wa0`mjl!>ka5k- zv)=sf_QV1dCS_xypu*DY(U;>G;HGilG=}_d+i6>%Zwh2#mP(_2Og)vn{LwvEs`Eis zdssEA49-0O!ZiStVvz>)LD10!sFX|J&EYh?at3uQ?R?rFX+v(d_Z}gf%m0QQKvYos z{tM(0`gvucon=XGbg4PxwCW+ci++^yR|T5ti`r#;Dc?0E(U@lK8Fi||qrj-I01ZjX z^DTDqK_1Hsb8sbX=(zy>g75;sm5lra_2qRey>*Pm@n$;B5rALNIxdmD#%i&Cy!|aE z9=-#)wozW1_dr@qxlR(6E*x7%0w8{;`iqU=(oUN-PnpZ;62_J?MoRa)^q60uieR^Q zP~H2eWwZKqKZgg>Z=h?B2A)9vfGem67v_!DE}M1)qZ@~L%oAh+6uTFz!vOvTz#qC= z8(m!ot2@i5P<&+(c(~Atdjt?9Yp+>2HI+ZR8|WQ@v=&m2uglwash+l9x5_3A(M>~V zfVOBM(Miy_j~z8i)zK}y#eq~avst5Vt_{n+0wtM0Femi(tJ{Yznxpr4qMT;^L>lyo zxb54rrc|A35iZUF1m$<+f{&UxJ}T8UeF7Q%b?Y{~`|vi#nZoA_Pg+bT1|^x55}r5% zkSLayuy0Q+ctPdIZKO;%HV_izE$m~MJpo{8XVlfXE-AtS1}e$m?FxT1o;c=GQSC?& zvyho5;sb+hX8`F#`7k>4Fi(94J_zTLRThPa7q5BjF zu4t*PyBhB9o?>m6b+(GAX&Sj@JKTld{(6(aN@DH|Rp!n{Vc+`SL8^s@PTBQAW*VRd zsSbUc`{A5scjD4FiJ)EK{_d?_vm)=Im5k8~iGI-GmphaTL%=!XqWrYg6-6xmv@0zh zfLeaO5mWfh17+*eSveu%g&S?-(r|1L6RAc?1_-+MkUi4XALIEuy`1hn5JKL1xDab+=+h}EG$W|7DMzb zRyS+7vNFib6DLJCU28|%u1t6WBj3KhIlzyG`0gYKu)}ISuch^gW*h)S?h(l^WUv$! zE#MYq6e+7CQs+ghMIdy)jr?pD+q(ephfZQ24R8D>uq6$0(5B@tZ-R|Lbw(()QVEmE zYSR;d2K5W(B!;e|xO?VJ)z;C|a_5rMxx8+W8x&rl!a*cWNzg_E4w7CUfF=HXb^^e4 zE-{FF{Dw5b(jdy+769Pza_`+6i+7F8ua$=?e%!B!!Fq!2`|-7;zXus_rZXq_gHysM=QI>v*2ZNT0>)v0!{{I|Hbn<_^YFHWIRQ zf8?pA1?UX_Wry=@Mr+Ro=}128h0cn>qUTRv7i*)s{zc>iKLAih06ZfH5IRuEvO9NF zQ@*zB>TGZ}Q(nqcpet{Rx#Ez)iPF`^+4nx=oCl2EM=!a-6PRcl<^O z7ccyM7umq>Z-|h~r;m8mPn`mM?YV)xw$9SGgq{^&s#06L6WUq{u|Z)E=%*)l*GoB< z`He-vPXLvSkmW;~L|~tXFSgy`M!zvIv*F-EA~62~vFin>EQ4T#iouvm&Z5^rg~R^b zYkwTU)9Oo*j9;M-k+G&R+T;aGxW%ZsRGSM28jsz9M_}?Jfr5SX!eV}Mfj0B?u0rfH zTkhRyEr3mFerS8{CS(hYQ$o_88?Ea|1$^HHcCS3!6^KE$ll1FnJ7g~6FHXdN2q_IR zkKVt5J}^0b>{I;F#5B8Do1h8;P?$7f+5As?zk^mizbddX0!4^&Cy91!FQ*aj3 zD;zvg*5BMve}5cE(Y5P)%NYX9KMkpl<*wK5UxPt>Be~w-=hVqB;5D{n0K3Jtabdg_ zy*D?M#cgv4k_zSJs?(V;)FU0!KWmgn+mnJ~<*Q8K<935WEG3(Eiup4_+~&(5ei8u% z$zM6^g+Pg?qbBH@UsydVisPa*_>%TE$Cj*_^3*(w>hgPHmQ%j{SzRRLZY8t*q56p| z-pJ?62sQ%Ry}5XQzQuGxytF$ z%Zc;755uYd&>Kkm1E!3f1XWrRXS_x%kqZD6>m=5!8fBC7siFIA?LE4C)7RQLv^qEW z)cF{0NYqc-AfQA~w7!pPHi^%GzBj+{ho1J3T_aA{f6Jg7w$nM_w`u=*=NY(yqUG0% zha%cK?2i$-0IJ}=KE(=jy_&--x05P9&k8=18&<%)KU)E|l;F+1c$_~9C~^q{BtP3_ zIuf-k2L?WovONd{1NCtA^3pAYOw;_-=${f^_1_V9S8R@GWX3>WS$WY zNaK@GYN>y&)ihXN>*++f^SdD;Xk878ZAV;) zz?f(x&1k`Yq-Kq34 zH)AX-dbw0q^{!qQ3j0V38Wr7gUmz$!_iY{o@T@s4TeZ7v7&{Dwye_#NTW*oLTx5oz zja%d;85XXUTwcGfLe_z2@KI;^wlx~ftcDu1{Zq8lb^S%ffP)Fy=^+$qhFq3W+N~Ut z@DA$B;j^w}@wDIAQ3R4`ZT&5i(y=aXi?m+TAz5@lpeqcL)D`S-nw$qg$)0B2emlaf zn4E~JD7X%hUIg5@X(##0b(@R4q!M6Ec@<^OfvQD8HAMtm7b~8d7XAm`#$J|IBLjG2 zz-adbC14FX6oS0QKs^j%9;F_``qSPOi}i>Fu|;|N^q7t5P( z++b7>vs{R7@i>3UknG_@*A}y^G)mt^2C(auKiwX^!$cAFX_7DrUjZw3ui9I6xB~Ur z6~pW09XxbH%V7olfXVZ+dfgi1vtECUH+K>#94VI{qMK$<6t5o3y(P?=y^Tazh{v6 zKJUpPl7G6lZYw?_2)tlc!~d)aG&*>K6*^Cp`@ISek4`&DLsZaRavm&6p|Z^89+A*K zKqhjjej69?qCGZ30OJoe0qDQKcz5rUC%FzRE8O0cy1g2}YyfIw>+tBsKY=!b|8CqA z2=H@(_OgC*i+raw=2ibCj@fy|Jb;;SyY5rIdF7<$i|d7$_k5s8>pv~ffL8fS7SW%a zAMa-A&B5sKdiWT}_4~R27o6Klh}uj&P%>e+B$hw8aRjGO6c*aQ`|s9Spqq~9PvmRe zR@0-DdP`7le~YyT9GhR@e~<1j-2&>FxJ*xJSppzsa2wzSl9`|HBIdu__UwJNrMc^Y z=trUTU?4Do9{{0x0`mc&<$~OD6X8LJ)K3Y1P*WGftt5Dn>9`)d3~*>``s#JvN3&a5 zfiH>tUz34KYb-?3#N_uN+U8Idra3$?MSHRLZXM^5-lFa+ny=v$9M(>g0F}0cDQkqV ztj32$*p-aT_Mj;{R_HkbX5IK$j^m7ZMIu<9$7chepMircs67i5 z@HuH;q35yrF-@2>$i>ftVn9HjUH~>2*w>)y4b+55ZXW}qN6yu!Z3kJzhUV-kE!l1R z-T!)3RG^42HI`3mzzCs*JTLvE1)zE+MWQ(}uwx*12yV`?i3tMiSpk>atV7QoAhq+` z>*=bbzu1BEgfNq3`}*?T|92lka7}~dzB~>>sssyzgoHjVp@XiM>5VQ85#qe+PZbhm zp=6$4w`u`a!L6yF@4E=lrX+4JZ~cNG{NE2`2j2ewK6JhRtsj+G@Qhg?w|$K3{{+h? z)cFxW7T&@T!jz1YJku6Rg^?bBQHw@D$-|{Q=F?JcLFV7H{_c}3*Nh+XgOAJ|_UslT zkr@HsMROe+|0?em+~j}Rt@sag+dnOb|AzGXXVyPpW5WM@%O4Dj|8ye$1D*1Jt7-E8 z{~5RKB>rw8{r}nBpc&NLkL5U-`x}PRgt)g`TlRm_>0M*!g5krz8)WZ|3_70b{WAtZ zg$t-!U@`snDfYa9_?cm~b&>;;{2zAs4dnU%epJr?OLF7%X0ir-hiY47YvE>6iu38X zp+e=1w2*`lRtfFL86oRb;kOtlneo~o#ZBxAmOau|k(6p}krufp+6MyZm_7`XLT?!@ z4&3{9mfx}yNe?cq)}+VjL1JIi#PXe+nyxSoAM;RLyRBY7_Zi)w-PTZGi>Wx*S6Mu+ zbLw^GZr`5F9Q8~QjO>+(7@XLdbsDQZ$i%cZbQCWQUoJW`7~^^D%(!e*+cac+Xp%w$ zz?Tohil8YG2*_&Jy1Rzy8JRtSPQCTI>>d^0G8``odO4e*vu}ncn#WhfSgOcJlF(k& zUc|21);uPDdLi;)bji8pJ(QH;hqRyhw`a@h_VEZ=@eFaI9Q>oM`^U}c0_d?`9nuun z#35*t&58ua&25~Mb8w#_69iI!gK)T&J#T2g#M9(%8F9*1=+iW_0GPx=0E`AK7ljBx zC(1KV(Mx&eU{;v_bSKfEZgM?DB5xDt{+z~MLkFJPc*HoZyoHO7$^zFme=E(RV2*;ZhcBuCQIXIy}apF-;GPd2crV!^104g_wdDW8nk!D#^HU~ z(rh<*Vu|Q@77j55@*8~uTtb!~uzUPz8+dZ~7)w=aTV^nc@sMxKTn1#bMnp$1%+W}# z-HkC8vtMlbS%;lSrl7hLEGH{6Hhj)^=KF5`$loAj>$L#{YmmJa`We;=jb=fUYF`gd zvaIm*Pz)^Wp=lZD!8Jv9hkJh7!?CmICVe??&Xw7b%hFmn?PitW`rz(3EgObJ^Cdcc zcYijnPs?3&7usRBLB;vI418Ht6M9bf7sj(P#$v*Qs%(&+xRl7EpU5q!xhaL4 zVZam24;FTRERcx=u#?vA40Z^+j8IoiS&1VUlrV?W?KRgLgou-%db-u(50GP?l~)k_ zc-ukj@5z^+td-iDg)`}>Rqi>7N+Iy*2W=^uiQpDB5j^NpN7xj?iWX=}qz@T_>O&+6 z;FLbr0|D;~*?K=23A5D(h}Jf+L9uZi?7p3!ldD% zcJkG_S4S|YcblgB&7yeJr6B&WNLqvVdplo$S1K9e)=!PpJ-37 zVR*F#sVBI-b|@H+cS`$&Vg{?9>272-$x7$Ja8QWwUL8vxzIe2T^A#upl@+94 zFjvj1@9j-mJO)|0R{y4mft;EUpcj+J%AdBCRo1>m&8uvYFJR1m&*gxwv^VVC;fCFs#LGeMy2DmC+G#v(*Mr8Eoa;+GS&}Pi!D&3vg8`9UC;Xz6 z7pwD&<|a;S4%B;sh|Y#XF^c{jw|)Fwj|!V@7uPTLM1^e7<{#2SlXK{&^dzkc6a!BZ|L=tdb}GD+n;-Md0bEq=Ncs_8Y_Ca_cD9O2oj64@ID}uteq^BqCM3Y zrpo353xMwVVS{7X3q4C%r~_LZKrOEAEeiE2)LzndPu(fCozo|oMz~|B8UYIkbbH|Q zaJ^3lg2^OwaA4MFnL*DxZA!(2b`)mQwy>#YBE@vbFDF<>V5a?fE@5%+0rGXo{bajW z<9R1bBztZJ(CD?g1as6MRsri7_!mVSwgY^S4rmC~>$=#Vb~y9OF?qyT!w619l18xW8j*5U$$iFuSioI ze^-&6LGx9G)y6W$rP1u{=9UM*#+3rooqhu2I| z!0#7`Nn!iKBt(WW7T&jGF32@Q8R8;f1OZeh(;Nf*WQjy@iT4Sn6-nJ%xKTJ2tYb`U#&r{A_pDyS+Lm?xWf< zj@xVg$bxqDseli{H0bLe5 zxGWi3L?ORVi$EFs;QtVqgoN-2XMhP$IkN=rv|)OuVxBw{x3|n@{e)4^JsO;8`*5MH zWA2z|A*hGP9%tUrtIj5{Qb){u#Koa`k}29rbW^m5BS~{7pTwXkshq%F=N9W_zOmqc=D!$deox_EHjX2;l(e$ z8EoVp{W$&MA(;4jxOrVxh1nqN%mLRDKJRkI`fR3Xu7EZKS zB#r$J$&qDsgfx6>xVNCiS7Qt=HY(_(MO!S+%7gjyG(~WiG#H)uxOpgeA}SP5#I0?i zNkKX`5Sg<^PHO zB-*C8EW&Hw`~kOlj^QwCYVkw2Ap0`#B=HjPxR&(5Q4+u=N z5RPOEM-OUYroyrIJgw$uPlEb8^AK@i^RN(Mh8g~t3$e-r`IM|4$ga5W+Cmrm%wzhJ z;`S%o_gg>OYwK*o^_Gk*t^NvT!Hy=OYBVoS;!oYQBqhz}2h3|Uuj6xRzH#zWDopSb z8LI|s%c$lE*IM#64$PzYagh>hQO~ymu`HV?^m|$DM79s$Nm1Nki()sU#8;{cFr%}C zp&_>zSb$f>@ff;AgITkX4fusDoNo0$4{l1is`Ou7V)?Cwx9+e4oKN}$1|S#(m{odt z^Oe!=XYjrXIPw1BGR$5h#C`I4{lsngdFFOdpfG!5VsF(OQ>@Q-ZSPug?mMEnNMgf3 zl2}K3*oR1zIvZpeDK8Q#f8NZKdy)FlrhVq1oy>G=PP%+tDs?e=i!%(Ss%jLaRD0ij zJMV6UVQJIJuvxit^o~fnsJkYP^yi`enfY&-W$WWH{#+|r*=;Jgl`UIV38ygU=V4bUVQuo6lLE#t8fw*zGx9zTr3no65+`BzmqZW=i zleD%DvUlsr-HbB_yFiFP+(FlUmiA0u9Qm2xYSJn+TA9UKxn4qFIMl`hgFc{+f++^k z(mivW+*1*wEIj$1WzjAx59wHB6PYB_1C|Z=4$1({CLDh#wkLo1)=p~ANi9|$I z;?0tcX_-BgA+@-kyLy7gDVbRM5G+hg+~!{7t4 zF0__CB=_BYNJ|nF-V)Cwe&+LHE+{$v;@y^8$d4(iW{`2`G(KHL!NFB}2*Z`Ewc62>cI3SvI^^_}L-W8~T$?tMd# zX-3k2w~huUxcOA<2dFmC8$R$rfNXw2g%>hUn4`by>Gj%=qmOtt2?Gt^D}J2$gRSdx zb(C5p`UEG&;lQ_S(FvE?q?cs8#IcwQKwGS+{CGoK&yXAhLCmPQH4LMn)_*y>KkHk#M8PaVUUntSCX$6tzn z{07}*xsUNEAa+LFoEcls8gur<^ zgvQ8_6H zd{x_`Wk)uri2Rg!NW}=#sy@XdUNZRK=XnOekB+$Y`4~uxP>yCJMHY{Y-blBv zJ5}=-;fT~lU3sVPuGN-4*#i9MhM(e>;^;v zDiG4F+Ke6(ZQc)q1g*O!(la4fq9$sk{&4HCFBxRhhW*gnH&D=()kF4jN^`naLtWJ7%8@$OT`~cxxX2;s5cTP=gm-gy)4KsY&g7l1DEHr)?B?oWd9^Y{x4$6;>Q$6?8^9K7h z#Paz=jzqQfnRUtB{CDoJk@K8de0SvjraMj@Bv%4O>3B^eoOgF-M#xkL0x^-NOHBU~iD+3<0!BZcwMZ^q}K&B=u5 zqt0wO0MjoXILk;@RmlzShD7aInt=05yFzj@pjZNJCDv=1kMb`*)YpM`G0)VeXah6* z--}7*C1}X7@vwV^h-LpBL{Lmq3+bY>@9$-`WDlH)0=+(2@AoezBQT zh)AMA;P=YYlh|akC=>TcuH{#K{%D>BA~mNKtA%6r{N2D=4n0rRAStByMfM@E4n;)# zers`B2z<}5dX`WvvWkzEs`2?>i!#wxPv`nDf-uU2BLN(?ihTCm z58*A-mC_%{IM>$cJkt9&kdcQM8Z57_t4YQta?kIsK;Dg#2<+QnVCw?9%+RF$D|{1R9=DlFP#3m~3Bgi2Mv5Zp$lT3bBKMo)#^W0Q9hV|>)Nr_C?eX z;@M=3ZG3Xe>W_kfpjNd zrzv)Q>A}S2y@5ngEG}M>j*IPTBcU!6or9H>)7rUoKQknT_MYeap+|ZI+;}>xHNzW3 z=J(B}{;f|JP};2zp>1|6cquzI6MKgbmeTCiSB0Nqu&n{34e>AcKvEYQQLDrB;8x~w zV8dnSw|F|K4^vJF{pDA*>aMf1s}maZEjxxS*g4$caOI1b)bXP|?aR2*1tb(&H0&<> z(wURC2n+osRECyjyLfP{^WiEHjs{oFa@CzkC1>urBkROhH}t@4T=ubi$8FGc>-XS5 z2(%+j67mI8lo!TbOxW+AKP0ZI=qa9$w0Q&JR2(OC2yXh|^if7~^saDFiz&Gu_mj8s zIzOCL|LQ%ST(}M3fOcpir%ADD$jl>ed0iLho&=$lwwV{lt9BNL)e9EL{1PKkArvU0SlXy zt7F%1W+oAq-1G^$QtW?N`ubHk=>}qJxbEMpd(V}z`3Fsr87Q{CnGk9$ig+hh~Bw6bO zBiVT`C)J9K{Jv&*a3z|Udrl{A;RS3X+pOS1lb_H$w@~J&n95U z{->J2tjcj0s_z&!|CcHA)OG1FY^A>_y+A5m80jp(2sfrbE_sd`UPIF%lwDqh0|1WTNY=P zKh#}r`kVXXwG1a<$G-L%t*Prjy}MA>8}HPm8E^4l<2St9OAphIhkK>B3d<%LXH*V$ zhT1q22(kxwN40@pDb^53DUJ2-#paHfAw>%kOu>dvFY3HKO`cwucGZ245JnD%$h`U2 zK+M6z_FFHD&i6J&frTk;{lEA#?i2J?#?YQ2Ppm{6_L>M=l8gyy7FJ7G<8en=VcN399Br!jHT-)#9;u>dS;-Rkz}m~7AL$nQ?uVva4!yf_{@VWilg%XmU~g5xE0MId zf4s8S?hvFEyv(6`-87AzhCvG*7gfLPPI}a^FT684Gqsy?J4^6btGWT~0Ax6(T7by# z-M>%fb0<`U>S?4IyU(jNH4nwBF-o>%VJKtAY;4!^&Q2Az6B-Z2pfg>~MW87iNCo|N za&fTXG%e1rzpnS5ou`g#ELtp}_6E~$B zIKA`Bp)pu(rQ-4M%r>|?_(8BgK@&q-{-62p!U#esaUa~wNS$*nKSS1;-TQDBdZRX` z%-cDdg}RIqCET12ekSdPXw&XnX8jd02ni96)?@DNgXc6u^VFH39PT(rRW~>?D1*%;64bD|u=Tt#z2sRf1jg--9DwanVhsf9AsHR3?MQU#pPekF~->ZAdg(mNh1TKG(z2Bib%>OE&EEo zDV#>@{WqIL62k{obJio4g8TMsNq+w9BnEs!>QDn@UD}toe~%&u(k6y;yqR}k&H2as zg}8;QrM@K&G5CiuM#}iR|434O$ULzpLvppV*Xra}k_%yD6YIKak&ZmGQ+w3yO2+@O zGCotl0}WlLVSSUq&IZC^Y3*?Oc&FsCV$?t?!M~3(kN{32d?jZJgv+z;ed4!12H$TF zJ{X;jv4G~{y=mk>LQ< zg_RDYJxNUxk<=tu(#HQ#m(97ro#%j@Ly#} zO@M>+H%e~MDLxFJSz2mVbgYs&akXbiv=h+KuleIN zd?epl2c_0}J37ko(nDz@%#J6ExxZ%!<6o~{msy$NHBO-BG12Id^8YqSOr=A!m)&zT z7qncPMYZCbp0_tm$=0fx-fUUomsJ0;ht%mz-^$DV8|K>R1-cTk;53Uo{AhoE7qIh& ztf|}@fgAIhT~=Zl0_jJZMc19rK%xulGMNR=>o8n}>hdso-UvB0iriF=1%;&hf3 zI_8f{Big~w|L84N9*T0yFrY3aD_xAZ)YaY+PZABt4Gp>w2j2-7}Dy0a}-uEwer}us#dD& z+2rg>8yOE$5?*N5cK72gk9tpY0@yq<)$VMLxUz4`VelK;z0pag#So8dW;4}C-Oz(`a^zVvCtO9 zbJ^Ae$0q?M@)R{ps|*F%!^D$LqN}8^Pc^koB2N+_wxZ1ZkaL(USK5X)9Gm$&$~~_M zdc#Fkn6s8KvRGt*a5UEW?t+eb{|I_J7VWey;>gK~Qa)*(Dam*pX}*O#A%ZhmY)&U> zbP#f2vIrqb;WGo)db{ZlBH;6nN#E@w+*FGeq8i z3&9Hr2jL%4u+xcI>2v8_OGm<_E7NN6jR*oM_cD9Xj)2q*SmIMQA+i_S%~T;#FH&RC z-k&dpjeLG%Wh!3lQU7x0 z^Wl@IZ=QlTqZIWR(|&9s)rvDDA4~HOlG>zc3PPzb9zZbJ2WF&PqkDhAOz{`wT<-;A z24^9q*q!cm$JTp4iC;fS434lW*c2|+QyzM5!gMA~D?rcmw0pOkqUS?{V{Jn%VNYPa zkYc*Ae39yl%OmCk;u~v^i*ze|q8-NsVfU>slL(!nI5?y8fZdB7Pg~-6eXixwoBnF#}5qdVkzc&-t_446OHx1L%&+r8dG>jx) z|G=5o8FWs5UR%({B)xl_MwN#cgt^D34ADQk$e*iC&m*9*%%y(=6CLb5&BcH>-4k*; z#;Ynj5;)6f(|yG20fQ-H1twrA!p`G<+;h{JPrq zdo+;)8S4xUcERcXR#+DP=O$AHxAfv@cGl~+(Axb@@;W4gMWUs))Kit(-F1Ups0K{4 zR?zppowGp|Y<1u0h7K){x8m(IcET>SH_`FOXB(3no!~!85EGXYM|x(*)}umRV0?b} zKzje+1i)L(wQ`G$owm=x={0y9n)4?MIn)K+Y{v=Aa7f|vuu>g5#c(%GpRj;wY|(zb zPbk32uv_@H9GG0io7Wu_(xONgdV*_0kE)nbBDi~Kl2OQn2K2^Ma)=+hhxFd1kFi zaCnOo9+yXDx&jx0P!a#Cypjghl`@=IV~fC;LIgS`lwY$_xHjWnSnZ@>Cr-kK~ z4|7mdH%NTEI=GXZY0xRG^$+uzKYkrKKTK?K`p`KFQ8Kvp8{lSFZvh(V`N%Lfg?O0$ zyj<-ow2HH@9do^Yp@KVG5hrIw<|U-G@9C)IeG)5}j_Vi~F93K~sO?KPuPvjcnk?2Wm&X#PtsoFHodwz>MA z2Q^&AC4M)RHN;1!P5Z`svFH>X%L*R(Z+1RW*w*wFuPM%&5Kr!j{Xny;a8l+%tPJ6r zY!NRs38%*kXnDRIOFTJXwx@lgIvsQ*M8C{5(xrib5Ejr5!Kr*kXVu5b!HH(9dio~A zfzk2a%z_oZp5}Wx<*d=S)n{tO3(F*#diTz|zMuhe?3&(sX zStNl+a6?*+GKsw22u8If(aszZ1rK3i-JnH_PA-Ysb;YY6{X&ojSVKoqxhlUsbM&9ACnYk;hXNG$6q+!^xjF}Uf}-J`3T#AbDEHVE?%g)B1S zsfbBhOZ0x02YKGa)%e_KPtqmZeZ5MQ;Ub7EkfjJiS1DsZaV>i7tn7**O`}lF`j7!3 z_H&~#t!HiUNFk?w^7sHYrnJDVT}MtQ38%FwZu6`e*?S@5U-#oVwfnvlvh8*#+gu^= zLdolZw}0e=+c$J|VjTU*BOs*7$6AoP2RQL@Lq5;1PCww)-b3(uZ%kQ-XYD;EiLFaT zFdsc+=*-;vP>Ni}Kg)K2x|jIlibk-n+qTqB@LnMFV5zTaMeo{Dt@Nwo5XC&khxL{i zMrWnvaz<65OAhNVtxip)L+l>~Nt$#r+{~UF@6v@#k-F>C>j@0a&y}AZxb+z76Y5_& z6c-J$2(Mv=aCH+6!kr-llIUp$d!NcsD0b%d?jj^)WDYLxx0R+)lgGRN7822}F&C@p zWLwX7Q*t#1xm|Rwq@IcKCxsjAqr2-~7gsA8pta6_`HusHiMs?!hzBf7M;>3nWKVdx z#p5Ko3E&F$wKXD~JC!6g)O1rL5dl>6xB3hd_x>z_XPuS(xxO3I`c3ij8m=_( z(iGI!hW)IlhXVwhfa!G*`PK(v0mKgJ(Q{o?HJxn>}6g zKNcPfnbA$ulVAr)MUcn79gFzIo(Jcpqp1h*?`AX`Nk7;k8noB?hpSX9y`%!c~n6*sMzv*A47=XZw1P#(zC2kf#uC# zhz-@Q+!IqGF*^_IDZj^Uv6(e5SudOF54&TC(fTo54|=fUG|2??0RX;ovpuE zfT&w~^8zZ>YX%VN0p;2{1;MeheEv+c zor5zW@Whv2F;wsb1pZ#;W(}QEW|r{M)qJJbZ?k30=+ek@p<%RQ3z}70ozCh@*7M%9 z`&ay4UOt%+yduM+=`_i)BW*UuI(qLUqp}A!vZs})wF$WUNw^?tDYXrBm?q7Mf%Y9*x|yU z7|Q%olAZroxGbB784pgrr;_=(Uz^7a?vu*bv5Movfgc#J^*2#j?TZ+cCfX?R;P^RD#K$Itwc5kLz3$(f|3-R@u}*KfJt2cBTB`|}pS+G%TlNa;Um z6>>TxFukekA8~j);V|*<7ydu_3Qa4gJWv$Kb42L4H6PG2 zn7>F5(D85X|JR7Szcv$$__dM$9?^I!f^L8Ra|GhAg8_4Z>o)&A;?A$V1(W>G5ov-q cmj^cp!ervd`qi&!{sO<3qB0@{Lb~t%4~ATXCjbBd literal 0 HcmV?d00001 diff --git a/doc/design/mkldnn/image/layers.png b/doc/design/mkldnn/image/layers.png new file mode 100644 index 0000000000000000000000000000000000000000..e65e1aeca47cd2f0c5289d0bb209ef394545bd31 GIT binary patch literal 57028 zcmeFYWmKHe(k_^Upa~Eh8Yg%N+PJ$lF2UVBNN@k|NJa28p(T4=>Gy<%FL-E0210X9y2`Ms|?Wbb9s-t>fw6^B(&= zlV{HsG^NFaRo(RVnu2|?%$u9<9}RQUxiuug88pMDj?BeB_ohKW?C}8?ohP` z7V@_r@FE=x5OCi(=;>01`?6di<;x z8!TP7G^(U&s=m%L5n3p1ukKD$SN}{#^4W&^hGmBZg=3h1LC^r};YE5hNL;r`kF&7} z=IE&wdaNd{jDIoP>h-J=Ve3Z>ce|{`9O%5TU|=^+)TieHS%L!k3I64QuPF0IWWlIh zt&u4kjQ0T(#7;rqX@7b}$MPo**Jjc166Kxb#OLy+;#;FrT9Hf(=EI~qx@_z@TnVX} zi`xhn-MSRHcBbA+w$RzUqGaa&lCw2e&MWg6^bX;7(AZuRR)M-&>yCo+wy1`>acSRD zsCx(9rPxVzJJUbL@*n2?DF)MUG`kaP5Iqc4=w$14=1A!am2{*|k^Nagt$zoJlC}D} zBl!LIPu4mN>$Tl*56jxbx9IzdAHqz25W9Z5BuX76BJB?lKBhIl~#!HoJU?E&c}H>;JU<2zWwgc%JB7m|h2~_qh{( z>BralBxhT54*m=%-c@Y%P16i%sOPKvg>6{DOfQYM2PPu|Pu(|bF|oQ;G}mUzF8eB0 z@vkhK5I}JEA(U)p`gbF4Z*1Q*GP|#qt8&EmR~Nv&8?@|7l7eE$nUx`($*g|Ol68%J zaBA{0XKC^J3r+f8NN=}T61f|QZu@2~Ry0jlUR64jjLowDF^(cyEzw|2eI*9_=65NN zB*EM>Ke8gs?gRr3M0TIfZ6mZKP9y{^9Q zaA?Hwib=5XZ%oGTxV>P#Qz>lQ6NP^iY~jsYA=0Ll4wCzN~*SJ#gaS9z2=L-5^8WQf-dq8&>mwAtl4N-<%QeMD^^s)J#B& zI_#~qtkXZ<#q!+@GqT$$UD2#034y&8*stM!7o67bb}Y=T_Jp``V9Qjt8w{>AK-F+G ze5Gr5L|IYwDP4>^5eH}+*>F1UR{jRxSCly^-6=iTvV&hIUAXNdu6dj@eP99j=Cnx= z^Kgi|Ct1(U#RIp8nsXZRLt0a6Tkqab1QZ{f(JXqCO>6a`#PNP=k!1$+Jj|5NOrGR7H#BS{MYb?T1>V z1x1{=Kh)>)mPXCHJM=d;emTE0w?(smKt$1}JQu>6 zbKc)>vMzls@05ygvVc5p8@Na+-J?7$IkSX^x{8}8^MIg&crg9bY?oW@~Y8Dw>I|LwcNFR^ud#}?U2SutCzLZ|c;KSG zlpL+HMaz#!JS+j}W@2|E7!%OG+;|+JJGZ_r&&0Y`6oed!+9(6W zY!rZ5y8Rn46h&~n*;x9SFcX6&h-IM_F~J)ZCQjV7ovxE#0mM_uzi^vr(r~k-i=)aCaUygSn43T?9`q@ zi0mcg=Mo8u=UR1pOM&#|L`)%XS**I+Gg&cn=1?$q2gA32d;UNCX)+Q?5F>+Gi%*yN zE?Z@X^JLPu#YeR~PyKwE&M*UvBL3LW6L*-d8d};VX@V`@*e%+)J63-u+HC%Zw_369b?`-XsO8F4R z`2Mu*kKXE~bXKj*<9{0jxTm>6J&>P5=MY1_xTT1xvE}`L^2PpA$`8Q-n>SR-npJE! z%XBUy8bSGg{+?68P~mm%NtLyKs57qGq;Ae*({>4-dEySh6aGyY@^6ILICMDjQb!;h zxgWbf&4HWLj!ZrV<*)o--_)RG+h{bkV58+b{dp3$kQ8QE!1PxP4l+!%LjCK-hWJpe zzqb^AQ2xYUufz{BEMWid$N#|&{~t7+Nwnu(tsLRpM_B(8Y1T)p)B?g^uY#9|RMOLG zh*Kx@nK?b>HJDzl<^kx}y+gny&8JJUtVyr2M3C|&D58MriO!!~ z#X)mkiYz(#juS{TS>XUvB9WU87os=1RstbX{vqWl@qZ4?JK!5DkRbxkVEp%o|F*$@ z`@#R+y|BghQlN<7NK!fG@kwek`wSot18bcQ2j_M2H+?wciQyY+aasu&Sqm=fv`%FD zX+S?Ru=9Sk)+nTK)UHC441K6HeWf3LijDeQj7J=RAPnRx@||ok=~uvBeS*1!Q}2UE zA`;`LpvOd-u6~=n-5cfkF9-&r@qfq4|DxXiC-a27W`+Va+gclotxtxV92fNpmAuuQ zDT{niS}XNfAT{IEWeZawmn50_kT!|SadlDfyaUMAMt(VP{aWk4THwWHC~WSlJ83-F z2W7Sa36{29;-GWGPW_Kv#_dsXe{`~% zmpM4KreE1ONR#Z}0a@5j?QEb^)k86s^P=9U*M{+uE5@5B1j_4@d8v>WmW%@re<>`D{HX5ojE5X8wi<_8@=+~vs|5E{n;k}r3Wht4qGn^az36g@(S9}xfBp+G z20wCICW64s@2@0U$vaxcD|V2!jcxh|Tda=9r@Qm)iu|FSmBl0ii<6W*SG2HUfPrwE1_=0;_Bbjnwo<*bl*Y(hiQY7yLoPl`XQZFPGpFeM>*S_(X{o%M)se(fGO0Yk_&7 z8uJ4w>M)5)D4Zpz|HWS#UiJ-z+E8AB(mKuPvaZ*KQ*eC}H1X_`ZqD2vV zPL;@F`Qca{bQ z1E`7hdIeEC-1pZv2;6`HuGDs+#zUg_uZf{V#c?$M%z)!B7)RufWNYKgI;#xO0XNeSbZF%~8} zizh()?LZo?`jj|f=4A&w3N{_e@p0Rhdw0 z)cnVAy-`jg0(f=jn=3_{G_*qFPh<*p?0n$9X|jZ&Krze!O>cRvgh*#)<#Vn>kqIaP zfj{b`rQbmJsD5koXj%>~mFQOh3(9V!?fBHL-p$_Hi5%PpprCoJWj2+-La&X3q;wNb zg$i8;0&_p~`|6cWd(VDR0z?&<-u#2R|At%2s!nMGbRe>c47JB++M~lcYo1(>Gz#q8 z5Xt{b!Bg<_y=yUk!)CSV$J864Lb*oj<}F8eD9XPX@RyUUt(4IGGi>#t&6~N~L(OB* zua62ii5}>AHP*4aOuNLHCiF48V}SrT$grTQw&LCJrc=;DrLE<{Oh(6Oh}-D?d~h!3kWEUxn*Q;XG;9pbx=K>`{A}_|6+F9PQ1# zFr7Do=CD@GmKIYI=D<{P@Ss_M>@d6mJGDO1DUglhFrjIFlz$Z{N;;ZERQWv4XxZzV z+~n7VvNFlINLtnBqI$fn7t*N+w}m~-7+-U`c2|WdQoBC2iqvqXgi%&3g8c;0wL6A! zvN%D`d12UaGiApPDXZ~=E+NKM0iFlO^*Y}Z>RtHFkcRbQvdH==({zz=XOEFd6sMl3!zz7-o_p!#)x<={i$&bTUiaUpl_Tc}|{3yv_@F zaM*cP_xE56o$@Bvq9Jxv>{38YihCPYS0~czX+1N#*9K-Xjx%j zYc%CNA7XojJCs(C^;h~=z$8BI^l(S(Ps5<2#BUS2XXSpw={VLV)d0`o81n?>@LZ& zL`+7~3KUN|@6wcgp`DwpBJa&MNk)iAnr=-Nc_x@P${IE?5XD844G@_woYAB+$~3#%KIX0wj6YgTX=u5#V1rU6MSecalo-o64p;6g^y z+4Vv79m`650AsQjGJf9*4|`;ct99JgXeBzM2|f64^z0kMBwb%keE(%-7EfB2_(bM? zpy0P%QQBkg=*p|k`ppyxYCv~sKZa1G5?2OH>W{XtbfmZX(!qqf8V|lNfTS`CBMnY; z4%aaB!YmC!+~%th2+3na<w-lZJR?GO2w-AIZd;--d?DgNdI5L zkX3Y+JvDTct;$0}cy4Ofaf9hrg5^d!RpO4FqHk}~v8Cy10xRmvxSI$JQmb7`=ifwn?s}IuO5&kV@{R5gyc|Ir6m4Tb z*;iN8@#cp533!c=%~j&<))r3(EQDG1309R2gTJ9G6sSH4&MtwV+t~`W0rJGO^Cw)*m%H; zQ15tJR#c77Cw&{k29Xf4;B;r1g)5EyTWckcE2KUTPWv@mfTe+H`6t?Uxe?t2x5o>x z2Fnq!57obESf-__KD0PZpEs*E`|Q=^2TC@$4Q82rpngNE5tH;|1W!_zF)dX;%d%0s zuv1V;Pn6?F=9=q=+!6n4cGR)6(WYF40Yi%*XYMvtm{3`>Mm^K_FEXKc+6ks0`*GUA zv)W<&6p_PF@l8%fDNS}>t_cB@0$yIu&c zqlr|hA8lyK7AS!bwB2?pZ2-N%4xp(Btdn5IU*moJsvuskp z-jGHwaFBFjHbtwR<{kERf3$hN2P)U>katL1yv65j>@1D1d!Tow~17B#8BuuF4 z4Cb9;{o+25nriTjQF2?ED`GJNnF8U5rG@>8)CC~}O-!P)u0XU4!gd)C#&pV&&V zVFm)=bDrpQ{UmC%L13>7A>ro>r&`vvwjbdTQHV-9;eC%Pg8iLpuF9OL14&}$SxlsJ zx{*!OjJbU%udD^A<@VvzjLsUCLP71=WT6yoi==oHe~ zx|0`51^fJX=g>nDDPH##vo3!zyG)XWC|sh=_M|J^Be35H6ms;t5qWvVrCd%E`h5KQ zCToE}1XYmx8UpnbX%WSXRWD{foS>i788)Qjn zw_qx)h=}er`xg({JTK}yn8p3j9J3WPLr`q% zmJ{_VBd#Z}rt1S|K!!7)t8jh)9TJnh4#Z2B&K=o8^+Z`XlEiL+5+6@e-l02nI24g> z#7k`+=|gy9EZ2P5Wei7v%2G_j@FCsIiz)c_^MitC8)VgOcgLV`q^{8s6I@Z|#{9*m zGwB+P|7GhljGqEPIGZe&r=#?cjV~U@pqoh!)q%Y^)qOzz-22rbUa63$Y~{z1SRIhO zMl>Ts@stuer1fSFz3@90S4QzXgG9ZOjV{q)>feL<)qBlTHT}Wp zLJ%gDWY(Nq_qyz@;4DXDBSLw(n*Z)q8!Gd{EYp$F~%79wR!5HS&f;q zf%#BWiOjYvEu65-{Q+oJxltKP&n!`cHp5Z;WF+K~`6S1@r>lEZ0ae_+P@?WKGN-vO zltbfE?HchqG}~-ZUHQ`jYj(}1$2`q!x~BKCUgf4sV(yfmrN3GSO1p+YzKZBV+4G?b zq>6Tta;se?!r^DK>7P0%5-&^|-o{*3k;}SirC?U9^>f|014)ZBdyBV8^S^p|n#l96^guL!x_>rL;!1t4_4#I&bNXIi9VQBVOX6tW= z>*!N!ti)?vH@7N9QE@JDEGmm{cJUX6;hT|fx}|jBVH~_OJ*Ek>JM3c~`j4s-t_@)F zdcG}b(|}tQ_Nbe9c9et}Hpup0tHRv>Wk7wo*Bh&yZLou`QBRe3ptuI#EdD=!rp@d_ z`1(X*>Fk?&?u~3SW+$Y-Dh&Vs(BlNdElr%F;Fe^CW(KsIdNF5pVysO6c`sg!gUo>jNPX`zQ&ZBK6i2;h!yV$;yzoagT7j@vv2K zeX7?gbnceYw=|v-JUw@B%yI>f=UU3+2k%lOI8q2F0y!oMMz*uY4H9K~3({lpNO(Yp z)1T5|h5gafzL1~)Y_XGJiE!p{eek>2C-ew!mcfTApCpXWE1FikyBH~1zV%rpCzmmtsnN%En^vi`q@o;hq6nmwdgNs zR-yqjyS6#}U3YtNdb#hbF>-w}8u@k-f*=XzJ=KopteQ1|vMokes!+My)0O zivi>au9sO+HTpbqPm6v%Zu$xK+sw(&!RmDTxTWMEzF86<`s<_{>KD{dt<8V%_)x#O zPDM-`@QHKBHtniW2VWo^8r{3`l#m z-1gLyY~Ow{q<`0P|Lase>h{k2c2+w*E&2>tI@$o`B=;*N;{^!ZA!ieR{bu7s)Vphc z1eIt%@<%6Krmrh*71vF~qJh~DRhgGdRI3n9@9clNj2TOW7=2;6#KIhFt94rv%D`c~ zO9|i4+R4NDNt3INu-pWVmd2mP$7#`eE)EyGE3xg3NeJ~{F!#p~Z}jP|WSN0Y@bIhN zdfJaAknU~(nH^}eYp?kY*sj)@o`2wfE^ksV_}BE00WAAk>)X?cMZDK4!q>YW=J80m z2dich9=O!OJH8W_2Lie1K`N}ilZm8&cCoi`wT)}7#6$r7a?-uwS3f(U-E?s2Dsbu_ zlsL(GKDk&OWW1ek&;8H-Q8Q|o6ttpgXK7{rZOb}32x7Y7+Q*A$O9~oM*yAho|9~BF zKafCdZ%2Y(BVn=-VC4gOo7!82=8>?+bU_}JAKUm#^=e$i5XGJ^^ZItiaZwg^0l>d2|CmoUuSm*L^uJ?uSjb)ZhiK4S!n1CbkqE)E^BW zpi!>h6|o3Hqoqt0<|^+#!utxD(6g8NO!^-E`Z&gX4D2yEPlP2|*_lUA+o5a>(q>)i zs9TzU!3-=}fGXk`ZCQ z*B$+5hJ#kE797L!=Zfy6N6uQ&3?KUXFU1}K7XzG&Jzi>g2;^q@pB=mkb(7}QF}%Ku z0_Oo(fC=tgTrQz1t6JJ;{;8sD{C0$a_vz}N*j*bs$}Q!|7^2;oQ5zQr=c=0ea?e?C zQW6rJ_qcOgW)C%g`JUDEnRzwd_sQ}u-Rp)4Ez9Kr9ZB2HJU~VJCrFw)W||J2`84SJ zEhGNXywVwa8*|+Eni{kR=dLkS_n{qWIY7UhUN?_8UGAG7 z%!CCgq`)P7O#83SNr7Nn&&_~7b!@ffVBp1% z1#{?8U#XJNCsjGO{t_6cu>G4Y!e($=!K>2R4H`ZH)D1S{?!*@^5gOS!`>FJ-1EhhF zb|gnhv-rf~k@EFW+c-5G|{26^$bPB!WH_@5W;_}50|4dMx&d8>Sp6X~R~~TZmePHKaRtB+kJ9aPeT6nEulsgN z_RqC?_kd1dps52pS%NP~eEl@ag`Y3ZKYy5yJFcUtSWkNbt7dVL`^?}so~+5Adrjj5 z==0)j<(Ch2B?)-qEI7%6C3`n}!5GT2t&InW@_5(gq`+B}>cTBvw`U*3qQdS@+qxIo=xpS8!9w_yj$l)8~=z@qw~ z^Z^LnUup(QBy^+bueQ1p9NqLif{)i&>uEyllCahkdD}=KyRe(5?IS*mSo(=ep-N+Micb-|Q+K_H z>Mf`TM%rZ9_?H_~!lZI4_OpGKRU=33UjJc!_5LZgA0J@&v(-P;?p&_x{PYWn&teV6 z$}ie_LB%QwlQMU7z2QrbeM~R_ig51k(zigIh+scCXBqb`rn@ac0Qkc7NQvPGGUS?E z%Wv>f%;4nJ|fg7k6l$ z>9ztRf1IDX>~I%N7;SMI(m}Zb0NKi8$>9cTG)^{$S@OvsE;YqoLewgC*8rdpn^&s4 z-kl%bY&u|O&|Yd;Qzi|Lf`7m0Tsq=BpPJAe^HI;0Xl}@Am@^Nyfkc)_=rk`y-)^i& zBL_5BGj;{|4 zsb@5pk?2P^i9fa+6Mp=9?{ zp`q9ET;UQTZ|~sf%>JcncGl?b=%>}a_|2x~yMCI&#Je!fp>fdBzG(r%b^T#Br0V=c-vFlSm{QRx>8~)5uI@tIV8_v5 zR1Pp328OdwkJIwIyOKEJR&gn%+n0+hB=zPzFiV$P9~@};!Ix^&DHiV7H|`@F4R>_;>xGlExzo; z5@EyPTGavg=W>=Q8wB38Z~GV^UJ+G0ztVmxB`$hRA1{{}YPf=d_J>#Dj1iuq)2e*fj(eEYk6jH6Pvn9JVIxyoszXs^(d^JmjLOplf zEFRyy+4Tc-SvkE)cmLrya`@`jtiwC-)xgC9$AJyUSH4~5_iyFFqKfOD>HR$i$kqAt z4R#>3nq<#PlGkl{g#MAqh#T`YuQhHac0Ilpds`g9kd3JhB6wKtqzaKI0=l;-AEt*5 zF__rfY*=bw`?#+=iI^`O3b&0)QJa8;z7_{ZVBB{Gt?p!%J`2jvVBlq*a2xWe@47K} zca6N?3HjwcmOdC;0CHkM?f0H%8UOJVU0dAr?yFml)Am1=Nt2QodWV{c1na_)j; zwJJ4X;K&nirQ)UcK76N0hV{mog8ElzoLv*&D2Y1^v+l|Hl+33~|qW<^u>=E3aHTon##x~NaR(Wn-U zQTfzb0w#G2IB(>G{AG#+8TRP(y{8=k^HDLV%ly1ulp}-60Xg=Bc6yOFlkfOQxl2)7KfSz$v1rwr2c*&~d%6W^OXsOxc9ktTQL3+Xnjvrmaf ztN;gjhIYob`Zo&e5A8uj3mLrxLV_dOiRcn-ArtyzjQrVc9>^;nO};{fH)0&P(xf;6 zcG}=wwso2gfQ#Ud$wkr=#%aNeS<_#T)%0BtC1F$9X6i;8*ym2#h-HP1GY4TH5=}ek zct14X18_WY#J!e+wL=n@ju#jM-<66%Q}oQ#b=d`Er~d|%;sLF!QORFfj}OaIXp`G{ z=*+SS6n8&Ez2Bs?ERFr$GJ~Ztr(CvH)Gm02-{=fP^w7;Xo3;AK)SR6&IN-|y>1))j zCTk`OL9TqA(G0zh^8Oqvwj~}_X;i4~=wx`F`?w#3EHCcedXPA|+$P8P88uMM!jp{Y ze|MMA2sS=*R=8?f-_dBSMVTv#{s> zfFqe-rS1ASD|6fLz9A+#YuV{{uHe_ov-)fn0cC#%G9dd2=B&MqKlaC>$8Vp$t6&## zC-{t&5h>`<^RiTEKk_><92HDsI?r;aIyA!D?+r+lTyf45z)vSY;xTeZ2;L;T8W8eyCh5E3@=cEKB|r)gNi6PRx-;_UHo#0qNRp(8mf}dc z-3Zulb!t)~2Trv3(QsDJkVIVhG!&iY&>8m)JcI`cak0}O3}y2H`4dSn#dh{&|Kp{c z{)jS=aE>}mBEU3$`I5-L@H(G;0Cb676RUrsbS2dA^5yXm(BtIFL3oqibO$o!3G#9f z;mY4uPA5bEZn@O5f{qtxpuWpwxmc)1(qzw3_tJLCb+-0oj!WioMiLj|vXv%ndv00O z9Ag__6G_LRKQ5aP?j8QptWSrk%$eP80_lkC(5ax6a;O+KhWsP z{Cwx;Q=yw$C=!twx@y>mfSBvIL}-!}BM#uI9hxp#Ry)#m<`$cUMQ&$62(oHWX*MHE zQHZj(jh5zgBUuNi=H(P;^q1*J-GwGM;P6uaJ6T@4OOL_DnwD~Rl7q3ko;clH?V?^2 zkiJxmzFffFD8w4{oo}zjcx)<7ulK#tNB9f^6*=RX0%9w_q>Q;mXSGS_r&nlB;xz|Z z7mn%o7a>?3Ee#Y5HM;u?UUH8>tVxnDY$InF*+ZGbcebhhd0#1yy6L|E8`d`w4V*0* zk;dWS;NC(;JOq(*>yH%@rhD&jA3Dlz5VwJ|Sp4}BeeXg5~cbxc|}UlRYPZ(!~noAg5-B=2U^aG%y!wmQ~r$So)+=t zYWk?da?j_89E87e2b9C!UGNKq zQrK0ao|Csw-iSFTYV=_WYWhUJj|EF*WE~LO`J9IWuF2jYgml6M?t0bO_7a>!-znIO zkC%MP{_y)^_eQ>okoX8uR9@fV&~`OT%QO`Fv0k^fBz4mA8=eps(Q)9)c};Pt?GDq; znt39MjBQ2Ewx9R8q3Os`-TLC<*}ZdD;$W=vWBv)`ZSAOyKaHC)bfgk7f1 zI7oZyFSoZULsqA}*&uKWFM2+=8i-ir!xr}FR_#K+oSVnzD>DH^bjhr z;H1jh$|rUQHw1+P3*xIYUA2C8^^=e<3Ybgs@7G_0-J%hI^Z?LKid|L*bW7xNHADP(#u1*GqZYNpOVkNX9lny4TCAjJ8+lnhSY;!9vI+^t#P=woswc3ICkNkE$9I zH{mmfC0oqzLB_&IKQQGwmX0R*n2q132`H?NuAPF13Ge|0zaNho^#fUXx%xe)x&VYK zH)oPxKOvWbiE01pCedmXHBHhT%7P*1`t|y;I_QuSs(jjmqYDH?eM{GpQYE~IWG8`p z!!?QAchMP^-vZkK?m0vK9%8K0CYLTYqgujW^E&)AY6mMFBhur(BZl6st4l9sgxfu* z)=dfV<+I1GxlTXv%KUi;xPHVKQ9?5O6^B}sm#c7N_f;|>sH(#Cu4Ar_Rkgs>w zC80|G_3EDz(S!&T#gA;?$u^WHkDN1Et(Z5 z1eb3BnVfd%s}IK6CW4NO&fdf1dX?{2F5nevG?`HziN^$2=`pE4EXlMGGb&Y*O~zQd zF0zIwY<_E{=!P`I_Tx#lULS?+Po8^fe>LYR{te`55xNt-pRlklyMH9^N4b6-P5KZh z6hKDUbCA+vk2~?AufW}+nS0G*r`j;6;wCg*eX=|} z-FFei7mr$O_V9Oq(y_};s6TwMjx`6lwdh*9KEy%WRBeuIKc3ZZ@A1!`h=x5OM%KQ+ z`GHpUKazh&qjC+hA2$}0M~Bx_0#Me|(eIy#no`4r=52RL)%#PJnFRBY#FeKpRe=x) zfogo-5mdErizNm%)u6cy$5=`-ILp(n(q-w(B{U<^eGs z?QXg8LB1t-+s_y<)FNZb1{1-d4I!V~bci@M)V6bRbhqy6cX^fG?T%}gYv*m>&Vf_$ z1)^v(qUeJ0o>lT0ZrRC@g~|M`S`>GrqB%^_VXKvVz}x*!KGqNpD4%&|wiJ96E$0Xm+&sP#|}iK&r+!=Dz*36 zuv#Kz0%;*OCCvV?j>=a=Y67VKM$q*`#$|sK!DB|FMbGen9cMjd96u7!b7|8KMb0l7F%Er)w0y$DU7FXbV8)CM!RVC% z1IKTOwckJpw?T|ihmkIj?k%`X=!Yu~cV_#{G|Z zLP7W4Z8(f$Mj3NQMqiP7UE&>iqB-3^LS#Qiwca8!$_W@h54Plr=itd+Odr4hSWlXA zw6Q(;vg`fZ=sx~Pnu^5nGDhc{YM1RO8TuxnI z;M-;Y!_9pb$m7g#!($eCyc7%!$lz9=Ejx{l+mn|A;o&cl%#8GF|W0z=t zlfA61jlfvNEfVd_8YiCrSy>wd1f|*uTZ6ub>&xqDJS zBn4&HaVv!xYEX9H+0(i=PU}?O=d?hLwXjL>U*u?>$A}pI5Lyln$g;F~V-w$=|I>iY z$T+J2n=|3vw;vr3_g;sze=1VhVnbjXEzLW2+vf)9TJy)Nr)Nb6%3tQ6yZft1Qyny} z!AK>>;-%=&58riw9hfbPqHSGPnI4`m>yzN01}SqD9FepVBG~(zBBs%;9UHYHB879;{km57`)Z^1=l6b*W zW_!MR#Mroaghnu;(As$XiFh>7Ce4j29%zR?Fu;^JJE#`kUXy3Q4wpbY;s~K)1<9@z z7sDxAqV2Z+DidSClf^@%FiL-3-Qr8(a#o+O2vb^L+&(q%F83~r>-6*0pUviw?5ugi z@nXUb5bY~TiLMxC^-9J44Y&f=R|c*frY>t(?iSb*U5Q!f}3WS@Y7cg9*&t*Gc=2MH^zkOICS5r)U36DR~ zz7onbt$$}6f^XApc>j`~lmU;h?eZrz!@Hi6lycWU9-VVc=P3v~kSqK@#Z7VwLkZEO zQmYvp0sbm{lxCSkus4xoM?je>-`U47I+DII!2v?{X4=wsAMa3hG{!*}?0dLcLW9%!PTQk*h!n{=VOBlQs+Sk=NkBq5lP%b?;& zo+nICu{3yP%$r~h0#XM+X=4Iy?3Roo&DneUVNXO=|C#Lj^)2vS#UbA+}4b@ zl;X4(6Rs_h@ZOFVHAPeC9Hqrpaw+2b`NntR8{}2V!-1qcPHEfVTCA5E4W|$6%)$VYQR~bA-j~)_C4fa?0p>fEN!YWE*6w zhJjwu4ZU?fM0&k>TM?tq&2(+Bsd$@*?m2&kFWZP8ZwnIco?!7sHHZ_4!!gjV`>Q^21-!CI@^aXU1e zB|zHvjmzZB(Hb}MEr`6AlLnx6Qa0yIZ#+On1(NSPZbZ_mtFO?f!U(%x)Y+04_TuE@ zeLLywhbP^7?m$j#H3^D%!I^s(VFM&13GGobP?hD$+3bpx?uJ>!RTWBnU|Kj{qzu$=6n?&y-e51<8rpui2sALhXOAy)^Z8RI z5MU(DgJ|+JUREw6+94p$2Kox)o(@*G0cx=Z1)Ei(B7R2;0<0BH%Tv{*u&1e5UK=bU zUlwkE?vYWuLeBDPJ*gXT`Fi)6cRzh6q$F`7QzBVfd|xm|w>M;MC5WBvFMP&>`uU_a z+jg;VhFf}hiultL**`0O|K~!^mzYmcqgehjZa1~wx&%To^kW4uIcnzAd?EQdo9XdJEq z*jDx=M1|>Bx@g(Q?yt-3>8|Sl>uM(`VUCcZWW9zL2jXCRn`FHp{=N!gSXcz!HxZbS zp;Iv|5CpMxznPS#yW*+vT+REPK&r||bTVKB8uq`(dwpe2`@MN|qZ6aydk5ZSu*&-;1=0Oj@4Yn^Zl#jbR*`vx?uM-vIO&d^_@-s_{&l6>Z zasd(P)y?tB+I~ol3!bWj#4Mt)b-iNkj8aQ@$h8J4{9Dx* zwQ}IxPdJy;(1gHF$1c#-F)(5-06G%>90kXKX7pL^g9%7uZUeSDHRcvxJ7-+P zB!+f~b$s}86oljFIv*yA;zb*>V*asbiEoR;>y24UMds=(ZLuJ9P zNx1kO!Oom_gIm|5_2RjxU>Q>7+(<^8z^k>Fq!}KP*y#f656*0!Kz;{ho0g*F`ImS2 zS-QO#X%}p5Dg#cm^lIaJnJ@QwP{c2&m%jJ%U~Q&S+RrQyzK*o#EO!RFN*HUnmAcxb zNnYQ^?~&g2Rn(Y$w*(>Wh}yi1H9s&=W@vaNw`x{HxVU=75!KB`Th-N4P5~fATVJ;} zza3=djH-cd2$Q?OsF<=(2r&8QXDvOvhjQ?xEDC3 zfwJjgD5rESB&(>MW>uasGztox6@IZ#PO}05-ATjP*Z=C+pAo47uD_U}5 zLS{M)JJM5o)36-(1#3?af)YhfL5tXL?%L@sSLiooxpU8dX&J(S8qPh1$`~*>hjZTBbPA9D$p1Lq}EF! ziEntuieI1Q`%9R=)@H)&8(XL}Kx4ay`f;yfNqUH8(Qx6ikmfypV|30Rc&`5_**4L- z(pf_^vx*v8L-Z5Lp4+QBa)+iY4&^bCoQ;`?2uFLT+q~}dkm%P5jW{?uQY@R#?};b* zTYz^?#O?U6VGH4qOv?;1+y~6VubCI(Z8!gey|)aDI{Ny4Rk}sG6%>$0hVDihLFrby zly2z;5$O&Gkd~Bgqy!wg2kGu+cs6?O^W6XEyglc|xvu-Am)8vQi@o;RYpw6+J5riI z#?ag!d_wRk1S^KpS9?x4&N7Qoyd=`>oQ+M0HHYI+2`L_X6KDO?q&aqM-@OV_N#_xs zrJmaBI&ND1bRqD}N5!9g{6&KBb_W3;`?dy(MugRO!}cz#yeMfOkwsD|QKtG;kvrY0um>Xo>i6ISbOExjQYBj$tYqxaIZU9T%Gg(N z8Kf&|6ImFjpqH~DZo5CoiZre8=?UXn>ZP@P|+;e*auJN~-$hPf_(!_e{jG_-x6pv#4z6%FUsARBcdO=4H;oA@rfNwPg=!A|FBd- zV)l1spLg4jYA-6nqFaD~!VWTu06D0%ov$qfVxiJ^R%X|X$0VANpEKafduesXk*#ll z(*}6n=i9Ba!M~{6{Mkf4$T2+5_ACi7@LFHQ`FNIhj<0S(UT~F7prgC&kq-vED!}+$ zH-5`n+WczNAJ#zL#=Pmt_Q_vCGz2DjF{G41yRFO8*Ga7N^WdnR{}a+lS!Xdi9SBoI z>y`vlS7Laa?qvd6ZvN1L(Ed5CqU43RdqJiz(pVr-sut~k@TO;dQUw=JQM~=Kx>k4v zZ!RXDG(I%UjhZ89Wz_ZysJDJvrc( zG1g|YzlT)egnf)T_Ye_=anttDAT*-;>5_G#W>U-~4$Fg`bOU(xT+}laqtH>D3QxDF zQxjeeb0?&@EFAyW9ip6+;=IE)A4Uv}YnF73R@;W-q}OUui38#5sL*&~Ru?4O;YT}P z;TG(r>E$Sg6gVCD@0Ggj7EyvDB08`jlBN|DOH0G3TeQwdTs=>hrs3T5;^rA_%F{pF zdhLvUS!5bEn4gx_@hQ@}Aa!)HeuO7s$b?0-`c~LDZM@{A?_~Wr@ruHD2RhRNeid)y z!!vsy{B{TIG?&{YZnGdVVs?|@8|Ip}*?k>7L1f#9#pE)^e6%|Y?H$-16=8XHM)XUl zJ^Sr95z<-02DvyoCbEhyrolH6{C`mB@f)7ld11?Pcx@uO90nRKZsj{TOD3awyM(98 zVbo??5OO)|C=$|V&Pca}6FQry2B!L*c3C(E3RjxjsKKFSoESA>brw!~V^sT|x`TS8(Voh^n9+EU&uu+69U#ZLj zGzFFCPl2(CgOJ;D3dRbi04RzWDU?w}bZhWE>NG=_C$V83)lsh{suAVEKcxy68zJT> zdUhq7&yDj%GT%YrN_Hp4W{Zk?u+BGdg6mi|qV-Xg}WRNi%&D zdkbA>t_gh~2hMOl1{*RgsnPV6z|2wSn+j0}#^nY3CyWSi9{Kay6yJ_6>a63(m5Zf~ z@p5Xnxu>ie#8exc)b55Zn(V&2FO|<*Qc)||BXvVr=>w?e^nPBD{nS6FM ztx?00Loe`L#4(%2Qe zHp2_f#X%H7+hFZt8?!VO z!X{t5m8RJriaR?ol{xzy7dO}XG-*E|_l7ju*Cbf0GlqBUOr}_Lj=E>dKBap@CV7q%FDtE$Tur{RFBEV>@^faXcy@EpPJ&VW1#*^c<@sB8Fq9s&( zAIvGM`xEnnhY`!(`VGGi=1{sqdIIf?rjcqPZc$GxV(hX8f2&I}qH_>IZ+6?YYNy{{ zXAmDr@v=FmmLCsUp_NNHy#x!wi0?93Er}r|o0GmRFo}{&oT4~!6o&=fI?B06Cl@IR zIUT>jZ#O&6B0$<`%b2C!kErcUS55q>eQJC3YZ(ef-8$b9W~W?>+ua`sr`sHg$)A&> zONcH`Iu}#u4YLc<9KjhoDyA+sR<{ z7Z@miHA$!y?v_NfYq-I|yp38ju)DRX9Ir1h0OZG4uG1aZg6It!h=P)R4uZ2i#SpR- zW1@^J&R1?p@D}!c)8!3B`Uq8B#S{NelwAbG3i<#0kk0v$8$y z?-IWohx`_xC1~+frcJ;@!|%*xVCE}kj%8Y%C75OXE}xbNCU7zhVIK+3zL9j=liwns zcxa5L0^e*8sBI@sb*@=g(TrTGes+=TE(2@5a?B-;KTUY%AEu z5R>!Ib2!AX1<`nmsX$YL`7*0zUr*-Mdd#jf%nR{da>gmrGyKxFIn{$XpgnT)ROsje zTRpn`sVtoc9FL=S01U$+nw!=upfBjyNKzn%d&ZpVN_3>y8#c{$)s>omMztD=6I&!V z-1y9Tti9lHl$>gh6P~?~zs8_df3PNv&TzKoEP`+Ay~MF)DXu40aE_w|)!t%n%z;R6 z5d#-UC2@3vnDvlwHTUs% zu51d-tI#&v50|Ph7rCv_eIG;ee1GE@s&3DJ91j9}um?oBn4Z{=2mOtb{gE-g32!;! zF{OgI@35m&uD)f@02n1e&1w3*EnewUvK1?_j%pT!yT~1!C9IQ!qmv_SU^?wn0^2;z zHES1hlC(ZDd4%2bsDh=C!p+Yv%bqj{)grk2?@t*(wkmuTq1m|{HzgwMfXSi{9Ol)CDG6*cq`y8qDcfaBNZ8*&$?UFq?2{qyn z>i6Va<_#$WN1p9yEhzYGCIlp6(`ycFhUJD5=BI!)*OepV!Lcnp7{ETR9CW)=f)sQ% zBek50+UNS^KBJ}C3!I9nQ;(ejajjT!+lS0|d&?)#_tCR+>^>YqoAp*c19CSKr;?5! z=@?x)U1hAqKjF;iraxvx-9=7;5|^V0i6jbC9*nssKRf*JQF=HC9XA6tif~01!cqXy z9EUu#2?T}ebuFRivp=wAZto^=sy@)B$bg>3_W&#eFAm{g)3;XZ1&koekwc$dLaNu3 z_aj{KbZ0p!3(wKdZT8lqxpXL*UF^MF5o8eX)tE|@uTzzy@S;O<4kDRuDaG9KN3mmZ zZ+;}BK6R5UA)k}%@&bvrRiK|VkY`_23cN$`r_)_Nj1I^-!+7)MuzwXcRv%vUX@)+;=@&ho3jfHCmqJyJ*nS$;LeiK0O1Tc~ zq2QSPaHDW7N!2=HyXEor!38GIuX*~rY^K<4gpvQ^th*U>ToD&d%1E+SnF4hd2%q8S z!s<6fA@)~EzO=r!u&+G%=V|RBlFcjbrp#Rshe@h>6Dg!G+YcA;GMe%e&Rx1Ffk1dq zZ38}PLUKjI*|N%TAFUr~?Ngqu${HFX zhL7I7d##_U>j6^dXr|F>VpkSY<+i=|4;MdmL9eY548L9}g0le|*QSF>Q_cZ4 z_Mst4OnGvuWn-1pdAz*G2b#W>cJm`8Yw}PBLljL#ukZ8&Z#PFGUjqFAzLd2_of5${ zG8Pzy06pEF8B`kGNs9~XzDm&F_7!pu0k;wqfSWPk2Wd^4k44nWjg3wC$_09m+fuE_ zOT`}0M4YpTaV-O(mLms_SIyGk$CLJ_TAz56q-cyjADwg2^5BZ(xnztH*{LMEZ$zAO z%YO07xLacc=W=H#xpUuI7koz6EJMMPmr2Bue$Y(T5P#oGd~0U}%y*!eW}1?d`qTDp zKe99ZS}E-cvG|&rJ1_{@KM6^BXn4c8@|EF?%Kgx$Il1p1yheIL(d2|ENlyIk{M<4q zI_IUuAhdi>`oXtmn_a+nKxe}zW+Jm;vw?)LzrQ?%*%2mphb-Xb&bQhi!8O37vZr9M z>ej8#4+n1)mUmXR!QTmPI_u>$O+s_~Uf?L!H5n9D%c#XWKfn1z`$Y@$&+!>rs%~JV zO>|LN6$ah`Bl$B1dmC!L39FuF0=zJ7F#n+XjZHLgPYhOG zV-sld);KH}C_dm$^MIIsriGo)9sU9-j%hcB)4kNMgfOuRgBhFdRsVsv-}zl1SCqZP zqj}h)%#CwEl_Y0oe^P(o(-G_qlzEPcSTgqm0H*j{jB+%%eBCv3R^JL2`u;Ehr9S8s zG$wD4Yz?Yo_?X}y_q3l!7t{8^MlTt0S zqW$kdYNi8qtGffuRkNTH+=ucA$Q5{+kFMYoq0#Oj-aL+6j*Q#c%pUb7{^e%*LDg5EQ^cO7PejNpMQFFA+8jP^FX|h2ES76 zmos@XCB#~Pu+j3(2G|Eoj51YKC?!9A0~2FQNQa-mr$L4DvqJlrQZ3@lBD&=& z1f871?BNjx&XE^q0JN2!!^GgLGge(~mv(32F&22&P#b*Ze(MFEqR2FNqy+lHQbFp? zMME_)f=ro3f8U!G=Wa)Q$#JAB+@7v`a5H}_#@7H}c^1bTEFSb&m@bE7NmA%3Of{u_ zXr#t>He(Sg=O==t?Khe7pfDE99Z*|-NQkzT^Kl9 z0tXLnVfTLj1TFS2j6nC~%jRZa9BY|X3)HO@#97p}FPS!Q$w$fI!#aV~THCc+ z|B^7$>l>xEY8ov@8Dre_V2CPqVBRToibUR6zx;02g?nnPl?w#GEmI~Ibo+aZ;k`t0 zl-N)|mHXR1jW35)3U9s#nDVi7Dz~>|ne&Z77RP}?c&@z%P2kz}*!@hpNi58`{m|rg z{+1Nq&%E`l%@;wL;vTb;Qs>2O3=anG&zz2y^a`5FJ@=3~2c4wmxwT-=$dxi3f*GS< z=LKE5xFYV`_1f3mZ_I{^&;pt>OOxj3F9(oTSCA>ZG;`VFe*w`t7?Nm}^>|9&F;V0M zJ-~`^dnB}J=5<44KohA$X_!J9U4~%$dfTy==^lIxUepgBde#wnzke9(TO9269%6i5 z`%TlMN}2m+d@5HIvZ2Q9{oVXrxy2!XtRmyN#5KO@$`h1Lv=JPs{a1CRFH(XjW_OQl zN<5)bX>C9x9SaI7gGGAX~jVkhU|Wg(ACx2g#BHR+a6 z<0nlYabSs(P_w$dT1nrs*2oTZ_(l|IVY(Fg-@NWV; z62-h<@tG4EF$Q*)_(iQA3pe$w<{Hz``$CrD$RqrE3GHx-w>LZbp1|>nEg=a^-SMK& z%h9?-wG>+7eqzi%isvG3TI+ShU42HgM=nDa>9dL#I0Y)j<(bE)>$@`WP!^r}A*$_r z+eX_MqmE_5?x&Rg@`Na0H(2=b#uKiz-YNRY(9Y51Tf1Qd&*-;3JohpeAx9dFZJmLZ z-Dn~-v}hf*T}~os*Dp3q%=ZVpQLZ&O)QvVQo{DIBKwYd=$#o281aHQPQ8M93`T48_7=MPomd)3J$9mex6r*3p z;C;Cph-=|F&Y7u@B41F5GgJl4QzptJ85UyMc>WW1Z&6o7bB(!v6z0MnujlOVLV1%n zH+gmQEA102HK?}mXLCQBu}}%W^puL;Sg(>?$*4V6x}vK86G`0FI_Jaw1@#?Lb9F7R z#mM*aZ?UwX4bXQ#E-U;~a4s8Ahqg@|gOfhvQ_QuP0RvPqJRmPmX_#Z=0F5}4lOVWB zrka}KRnu*6Mr{4e8y0e$1R6>LUsQ3~M9I%s0AOR2Y+An_ji?!gx!VG7i4|Uz~@#_7=|481~j(c3fI1$a@n8|~l%(9Xfr}up!iA{Mfxc!LXz0Sl(1Zr`6K z+1;`K<35)wOBg$}mz`0<#|%uX+D@EgjuoC&lg6)_Q}do4z5lw<4FBx1^5XzVC4P1< z4`a5iVq~pZr&O_a{OT1C6pGPe0aDPW>UUZDdKN%V6bc>5$0gQ(%EEQAMgE8@-A+@^ z{Z50$E_{hXozTm16fat``OUTlq!C@RWdjQHb0U)tvURdlVBDKyH(CIU_O`ey9?TKV zg^mdFADgc#<~zR(SnY`W=3N_wYaf&x+(iVYT&)f)$5uqHmLE%GpefQlRQBG#`Q(Oi zze7W>Gqu`<83_mzkK7dUSI~O=p9$6A{hjrk+fDTf7wph5LDYTU%4hBdX@RtPb6+pW zDGl8$V%i{Xr-bfmvt;i}MMMz*jK`bk+`--lAr~k%gcKN)i&klTvwWO*X%iQK?oK^o zF+d{kQ(=GHWtiOut_qCsIYnLoq3NLMebJNCdt7Q6mO$PPRGkO&Qh;62R@((mdhsr6 z^(jkRhihpnzN9}!HX<25H!6b*w;*2uf?mhRkHYGaddEWIg;E0w*FzIaNaiHOl=q!vg z46DBmJL!3SO~I@_V!3hB3RzvD&iDz`@)JVtb$yP{hais%i+>aP9Q^&#a0l0&KbwNy z89O*C_Dl?199O?M4QcNNn@LcWa)II0Tin17f6K+IB^vf?nObbFm><({f!kKZe5DFs zlB24;x92Oajek;*)y&wl7K%$FE@_2#N#Q^3o43#1&O*;Ey6&P$jYr8m{VaAYRNR0x z2>Slp;D>7f3oz>#x4xv}J?6$6@D><)+aQD*a=Tk_G>LSCSBpJUcBiKE^@TZH`=KDQ z?zXOtI<0Rbj4a4u!&=^nm)ly5hpHQ!stsF$#cojv zNm6MH^qpJaRc1Lh+2eZMJiRICu@J~Mr&+h1k<~*(&UN+6b;hUOT*B1JXvQ(vxkdq99~iSYpm8$7waDVQD$kRnQ{LcNqk6SC2hOcL_)&NulPY?3tLxiqs#+{0@DSH%6FvOhz z6$-#tpQC+Cw{|qnfna!b3sY21!H5<=N`|2&1^wazU+GUZohjo@BCi!Om;!#UaR9u+ zNDmL56NPV6@2F2%P&xWIN{7&1?U198XOWScKGKqJm}Z?XBc}JWLj3eT>gzrL?8z$p z)HY_Im06&0bVtgjg?(&=oJ=+ghypp|S3YJI!p|QI$01R8n+$>jd$=b1@p(#o?L7;0__^R;ZaibS(Kj_s~bfMeTbLfYo)UQ-PaSvj& z;f)J5LI!0^#J&Hs;$emNau97w@MEMN!o4vjwTRsFd8gMQB#eVkkN}rOdp3^yW=ffP z^8?`&<5@QH7Q0m<#A3b>|Jwx+6~#*zEUK+%T2+bxf2pT$=cIR5tv$5R%&NDGp%wj` zrzp>0laBobuIH&Cloae+Mkt1c!}H`WM(7b z7V1B`;12OXyU82#T*O{A!eBK2h(%B^n#|`-S+`J2Lry@2SlX~kfxnaMbb@hDJ(kYE zP7{GV3l?)wx7OL8i+nG=?P_tZJNcr#VsvtA5Jy~X471deY|Br-+t56pl53`RWd2&{ zFd$p)TH7KY3kr$Oomg9gdxIHKYpP>2dAB4~#-r^*J+_C+RE=-1L7^$<>kCB`l$85F zG864_8%fwiobq06^zJ>AFhI|-%yRMp22)+f@jeZ|cF)cw8GF>eNQ%CJSnWm%G+m@n zn@$)EQ`cG73)7~teXY5qXs~4_?@^1EB+5e09uAx$64S>W)cT!O>TdK#%WL?{IOhan zrW-75AWdXT=|*FCEtraP)~&aoKh`F&__Q{d@^?V(m2<|*1%Kv7Rl5uavP{KIuyGgv zAR0qFkmAIMazwP`WUb(_$~X~ulk%W5$l{PS0->kv4``7+iE^R@S6vT?0{h6wO^=B8 zJ0V8e_EhRd8?1D{@Z-u1_iWX%aF6~_0d3ARftoF7w&q%!Vtvot)gEbL8ombLQ}of= zlaHuAtzP=H^!0h>?Hy;w?QH)7&E==eok*uaoxJ*)(7(8LuR>~7Cx2_FKNhJSl40I%eEZW=Us>?V5=bhZ% z6lx^f-H$MeK)ft6!UxXR8M>$%n?Ss|Kt=RRZ*#OxExpB6Vrtm3OSH>S@-fps@&P91 zfF2|Y2>p2V##u!f(*7I|zMCsC2JgfhWXE~0*SEmD4M>sLq9`P7GVUcjNEZtcwO6%o zDDA=Ky!LCahUc87lS}VNMvT+*g^7~3Ga^&9TEBOGOriRiWouYCuKXhd^uDzAc!8R( zx0ZlM#3>-SJ~-;}Xp0_p6b%lf-#6sL3tVo}F~}G1qkk0k_Pe&1;k66d5QRAtH<-@$ z1gg)r^1YR>NBdP*V5r-kk^fE+Xmdw405GGaxsJPC029**BxRo*IVSTdy?46 zBUF5z;~z>eHqth(&#RMofH#SoCSBvkO}5VNzNBh{Ym6rdD%&k|*J2ebDLyyPc9vk@ zCZZ#a>63^s-)nUOgB?SNMJb*@=Y^s)1V;-wZMlRTUYy^}WrTg8l2m&ESDK$tDDAuOe6OHsxhL?Rl|x>@H@3ixuT6otM3VaxrG z5(rHV@6xLWSJ|W&Z<{wfr88`d4#6N{uhYwXWxwxAbg6?skiyt~DVcTdz&{Dse5gOb zur@2XNC@C&WW5M7f#4jPBzHkdPiHtWU!TlhJI@S9&d~j$;>FG!EnFyh-Rz=KC#Mr` zK~7$@Cs%X<00)QN2f7_YoKdb;z+w!2Lf6+z9_S-6GC24JVrEFTM?6G|H0c~r1Lz!@ zXHMrao*_I7C!*)u1?sD*AswH(^2aFHENw3``R)a->D8VT9sSwAf39eJCob~1ItemY z?AXC;_i8TCB;iR|)VJ%U#0W7BJ1*2ssI3_osze9IcD>E?H@U_Opg!;pA#esrR_1$C z3Jo%nSvIiNG@#+1&XrG=@8oDPnx7OJp0u*xb?Lt)I{<@iFp{V)lemSKfsGF{Zd1GC zJULISgh7a4oLvxoq9+t}s$06Dw&1IipWp)EyM)w*_4mBEOEhYF#;HbdYg z9K-_Cs`B3V$S>;Do%R2&g!H!Z2PlF18gPrcqQ6Tb$po;H;s)EDSI7>e{qB6&(59Ka zFe!pqxwm)XSCaNTOT{0`9V=MW{f7DX(LR#adAB+0*RUUninnsw;b3V7M(|b%JM;va z=tFR@rJ@kvd8H5;*# zWJKpoHSSDOttIb^z^4J~K0)q2@a7C-*C9vBH4Y%v&#I$dwVYqkBmcBoSI_| zDCGbc{8-!gprj>l#hLsqwI4`M0L*O>OlC?O-Up;9?e}u;cbjiIl_CZn_^haehjcW* z6%e#U88VFqzJX148Hs{{FMuH!X*;|K8eP!}T0p)>E0n(Dt%?9`Rir>B_cZRDDdqYV z?Q+oL6d9l@G(xY&V|BnD-2x{HBoFqy2dm|I;&mPrCr$jfED?U!dl(8qo}5bHOz4Li z;%gZ|zWKnBZ^R6LPk1cbg{Iz|ss4;m@lObG?iiH?nM=E)>bN9neRmK5ta=}zO1R1d zp;;K6uUcOYQH<)AZNPa*QAD*}T)f8k;Sb)kSeRGdXM4gVKUBANWrpN|u%vn`#_^L zfs!x}Iug@W$;3%6v^TD9k(Cssq17t^&m#AO7eV zogtUu@y!>ii9JckB>HtIF^n1=$bGpe^%8m?XffCKxrSgr%l6P17-;roK@#^QMTRBz zVk5j_UH1-20u*4ri=W?Xi5{F)f8VtQ!Yq@sZie_Q${Uqfe-!Y(p{eDL!1KgSw zp0yYf97ms5e1s@mWZa5UIgONQjTU!DLLN&s{|>Y_=_Ga!uBP$fVbi3C+Ub|W`s#)V zn`dC;l3XBYiQ+3i`8C7s=nngkRTia1WU|p4i&eZ}u*=^O82H~xHsb1f6`Vqg>VfWk zf<_=U6HgIInbtrryXJgznzbC?=^l^2?`q<6hrBUtnA=+2zD0zrJjnnu(#T>f6~R+9 zN5tm!c~FLVI08SHq+iEN=iCxS!FLDbisdL47-lE1xQ5yAA@m-dV)nWey$4+Uh+&oz zcAkFEYWairv|76i&h5^daY`7^@85w|W;ckpGTr9+)GIEwHg|1LbW(W{K;yQom*>2l) z@=vo@z!7Q(f33@a;@duuzZX@x&Oy?wDvh?mS&FflqM=x{inLargsr?gqEf$xr`n-} z$L!T)o#amU@)DI9ek0#$tC#6;b|uz?*MeqUc~%o^H}geYVdu-yHwz)i z$EjcQ--N}u;tlQO>?bCm`-@%H>*Lv^^HI^l;L5}gplG??47HNtjb*y1@HUEu#{ODPX_y7N zb*4>ZfPDlC%0ESP^6A3$m*pRD&y3G+4wV~Ccjl3seg4^>0VGhr@M-X8fEtNt#=67T zZPwn%mL^l2;te!eT(y~P0A6s%1Y;L07@o?tj``-Jxf<7QsPh{E zV2xP`i_X{{<4$MJbB+r(!RW0{aEWFxH z?!yj*1EsAMOYEIuW`0r=;*xOv{@nx^*xz7iq4|ARL)VE_M3P!cMjT`fpk)H5OF)fZ zpB>M#-8jjL1WGi|PIdl$V#Xr|95Sfiv#G{|xLTGko*?xDtWeFFbZ6J%%w1N=-G$i~ zAL&F-mUjRU=^wKVSjz&uT$T4H233GH1Er(JiXSbuqpsWa7iazbe36OUG?ng4zwZE@ zfPt3t$x~^+r3E_66;J;2X53?^gc`sF-~GG+HSS-WEpD5HciwJt)vqkb&qo1`4`eCl zm8k03?$3RX27qkUv}mV<6-nCEyf0JJ*_K~8LoUHHD)=MI9nKTiaB0pq08EG%4mTdY z>jA5X2IneTk1PKC=fEA6P`Jz$?Q@(x(%<{dE`2A22c_=>gB;F*2Lk}|3P#xP(2BuB zgp~VW;cO_r@uK?@bO!=y%w7h!@>)BfaQ05r41Iid%FB~@Ja~|QeUpd99{P{i27dG7 zxWQ!o3NSP3E=UReP@3fP5%NgcEGJ ze+zg!6!LLbDn6t?=GLAA5ZPaGa6IVh7dg?NDV5Y0JI>w_ycQ^~$aQx>4ZX#1L_6GO z+xqvFq>9Q4Bi4^QbREm?Imwl5#APSIOMp&63N@ed{C9xH;AtRt#w2KVqhELc?7cv@ ztxp3cm-oFVb+F8y=_~z&PexwPYET^E( zdmp@Y0l-wz16&QoF4PY(t|L1JXNGTFhuF?YJrA-wXq1(j?;rl{*yucef%BU^F-swke$~QPJpmfk^isXO11&7idgP~dF^amf5kO1Zt2$}5@T%rX#SXvvu zSN-XU2nH!xSA(oigN@7&U-y?i3lpb=^Um_X(&*m>+*9^l3OTw=OiVcVsTr)T>h8NX z4A5Fl_lF$q^J*H@@;QPl9j{-SiYS5;3W%mfc+3$V-5f-OKW{*P4P)eFdM^$l2lIb|3Ce<{Y%ELv1qXA0Wu=Ys~pfTBME!N+5kHtDE*3#eAord?uO^V2UTi*U{Qe2p?Yuajh$;%qmZ zeH$Ah^I5nC{QGD$T}q173J2h2F@=a4vlPQezAcSMpt&StUbSDUPWun?2fbCIJDa4E zwQ_89DRY%nl{mtig`>5Sed)m+S50J0+T#T|){tJy<$ujh*TzokIMvT~SuiZ7i139W*6mR?x zTr6oZ)osi_Z_ERr#*5X!ZqeA!M**lwykOv2*%G%*@R#`%M?AUQA!zwTk#&IWCOm@# z0bXn{&jI5~%a{B2ROn&%4&J^2%r?MO>=()oC>*%F?mQE!8DP`j?Z61sZ|7uB2qBK@5o zAUR59P7I5^lKKYj;I7~68=PA`OUDqKwJJ0+7oaJ;Wi_fcm2w}zfK@}1=PHIyqlM0% zJjSl%_{-><76`4?v#q?KF;oF0UB4@KQa!(A#l8P)53Ec=Ma?B_;9c~#@A-5r+|Cgn zpO}aqL)b98I{T(99+hoc#OX9c__dSzEk9=_P{PhivFQ8uTu!@sNr|e*xXwqlAdc;A2>0e;r8rnB=2OfBvDScWZ9 z>ng_-vBq9JjB@A3`Sfe~IC>ob9OuLR0CZNQIM6c9ssiDnjW++Gf#L+y)Ggt^ULPPh z%j24Y)t1=Xy031EfPTvztFBF<8(>d|&Z59))!|qMl*xAP! z2pk0(GqL&Je7n+!a0uDB^@5#&CjeTE2g^49byI+=$roYM?)_Q@?#2;O^&i?ks(^p{ z{xoBM`Knqr9tT_YO2pH6nHNc7BMzL0N-G5dgfCZDeMUvw+x{!H% zCFt7P=yQ%+=zU0 z{`>`HWA-)EOJ)jKoY9+L5FQTkuPWR@$Zbw5*uVVNqV-OB+@WFNaIYsWS>y8JE|s~+tkk+T{O`AdCG)ciY#*_}6ysXJa=nVpau|4)Yx$~KAECdx z4_bNv%_n&}`{LMAnDhZ`@-Odl^M;^L@=Vc$OM>YxlllPihdYny3+LaH?`{qwSDOg# zss%1XaSITQ(uo+RPwKYBQLp(lF`@4lqc?Kn0A_ELVTR@Z`S}-FXL=j8nR_+Ds&$vAHE|`F(Er=tz9Y|qYgq;%UyAKtH2!ZM zEB`;V0)7_PE5?I<5pQ&&ScN?$5j(luf>CJo`K~?pi64OFz$Ky5B@Ant_xO*Dg8u^^ zy=4D9dIJo+7n^c#5ON=YHkCH^<{~F zW(9ZQ90~kt;QI4FmiPaAPxycL&j0B$A4Iu1HG@T`nq8lz76sqg)RHUWOnbSG)!jfj z#>Oe2z^pOI6hKC&z2<;v^F|R?ea?@&uKUY$vwj@o?aO}DFI2w^%GftKd4Z4I3)^`i z=%kZU5@i%tCGeyRvTHQa8ZYo}kkN+3Cjs+Z9z8&Soz6YzJE{kpYWVcy^(U(V)g1!F zC%3&dFKmC<>3lG|E=jSZ@7atHX3-IUUPevr2`qb-(oScJP;G<@Z<8=}ALR~bS$sx` zKT|FU+=1Ch+=`aK2&3L5Y5(6=pb{{*gN<>Ox8K{lRm>g7#U4C6x0Z!I&-$8WAgjHN z{fujh5*q&Wg=;*b9W^OHG^KC#!_M^M@R4>k#wF_+D!3FX3|E_0&_3m>+6@$A0OvM7 zSTPkCN$L|M{TPNzJlbGkB8_ABu^y;Jd`1AS>_jx7$`$-&d^ef2LKR;3@3S#KP_s5n z|GnMg`6hrNnp(iW*S?G#OEzoLOXJxS|ZRu#$Ec<(*WF+=lBt!F_UMKv^67%EW`ne(H(jRmR(GXoqE%O@0W`sWO{ zkCY?}G-zQOOsaS+_>R%}-)qEjnrAQ=o~N94s9U8oph`-39?z1z>_ApMBjJ`ju{mJi z;5CF)Z(M@;Pdnz1D8^J3&>+(;qeqINy@So2+2czRUMueUMNCF`^rnR@W}iKC6iseN z+*#^ZW2xldK;Z2dN&(4^|1Gm&7cmKAM<`Ksm9|sQ&Aqx#JZ}e1;N;*f+q8%&l;@4g z#oHk_$M$&?3RQ99OGqR0&0Smut$RAd!Ya#vC3jhq$8wSh9Lx8qI&KFLZt5)*-__3l ze0)o8@zL%vUCqzYqiR**+~966Xno-1%VHf`RyT=!k*W=y3LCj1(dvhW?gpOO3jt$2 zRg-p;k*uaZ6}Nncvc<8gEa{4C8o$=-6^8gDwn=~q+5h>oPq1w*CW4J{v)xc%UkUc) zw~QJeSe~qit2~Vr&z?0;5gXA9w`_D*tZ14NvWn<@rEUMzS%v6)NqbPNk2@+ya^=Va zXY&M>{!3heFYRX*t_+po%r2uIelLUr&73j0Us%Q$xL7B>u?he)^&Ju1O@?LZM2sO%SDh!~E&d>&oEl|UK&hg&Jryzlo zPdVx%r5ozW2Q{xl*s2ctxXeQb-;z6orJ6Qt&{L?q;~d+BnzQASR05y9^08nuX_6^Z zzBFV<;<7o}8t&BQ2D9sLFEyXq{QNEh#!qA-^*zk4bJaJwNt{&ZA+`<1{tQNTK|3u% zV1&f3iSvegcXAOpb8)bm(Pmh_skh9sPXf+o!krs0hm@A8_xdTSBdjvX$!kyr%2zIr zt*Z*&oQ$DR(aX|c#-xEz)NDa&=&Q}HuNCt(YcK`tGRL`7yzV-UxF`nc(F~e(xY;^3 z2wNcx`9B;oFZll>(8S7L_euz{>a96IXI_tD0~+n*pqe2^0&drD@9wY&qzs@qHOLKZ zEZJvx)y}VwUjGo1TVCIF3adJ+gg_wBFjF!lcs+Av~2kDo|Mog}!LlR70YYdVh0Mz$Lyb^grv{Fvao*dQUz?lre652i-V&{h(V4E_ zj)b@r;0ryZA}-r*&C$<(TPuRB^3YE6wZrGh74aB`v=70bNiOI1iH~`bNaPeTdi~x} z0l(mDD%7Hw_^4-7`R9juc9B8IX}Xqoqn4e-x}< z+wHZW$0>J)sLwx?{Y$S_{OL06aF4vEu@E3yag+0b!a zc^5~y6HKfmCWU&?s9bceHaia32sYq6Q4@VEbfV%T5YVxx>c`PadAd+doc_y67ame< zqA_#642>%->8a1+0{id)r7r4>Y+boXhM!>}73cQ*_qd-eff7D-k+i$J)89y23uQpb zru%yKKO(8z*b$%whkUQL3}|NdNoKRBhbwFcPVuSwZL+*rQfmO0%!>c$-p7?Y`$WBmQ!PeegWnf4r(O_ znMufTxlq@j&kZ^QCip@CNYgH(0L(~%0sH==Anos`{?s#+K_c?Y%K8?r5z>v@$Txu^ z##mxJ94dt441#IDH9()ssz)Q~TB^a+9bp+%G)8=~zf?&>%ia$zKH zp=S}GYoAp9eKJz(;dI1W60nW62n#^Aef(&tB2cO72$Gs(dJrs2jLHXGdPWXJ|gVh(;0&-#aQ!BwQ z??JTr0w`WY8lH^C-m!$6{q0;{P-p=K*E?z@Ny<2n_aPvnt(r8Mw0o9NlSIVx*n^gMO4{`6CV!l_muV-1J`!sR>ggBOPqUtD8+%A) zRQMU5=x1UJm<@inCe~TUA!$y+o(uC(pIWY`+e=Rrv^@6ZHlUogS(KU~jeI=e(P*-y}~obO4#xcHBK<@vV^wPk0Dz#}cyUR!IPFyC+)jQJOew8y@#2S|t(8xG7JvRjQixSJng7(R> z#rOwh8(Qk2vf%lLdu`#w6&SCR(gQ>WwPg@II(|W`n)4svjL$huzzK1Io9GxbE+oPN^2eO@D=U{=)$T!+i zICc}Wz1@sGntPqg_4K&ybLLS9!`RiHPCy03Mf)gJrIA`c(t?~L;nI%b8hOUt3M6@N zcsDv{R7N1ga{qB2P%m0*R|ur2*O#T9ZonZZX!MRfE+wu!cc^`&^k=3h7}&*HY($}J z;#$mcj=FS< zJ@3g+ZFd8P`D`pxxMxy;Q%5Pm&d|D_1$lYal)VLg5VS(9a9pT)2(ofh7XoXCo z9wX$)xh-Ohmx>rvYCtUGc&k>j8U4S?`_8Z?x~^RnD+oMF(@+)ZA_$>31?jzaLXnct zrME;xLBW7D>4aWF6KPVTRFPgnFCry$gb+I9jORU{`hMU4_nhlG^JfBcWios1Rql1K z*?SEFxQd@Do|St=#VU{YN4}l+kS_o?IlPkJKa>%FTQ|F)o6!2a)r6ITEOT>|=Ka%Vt2Uci5BDMnNfNA;1Aay8@Cus|x@pge(&Y?yeS-1<@ zv^NUV%5&}8BOXX57QW{D2F!Q+!%N&-eXBxM&JrUmEchGqMMp zJr@2jVwb#PosxN z0^Knyo$q($2RZA@Ul{cm;wP!7TxwnPB(i|b*~nJw!ndB8mO>l6)XMAIvi7h35ZDaI zN`AD#*yvi;>d-%AOAZ}F8VN|iA_8YJzK}NgKlyZvq%1`-u)m~Gk>W*A4&7e#Pr<7> z)bvzEl**iSEKPYmo8)AsmrZw$P3^`aJVmW;01rp$q~4%SBl|xbhp7|-XrUbxEO3KH z@Pb@_oegRPnp2SuECFArVhu+zje&a$w7zjvP!6{t!qfjHr}DEHrd;0e2`$MGliQ3; z{F=osgN@qX1%bFQ;s|81&&3!XN2fp*KdO<$@}c#^qwe0|xFY{bRAtU4YJJT5)+;6x zl53SRZmu8Ca|0VPf1&DVSI+U6OujT^gkSIj{ouO?^sE2T@>NDW7r;^9WCcHP#)`l* z2rH|=YIqiY0PjqAHIJ(ld~IWp+WTE7lGBKJq)3%{NAYC@ z0oZ3}#ZGtm^4_Itc7vxlb&u!->#s^@8YB+qC1C4bs@yvp`t$L3$_8(|DtIkJeg}-a z^qUz;$l8haJD*hTuP{)%^rAN!!OzfsIcr5i1u?fpvY7n8+q{>KrV({3-7d)oB^3Q! zbt|X86-lS-VbjXftAf_+uW7#Qc59Ar`A}9S1`&Svp1!?#EioFXZ~EaAtvl3*KJmPV z%hXHCqlKA8-VP$WF7wTL^ySYux#H#zY#)V*W*%c* zd5#L*g2b;BbCosA1NH=d(^EVz5# z^jgnPkm`L{sJY)kF!@@fi*>fu+1PXIEBzwHa5y8Cs7dSI-x34heFSB=pp)N8jl;n) zho1S7mMDapdqZ0V zy->g!fPE`e4^w8OJzkOQKpU5ygxT~ngc@2(yuN#7=SOrX?lYBll~GQYYOCTXA*I>K zF>ct}=AD{700sBR_L_0zeUgFZkxgc)5~+IWo!#3kG0>}Y2}58wz)$}ssx1PwIx{`n5fPyn{cK6MK5tE$ z<>4?5FKSdV6ZydH|?Kw01aL%v!1cX1~eQuIK+3lDLj{ z?4<|#?pX+-(*~S2%SRX|15cBRLK0Q}t-U(D$5^kZL12OK@>Q6m_ln$>`o5@N-m|}Q zXv;*12k<_<&Jyz&E+PjIcSO5DJkuZY^>WKea;ymQwMZ%m9G2Sw640mm_lx+6=yFO; zv{sa?Sl-O%gu>l_8=nZ9@?YHtw{(hEU?Z^LPU(de#5#ZjJ3d%b8wMSQ4 z*RsmH$$GySm==k}-b2h)v}z@}99Y~;cC^ux z$0*Wsuq9$h&iRV0LxaBZa1-qgFh=MS5jDWnNISA(0A%sJHYfo56KY;Qs5F#CkD|HS zx600FN0=6xrz4*)V@@kyg3l*fx7fB+2ZoV_v^m#Wo$R4+6jc;>AEB<2=1&6!k(~yY ziAzB``8SRbLoea9&g8{h;#?10HriI`UaI(?vRTE_max3&q^vaApRNM(B6#Mk4cWbS zR2z}D$cZi2q#F&KPQ)+DG)J}nDy``3smd?(*o2$cbc_4kPyYt!qLHGnp*FqU<5z$n z8GcC=fQ>&r4uX4_lyU@W;kv#(uk2ehOl6`wNsk|<^%#>}i>oeUOxlQAD;U*QkdW(C z6Aml9`{p`B-iPyOzY-qy?jDq==d6L~>IR5y73yvhzz;+Pa}X9y@g(;4b;;?KK-|Gw zd#WH`l@fP(s+GVfvd?Js)}ctPXP>EOo=snzd@OG&xzU+FblGO*c$Gz^pwM;`V1q7e zGDvw|jMYkL);7{DF8lQ9j_T7>+m>Nx0zv*Xk@v;rcXi3aigTPwuK)s4Gbj0I3g95Q z6^DOlraJiydx@#fyyS_ilU(L^zdzk}y|gTN@f@L%FL}cAG%4wI8Z}+mB_U7EijoZGd81Rc2h^IVb>DVi z|K96QIONLT4$y~%U(A9QXkYEAkgY*T?HglU`98g@ll6;Qa zMv?zly$fkid^v>@K|U3aXr_}d`BfXS=5o@CBPlV^>ns=^VA2IxntGG_ACjMlM^vIO z%$f^5KnIyvzjp-LL4K^hGX&le78{>i?|-!({Pb>`9CkAzN2^&@m=rPoPK_j}W#o~9 zQ*;@hbQR;nWkLUR!)R*QX|uL9M|u2*AMeP!or&g-RwPeMPTz3lo#zd9{4(SeyzoRa zgyNBsmYa>Lb??{*d3#O$awx+mdwsE&&V~D}A2b2CNXGrgl_4*s0})8v=Q^Qr;;mZV2shmdXw=AiOuCxavAYlhNGVZe(6o9+;@!vQS5k9}MjmHUzN zp@?~ag5bMn>T*_5>pTTJMB?6N3vak1asGh$NcD%hAiD^5VToO&RwQpf@!62}Me6d1 zv2vaJMq1GujUR*P{%Q3_pvEg)4D@70@5S^S69So;Ql&hWy1Uy`qv{2-JKtq>C7%sj z`8*Vq&c5BSxBRZ?Q^VYB*P622P+s}B?h;etkfpe0Uz7BiVQeV3iJNrNz#Z(8-Sk(- z$ehf<^Cz`@D}ADdT#3ZM;i&Po_>_|hkcj(WfA8x0GUp;kS;+*=nd)@Vnx^4B^3Ru* zNj}IA8vUqpy;t_8xfM%q-#qVwF#dXvTs;ya6{B^tpP_LUa<*?y9CTfc2q}-)zNS8+ zyI3Sehflssgbn(BYRgUf){C$8^s|F4<+(bcy;x4i%#E7`powUC5JXub>vqGB)_{6e z&8f|jbZ~R+%G}sX>(J)*#f2Pi^i0SqFb09{=NL0Ie32^#<$@8^=#BO?90KqU8IzdbU; zz)%Lsd}ZZxQ#EhF!>J}(nImPw&Z%L|WjnU^qtkQ4eLN|r6R9TR8~cj(@`a_DZ?lpk z^P6XiRdI!E`hUqJH>t)B`>J#8$^*KDSusAnq?}At?>%V=A&*nLkDgvb{ZcM6=x(>~ zd!uW8^6fqpIJgk>WGX?gdZ5_Qbo6FV7tK8o{ivgMOlTn%E1E-{`DQ%>V}=njkb*i) zpC*u{@6z-Tss>8zE0CQyKn#})gA5G{)n@zIH7@aJR@bf>zVC|$!}Hg`)YoRFZVV$r zw}0rTF*=)Y=rL;aPAM7MjZoyIJANV=yP>Jw74M;Ona2mhlKBNis$U7W+DTqYu$%i> zRhl4jc-&yS@oCR6ZEZ{gQV%T+)1J`^2?P1_*CWLBwDSfvtMmdI%cyzP=yjJIx`#J) z+_s*o^1iwYJtwRLdd?fuev_|f4~$n$uoq769VP~2EC@N}4bvXO^MN2YA73r0DS@LC z1r+Xd&Q%o!Z%u4w*e}v-X=C!3*a9r!m$~B0F28_`C>85NGykk-esfZs6FYdj>i}l*ISxZA`7q4ip^sf-y;3iipw%K6H zG=OP59z5q14Sw&$#pKZBa2AXJi~02_>JP`Z&yLiAs|b-|`|5-D4o)Tb@<-Mow^@vn zlN$VOK11N;UHOOGW$Z|g=xx<&xc6mLhOQ0-u}xUjYh}5`ZfhRFBo+Y+LsS&ra?aOdt@wMZ}%Wt#4u#p$>OzoaZ^w{(R4a9_DlE-C5*Tc-8#hR^OAgH zpmi7dZP-hm4*w>sJ!4N?WghD~TWIQahRr;fQ9X2XG3gSzBDKkjEek8$Y7*WE+3b8`wM-YO|W(l;e#U5zBgS84W&157-K=vQ6jvrKaQ=sDH^&&i#jL`nrUi zZLJ7A+wqr2#zP$^twQQ-^q3cztd{I+MN8w`Kd?x-Ubgx0iyj>0UNMXqZ`=0T@wmL5 zWP1_e6#a-4uuJ|cQzSI=Az&klrslH@kyk2oM_s1g(&mj)G5O(9!!K);kBBJ(t@hXxcY& z#)=KyWiA$Yt`UB7-mwN_vga0|D&{6CoVgeNPdHB=^QAjI0yKR4<_`kBSzt{%wt%_N zH?p}*HROKIz*`>yqM^u`;r6F1nOF?`k@+~;Ypv?YQcYR2C(h4~(f;4{25JJ`d9E>3 z#!ik`R1Xw-m{z+9Y&xy0iGM1tE-nP=Ru>%_iT(1eUTV(2&*L6)aM-J-T8IkU;Ei+J z+zj9m8ev9(yPI1!e*m7D)Ub3o9^vHfa$b^{lgZ|cee)^F83E_dBLS4cp_n<6Ey_xv0169;o&)grr1I6qlCbhWJFE0 zEDQS5ff)otC~a@)u1h5D2LX~U>=<)d!0cS6~ek>ERj?~r{FHl*f$ zN2m?e#5;jRJ3Mg;lxRPgsM#X}l^p&N2q);(D;^j4W$wumR!2U!9kecVBaM~kL!_d0 zv3y!9FIRIC`GTP4wn}NcgYj#}3Z!|tGv`_c-mD@PJ+u?GiF~t%(UFmvkvS{4blaHF zmB*R3BSpi5T);R&a?X$4h29@Gc{JGHFl447<#9SP{`J$L2vnW42lL{%J+Db@J#Ze8 zu57cV)P%FPsBlwHki%{*fUBK~uf&6yOmQP;8fJ}FrJkeT5xRr@b0*XDJYw~0fyLXp zke%Rr$i)q&+zUc@`#gbp!DKpg+@|M3Jf|HSIr6Yb&K)0fH|#|i?ds=tz@=R!4Ra2u zvRX5XV&^C##?K~D#5H$PcEqkeE_w$}F8gniC!A{Isoik0Wa@IL>F>@G(M|rRc}scX zqh(IsXDj}|15aC;3@xg<x@l?(L{H}Al&kf#_XOQdwc5*E4iVf=< zWxVUjgzlxNLk@E6+GI%b!#*=ip_8{|<z`8lOHu=J5_JZS$*igHfD~e?7lMMT3&)AMmW7;F=gC+{{R!HgTtf}(l z;%DQ)>9GCf4-Rt#`kEKxg>b|nr3Of5pG^vi?D8W`ziVAqR6x9^4?rAcoOH#$uTF>K zowZl(or9BFrd5+6eV0s3;Om;QZb2oG#q7)&1kC<0%P4~15Y^q@Dt2$zS5AH75p&H#Y=UzTorsio|F~fWu zmr1V6ny!_A77Pn4Fd$7k2j4142W{jDSK@S<&Xl}#Q$htulO_tu$~ysl9WL=(eXW9# z7|rnU_1aw}!=b>bDL)O$a=FyRA%iId?kdl}^^*x5ikwUIIA-Zt1rQimA{D!ot~{Un z0Hd)MhhLY#LHml!QmUn zfX%>LsKNktBoLBaH?P7*Hcuwch^Z-LjPAr81_$hg7yuE8YknKL4;eD(c`l^voGO)~ ze1dfpsTz4T_OgA*D5z=B{z_`u&S-3CUfbN1XJfELkbKBMgE5H#4v1@eR4X{~AOgbC zuqPU<2TB*7D}!&Sz;y@ta~WlB6ij_uDSi-<2o8@O4N59J>P;4BM6Wk3r}kGzm{+#;eH4==3cVTrlIPzQ_MwT_q4=pvHcnlpCktu=DbRtL8=RVH&t{YdV^$4x; zF%sgyyuw=r^;miyU^%F+;Hax6&UHxA2&!_N$Z0+;qjRvTb+UQoQg0fV2`zm$XjbVC z?t0(KhwUc)yd@2g$wwFdG)@><@4I$xL15C2o#DDGiMS$7*22vnP7@HwQn>fGm*+P= z#e_nOLdwql)2Ru04fOb&5`-q6V!4E`c)*sH-d7~i)x$=8hgFmCkjW6!KHKa=D8X+- z!I^)eV`WW5=)699Mf9+1pa2zAMLZi@51BGDm%F|3;^X2CAMJYnHGS9$^&e>L3r%)t z(p~6NgIbHP>G$x_2=01}>?oQ}&oZlXpGzgxA3-ss5YRdut0Cobr9V0?bGKdO(Fmt& z>$Pz*R7_ipyxm7?j|kuFFnMiqbK$~A&Mlr0k6o(ZkF+xtM}fJ0MIy36AA2f}tu65L z#^l3u&oRYnxrb|~ObTaDQv$Y>EO_g~Job<4dO41>Rv{sV(A1^(O@zU87@`VjTI|+? z7V&vmo!Ue3s=ZIX1281q!MCBnRf6+lMq)*@#9pNZp$|N)Gl`8{<4kH7w*^9nxZ^CN zxOoHtgm_ zGR~eKomj402g=^yuxK#B{0uQk{#);&s_Ou(YmRYu+*w6dKgO-TGA1v;I3;yB#Fd)K ztk%wpr_R=Un}5Zzm#X(nF(^;DBluz0X#Y$Uyo6qjjw20dars*PB;bnStWNL#?93$N zcBGhQ<<+P8(o+E)UZNNu!l4R)xAVFL&E=9fHtT-8`a6#0LXs8J-P7?KlDR4NbXsVR zNl7WXG)h{#(pgDH^nKU;VOOh@`Vk=O)Std%2-{gDq|nV%?FU*)R^|NVK`~%MBay@= z0T11w`u9x>(>@tlXE4I9e_8#}=ceX`s%biWg~jRV>KozxQZg~N)RTp(^oWYB*Ydny z{v@TStg|7;y42KeO|vyG8ha_Bs3WqGAx0+6qwu_AARO)23u3plRcb?mRnU@x)TPCn zUYi^WyxV`@;eCm_)bt!S$I7qXew_IL83==pwD4+-^&eZwhv7CkI*9Px4Q{8CpvL_| ziax0=YBk=f-`~G1udP?{N5Y)pS;#e&QcLnKC50dX#l zln~7^@Kf${8Mk}7yQdloF)QcqUj(#1mq>#oIhQ=p+S&?my&tsxcdiQj<4;z9F2nfu z6#Pdn7=8QL_Ds@L1=kg?;(U!C@q3QXz5lpjAH!202wA7(aT=yQk5~9F3t+%h~~Ypg#7Jkp5I{PA=C?|Xt4S7A=?Bh z@(HG4H>1itPG|eFeMU_sE_5h#z&2=8ltIMtIZZod&_T~{ojrdG7gAdmz=|+k^Y=0Q zsdEov>JKc$Af@pqG@5^rITHjDJlfDdKlN~-58fmr=`-k?*FuF^n~$6Yr2C69`d1=& z{?hDQpgJG-keapgySU;f19AE-^TLg92Zv#VJ4Bl`=ry!o-SjFHJ( zTtR;P4RC(=V2V%~XxnM!>teZA&^8~%ORRznq5i!?EmW)Y?G@d&Ai6=`4r@Sahz5?` z@uq%F#q2P?D0$b8%gBYs^zVXeQj}G1rLmiENBc+SP$wyz4k#bgod(va-Ejx@(`eMF zN%zgHu7u&nb3%tlm`87cZ?eWcaA8z|-I^GWhM~FQxH}XC#i=J0osWhZGle>`^8?3G zQS!FdrFL( zRtKs|>BJk8Bi7i2v^d_tx_w;@e~y#S*gB^2R1gdQiTrE9L=O2`U$_c*2 z`d@tCWhUyL?=<(o$A<9!2I4q*`hLI*;>?YMt<10PhU{_NaCeqsLzEHjtiujmK^@{l5(Y!k)olEzUO8n3K1dAki`|MA%AQD7zC|q zK4dnYA3SFg*`ekTEqIoCcPP80RH_BqcHOY}XWQ9_$Pn3A3TNIv-$SOqBnbN_)YK>F z)5U;4@c*{7&mlURAYg7_6VuP`3mb4!hT-Lcr`p`k#vByzXC0%pLK`zacbp9dnxnkK zm|VTMR4>M~>Evc{)h{h~jeMHfq@W0Uw)pLnh{eI7(Qe4}9j`3??))p^cU8DHNZbh! zZbP$eSx3lv?t`-GvyIHx*LZ;pcRszUnlG<_AJl`W?lU&`Q{I*1(?V*(O=2BlF+|r1Kfz8NTQ}G`;UCme#LS`9PV3?2mJh zTYvH%!!v)Fs;DJokZW^Lx!c0^o&|1xy}V( z<}`~vm>C&$+xu^MaejQY^UCe4E5r(LA#Z=X5MQm;Uy0LJd+eKuL3CI2iDH0ka)b=@|lVLeU>`;y&*AcE4o9-%w1uYcP3kKfSs6PmrKgu^C*h6NEZsl@->q zpCXwV!SC5;WfOnyo?2L&N2nNaE>;|^ zn;El6Y3<{Hb_XSx=3ZS4r(sMN>ab~g;tn?Pbh(^9!KQ;?D7J2jnvJoP*|WX<_8?4mE}^7 zw;gu3*3wibeJJzyER-Bi7(?74S#Ep9A_5HG()Mf(E9+5C9uI;GAo+I=bvB++cm%y3jtPxbzJPln=KKhW z#TZ)wj~T?&un=gGIams%tAyfrqbh=@cv6DOTnQ^tfIYq<#gQJY#xJjG-Bu=95+}be z{+H8EVue0wR{=&Lgde$g+?Z~;l{?XxAKG%rXdhcd`Em{7x9VX(+_|rqMS5@BsWUJZ z?O$U&N_X61e^QJ8lDc&IjndT#U&idYDP>zHE}VSWboA(vM;{OGL*G!ztNyzry7?uo`ZhBJC(t*LlZ{5_JL{N3C z(AUihC<%O9@Q3El^CeZPEeR#=TmhV_fr#UQX}Pm?!z--GBWt-~aHgPwwpt^zq7ygp zYM}*RDIDD8x)Igbg!NlRB`1|qOIj@N{=y&2or#*fPetR7M}={;b%1x59uz<;^*_{7 z1$~oU(g}yCgC;{`KtL_*!NvroBCMs~;mXu)UTqv`W}+z(>clC}&?vd5QA3r~9qB;8 z3Hur{U*`dFq4Zqa)~=HrsQ)z%Hbc+!?o)Nwh5YP+`3E>{A0LU89`E$m2oFwc>DLF= zh=VZJ8|sT;a(*6d@}lPA&-|P=wi7Lmd|j7g8#BuSTIe-e`4q_r`ECVP{QB$C3Bz?T zyE+}z+Ld_{tl~QM)ll1JMJI7jL3*10$O%$V#pnHFuO>e-G7{S96UNFu*FC4kGAjL# zNVd@Iia=LYmcb|sjSSeX;==<7de=gQCz#W$5DBj)Zw)&UW6Ua}2o=(!@_j!0^0&u5 z*I)d5Jm=5#{7K6GFJ9P80+Z%`+wSUgr96O(FUa`bk!HAc99VOF%9tJYfc-(Jz>&Py zf=-qzcpQHJp!320j7RxnL)iSIW&CtHl4MTnV?SK7K=|mk+7h<2WBNfOS{PQiXHtX! z7cxnYtsKG1?7=Y`jqkJqoaHB1nZn_XeMKeAO^Ko_%0%C<#2)Fb^osIFcQO7^9oOZG z9OO%yj-Axj&~nb!cuU`}2{Ul+tXYKgI&)Wo__3VbtxUbgBtt((7|GC$x7#Tj&&~#h zxin3j^(LyCx;puQ_`zQ}FSB=46~CGJ^;(VK+dls(2@S>Jjf8+SD+eXI7fC{j7nr>y zT9)b6oyBe(y5*~?YJAcKg-$qpUNKX4gico}USOT;E)~=!6k0JbD#K`aM`^1R30BJq zOgp5cN(jNqL2klvqkhkXq@Nq~_I7A!{$q3xUA@%V6L{B~ z4@bk1%LS`rz2X>9sD{J*AvSoBe@PT$cZ)a75ofrS)WeZ#H3e*n=iqP)6eET)DH!DQ zNS!^J>`sL^%hD&$pUl8h+IkM9Jnv*O4J&x{;U^D-hrSHl8NAxV-+hQZ?73hz`7wLm$?IzyMXxQBXJs-hpp3eBfxAujH4zaa@Es0+6 zHP?!EtL1Yr$go0M0TSM~(6w|jGGm9un<7$@@Bt$tBL~8QxP7>*ppau0Mgitp!L~E{ zI`Zj;>(9&o2iJM^gw&yt&|6nQ^%-6ja~^R26z;I-2%aUAmyk%jVl zNBw2lBj&?gEdwIXK=AafeYi=f@Nr)UA)s$Oap?;12W(vEpBWj3(YjSDIfTPxrxL%h zRv!!ll!xGQu}|Vb`&p2I9g7!%zCJ%T@eSQh=>8LkurfzR6CwS*kRm zra&OBo0S(MldbLv$O|?E;pSnkp_#R5q3&X-`t7&e5*95O`d=f4xaS*l%&0I{D zTOeI5;b4W8`8_un_jr)^=5=VM7XcA;TFM41vnoLPchP=JKJQ@8J9^*#bG!};)PDF# zp~#_j!a)g84LaV@vE3NBwxG}mw~2O`H+V#{XfA< zRRfWV9pV1^L9oay_2sd%XYl#nLo9!#3!oYW;J0GEWrA<5jl(8VA&vIk{F^@Ts%zwO zErK{hGW{dl$bZdCk-CGNM=IpB7}Epx^~(LD7ta{)BKxnmq(T^fw&NW`t!C_C*}56P zCwB~9V$C@Z0_GOH&Wkxdf%(l(s?iVr?B)lbD=9)%^p^S8<0jQ4a^W|)Z7b&i-QQQWL5H?Pk^e=zpCRT2l^rD-6NS#aBqAYh0|2l zMPt_+<0$gNU+LtIZ0={4$N=w7=p^hru^5_O?56xp*PUa1cRzi{HcYP8+Ryvh!IIX( zwpS_f*ssz8NJG+LKA383jV7SRc=>6yv-bZ*} zj2YK)9&l#fxW;9E;#{(pL}1bdu>Mi@h0oy>J{Sqz6DgSymcJwR`CIgw>`abc02x0s zN%PiTB`?x~>G-LugRziPV7ZLs0+CVh+`pwFpO+`Pw(n)Ay;s*}bh#oU%ww{|bObFy~0)sm2;jM?TphIP#L_s_^SeZhrGbsTU#$ z6^EYITW+h*z$mUb^$=+v1)ip9MlB~_mk@h%M7L*dx;lps2l=n8V4Bgax;>W$i`h}e zNK_o^SrgkTYv$OszulYQ0xBnN+=eHMss@KT@^aJ9CdO^69+H`tzqs&rPme@vLp(v8 z{ww3BM0Tj9)$foRd_~skd&_htro=06Ws7hTfXFcJvNvZUjnVioX4P6FeXf-97Lz{^ z02z?L1iOvpBhJi?-k0<|CUsKpr$ma*TKl_%7<#x1>86O(Tp6KkU*l?K2hNnkettEJ387J zT!b;dmTBreJu<*3=sSu?OEY{6YLY275Adt#L*0MxqR2uH*Y~mXO9|-NHUjH%>!&`+ ziN@4Um%XyPj(}_(?pPj7w%qtP=Es-if&9+a1}SW?%<1P7AXhM`iFpnmEk1oK%1E*a-^gwM;7Ex8{k$7ZQe>=Ra*L!IB#KWiU zEe)d!0S}}-^7W=Y!=G2Gr%&ex3U~~(*r8%f0l$b0CiINnH%~$z?-_ZBd&QktFsN)u zt6S@le=+v(`qMr)6V>59F35-(U(>wWM>r9k?oo(v#`ADH)GI1FGDDX2k?M!5D}?kk z`?P88PpgvAzL`UB)F#sMC(;_>)@jEB1_`yGmw=M~m-C3Z%=FrS(6_*Lw+%Vtby}{4 z{Q4us(xv$YvKX)5dJVjyhFv)8BC#6k%#1GAmmo=wLo~^( zt5XlFA!Vi*{p}6;jnn#>eQbvx%~{zdlvVKoe2iTfpM+x}5wMi{zC7;u?Q&VMXx^&5 zHgJg{w_8VQX=Q65|6O*dgCx6B0X2P`8ns|`s>WrVUZ1Mrs1g4Fd#&y7VcFW3u2pP? zQe7h)dWM;@5&>R_q`%p%u7Cz*hBI&zcY9dEroytdNh;aYo7-A-^ynamk6t+exKV>X zBR62ga(|MGe*iuqFyH6Y(hQhGf^T)swS6dLFJ(d9xF8eYA9FSqwFJ(cCYiFMgXNVqCUXuaxs;!56?2 zQQsQb!!|LqrD43~4~lgZbOijIi!e6)rAV}~IOOce@~M27Vm2DWS~|SF!=-18^CIeW?xSrHdHGdS?4(VYKdD49wAt}XiH&!*xZ z(%BPjDl`Y0&FGz1(>A~F2-hr4(QDoV)9tg7mIRe!%Jz$w-4FolxLE1Y`H6kIDzd#p z1m*2wR-ZLpVuw5}0H`dKg9pM#RL4;~DkoEawTS2oKH4nJP{=a(^^0}wNXik2$?6@+ zTu?Qi0WziCh}blN+^GFfgN^Hf)AW9T3QHwcW|-}c^nRvVn*#FAY#WUx9f5&TDeR_! zpG&?yZCID@eS-I@KzOm z>hc!A>N=l3^m36N(z=&eI=A{;8{r*ezSW)%PrCyhGJgJinqn#yHl#5! z<}0wR)wRs(m`bnQv-$+6_@S)dtUQSWGkL{?BBYgDlor1M5egQeBHEac+Qqw2-*Ut2 ztnJM0uBpi=v+4NaWT5yBHL=koK#foFH6q+Rs#J*{$E?-`Fkf{V?BFQFb}bSJ;SFt{ zi(6@tMS$dzCan)jvyh4{m^n%TE1K$k_sN&qILV5+C$h+@eors^mvUKfExKx?TcH=Z7f7B@8Ts;E5 zBb|dOy~AdJ@RK=Iq#qZ(t5aw;-~cI=PLs>bJvQ5b8*ns^0~}VFtCvfR>fq-Kf4~oA zD}yp?w3mxP(VHp@^YO*g9ZRFzmZz2{5hQbnt*Z`p#POzyw~&YPvy&vruMy=k#g}E6 zBVd<_diA(kjlS}Mhg6p6k>)iQ`)}?n(QZ0^kHg_0bUrpu17GQ zUokVDHydBnl~G}t2(P6ni|&;ulU@ugo-<=Bi_5x|y@B4Qnp&hK`%}h~{0AV}j-@mP zeLdM#H&lIpZ;Q|??ZAS0BOv$HSK>qQVD<*;`j~UVyqu$7)Z1xtwfr5 zoFaEaeG2=if|#9syDf@lqjyLQ4Dm@mU7t$ZTcKck|I^*D-II+}($$4>8K@5!hJy*@ zNnwF0R>N!2w7BvR(sPcH9jnX^x26F)-P6D7S_$^yt$lq|W;9paD50af#3`P+NrHi~ zh+^<-0(WD=VE7|adOK543*d587{#A_li~CBqe+F81RwsTeph{pf6mUl&YMTAlau?y zlQu^IH^7@u#k@t{_;Gx{+rMviGyu@_fP9G$aDM5)0FyGvZU+$k0J;hG(=2}KI6&jhG|F*5YsCDQ+#jS6Fg^Gm7kfPIKS;h=Ce`b1qIZ7fs0ep{qU5&7*6T@dfVWCnsu_D=FGa|p?MM@7; zdmK&eruY3{Nc80bjBC6apc=j31V5Mksgg2XWefIfV(1f%is6j2)BKdHq3FMiOf-qgr$J%0A z>~AD@&cWdK7|uPn`{Tj>U%%kBF;T^J?vY?}dOADw+^<>}aC|sW0{uGEGAok+r_Vot zQYCO01>Ph7b&iP2IS-sHw}Da(aNL)UxeT11w*jm7WmTf^PHb83JpwfxTMx-k!NDT@Rkt#hv2q=mQQlvy`M4CvI z&>;zmN(m%LC{hBrA)y2kAS5CEi_iOh?-*x{cbq@xd7kr~@tq$T_qa2Xd);fUx#nE+ zTGzFnU$-?su=m(r003~n;@V&K0KhH|0I;+5*FEBIfESG)iZ44t?ai+MYVmT*;+x$* zmu)Ts03Xu!@$dX1zLyNX<{SzDNPYhK*^#X-bsPY&?y~sn^35oZHPV6laHq?Pd`3Dw zF>$xdrrA*XuV*eDs;s=ku{T$((>(lnNAmsANnhDyoiX3=OE(I}#tcd#?wkKbJ#o1F z*B8vDH>XbQ%xg)oKbevC+ld9kGwJDxiRqneIL(9LPKaZ1Iv1xGrViAfU zbn_97d&I2mk`9#JA$~jFjA=VoeC;w(S=a>tTsoZ?E55s*m@X~8y!>rb{F^twUojEi zoWIg50RY5b{Z-t3z|gNJ6~&ihC;ylK^f^JIC9L}r`TcR)Wb`!2c-MEd72X(1eQ3$v z5Ab9r#=f4bQS5~90k^Xk;Y8DinPlm>BLIM0O=7HPfUQAX$1{E5_gy7KJCg{b&g#rG z5G5T8>rn{x+f}qxHq+n438BP1hi6TeB z+}UaX07~q~=Dl8c{mSE$IeSx%xU-(oChpqb3I36*5PJO3I$^ zsS+&_NmT&Az^R2@{_)Z<7XBNF4GTAlW%>4`J?3>{7W)is_&_WrHSkwJd`X7*g-!;x zdsR4dLENW09mNxPkRWkm_U6G(=Hr%>q#Xb~1No?6E8YD!59VA{I!Df1JpWv{-UL3T z>y;YT7_kghQS{d?jE|ghXQ$0Xact(R`?cE@wl{ICKTFlR`D_k#~yHTG@~fh02Jc-KO}_>vpF3Y zZH%%oQn>Q9o1)ob7P}xjmA#j^(2xM<PTGaN2d? zP^z&+K4npB7if#!!c$%XVxy1MwlQ%fPQs5BU%O$lJyy-lRn_zPwAP?48PR+u37%T; zU*SkEQ<`^4@~w&jJN;p>MUszXMEE(Y>z3pl!s3?YP|J`$fY&7l??EzeMPBO ziY7(Py|Wb_TlE{kvrl|zVTp+TEa;g5Wl!_3#l}oAz`xmq03!#HX<2d66cW>B& zvGaIUdVp@fHX*u~t10g2IC^Qk#Z3dLxngxtTaekDcEpHN{dks*v5R}iKLU6Ye#K<< zw4{l?PVV&_olGnfp_@V&p=kIl2Wu89(=D!1t={oiR)nKi6*>?JobS~;EkZ`9x1?0u z{B2=>6#j~joQOM!l!L8)?dYjfxrU7XRQ%%uaEJD+!y^w2hRMXj`o4DDS4LwhvI4U{a zd<-(?>pcw;t{q89N-ih_(rdEE`%6rcXAwFZvXPjI*hmc9Sdn*yow~d^Q}7XZkRl%N z97yKcS)_O>9Q&11pO%FpNQU`F^B3VUylrN|4%>I{O}WWH!x*D!gw7X9M<|>V>n$g{ zSgq6N`@ju9Q(*zkb3y>J)18Y{F5&mKi&mEx)9_h1b6xgt`-$I~YjgLYIEf&W+Y{+zqnpkS zu|4yOGY2$het2Bjv=a$OlQfcg+i`x7W5e2Bw|p4MDJXwKat8yQynCa%PWz=K!x?Lq zGoiE?=#;)-JTbEaka}1=a5@6NLccmum+dv!=gK4rRAM0lyCIX5*p4wsr$jGfD%?AX zMuP6~nF~=Yy*q0!2-6eB`PG5L9(FOBL^l^z4fW!>o7UcLlFv--mv3PF?3K)O+-tK5 z6N~)9FqYAd=7Ld=Gv4=Nm&f1ql6ROM?8F(^dU~8WQL%Yu{DWN{t22&AGAsnxwx7A9 zJ0r}PQjQZDZBAgfxAD26M6SK;#&Cr{I7!iobguK|PC%n&ZxZaxoj@0zrQJirBL*jZ za-}fCoT>9l8a<`;avZSlp>t=5j*o87=+^vwXVQpJLaP)81%21=HYpW~t}FF;Y-gs; zj2(D+6%1$DnqAYBKa@jeKhH0(LqTGH(AL)O%iCu(Tr2y01}hLAmkRW9KHS!L;*bvH zxMqzb0FbKu)79Crv>FRl zx*nb0AQzIyks|b~!@ra@C~NM0s^+({j=h!?gNx!riM&0xN7U$KWSr89)xM&Q*TkOg zjb>@mq&R8F87IcxAAqsxH+NgFoY5K_4?PeX-Ah>Ggy5PsUq`U8w2jyn%34?8sP-(A z&r-xS#@$Y`bCSooNBk%j?D9Tl1>U2tj`0uOx;fd&?NBtHk>cWtr-ebpvR|$RqMfqF z4lK(Q(?WLw=I)+>}RID}pSsy*Hm`5JI?50rF>KL6l$}pfl zJX5!bpWw82e{_4SD7hpl$WRvG3Tp5II@R@ra3Z@UE$H-ZLJTZ2-o}Yt4UU~s>Sp65 z0QU>irC;ik6W8udKMV+0Ht?ZQ<3&~U&WLIKnMNtPOQY)G#^R00owsX05&|TW)$71U z&#>ZbZWJZj-pxk{Xgxjh-`BVyiDg_iX zg%2l%n@6@8Pip_bnBYKmGIU>}t8PWn=tpfhX=VQIy@-Xl!e`IB@kH<>gP@#JIU38ZVu+SgHCde{K# zb!bFnPXgMpwyC!b2`dFJ^2SXNn@Jo^$7oB1iA zHU?-sN-v+-R*Z{Uycn!B^i^A)_61SpDQpc*iWwqoX|)d^8QXQsPgz&h8dq?3A9D{bWVGuvIf=ei^mLoyL&j{dJfz)O zn_lGTRNSYV`uRv5=5iStRCI8i`WtBBC1N%#xai>bVawt~N@AV7 zOmz$TvJJC$IX63l1rU$1MGYoN!v2SY31XwVgZ}u45!8q86idOIx?I=_szN9W%-@#n z0CZ)XSm-thSwyqb@=vQ&Q}#o4`szEl87Lm#yKN>P#zO+xW_D>jR44g`tCYUZjG`Up zv~e>yZgA_NQi{@>Y+uVEksYuz%BXgFMw9rO1MLQrqOMsbLeBi*V?^#a7|f98)NWoo zXLDT>7t-6wW`oH!lAySGThEsegK71K0cv;wa}-Q*zRK8|NT1@59x{O#_C^(Pz7u=u z>P*#?`^!#L0xAiAADNI?v8-wDz5u19&Mb(w=wM?LD~~JIXI9U2mbGkIj6YCW zqM+L1+7eN{QQDYkZLNgLloncjZO#UNV5(Mj_FfIBs56ItO@?Fe+HUVM2eY*`i$tch z4A^1n&g%eAqUplAK&;8K=mrH_AWz5Y)dLUuB4 zejlM%f{uqTK3?_@dKLD`iBcTsyaO=QATCpvpC(v-3M1w7GqO-68NYEib>r?D?Oz4? zYBtcUWW_7se4YPUMasLhe6n-vgt%Zl_ZOXe?xQlZR=V*rohvRQ^)vr(8ToIn_WxI^ zT)p+VoWP^++`_xvUDG-`e9o;3B@~XjL)VU&BRtmYzkQIt5!4@sD2Uk{o6n-j0=iZt zIe?CJKX?dp38u*7FEyAW|KvUwEmAw2b^zvjgC`|Dj|7T_oxZ}cVRWY+xcJL!2mkem z$Ny*p35Dt51Kln#W`cw{JdUhc0-R=M(++(X&4iBDl(^D5Bg4ijRZ@zeeYvEndVO0E zx%C^Wxp{Ct^+ugEzW&L$$m1`yj75z#J(c1IVU6o=CC1Sh>=b;=_VW@i#lNjp^U3{G+H~~vpWGrGUJ<=c zFPug1{f|dvrPDoSJcG_g?wDJeZn;o|8++qQkxlkNSa>_mP`+j&vx{;x+oCaGZ2xT$oHL9{PQQR295X7|Z`N ziSItvZ8xE6Mg8$s%MYPBKCYQ9GvQ?Q#auz(Q#g7AF4&$E9M_7v&&CfTLMRtR3x7&1 zo=$Uuvr9TeT3SDfJ+z>upJK6UIAum7a!ftn=McnI-_e-$x;xGJ$PGmhV&2o!c=b;8 zUsC1v>QkWmI7TN%QL1^xmyp01n!q?~Bg=9O(nw>`^lX&floU(S-VVKVXa0_-PN|#4 z_n4nh4tuoy(`ktTaHx3dop}8Q`KD7*ZN(mMv|4Ja?y6vd9-K+n3|Mu8d$7L_K~bdb_iy^ z{am_~AUbLN8}nc+C$j|9vB}z?X1w7GW<&5%QP@_xP@<#w+tM(@miCl_n9Yvs0=L}k z%<7-M1)j;!)`L7;ppUjp>nQArhe~oZG$oMaIW-C2@o6skn=2>^_wg~oXw#@}Q3Rch z6cp-o*ag)BXF^5WL7%w$Fw4Ks|A2Z!vvv)J#5YTowM_@+501wD4UY}!#Kips!${j- z*1DwSoK+n{SSI#Q%Z&srL2c1n|3EgHdWGC6I`$4f`jcpp&b1VSp!4_`I_Ng7ydnG> z=Dt=(aEz>3G=dzvx6sc6(=L?#=M5k23P=r)r)33^ku;#Pfgp zz)z2NzL>a%twSRgfr_X~WFfc(S6 z0Om{SMr~u=S?KJNxM(>~+g;)2T7)7>*1vPRr$8EMV|P27)7KboGuNUiWVzC>$Z5(}$=3&O0A zJjP+M^447R`-(d*ZE~owkLl6q;8wGKlEsQweM2B>Ak@f-yU{1=|I-H@$7nkOj$pM; z7cOp}T?sevMfPISb4(|sxl?XgA+4)bnOU<>xZ5=*teOhQeK%8H|B%rl3YAK(&|Ktx z7qU9A+Tol-%#ER;51|7N%?MbB7~6uniKGsj5&`9Fu`>=_e+_-tS$iO0AE)MQ4+tvAbEV5F^054=X7%qV5@t0LWD-X|qkniiQndYc zgQug3E9$LgC*PWE)!~rGt~z^8Oc3Xxj@A?Lr@AYD?UnwXuCX4geB4X*w-^3$M3dta z_fho>j!!&vIqX7vajq;pTLH2PrQxt~Czh~kpRa>jU zrh;F1A=TedwSr5IDOVw;FXu-pUQ@cos$4Z>UUch!=}Q*8?Qg(^vDvfQbX2+EdD^t9 zm(EKVCv{UK7rV;Rh?8~d@3D4@b7HuTfN7}yBC}f4x}daML2%;2rJC1^_LUkH4FV7L z6HeWiI>cp>Zlgw$1y&cgM2Am!I4F$&r)OEp=Euu$X04Fg(|vbyY&bP|-lF98>dOXt zR=BGo;SdK5hl|moMJQUU#Y(?rm6f-Fh*wNrwQACa3Fce;7bnIW2!#=UZ#BHZ)>-E* zLQ}~+TpQnZgcmEsqxcOemQW;Nzr0~jb%Sv~rVz_{S#VQ<4Nq3Ffx``Ry@%-$Vmr#) za7iZi#6n*@PkTlf6o>)N_ibm^DBHxqU3(jSVj=nc)XGcOI4)Z1_}DVJLS;cj_QM$t zd9dTAKeuk{ch;*QeV@+sP~{6vf9oe_VkHq7hw=|p0{0mq`zF#h+l=hhZ^JJhHv@a% zt{zwVOhGMw)VuITHZmgJH*Td391~nZ@Y$S5IX&O6hbkU7{PNcR9fW&&9>{o27yn=G zk9*y)kC{$j^_#OE0b|bXW`)t8K1dJ_G7+v6)rP>QV^aoM_t?&hM1wh;EV?1Hf)$eU z$HmGGt2_41bqzwxYxwOVc4?vQ-q3i*87Har&6dubp3-BVHO4hsG`cro6Gu#<1*^Ed zn?n1Ej89GAxoqEo)s7_IVGmMCc*?8yv%U0fb_Iq-cRmpeyar+3)0g$;sI{}k)_>c?2Q>qoGWUu;7~ zzS{Vkx9{;)O+yQ=HzTTX4N|Ac_t0|oCJ6Qz9TyBm8(BKfAOyfEzh-2+5k71{olaOh z-6JCi(dRk21RM(D4b_l-%U8a0n(P4Ubnfr!6~1oJ-q9$RpQBxCa_`x0rWt!`ReEBU zVVtp1rdVbMG0Meg7p=-L|8%1Od2aX+0(DAjLcwPQ62QuC5co_CI<#ew3j|uk8!V1+ zlS-#IwQt_ZbB4sMmDYHDy!k6)vz5O`N{~uv#ahGBg(GBEqnD6hv>0i)0!cUT&ul=G zdCCITkr-o=vkDX}($$4CI{EiXHLQhsO2n*X-1$+WhwTGWSdZ;uQ<`C9G$hpJ(V_gP zO4`*Ve)ULo*ZFKM!@I)r${Ws=rV@e9cR1GgxDmtPN5g)iyDEtX3Nzpouauu zj}Bj?Ej_e>l<1%J(6gs9xU`W3u0&iODzfnhA^viwH1;dkjnalW?4 za(jy3AjU0sxXieYG$9y?pUtugw-3O>=kMS&>kj#DqK-5Ovex`3oy+P#F)_!ji!a!H zF{fxI7(KC?z??$X?8Ew-XFgCmasxR2)5limMasmR%}M-A2w%%qwW;XlHYla=bX}IB zDsE--^weKP<0t)wKRWa`BZY!0X-;gGcC9~U`ORWN?DCwpnuJqR z!Hu(l=+JO98KI-9>eY~fn;=<-#_8Z1t@ByoRj#Hs)OesoTrTq3Z6FS&F!2>tQ(hTN zvOGoArq)ZX`%bo7LPS(VPjcf}Z)dr$Z^k7s-De^QC(MSbQga6#K3#3@XfXB`+!JX) z9*#kF-gl?{C3=05w{~jEk5z0ix2St^-t{WX{uB|(QH6MM+Y;`^ z*~N zapvVfQpjE1+V&r{Wzt&>gA_)FK=YUT^0f^f7e54hHOLLvsJVh-*YF>4*Et!A%!1{< zn4|@!+pN?6fbMYeDiu`kklX~wC1gw`ew_e|vkD+x+vM({6s*`MHFpZ10Qlg4C$}^~; z$-;-9)T&QKE$Y3M`+l#(`it{IJf_jVFGA0bs-0SJF#c&jk$d|D+36E)D8fN8W}pLI z?#BJ*QxI9tFWAaIm*yWz{tg}f;7g%93H>`hg?kVw`n(`jAC$n(U>Dfh6(64K0RH+N zVfoRw2mj&zgh{0lp6*zA^BJ@yv;ENR8%{aOiPA4muWZ7{$o4URl9-ZcKE*bV)+xkS zMKotX)nNbFa7+aH9huX<)+*JphM(YrgZ?3Lh zb>AL8q-W-{?McT9O+aMJ()UlVD5P*+TS{ktdSIGA^$Tmo+ z)Rb_K>LJ#wC$*?vX>LWjRy*VJE-x;att!u%%YR*&8ylDQLX{2_H3P?bz8xtHkE)uP zY$Ush_GG`9Mr+p?)w(`}OfF8?KGjmc1_`ysDcwMbAXBfsNkkIJ%EXW7RywTGa|x9? zPIYaUDqMZWW!@@WaBcc&VC#L88HWlepO+7y;P=`_^jM=8EFxo}(a(xXSC6tm^&P0I znOQr}8U`5K3CweQ+ze6UmYVkj@BCOf7pm@h$*=6f3!U5iy~=)jPl&^-*8oBSuGb!_ z8YgAw)fbt=+!K2OUZ@V=fj4q^9y4B()oF(v7d+^k4r`kUxg=KH=PuPBp+mM;IzO-`2IvzgMNHEG{3d z#cRbtH71d?_K?__IA2Q5R<24q+b&|ubzwC7w<@E)mIu6x3wHYs>jSQx%=s-xaS6H? z?O4;^(!&J=*GYSfKYSXa&{lITgYwa=jiuR?g2Jzk+UuS9Ua_n0rrf~$KS_U7-k>{; z@%yq{!c7}Xg~iB`AJ;#@qX%uw9=Ka%o42I~_EvjL9C>)ApLMg5wb&wheNmKOTl?lV zx!vWjyv}mBm;9$}LjTA|_>2^rYcI8U`y?Wrd@)uK-U>eRCL3~#m1HywXceE3n^g z8BE|hk#H_+Yr&g}Y-ZDC#xQhP7n z=zj^p`;$Uv@gJ$3XoHq=6H{=cvZEUPv`-bn*CHzzIa3^3o!*5Sw+VAn#TpC7_Ukj6 zdV>y{r!}5Y?dh7Jg?feGa5Q23bhvhtqK}s<;OJr8DnnkkF!b(=2C% zU$u+=v;MTZ6#b?sa&;dvHcW_`2$#4}HvdX6RI2^4cKVen6+0440YKk5-JHz!)5l#b zP)=<-WY<<^106vhYEhno8`lvPhz9rbC=Bpq7Y>%}UvAEL}*!}+<7dtPx4Aj5IzC(qIQJ{k?{FUQrM7cc|7hG0k*&_Yy;M0&3&G!$$jf;{M{^WF2*f{wr zAP?Lf&ZTu?R+~1<##f*QPR@uBU|%oHT4iuiyWCymicv9iM+Z_Tzs{%0d(?I~Q*I~* zm+!(1$M`g(%p%sg)zg87E4Q7@hSLi;YCaty0ddiNnoqzYmdz@hiLs}HC!{RZv-qv( zpEauRq){IJ7|jg#Wb2HZH+Cuj`)jfuIkC|4iI6!T14Re)Z-*56?}Rp^vvic!E3Oz% z&AeUV^;A4kE1V11ygaE9ZXOfPuqSJ0mUh+#T}I%NS(}Lm%)e$B=(nbDTkg#)*Uq-Q zmq`d$ucWtz@w9#kcM#Y)!QT4+Ob(S7`lMsP9zf7m=7SptA0@@jX=5xTd}ZzQWsnplkMnwe$`MKz`pjr!_0KAB;C@5a?fJSdX}w>VRN@|HBv10{hU)2Vs=t%uvf9cG6v znC6b%EU*q0IYmA=%)Zi`6jFY+& zr%ehQc)7`6B7)hBwpO})qlIs1zaljfvV6tmX*TRi+rWAK)GE82*zsBY(B+VT48LMT zfD$fb#gcpa+wbg#m2?w&SY65KnRZp-czDl8u_xs ziJM}V=Barg=x9pe}K7`C8?SN5g>3fR$NS12-XV zxY0WEQH8a6;Er30iID9@WaN*5b&sUl0atJ5nkpzIw5VSGTmx-|V*@?{F|<|c@+b=M zhB`YDo3rg*<%u*wG%gDtT>eWJ_swbYCo#jLzs+}LCYw6u4x6}0Ri4GYGj)fhvBEi` z+6ghiVbkvrBriuxi@;zEM{K-_~rjdw8 z*{6aaR(qr#!98^8XBAu5Keyj;Yx9#4vCu2;m23Y+^xp6%F>xS=qMUMW-`#Mt31x8@ zKt4ez*72_^ZIK5;9K$3pFW;;KyD|*Lwa&*wuMXWXL12}ZY^DGQ@2;%p;mi6LW!|H{ zq3=))%SkVmY!Gz0*9%G8AoTj*8+ya}(O+{>#mUp^jlgk-Z0LZIxR^dYn+?f=$|z9C2QGERsvip8HK;Ww}C~R*0kpOK+A{7oM5Aj(%xpF!j3sqwoB4 zVfd8k4-H1{i|!CWeKohk{&GwHde*L2!!BysuS`R7+JrTEBmI>*9KyEpOC5V#I~NKe zX4sbeWhi%3Zau3(h5Vp@9Mj2KsB<4oh;7bpe_NSl=B$NNa2pjPc<6;Dvu;}w_fB0Y zk=S6f*ixgBR$anvj_!w)TMPm3sPX>lCz+L1zwi0;=JSHtoRC+3ShdJkxg&coQdFnd zE~HZrMiJj@9SV<>+eCWQ0lT?y5q+_vc=VmCCT6Lju$qN$daAk z>@L7^66{(P4U+R7KXUUztUF%;d+XIfVwK)y-6Oyx>%@u7VdxJ}4*wsF+qCsqY<-qE?Brr7 zt}@1NQ<}8#n==*-)hKIR;|SxpGtF6w1E0~Q4nzu(%lOt8DLV7L+VXkGcjLSvZGwR& z^}+pIb12LuNxtmu33_1!wY&kgALc~)lX$bPgI`n8)M?@?sAwqhxC(_+(2n>JDI3$- zQg->llbi0QHm*8oa(u{enqj^YRqw}%?8o{`YM=uB`+bbUxpndew`SCac(98~P>CO_ zxnB;QC+o^jj~z4!jc5ZErGV&_Eey=={00G}9n4x~NGrrbVkJ>ZbK{5h;d&#> z^Mm|xMm}8w_h(CAL0CHbvpKCXN6qJ#rz16xO<^@kqsm})(GWFDo-;b6LHz2J16BKj z8lH`Xdf+jOh7LtimFiZ+&n>Hr+|+?c3W(?;64Jh2pH;$zwK|Pt?~!B(*2*yB{PPpK z{hI?k)M`nJPhrCVE*ejFhr5p+8fG?L2B)fo+x%Sr$~Xsfh9?BcBzv3Avevofg>b00NH>%*H~YXRPDhBLx-(PZy#uplzEf9>35-D3CJStGu260y zhpF4>UsBXuoypzZbDr54(rST=`JJJ#I7YoYW@69USPH6RMkxotCBw@G@T^xdlHs@C zmXp9po_ zSFjr)C%2%ZfBo#HFvXDJBBkB0_tb?tQgg4oiP9J5Q{a2~IvniHhiZzf2y2;AJ59E7 z*Q~ys!o}tdfL-8ZLsYB^%t!w$u08)Sr6CTL%EcMapT)r(j}F9=(M;L46m>>ncpl_@ zm6?l&I&Q9rrwpXO%MXbAyL{Q^43#eLl9t;t;FaTq*3i*pvcTn)MLn1SpD{-1#vlJM zd(FocyaY;0MZWspWPfuu8~#&Ipg`%ZYDp-DmAn(_`YEdNLeO?c82)fo!vwwzSkJ7{S86j#@UdV; zUNv29Wf?j4&4*Hx;x`%bDnyf!!JGz;H;}xAo5wR}+5XZn^SkXRw0DOpku;0Z8C1pj zH2vY>?Rs=L$hbLw%sZwpH#W5u>L;P!UR`w$btpM2joDD(h_cP6=S(qBARQ`oXpsGa{_WfVZ$G=d!_Rl82$o@4FG4n~`r`7c*XA z4qD$be_6zPxfoWrvtFy;{bB-z3mAI$lVi3q+d&ho{0GMgBJ)pH_5HIg^Z$j{`wyie z;F9SKU%ZttC+^EP_|HtcZEhfV1Z_j(=iRhg_pAPUO2&P>{5!2QBh)n(} zLvb9vw<~Zt>L=IrpP<+LKW=1WXemhwx7<>mc=y8B?7}GI|DL|Ldwp^4h*F-KtwD0<4;< zcE_=NI#hzghTQ&1LfHS~4EwKA1pm7X;o0M*pmU5A$Gg8Z_jn2p-!kuj19%DI6z-oh z-g)nQJCC*&e?vi$(;Kn={iQDEp-O93514PpChX9iX=Pxni5jv_P&lRnIDb||Xhm#b zMQpsDvhgMfDOhD~X&l1~KZ5kaWFf}hZG-A`P2`|KeyN8e)I0i!GZu182bM|s00NR@ zB97l?aH6Nq&Ij}*gN=?A#&KD}Ld=5FRCgEVgjiplPtGOCy|gu`j2VXV=gmvT(X}xR ziMx9jje}HL|FDa_t9x#471fcTNt`(%*4S^L3KWwh+ zN7`<#h>2}RxUcou70o1K>NMEb>fIwI2=;J7nkY)Qeh)KR`;*H#)uAjQ)_`#exH(g^mv#KLO|IC0#*f&=2&YAbM z9Q*`@TOEjhja`M1G7n^u5qqDV+7Vd3|g`c|EN_B(DkWob0*hyOUzo}fwHJ6CDl%3Y!L<%Hi<{);qxs^%PJExr!k z*V9=8ICx(~5KYW@469smdFw4ZrY+BzRTOQoN`i(-yPj_K$djA21-=QJKoPmP=DPjw zV|2axP7soA*KIiS?SOw@l(^9Dl~?FLU;62+zzpE0^m9_w(b6>&?i5j*We!r$@%o)2 zc6Fo=IqFW?o`ZdoX;(R3TYI)oyFhnN`Dn$dlHMB5*u#sgp1A7c-(M}qP+dDz)JK92 zX$MKfiq;h)cDWvITR8o(ZTB?PyRH06aYXk4QkUEcPW$OXNOJox{SEfMH(S=D#!yey zA=*H4;O^XJxRB72l+YwzQp67;bk@U%3MzJC3Ig28vY!)QpN|2~Ti>>ph~#}yrc(FwVZ5i#^{` zM^8*&tC-dQd!NV9fiU|&Z_LaF)q>O1(=zR7!apyo-9I%In44p5S|0FCN}Aeba#J;v z%!SR4i@nk6V8NFpfb4mI@*E0Cwy*a)R-GedkJqW%;sk(>YtAntxHGlQqOwh?4&HFx{hW` za#Fd?Da617uj>m@0yCHM)uveD*Kp=W{Yro3Zgw=1e?)Yb7o_!z1!$KU?D@X>2(hT4 z9v(Zli`13lYg2<2@x&esXXui;v|@z}(j_1H9g#$(qZl4Z99yWr&Smr`8tBz}IghtM ztRX|CoNIdSWF`~?Od8zp<@W__^{*|jh~>6aENzeDS)L~ed=)GGco77TIr?4aADuY^7)!YOj?s$b zRt9=n&2+riK}fbT17?PwPZ;36gWBEv+!r*q-|vblZHt!wRB@pa>`2-H*7~y4ve$W3 zNl`Y4rjFE^p15}~9v9`BaRLnPztOoCP}OlB@H$_dj#q)=Wtwii(%@rW%OjleSaLfG zb889NC)5?)i;#%Ve~T@c;ot*3FAJtBl;Z13N?Ju+u-K@QUx>6Xn3T9*Y((uFx7;=} z$gfcBaNMiIO1%|LR;XWGg*cP25Qzf@_s7`=1ZQ%{f$$mBieG}vy@RXCu}%h;EO0y-PUPQm-^_57Ca3Pt;BbHGc01U;CR%)*R71E5tSXDzMzsPQ{Ch>=hwdn`XzYN$ zg~K+q5jk;`1onx$v;PPFr;Z!rG^?hXLI8-V<5) zp}!w1uI`B_#uQE)?2J0IzYe8+a>|CJc1C_3!VlLI70xd2Ih=(%ZCUkdJ5BWA_&5@z#!4;p;Q(2)pJp@yOT^w)c}2ThiE#-0Lmd*Cb+@0gQ;yzMi(lh+TyS+pM+m z-wJTYhI2hLa54?Ef=|FOpuh7tS+47^pDqVN&&!L4qQJuXlKKkoO&xBC$pMKN`G>xT z?7>RaWSH#!*;>rx_>NAKT^P08F>c7ScCRnDmYkwSX~e0&w0ztCezErBnpu~!WBlckpUD@#64~XnkT;i@f<6}p6S~e$ z8tY#Cd zYwGOmdJ>$0We3k@OG>gU6IBOEKFcj?Q6Kk3s3~nmjz*q%t(<+7sdOS-B$;uK%M4(8 zY`h2r+}8a$udz=90~}{Xu|V-K+9f}N))L_BA1Bh%VoPHf@dfhoTTjt^5`P92`6!v>H_u{jz6F<;IqYP*1=oM0&DQM=cG?H z4^Ba+J8|W}J1mGJOj!&R{H%R(=T#{FDM- zWa$%=St^5RD6s#t-rxLb*Yy8eY-T*(NR){es6lRXE8|O^{ZAFE;yV4`aq!Z=tIYa$ z9wzm_w@?3&m)rrw%hTi7_qU4JVRiskv5!gU`RFlJyBI3?&(pN>sQ+6I+WM#2=FWMp bXvf(?U#`U2=OD4O0xZmI|Ej&>`RM-vK7}V@ literal 0 HcmV?d00001 From c218961a6b9d8603e881fde0bc87e2cf058d5c7a Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Sun, 26 Nov 2017 21:22:33 +0800 Subject: [PATCH 058/275] modify for code review by qingqing --- paddle/operators/math/unpooling.cc | 4 ++-- paddle/operators/math/unpooling.cu | 4 ++-- paddle/operators/math/unpooling.h | 6 ++---- paddle/operators/unpool_op.cc | 2 +- 4 files changed, 7 insertions(+), 9 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index a3a24a6892..d8647c6b23 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -60,9 +60,9 @@ public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, - framework::Tensor * input_grad, const framework::Tensor& output, - const framework::Tensor& output_grad) { + const framework::Tensor& output_grad, + framework::Tensor * input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 358847b315..d3eaa48547 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -114,9 +114,9 @@ class Unpool2dMaxGradFunctor { void operator()(const platform::DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, - framework::Tensor * input_grad, const framework::Tensor& output, - const framework::Tensor& output_grad) { + const framework::Tensor& output_grad, + framework::Tensor * input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index 88e88ba117..bf79354ed9 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -14,8 +14,6 @@ limitations under the License. */ #pragma once #include "paddle/framework/tensor.h" -#include "paddle/platform/device_context.h" -#include "paddle/platform/hostdevice.h" namespace paddle { namespace operators { @@ -37,9 +35,9 @@ class Unpool2dMaxGradFunctor { void operator()(const platform::DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, - framework::Tensor * input_grad, const framework::Tensor& output, - const framework::Tensor& output_grad); + const framework::Tensor& output_grad, + framework::Tensor * input_grad); }; } // namespace math } // namespace operators diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index b5f3d56e96..ada9ce8ce5 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -78,7 +78,7 @@ class UnpoolOp : public framework::OperatorWithKernel { auto in_x_dims = ctx->GetInputDim("X"); auto in_y_dims = ctx->GetInputDim("Y"); - std::string unpoolingtype = + std::string unpooling_type = ctx->Attrs().Get("unpoolingtype"); std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); From cfd7721b51c2009bfbc9049d25da5eab6aa29745 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Mon, 27 Nov 2017 11:13:07 +0800 Subject: [PATCH 059/275] add unpool_op.h modify --- paddle/operators/unpool_op.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index e22171649e..ae11a9f4f8 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -28,7 +28,7 @@ class UnpoolKernel : public framework::OpKernel { const framework::Tensor* in_x = context.Input("X"); const framework::Tensor* in_y = context.Input("Y"); auto * out = context.Output("Out"); - std::string unpoolingtype = context.Attr("unpoolingtype"); + std::string unpooling_type = context.Attr("unpoolingtype"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); @@ -53,7 +53,7 @@ class UnpoolGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out")); framework::Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - std::string unpoolingtype = context.Attr("unpoolingtype"); + std::string unpooling_type = context.Attr("unpoolingtype"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); @@ -65,8 +65,8 @@ class UnpoolGradKernel : public framework::OpKernel { zero(device_ctx, in_x_grad, static_cast(0)); } math::Unpool2dMaxGradFunctor unpool2d_max_backward; - unpool2d_max_backward(context.device_context(), *in_x, *in_y, in_x_grad, - *out, *out_grad); + unpool2d_max_backward(context.device_context(), *in_x, *in_y, + *out, *out_grad, in_x_grad); } }; From cda3a7747a657e630164c6802b9f1382e29c855b Mon Sep 17 00:00:00 2001 From: peterzhang2029 Date: Mon, 27 Nov 2017 12:55:52 +0800 Subject: [PATCH 060/275] bug fix when using hsigmoid with gpu --- .../layers/HierarchicalSigmoidLayer.cpp | 140 ++++++++++++++++-- .../gserver/layers/HierarchicalSigmoidLayer.h | 10 ++ 2 files changed, 134 insertions(+), 16 deletions(-) diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp index d62a8d846e..f93a9937d1 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp @@ -64,49 +64,113 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { batchSize, codeLength_, /* trans */ false, - useGpu(deviceId_)); + false); Matrix::resizeOrCreate(preOutput_.grad, batchSize, codeLength_, /* trans */ false, - useGpu(deviceId_)); - + false); IVectorPtr label = getInput(*getLabelLayer()).ids; - preOutput_.value->zeroMem(); + if (useGpu_) { + Matrix::resizeOrCreate(cpuOutput_, + output_.value->getHeight(), + output_.value->getWidth(), + /* trans */ false, + false); + IVector::resizeOrCreate(cpuLabel_, label->getSize(), false); + cpuLabel_->copyFrom(*label); + cpuOutput_->copyFrom(*output_.value); + } else { + cpuOutput_ = output_.value; + cpuLabel_ = label; + } /* add the bias-vector */ if (biases_.get() != NULL) { - preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW()); + if (useGpu_) { + Matrix::resizeOrCreate(cpuBias_, + 1, + numClasses_ - 1, + /* trans */ false, + false); + cpuBias_->copyFrom(*biases_->getW()); + } else { + cpuBias_ = biases_->getW(); + } + preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_); } for (size_t i = 0; i < inputLayers_.size() - 1; ++i) { MatrixPtr input = getInputValue(i); + if (useGpu_) { + Matrix::resizeOrCreate(cpuInput_, + input->getHeight(), + input->getWidth(), + /* trans */ false, + false); + Matrix::resizeOrCreate(cpuWeight_, + weights_[i]->getW()->getHeight(), + weights_[i]->getW()->getWidth(), + /* trans */ false, + false); + cpuInput_->copyFrom(*input); + cpuWeight_->copyFrom(*weights_[i]->getW()); + } else { + cpuInput_ = input; + cpuWeight_ = weights_[i]->getW(); + } preOutput_.value->mulByBitCode( - numClasses_, *label, *weights_[i]->getW(), *input); + numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_); } // keep consistent with the clipping in the following softrelu preOutput_.value->clip(-40.0, 40.0); preOutput_.value->sumByBitCode(numClasses_, - *label, - *output_.value, + *cpuLabel_, + *cpuOutput_, -1); // scaleSum preOutput_.value->softrelu(*preOutput_.value); MatrixPtr sum = - Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_)); + Matrix::create(batchSize, 1, /* trans= */ false, false); preOutput_.value->rowSum(*sum); - output_.value->add(*sum); + cpuOutput_->add(*sum); + if (useGpu_) { + output_.value->copyFrom(*cpuOutput_); + } else { + output_.value = cpuOutput_; + } } void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { IVectorPtr label = getInput(*getLabelLayer()).ids; + if (useGpu_) { + IVector::resizeOrCreate(cpuLabel_, label->getSize(), false); + cpuLabel_->copyFrom(*label); + } else { + cpuLabel_ = label; + } preOutput_.grad->one(); preOutput_.grad->softreluDerivative(*preOutput_.value); - preOutput_.grad->subByBitCode(numClasses_, *label); + preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_); if (biases_ && biases_->getWGrad()) { + MatrixPtr biases_grad = biases_->getWGrad(); + if (useGpu_) { + Matrix::resizeOrCreate(cpuBias_, + 1, + numClasses_ - 1, + /* trans */ false, + false); + cpuBias_->copyFrom(*biases_grad); + } else { + cpuBias_ = biases_grad; + } preOutput_.grad->addByBitCodeBackward( - numClasses_, *label, *biases_->getWGrad()); - + numClasses_, *cpuLabel_, *cpuBias_); + if (useGpu) { + biases_grad->copyFrom(*cpuBias_); + } else { + biases_grad = cpuBias_; + } /* Increasing the number of gradient */ biases_->getParameterPtr()->incUpdate(callback); } @@ -115,9 +179,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { /* Calculate the W-gradient for the current layer */ MatrixPtr input = getInputValue(i); if (weights_[i]->getWGrad()) { + MatrixPtr weights_grad = weights_[i]->getWGrad(); + if (useGpu_) { + Matrix::resizeOrCreate(cpuInput_, + input->getHeight(), + input->getWidth(), + /* trans */ false, + false); + Matrix::resizeOrCreate(cpuWeightGrad_, + weights_grad->getHeight(), + weights_grad->getWidth(), + /* trans */ false, + false); + cpuInput_->copyFrom(*input); + cpuWeightGrad_->copyFrom(*weights_grad); + } else { + cpuInput_ = input; + cpuWeightGrad_ = weights_grad; + } preOutput_.grad->mulByBitCodeBackwardWeight( - numClasses_, *label, *weights_[i]->getWGrad(), *input); - + numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_); + if (useGpu_) { + weights_grad->copyFrom(*cpuWeightGrad_); + } else { + weights_grad = cpuWeightGrad_; + } /* Increasing the number of gradient */ weights_[i]->getParameterPtr()->incUpdate(callback); } @@ -125,8 +211,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { /* Calculate the input layers error */ MatrixPtr inputGrad = getInputGrad(i); if (inputGrad) { + if (useGpu_) { + Matrix::resizeOrCreate(cpuInputGrad_, + inputGrad->getHeight(), + inputGrad->getWidth(), + /* trans */ false, + false); + Matrix::resizeOrCreate(cpuWeight_, + weights_[i]->getW()->getHeight(), + weights_[i]->getW()->getWidth(), + /* trans */ false, + false); + cpuInputGrad_->copyFrom(*inputGrad); + cpuWeight_->copyFrom(*weights_[i]->getW()); + } else { + cpuInputGrad_ = inputGrad; + cpuWeight_ = weights_[i]->getW(); + } preOutput_.grad->mulByBitCodeBackwardError( - numClasses_, *label, *weights_[i]->getW(), *inputGrad); + numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_); + if (useGpu_) { + inputGrad->copyFrom(*cpuInputGrad_); + } else { + inputGrad = cpuInputGrad_; + } } } } diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h index 9afd40b167..2483572ded 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h @@ -80,6 +80,16 @@ protected: int codeLength_; /// temporary result of output_ Argument preOutput_; + + /// The temporary variables in CPU memory. + MatrixPtr cpuWeight_; + MatrixPtr cpuWeightGrad_; + MatrixPtr cpuInput_; + MatrixPtr cpuInputGrad_; + MatrixPtr cpuBias_; + MatrixPtr cpuOutput_; + IVectorPtr cpuLabel_; + }; } // namespace paddle From c8bb66314173e68aec897f8e4a3f988ad227adc0 Mon Sep 17 00:00:00 2001 From: guosheng Date: Mon, 27 Nov 2017 14:21:34 +0800 Subject: [PATCH 061/275] Refine roi_pool_op to avoid warning --- paddle/operators/roi_pool_op.h | 49 +++++++++++++++------------------- 1 file changed, 21 insertions(+), 28 deletions(-) mode change 100755 => 100644 paddle/operators/roi_pool_op.h diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h old mode 100755 new mode 100644 index bd7736d631..3812c66c65 --- a/paddle/operators/roi_pool_op.h +++ b/paddle/operators/roi_pool_op.h @@ -133,54 +133,47 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel { auto* in = ctx.Input("X"); auto* rois = ctx.Input("ROIs"); auto* argmax = ctx.Input("Argmax"); - auto* out_grad = ctx.Input(framework::GradVarName("Out")); - auto* x_grad = - ctx.Output(framework::GradVarName("X")); + auto* in_grad = ctx.Output(framework::GradVarName("X")); auto pooled_height = ctx.Attr("pooled_height"); auto pooled_width = ctx.Attr("pooled_width"); - if (x_grad) { - int channels = in->dims()[1]; - auto in_stride = framework::stride(in->dims()); - auto roi_stride = framework::stride(rois->dims()); - + if (in_grad) { const int64_t* rois_data = rois->data(); - int rois_num = rois->dims()[0]; - - T* x_grad_data = x_grad->mutable_data(ctx.GetPlace()); + const T* out_grad_data = out_grad->data(); + const int64_t* argmax_data = argmax->data(); + T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; - set_zero(ctx.device_context(), x_grad, static_cast(0)); + set_zero(ctx.device_context(), in_grad, static_cast(0)); - size_t roi_offset = roi_stride[0]; - size_t batch_offset = in_stride[0]; - size_t channel_offset = in_stride[1]; + auto in_stride = framework::stride(in->dims()); + auto argmax_stride = framework::stride(argmax->dims()); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out_grad->dims()); - const T* out_grad_data = out_grad->data(); - size_t pool_channel_offset = pooled_height * pooled_width; - const int64_t* argmax_data = argmax->data(); + int rois_num = rois->dims()[0]; + int channels = in->dims()[1]; - for (size_t n = 0; n < rois_num; ++n) { - size_t roi_batch_idx = rois_data[0]; - T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx; + for (int n = 0; n < rois_num; ++n) { + int roi_batch_idx = rois_data[0]; + T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0]; for (int c = 0; c < channels; ++c) { for (int ph = 0; ph < pooled_height; ++ph) { for (int pw = 0; pw < pooled_width; ++pw) { - size_t pool_index = ph * pooled_width + pw; - + int pool_index = ph * pooled_width + pw; if (argmax_data[pool_index] >= 0) { - size_t index = static_cast(argmax_data[pool_index]); + auto index = argmax_data[pool_index]; batch_grad_data[index] += out_grad_data[pool_index]; } } } - batch_grad_data += channel_offset; - out_grad_data += pool_channel_offset; - argmax_data += pool_channel_offset; + batch_grad_data += in_stride[1]; + out_grad_data += out_stride[1]; + argmax_data += argmax_stride[1]; } - rois_data += roi_offset; + rois_data += roi_stride[0]; } } } From 6c5f928a3e099eb787111c8fe5120118ef2e5155 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 27 Nov 2017 14:27:25 +0800 Subject: [PATCH 062/275] enable inference benchmark --- benchmark/paddle/image/googlenet.py | 2 +- benchmark/paddle/image/resnet.py | 2 +- benchmark/paddle/image/run_mkldnn.sh | 69 ++++++++++++++++++++++++++-- benchmark/paddle/image/vgg.py | 2 +- 4 files changed, 68 insertions(+), 7 deletions(-) diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py index a88ecac67d..5b1f0ca006 100644 --- a/benchmark/paddle/image/googlenet.py +++ b/benchmark/paddle/image/googlenet.py @@ -9,7 +9,7 @@ use_gpu = get_config_arg('use_gpu', bool, True) args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list", "test.list", module="provider", obj="process", args=args) settings( batch_size=batch_size, diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py index 6ae1857642..f8c1c2df88 100644 --- a/benchmark/paddle/image/resnet.py +++ b/benchmark/paddle/image/resnet.py @@ -10,7 +10,7 @@ is_test = get_config_arg("is_test", bool, False) args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list", "test.list", module="provider", obj="process", args=args) settings( batch_size=batch_size, diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh index f768f6c29a..c78079fa45 100755 --- a/benchmark/paddle/image/run_mkldnn.sh +++ b/benchmark/paddle/image/run_mkldnn.sh @@ -8,13 +8,13 @@ function train() { use_mkldnn=$4 if [ $4 == "True" ]; then thread=1 - log="logs/${topology}-${layer_num}-mkldnn-${bs}.log" + log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log" elif [ $4 == "False" ]; then thread=`nproc` # each trainer_count use only 1 core to avoid conflict - log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log" + log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log" else - echo "Wrong input $3, use True or False." + echo "Wrong input $4, use True or False." exit 0 fi args="batch_size=${bs},layer_num=${layer_num}" @@ -30,13 +30,74 @@ function train() { 2>&1 | tee ${log} } -if [ ! -d "train.list" ]; then +function test() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/test-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + if [ $thread -gt $bs ]; then + thread=$bs + fi + log="logs/test-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + + models_in="models/${topology}-${layer_num}/pass-00000/" + if [ ! -d $models_in ]; then + echo "Training model ${topology}_${layer_num}" + paddle train --job=train \ + --config="${topology}.py" \ + --use_mkldnn=True \ + --use_gpu=False \ + --trainer_count=1 \ + --num_passes=1 \ + --save_dir="models/${topology}-${layer_num}" \ + --config_args="batch_size=128,layer_num=${layer_num}" \ + > /dev/null 2>&1 + echo "Done" + fi + paddle train --job=test \ + --config="${topology}.py" \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=10 \ + --config_args="batch_size=${bs},layer_num=${layer_num},is_test=True" \ + --init_model_path=$models_in \ + 2>&1 | tee ${log} +} + +if [ ! -f "train.list" ]; then echo " " > train.list fi +if [ ! -f "test.list" ]; then + echo " " > test.list +fi if [ ! -d "logs" ]; then mkdir logs fi +if [ ! -d "models" ]; then + mkdir -p models +fi + +# inference benchmark +for use_mkldnn in True False; do + for batchsize in 1 2 4 8 16; do + test googlenet v1 $batchsize $use_mkldnn + test resnet 50 $batchsize $use_mkldnn + test vgg 19 $batchsize $use_mkldnn + done +done +# training benchmark for use_mkldnn in True False; do for batchsize in 64 128 256; do train vgg 19 $batchsize $use_mkldnn diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py index 420884ed8e..97f4dbe0e1 100644 --- a/benchmark/paddle/image/vgg.py +++ b/benchmark/paddle/image/vgg.py @@ -9,7 +9,7 @@ layer_num = get_config_arg('layer_num', int, 19) args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} define_py_data_sources2( - "train.list", None, module="provider", obj="process", args=args) + "train.list", "test.list", module="provider", obj="process", args=args) settings( batch_size=batch_size, From 20654cf78a051a5079c68de7f7ff69239b063ba8 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Mon, 27 Nov 2017 14:54:39 +0800 Subject: [PATCH 063/275] modify for type check rewrite --- paddle/operators/math/unpooling.cc | 20 ++++++------ paddle/operators/math/unpooling.cu | 32 +++++++++---------- paddle/operators/math/unpooling.h | 4 +-- paddle/operators/unpool_op.cc | 26 ++++++++++++--- paddle/operators/unpool_op.cu.cc | 8 ++--- paddle/operators/unpool_op.h | 8 ++--- .../paddle/v2/fluid/tests/test_unpool_op.py | 2 +- 7 files changed, 58 insertions(+), 42 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index d8647c6b23..ab6212f387 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -19,8 +19,8 @@ namespace operators { namespace math { // All tensors are in NCHW format -template -class Unpool2dMaxFunctor { +template +class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -35,7 +35,7 @@ class Unpool2dMaxFunctor { int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; const T* input_data = input.data(); - const T * indices_data = indices.data(); + const T2 * indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { @@ -54,8 +54,8 @@ class Unpool2dMaxFunctor { -template -class Unpool2dMaxGradFunctor { +template +class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -71,7 +71,7 @@ public: const int output_width = output.dims()[3]; int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; - const T* indices_data = indices.data(); + const T2 * indices_data = indices.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); @@ -90,10 +90,10 @@ public: } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index d3eaa48547..c8fd58eca5 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -19,10 +19,10 @@ namespace paddle { namespace operators { namespace math { -template +template __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, - const T* indices_data, + const T2 * indices_data, const int input_height, const int input_width, const int channels, @@ -45,10 +45,10 @@ __global__ void KernelUnpool2dMax(const int nthreads, output_data[out_offset + out_index] = input_data[i]; } } -template +template __global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, - const T* indices_data, + const T2* indices_data, const int input_height, const int input_width, const int channels, @@ -76,8 +76,8 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, /* * All tensors are in NCHW format. */ -template -class Unpool2dMaxFunctor { +template +class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -90,7 +90,7 @@ class Unpool2dMaxFunctor { const int output_height = output->dims()[2]; const int output_width = output->dims()[3]; const T* input_data = input.data(); - const T* indices_data = indices.data(); + const T2 * indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); int nthreads = batch_size * output_channels * input_height * input_width; int blocks = (nthreads + 1024 - 1) / 1024; @@ -98,7 +98,7 @@ class Unpool2dMaxFunctor { dim3 grid(blocks, 1); KernelUnpool2dMax< - T><<<<(context) .stream()>>>(nthreads, input_data, indices_data, input_height, input_width, output_channels, @@ -108,8 +108,8 @@ class Unpool2dMaxFunctor { /* * All tensors are in NCHW format. */ -template -class Unpool2dMaxGradFunctor { +template +class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -124,7 +124,7 @@ class Unpool2dMaxGradFunctor { const int output_height = output.dims()[2]; const int output_width = output.dims()[3]; const T* input_data = input.data(); - const T* indices_data = indices.data(); + const T2 * indices_data = indices.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); @@ -134,7 +134,7 @@ class Unpool2dMaxGradFunctor { dim3 grid(blocks, 1); KernelUnpool2dMaxGrad< - T><<<<(context) .stream()>>>( nthreads, input_data, indices_data, @@ -145,11 +145,11 @@ class Unpool2dMaxGradFunctor { } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index bf79354ed9..e086b891a1 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { namespace math { -template +template class Unpool2dMaxFunctor { public: @@ -29,7 +29,7 @@ class Unpool2dMaxFunctor { framework::Tensor * output); }; -template +template class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index ada9ce8ce5..f00459cd85 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -66,7 +66,15 @@ int OutputSize(int input_size, int ksize, int padding, int stride) { } class UnpoolOp : public framework::OperatorWithKernel { - public: +protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + +public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp" @@ -102,6 +110,14 @@ class UnpoolOp : public framework::OperatorWithKernel { }; class UnpoolOpGrad : public framework::OperatorWithKernel { + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } + public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { @@ -118,9 +134,9 @@ namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); REGISTER_OP_CPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); + ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL(unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 4949fc467e..0a1d8b5996 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -16,10 +16,10 @@ namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); + ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_GPU_KERNEL(unpool_grad, ops::UnpoolGradKernel, + float, int>, ops::UnpoolGradKernel); + double, int>); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index ae11a9f4f8..c294221181 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class UnpoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -37,12 +37,12 @@ class UnpoolKernel : public framework::OpKernel { math::SetConstant set_zero; set_zero(context.device_context(), out, static_cast(0)); } - math::Unpool2dMaxFunctor unpool2d_max_forward; + math::Unpool2dMaxFunctor unpool2d_max_forward; unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); } }; -template +template class UnpoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -64,7 +64,7 @@ class UnpoolGradKernel : public framework::OpKernel { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); } - math::Unpool2dMaxGradFunctor unpool2d_max_backward; + math::Unpool2dMaxGradFunctor unpool2d_max_backward; unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out, *out_grad, in_x_grad); } diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index 106af9f5d9..3fdee9091f 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -53,7 +53,7 @@ class TestUnpoolOp(OpTest): output = self.Unpool2d_forward_naive(input, indices, self.ksize, \ self.strides, self.paddings).astype("float32") self.inputs = {'X': input.astype('float32'), - 'Y': indices.astype('int16')} + 'Y': indices.astype('int32')} self.attrs = { 'strides': self.strides, 'paddings': self.paddings, From f9c2a5c38e3800387aaedcc05bf0e49d0f568a65 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Mon, 27 Nov 2017 15:56:45 +0800 Subject: [PATCH 064/275] modify for code review zcd --- paddle/operators/unpool_op.cc | 4 ++-- paddle/operators/unpool_op.h | 4 ++-- python/paddle/v2/fluid/tests/test_unpool_op.py | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index f00459cd85..addceca159 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -46,7 +46,7 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "(vector defalut:{0,0}), " "paddings (height, width) of unpooling operator.") .SetDefault({0, 0}); - AddAttr("unpoolingtype", + AddAttr("unpooling_type", "(string), unpooling type, can be \"max\" for max-unpooling ") .InEnum({"max"}); AddComment(R"DOC( @@ -87,7 +87,7 @@ public: auto in_x_dims = ctx->GetInputDim("X"); auto in_y_dims = ctx->GetInputDim("Y"); std::string unpooling_type = - ctx->Attrs().Get("unpoolingtype"); + ctx->Attrs().Get("unpooling_type"); std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index c294221181..f05d22b49f 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -28,7 +28,7 @@ class UnpoolKernel : public framework::OpKernel { const framework::Tensor* in_x = context.Input("X"); const framework::Tensor* in_y = context.Input("Y"); auto * out = context.Output("Out"); - std::string unpooling_type = context.Attr("unpoolingtype"); + std::string unpooling_type = context.Attr("unpooling_type"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); @@ -53,7 +53,7 @@ class UnpoolGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out")); framework::Tensor* in_x_grad = context.Output(framework::GradVarName("X")); - std::string unpooling_type = context.Attr("unpoolingtype"); + std::string unpooling_type = context.Attr("unpooling_type"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index 3fdee9091f..22826dc1b3 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -58,7 +58,7 @@ class TestUnpoolOp(OpTest): 'strides': self.strides, 'paddings': self.paddings, 'ksize': self.ksize, - 'unpoolingtype': self.unpoolingtype, + 'unpooling_type': self.unpooling_type, } self.outputs = {'Out': output.astype('float32')} @@ -70,7 +70,7 @@ class TestUnpoolOp(OpTest): def init_test_case(self): self.Unpool2d_forward_naive = unpool2dmax_forward_naive - self.unpoolingtype = "max" + self.unpooling_type = "max" self.shape = [6, 4, 5, 5] self.ksize = [3, 3] self.strides = [2, 2] From 6cf2dcbc1f3aa0dd2274a57f910c7666840d4126 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 27 Nov 2017 16:03:35 +0800 Subject: [PATCH 065/275] Add cuda profiler tools. --- paddle/platform/cuda_profiler.h | 70 +++++++++++++++++++ paddle/pybind/pybind.cc | 5 ++ python/paddle/v2/fluid/profiler.py | 59 ++++++++++++++++ python/paddle/v2/fluid/tests/test_profiler.py | 17 +++++ 4 files changed, 151 insertions(+) create mode 100644 paddle/platform/cuda_profiler.h create mode 100644 python/paddle/v2/fluid/profiler.py create mode 100644 python/paddle/v2/fluid/tests/test_profiler.py diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h new file mode 100644 index 0000000000..d3a6e59727 --- /dev/null +++ b/paddle/platform/cuda_profiler.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +namespace paddle { +namespace platform { + +static std::vector kCudaProfileConfiguration = { + "gpustarttimestamp", + "gpuendtimestamp", + "gridsize3d", + "threadblocksize", + "dynsmemperblock", + "stasmemperblock", + "regperthread", + "memtransfersize", + "memtransferdir", + "memtransferhostmemtype", + "streamid", + "cacheconfigrequested", + "cacheconfigexecuted", + "countermodeaggregate", + "enableonstart 0", + "active_warps", + "active_cycles", +}; + +void CudaProfilerInit(std::string output_file, std::string output_mode) { + std::array buf; + std::string tmpl = "/tmp/cuda_profile_config.XXXXXX"; + PADDLE_ENFORCE_LT(tmpl.size(), buf.size()); + memcpy(buf.data(), tmpl.data(), tmpl.size()); + auto result = mktemp(buf.data()); + PADDLE_ENFORCE(strlen(result) != 0); + std::string config = result; + + { + std::ofstream ofs(config, std::ios::out | std::ios::trunc); + PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate()); + for (const auto& line : kCudaProfileConfiguration) { + ofs << line << std::endl; + } + } + + PADDLE_ENFORCE(output_mode == "key_value" || output_mode == "csv"); + cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; + PADDLE_ENFORCE( + cudaProfilerInitialize(config.c_str(), output_file.c_str(), mode)); +} + +void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); } + +void CudaProfilerStop() { PADDLE_ENFORCE((cudaProfilerStop())); } +} +} diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index f55a1edce3..c16d3e0cbe 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -37,6 +37,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include "paddle/operators/nccl/nccl_gpu_common.h" +#include "paddle/platform/cuda_profiler.h" #include "paddle/platform/gpu_info.h" #endif @@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle. m.def("op_support_gpu", OpSupportGPU); #ifdef PADDLE_WITH_CUDA m.def("get_cuda_device_count", platform::GetCUDADeviceCount); + + m.def("nvprof_init", platform::CudaProfilerInit); + m.def("nvprof_start", platform::CudaProfilerStart); + m.def("nvprof_stop", platform::CudaProfilerStop); #endif return m.ptr(); diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py new file mode 100644 index 0000000000..b94ef67b48 --- /dev/null +++ b/python/paddle/v2/fluid/profiler.py @@ -0,0 +1,59 @@ +import paddle.v2.fluid.core as core + + +def nvporf_init(output_file, output_mode=None): + """ + Initialize the CUDA profiler. + This methods must be called before nvprof_start. + + :param output_file: The output file name. + :type output_file: string + :param output_mode: The output mode has Key-Value pair format and + Comma separated values format. + It should be 'key-value' or 'csv'. + :type output_mode: string + """ + if output_mode is None: + output_mode = 'csv' + if output_mode != 'key-value' or output_mode != 'csv': + raise ValueError("The output mode must be 'key-value' or 'csv'.") + core.nvprof_init(output_file, output_mode) + + +def nvporf_start(): + """ + Enables profiler collection by the active CUDA profiling tool. + """ + core.nvprof_start() + + +def nvporf_stop(): + """ + Disables profiler collection. + """ + core.nvprof_stop() + + +class profiler(object): + def __init__(self, output_file, output_mode=None, enabled=True): + self.enabled = enabled + if not self.enabled: + return + self.entered = False + nvporf_init(output_file, output_mode) + + def __enter__(self): + if not self.enabled: + return + if self.entered: + raise RuntimeError("The profiler traces are not reentrant") + self.entered = True + nvporf_start() + return self + + def __exit__(self, exc_type, exc_value, tb): + if exc_value is not None: + raise exc_value + if not self.enabled: + return + nvporf_stop() diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py new file mode 100644 index 0000000000..7da7a28cf6 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_profiler.py @@ -0,0 +1,17 @@ +import paddle.v2.fluid.profiler as profiler +import paddle.v2.fluid.layers as layers +import numpy as np + +place = core.GPUPlace(0) +exe = Executor(place) + +epoc = 8 +dshape = [4, 3, 28, 28] +data = layers.data(name='data', shape=dshape, dtype='float32') +conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) + +input = core.LoDTensor() +with profiler("cuda_profiler.txt") as nvprof: + for i in range(epoc): + input.set(np.random.random(dshape).astype("float32"), place) + exe.run(framework.default_main_program(), feed={'data': data}) From 539462839bced49df37f77a06838de5cf6354410 Mon Sep 17 00:00:00 2001 From: peterzhang2029 Date: Mon, 27 Nov 2017 12:57:39 +0800 Subject: [PATCH 066/275] bug fix when using hsigmoid with gpu --- .../layers/HierarchicalSigmoidLayer.cpp | 78 +++++++++---------- .../gserver/layers/HierarchicalSigmoidLayer.h | 1 - 2 files changed, 38 insertions(+), 41 deletions(-) diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp index f93a9937d1..6317b66a45 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp @@ -75,10 +75,10 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { if (useGpu_) { Matrix::resizeOrCreate(cpuOutput_, - output_.value->getHeight(), - output_.value->getWidth(), - /* trans */ false, - false); + output_.value->getHeight(), + output_.value->getWidth(), + /* trans */ false, + false); IVector::resizeOrCreate(cpuLabel_, label->getSize(), false); cpuLabel_->copyFrom(*label); cpuOutput_->copyFrom(*output_.value); @@ -90,10 +90,10 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { if (biases_.get() != NULL) { if (useGpu_) { Matrix::resizeOrCreate(cpuBias_, - 1, - numClasses_ - 1, - /* trans */ false, - false); + 1, + numClasses_ - 1, + /* trans */ false, + false); cpuBias_->copyFrom(*biases_->getW()); } else { cpuBias_ = biases_->getW(); @@ -104,15 +104,15 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { MatrixPtr input = getInputValue(i); if (useGpu_) { Matrix::resizeOrCreate(cpuInput_, - input->getHeight(), - input->getWidth(), - /* trans */ false, - false); + input->getHeight(), + input->getWidth(), + /* trans */ false, + false); Matrix::resizeOrCreate(cpuWeight_, - weights_[i]->getW()->getHeight(), - weights_[i]->getW()->getWidth(), - /* trans */ false, - false); + weights_[i]->getW()->getHeight(), + weights_[i]->getW()->getWidth(), + /* trans */ false, + false); cpuInput_->copyFrom(*input); cpuWeight_->copyFrom(*weights_[i]->getW()); } else { @@ -129,8 +129,7 @@ void HierarchicalSigmoidLayer::forward(PassType passType) { *cpuOutput_, -1); // scaleSum preOutput_.value->softrelu(*preOutput_.value); - MatrixPtr sum = - Matrix::create(batchSize, 1, /* trans= */ false, false); + MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false); preOutput_.value->rowSum(*sum); cpuOutput_->add(*sum); if (useGpu_) { @@ -156,16 +155,15 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { MatrixPtr biases_grad = biases_->getWGrad(); if (useGpu_) { Matrix::resizeOrCreate(cpuBias_, - 1, - numClasses_ - 1, - /* trans */ false, - false); + 1, + numClasses_ - 1, + /* trans */ false, + false); cpuBias_->copyFrom(*biases_grad); } else { cpuBias_ = biases_grad; } - preOutput_.grad->addByBitCodeBackward( - numClasses_, *cpuLabel_, *cpuBias_); + preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_); if (useGpu) { biases_grad->copyFrom(*cpuBias_); } else { @@ -182,15 +180,15 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { MatrixPtr weights_grad = weights_[i]->getWGrad(); if (useGpu_) { Matrix::resizeOrCreate(cpuInput_, - input->getHeight(), - input->getWidth(), - /* trans */ false, - false); + input->getHeight(), + input->getWidth(), + /* trans */ false, + false); Matrix::resizeOrCreate(cpuWeightGrad_, - weights_grad->getHeight(), - weights_grad->getWidth(), - /* trans */ false, - false); + weights_grad->getHeight(), + weights_grad->getWidth(), + /* trans */ false, + false); cpuInput_->copyFrom(*input); cpuWeightGrad_->copyFrom(*weights_grad); } else { @@ -213,15 +211,15 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { if (inputGrad) { if (useGpu_) { Matrix::resizeOrCreate(cpuInputGrad_, - inputGrad->getHeight(), - inputGrad->getWidth(), - /* trans */ false, - false); + inputGrad->getHeight(), + inputGrad->getWidth(), + /* trans */ false, + false); Matrix::resizeOrCreate(cpuWeight_, - weights_[i]->getW()->getHeight(), - weights_[i]->getW()->getWidth(), - /* trans */ false, - false); + weights_[i]->getW()->getHeight(), + weights_[i]->getW()->getWidth(), + /* trans */ false, + false); cpuInputGrad_->copyFrom(*inputGrad); cpuWeight_->copyFrom(*weights_[i]->getW()); } else { diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h index 2483572ded..7f896e61ca 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h @@ -89,7 +89,6 @@ protected: MatrixPtr cpuBias_; MatrixPtr cpuOutput_; IVectorPtr cpuLabel_; - }; } // namespace paddle From 8a283dbc9e78f8c2f00d04180986abfb7d6b29df Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Mon, 27 Nov 2017 19:13:28 +0800 Subject: [PATCH 067/275] Update docs for fm layer --- .../paddle/trainer_config_helpers/layers.py | 21 ++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 32287cce6c..288aebb5b4 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -7423,18 +7423,25 @@ def factorization_machine(input, Factorization machines. .. code-block:: python - factor_machine = factorization_machine(input=input_layer, factor_size=10) - - :param input: The input layer. + first_order = paddle.layer.fc(input=input, + size=1, + act=paddle.activation.Linear()) + second_order = paddle.layer.factorization_machine(input=input, + factor_size=10) + fm = paddle.layer.addto(input=[first_order, second_order], + act=paddle.activation.Linear(), + bias_attr=False) + + :param input: The input layer. Supported input types: all input data types + on CPU, and only dense input types on GPU. :type input: LayerOutput :param factor_size: The hyperparameter that defines the dimensionality of - the latent vector size + the latent vector size. :type context_len: int :param act: Activation Type. Default is linear activation. :type act: BaseActivation - :param param_attr: The Parameter Attribute. If None, the latent vectors will - be initialized smartly. It's better to set it by - yourself. + :param param_attr: The parameter attribute. See ParameterAttribute for + details. :type param_attr: ParameterAttribute :param layer_attr: Extra Layer config. :type layer_attr: ExtraLayerAttribute|None From d4c2f2f219d3719a32f48a0c2975b736cd8f5c02 Mon Sep 17 00:00:00 2001 From: ranqiu Date: Mon, 27 Nov 2017 19:57:56 +0800 Subject: [PATCH 068/275] Refine the doc of layers.py --- .../paddle/trainer_config_helpers/layers.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 469e667e80..b0f21bdb46 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2985,8 +2985,8 @@ def spp_layer(input, A layer performs spatial pyramid pooling. Reference: - Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition - https://arxiv.org/abs/1406.4729 + `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition + https://arxiv.org/abs/1406.4729`_ The example usage is: @@ -3087,8 +3087,8 @@ def img_cmrnorm_layer(input, Response normalization across feature maps. Reference: - ImageNet Classification with Deep Convolutional Neural Networks - http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf + `ImageNet Classification with Deep Convolutional Neural Networks + http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_ The example usage is: @@ -3154,9 +3154,9 @@ def batch_norm_layer(input, y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift Reference: - Batch Normalization: Accelerating Deep Network Training by Reducing + `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift - http://arxiv.org/abs/1502.03167 + http://arxiv.org/abs/1502.03167`_ The example usage is: @@ -5413,10 +5413,10 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): to be devided by groups. Reference: - Maxout Networks - http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf - Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks - https://arxiv.org/pdf/1312.6082v4.pdf + `Maxout Networks + http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_ + `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks + https://arxiv.org/pdf/1312.6082v4.pdf`_ .. math:: y_{si+j} = \max_k x_{gsi + sk + j} @@ -5481,9 +5481,9 @@ def ctc_layer(input, alignment between the inputs and the target labels is unknown. Reference: - Connectionist Temporal Classification: Labelling Unsegmented Sequence Data + `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks - http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf + http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_ Note: Considering the 'blank' label needed by CTC, you need to use (num_classes + 1) @@ -5555,9 +5555,9 @@ def warp_ctc_layer(input, install it to :code:`third_party/install/warpctc` directory. Reference: - Connectionist Temporal Classification: Labelling Unsegmented Sequence Data + `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks - http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf + http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_ Note: - Let num_classes represents the category number. Considering the 'blank' @@ -5777,8 +5777,8 @@ def nce_layer(input, Noise-contrastive estimation. Reference: - A fast and simple algorithm for training neural probabilistic language - models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf + `A fast and simple algorithm for training neural probabilistic language + models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_ The example usage is: @@ -5893,8 +5893,8 @@ def rank_cost(left, A cost Layer for learning to rank using gradient descent. Reference: - Learning to Rank using Gradient Descent - http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf + `Learning to Rank using Gradient Descent + http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_ .. math:: @@ -6429,8 +6429,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None): smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if} \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases} Reference: - Fast R-CNN - https://arxiv.org/pdf/1504.08083v2.pdf + `Fast R-CNN + https://arxiv.org/pdf/1504.08083v2.pdf`_ The example usage is: @@ -6636,8 +6636,8 @@ def prelu_layer(input, The Parametric Relu activation that actives outputs with a learnable weight. Reference: - Delving Deep into Rectifiers: Surpassing Human-Level Performance on - ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf + `Delving Deep into Rectifiers: Surpassing Human-Level Performance on + ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_ .. math:: z_i &\\quad if \\quad z_i > 0 \\\\ @@ -6733,8 +6733,8 @@ def gated_unit_layer(input, product between :match:`X'` and :math:`\sigma` is finally returned. Reference: - Language Modeling with Gated Convolutional Networks - https://arxiv.org/abs/1612.08083 + `Language Modeling with Gated Convolutional Networks + https://arxiv.org/abs/1612.08083`_ .. math:: y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c) From ef3420e2b940d23bbc5cbb1b80d4bca457507257 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Mon, 27 Nov 2017 19:02:42 +0530 Subject: [PATCH 069/275] Fix the latex comment syntax in sgd_op.cc (#5940) * Fix the latex comment syntax in sgd_op.cc * Change \textunderscore to \_ --- paddle/operators/sgd_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc index 72f4e4d5cb..5576d7b8be 100644 --- a/paddle/operators/sgd_op.cc +++ b/paddle/operators/sgd_op.cc @@ -55,7 +55,7 @@ SGD operator This operator implements one step of the stochastic gradient descent algorithm. -$$param_out = param - learning_rate * grad$$ +$$param\_out = param - learning\_rate * grad$$ )DOC"); } From 966a442eb0799b6e25d601d2f27affc1cc74aefd Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 27 Nov 2017 21:53:16 +0800 Subject: [PATCH 070/275] fix grep socket error in lscpu command --- python/paddle/v2/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index 33a0829ba8..70f61e8499 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -83,11 +83,10 @@ def set_omp_mkl_env_vars(trainer_count): '''Get the number of physical cores''' if platform.system() == "Linux": num_sockets = int( - os.popen("lscpu |grep \"Socket\" |awk -F':' '{print $2}'|xargs") + os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l") .read()) num_cores_per_socket = int( - os.popen( - "lscpu |grep \"per socket\" |awk -F':' '{print $2}'|xargs") + os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l") .read()) return num_sockets * num_cores_per_socket else: From 623f62a7dc9ac46b5f80be3ebc8d6518b03ea295 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 27 Nov 2017 22:01:49 +0800 Subject: [PATCH 071/275] Add cuda profiler tools and expose it in Python. --- paddle/platform/cuda_profiler.h | 33 +++++------------ python/paddle/v2/fluid/profiler.py | 29 +++++++++++---- python/paddle/v2/fluid/tests/test_profiler.py | 35 ++++++++++++------- 3 files changed, 53 insertions(+), 44 deletions(-) diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h index d3a6e59727..c096ce37c5 100644 --- a/paddle/platform/cuda_profiler.h +++ b/paddle/platform/cuda_profiler.h @@ -14,33 +14,15 @@ limitations under the License. */ #pragma once #include +#include #include #include namespace paddle { namespace platform { -static std::vector kCudaProfileConfiguration = { - "gpustarttimestamp", - "gpuendtimestamp", - "gridsize3d", - "threadblocksize", - "dynsmemperblock", - "stasmemperblock", - "regperthread", - "memtransfersize", - "memtransferdir", - "memtransferhostmemtype", - "streamid", - "cacheconfigrequested", - "cacheconfigexecuted", - "countermodeaggregate", - "enableonstart 0", - "active_warps", - "active_cycles", -}; - -void CudaProfilerInit(std::string output_file, std::string output_mode) { +void CudaProfilerInit(std::string output_file, std::string output_mode, + std::vector config_flags) { std::array buf; std::string tmpl = "/tmp/cuda_profile_config.XXXXXX"; PADDLE_ENFORCE_LT(tmpl.size(), buf.size()); @@ -52,12 +34,12 @@ void CudaProfilerInit(std::string output_file, std::string output_mode) { { std::ofstream ofs(config, std::ios::out | std::ios::trunc); PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate()); - for (const auto& line : kCudaProfileConfiguration) { + for (const auto& line : config_flags) { ofs << line << std::endl; } } - PADDLE_ENFORCE(output_mode == "key_value" || output_mode == "csv"); + PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv"); cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; PADDLE_ENFORCE( cudaProfilerInitialize(config.c_str(), output_file.c_str(), mode)); @@ -66,5 +48,6 @@ void CudaProfilerInit(std::string output_file, std::string output_mode) { void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); } void CudaProfilerStop() { PADDLE_ENFORCE((cudaProfilerStop())); } -} -} + +} // namespace platform +} // namespace paddle diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index b94ef67b48..f31d6f0a61 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -1,7 +1,20 @@ import paddle.v2.fluid.core as core +import subprocess +__all__ = ['CudaProfiler'] -def nvporf_init(output_file, output_mode=None): +NV_FLAGS = [ + "gpustarttimestamp", + "gpuendtimestamp", + "gridsize3d", + "threadblocksize", + "streamid", + "enableonstart 0", + "conckerneltrace", +] + + +def nvporf_init(output_file, output_mode=None, flags=None): """ Initialize the CUDA profiler. This methods must be called before nvprof_start. @@ -10,14 +23,15 @@ def nvporf_init(output_file, output_mode=None): :type output_file: string :param output_mode: The output mode has Key-Value pair format and Comma separated values format. - It should be 'key-value' or 'csv'. + It should be 'kv' or 'csv'. :type output_mode: string """ if output_mode is None: output_mode = 'csv' - if output_mode != 'key-value' or output_mode != 'csv': + if output_mode not in ['kv', 'csv']: raise ValueError("The output mode must be 'key-value' or 'csv'.") - core.nvprof_init(output_file, output_mode) + flags = NV_FLAGS if flags is None else flags + core.nvprof_init(output_file, output_mode, flags) def nvporf_start(): @@ -34,13 +48,14 @@ def nvporf_stop(): core.nvprof_stop() -class profiler(object): - def __init__(self, output_file, output_mode=None, enabled=True): +class CudaProfiler(object): + def __init__(self, output_file, output_mode=None, flags=None, enabled=True): self.enabled = enabled if not self.enabled: return self.entered = False - nvporf_init(output_file, output_mode) + self.out_file = output_file + nvporf_init(output_file, output_mode, flags) def __enter__(self): if not self.enabled: diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py index 7da7a28cf6..1fec5c99bf 100644 --- a/python/paddle/v2/fluid/tests/test_profiler.py +++ b/python/paddle/v2/fluid/tests/test_profiler.py @@ -1,17 +1,28 @@ +import unittest +import numpy as np +import paddle.v2.fluid as fluid import paddle.v2.fluid.profiler as profiler import paddle.v2.fluid.layers as layers -import numpy as np -place = core.GPUPlace(0) -exe = Executor(place) -epoc = 8 -dshape = [4, 3, 28, 28] -data = layers.data(name='data', shape=dshape, dtype='float32') -conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) +class TestProfiler(unittest.TestCase): + def test_nvprof(self): + if not fluid.core.is_compile_gpu(): + return + epoc = 8 + dshape = [4, 3, 28, 28] + data = layers.data(name='data', shape=[3, 28, 28], dtype='float32') + conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) + + place = fluid.GPUPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + with profiler.CudaProfiler("cuda_profiler.txt", 'csv') as nvprof: + for i in range(epoc): + input = np.random.random(dshape).astype("float32") + exe.run(fluid.default_main_program(), feed={'data': input}) + -input = core.LoDTensor() -with profiler("cuda_profiler.txt") as nvprof: - for i in range(epoc): - input.set(np.random.random(dshape).astype("float32"), place) - exe.run(framework.default_main_program(), feed={'data': data}) +if __name__ == '__main__': + unittest.main() From bf360c7746db9a5084fb58dc73452cc19048f54a Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Mon, 27 Nov 2017 22:19:59 +0800 Subject: [PATCH 072/275] fix pipe_reader unimport packages --- python/paddle/v2/reader/decorator.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 0695542690..7e457f987d 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -14,13 +14,16 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn', 'xmap_readers' + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader' ] +from threading import Thread +import subprocess + +from Queue import Queue import itertools import random -from Queue import Queue -from threading import Thread +import zlib def map_readers(func, *readers): From 9abc0e04c1974ad16bf27d783dcb6b53da315a73 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 27 Nov 2017 19:04:07 +0800 Subject: [PATCH 073/275] fix conv and conv_trans op doc --- paddle/operators/conv_op.cc | 61 ++++++++++------- paddle/operators/conv_transpose_op.cc | 90 +++++++++++++++----------- paddle/operators/conv_transpose_op.h | 1 - paddle/operators/pool_op.cc | 24 +++---- paddle/operators/pool_with_index_op.cc | 18 +++--- 5 files changed, 108 insertions(+), 86 deletions(-) diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index 7a36a9b21a..462e6d9cbc 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, .SetDefault({0, 0}); AddAttr( "groups", - "(int default:1), the group size of convolution operator. " + "(int default:1), the groups number of the convolution operator. " "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " "when group=2, the first half of the filters is only connected to the " "first half of the input channels, while the second half of the filters " @@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto, Convolution Operator. The convolution operation calculates the output based on the input, filter -and strides, paddings, groups, dilations parameters. The size of each dimension of the +and strides, paddings, dilations, groups parameters. The size of each dimension of the parameters is checked in the infer-shape. -Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch +Input(Input) and Output(Output) are in NCHW format. Where N is batch size, C is the number of channels, H is the height of the feature, and W is -the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements. -These two elements represent height and width, respectively. +the width of the feature. +Filters(Input) is MCHW format. Where M is the number of output image channels, C is +the number of input image channels, H is the height of the filter, and W +is the width of the filter. +Parameters(strides, paddings, dilations) are two elements. These two elements represent +height and width, respectively. The input(X) size and output(Out) size may be different. Example: Input: - Input shape: (N, C_in, H_in, W_in) - Filter shape: (C_out, C_in, H_f, W_f) + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, H_f, W_f)$ Output: - Output shape: (N, C_out, H_out, W_out) - where - H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1; - W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1; + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where +$$ + H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1 +$$ )DOC"); } @@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto, .SetDefault({0, 0, 0}); AddAttr( "groups", - "(int default:1), the group size of convolution operator. " + "(int default:1), the groups number of the convolution operator. " "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: " "when group=2, the first half of the filters is only connected to the " "first half of the input channels, while the second half of the filters " @@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto, AddAttr>("dilations", "(vector default:{1, 1, 1}), the " "dilations(d_dilation, h_dilation, w_dilation) of " - "convolution operator. Currently, conv3d doesn't " - "support dilation.") + "convolution operator.") .SetDefault({1, 1, 1}); AddComment(R"DOC( Convolution3D Operator. The convolution operation calculates the output based on the input, filter -and strides, paddings, groups parameters. The size of each dimension of the +and strides, paddings, dilations, groups parameters. The size of each dimension of the parameters is checked in the infer-shape. -Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch +Input(Input) and output(Output) are in NCDHW format, where N is batch size, C is the number of channels,D is the depth of the feature, H is the height of -the feature, and W is the width of the feature. Parameters(ksize, strides, paddings) -are three elements. These three elements represent depth, height and width, respectively. +the feature, and W is the width of the feature. +Filters(Input) is MCDHW format, where M is the number of output image channels, +C is the number of input image channels, D is the depth of the filter, +H is the height of the filter, and W is the width of the filter. +Parameters(strides, paddings, dilations) are three elements. These three elements +represent depth, height and width, respectively. The input(X) size and output(Out) size may be different. Example: Input: - Input shape: (N, C_in, D_in, H_in, W_in) - Filter shape: (C_out, C_in, D_f, H_f, W_f) + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$ Output: - Output shape: (N, C_out, D_out, H_out, W_out) - where - D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1; - H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1; - W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1; + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\ + H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\ + W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1 + $$ )DOC"); } diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index 3e55ef036a..678b192dea 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { "ConvTransposeOp input dimension and strides dimension should " "be consistent."); PADDLE_ENFORCE_EQ(paddings.size(), strides.size(), - "ConvTransposeOp paddings dimension and Conv strides " + "ConvTransposeOp paddings dimension and strides " "dimension should be the same."); PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0], "In ConvTransposeOp, The input channel should be the same " @@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( "The format of input tensor is NCHW. Where N is batch size, C is the " "number of input channels, H is the height of the feature, and " "W is the width of the feature."); - AddInput("Filter", - "(Tensor) The filter tensor of convolution transpose operator. " - "The format of the filter tensor is CMHW, where C is the number of " - "output image channels, M is the number of input image channels, " - "H is the height of the filter, and W is the width of the filter. " - "We enforce groups number == 1 and padding == 0 in " - "the convolution transpose scenario."); + AddInput( + "Filter", + "(Tensor) The filter tensor of convolution transpose operator. " + "The format of the filter tensor is MCHW, where M is the number of " + "input feature channels, C is the number of " + "output feature channels," + "H is the height of the filter, and W is the width of the filter. " + "We enforce groups number == 1 in the convolution transpose scenario."); AddOutput("Output", "(Tensor) The output tensor of convolution transpose operator. " "The format of output tensor is also NCHW."); AddAttr>( "strides", - "(vector defalut:{1, 1}), the strides(h_stride, w_stride) of " + "(vector default:{1, 1}), the strides(h_stride, w_stride) of " "convolution transpose operator.") .SetDefault({1, 1}); AddAttr>( "paddings", - "(vector defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution " + "(vector default:{0, 0}), the paddings(h_pad, w_pad) of convolution " "transpose operator.") .SetDefault({0, 0}); AddComment(R"DOC( @@ -88,21 +89,26 @@ Convolution2D Transpose Operator. The convolution transpose operation calculates the output based on the input, filter and strides, paddings, groups parameters. The size of each dimension of the parameters is checked in the infer-shape. - -Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch -size, C is the number of channels, H is the height of the feature, and -W is the width of the feature. Parameters(ksize, strides, paddings) are two elements. -These two elements represent height and width, respectively. +Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the +number of channels, H is the height of the feature, and W is the width of the feature. +Filter(Input) is in MCHW format. Where M is the number of input feature channels, +C is the number of output feature channels, H is the height of the filter, +and W is the width of the filter. +Parameters(strides, paddings) are two elements. These two elements represent height +and width, respectively. The input(X) size and output(Out) size may be different. + Example: Input: - Input shape: (N, C_in, H_in, W_in) - Filter shape: (C_in, C_out, H_f, W_f) + Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{in}, C_{out}, H_f, W_f)$ Output: - Output shape: (N, C_out, H_out, W_out) - where - H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0]; - W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1]; + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where + $$ + H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\ + W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f + $$ )DOC"); } @@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker( "W is the width of the feature."); AddInput("Filter", "(Tensor) The filter tensor of convolution transpose operator." - "The format of the filter tensor is CMDHW, where C is the number of " - "output image channels, M is the number of input image channels, D " + "The format of the filter tensor is MCDHW, where M is the number of " + "input feature channels, C is the number of " + "output feature channels, D " "is the depth of the filter, H is the height of the filter, and " "W is the width of the filter." "We enforce groups number == 1 and padding == 0 in " @@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker( "the number of channels, D is the depth of the feature, H is the " "height of the feature, and W is the width of the feature."); AddAttr>("strides", - "(vector defalut:{1, 1, 1}), the " + "(vector default:{1, 1, 1}), the " "strides{d_stride, h_stride, w_stride} of " "convolution transpose operator.") .SetDefault({1, 1, 1}); AddAttr>("paddings", - "(vector defalut:{0, 0, 0}), paddings(d_pad, " + "(vector default:{0, 0, 0}), paddings(d_pad, " "h_pad, w_pad) of convolution transpose operator.") .SetDefault({0, 0, 0}); AddComment(R"DOC( @@ -144,23 +151,28 @@ Convolution3D Transpose Operator. The convolution transpose operation calculates the output based on the input, filter and strides, paddings, groups parameters. The size of each dimension of the parameters is checked in the infer-shape. - -Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch -size, C is the number of channels, D is the depth of the feature, -H is the height of the feature, and W is the width of the feature. -Parameters(ksize, strides, paddings) are three elements. -These three elements represent depth, height and width, respectively. +Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the +number of channels, D is the depth of the feature, H is the height of the feature, +and W is the width of the feature. +Filter(Input) is in MCDHW format. Where M is the number of input feature channels, +C is the number of output feature channels, D is the depth of the filter,H is the +height of the filter, and W is the width of the filter. +Parameters(strides, paddings) are three elements. These three elements represent +depth, height and width, respectively. The input(X) size and output(Out) size may be different. -Example: + +Example: Input: - Input shape: (N, C_in, D_in, H_in, W_in) - Filter shape: (C_in, C_out, D_f, H_f, W_f) + Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$ + Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$ Output: - Output shape: (N, C_out, D_out, H_out, W_out) - where - D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0]; - H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1]; - W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2]; + Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$ + Where + $$ + D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\ + H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\ + W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f + $$ )DOC"); } diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h index 0fc0735788..1cacb770e6 100644 --- a/paddle/operators/conv_transpose_op.h +++ b/paddle/operators/conv_transpose_op.h @@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); - // TODO(Zhuoyuan): Paddings can be added in future. // groups will alway be disabled in conv2dtranspose. const int batch_size = static_cast(input->dims()[0]); diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc index d8c58618cf..e26ffd86e5 100644 --- a/paddle/operators/pool_op.cc +++ b/paddle/operators/pool_op.cc @@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto, // TypedAttrChecker don't support vector type.) AddAttr>( "paddings", - "(vector, defalut {0,0}), paddings(height, width) of pooling " + "(vector, default {0,0}), paddings(height, width) of pooling " "operator." "If global_pooling = true, paddings and ksize will be ignored.") .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, @@ -122,15 +122,15 @@ Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively. The input(X) size and output(Out) size may be different. -Example: +Example: Input: X shape: $(N, C, H_{in}, W_{in})$ Output: Out shape: $(N, C, H_{out}, W_{out})$ - where + Where $$ - H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ - W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 + H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 $$ )DOC"); @@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto, // TypedAttrChecker don't support vector type.) AddAttr>( "paddings", - "(vector, defalut {0,0,0}), paddings(depth, height, " + "(vector, default {0,0,0}), paddings(depth, height, " "width) of pooling operator. " "If global_pooling = true, ksize and paddings will be ignored.") .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, @@ -199,12 +199,12 @@ Example: X shape: $(N, C, D_{in}, H_{in}, W_{in})$ Output: Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ - where - $$ - D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ - H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\ - W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1 - $$ + Where + $$ + D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 + $$ )DOC"); } diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc index 4958fa6454..b9c42a6912 100644 --- a/paddle/operators/pool_with_index_op.cc +++ b/paddle/operators/pool_with_index_op.cc @@ -142,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { // TypedAttrChecker don't support vector type.) AddAttr>( "paddings", - "(vector, defalut:{0, 0}), paddings(height, width) of pooling " + "(vector, default:{0, 0}), paddings(height, width) of pooling " "operator. " "If global_pooling = true, paddings and will be ignored.") .SetDefault({0, 0}); // TODO(Chengduo): Add checker. (Currently, @@ -166,10 +166,10 @@ Example: Output: Out shape: $(N, C, H_{out}, W_{out})$ Mask shape: $(N, C, H_{out}, W_{out})$ - where + Where $$ - H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ - W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 + H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 $$ )DOC"); @@ -220,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { // TypedAttrChecker don't support vector type.) AddAttr>( "paddings", - "(vector, defalut {0,0,0}), paddings(depth, " + "(vector, default {0,0,0}), paddings(depth, " "height, width) of pooling operator. " "If global_pooling = true, paddings and ksize will be ignored.") .SetDefault({0, 0, 0}); // TODO(Chengduo): Add checker. (Currently, @@ -244,11 +244,11 @@ Example: Output: Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$ - where + Where $$ - D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\ - H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\ - W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1 + D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ + H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 $$ )DOC"); From 57e68e574026d2853b6fcec069647322959493b7 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 28 Nov 2017 08:15:33 +0800 Subject: [PATCH 074/275] modify for code review by qingqing 2nd --- paddle/operators/math/unpooling.cu | 48 +++++++++--------- paddle/operators/unpool_op.cc | 49 ++++++++----------- paddle/operators/unpool_op.cu.cc | 18 +++---- paddle/operators/unpool_op.h | 6 +-- .../paddle/v2/fluid/tests/test_unpool_op.py | 36 +++++++------- 5 files changed, 72 insertions(+), 85 deletions(-) diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index c8fd58eca5..99e6fd052a 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -29,19 +29,19 @@ __global__ void KernelUnpool2dMax(const int nthreads, T* output_data, const int output_height, const int output_width) { - int bsize = input_height * input_width * channels; - int csize = input_height * input_width; - int out_bsize = output_height * output_width * channels; - int out_csize = output_height * output_width; + int in_n_stride = input_height * input_width * channels; + int in_c_stride = input_height * input_width; + int out_n_stride = output_height * output_width * channels; + int out_c_stride = output_height * output_width; int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; for (int i = index; i < nthreads; i += offset) { - int bidx = i / bsize; - int boffset = i % bsize; - int cidx = boffset / csize; - int out_offset = bidx * out_bsize + cidx * out_csize; + int bidx = i / in_n_stride; + int boffset = i % in_n_stride; + int cidx = boffset / in_c_stride; + int out_offset = bidx * out_n_stride + cidx * out_c_stride; int out_index = indices_data[i]; - PADDLE_ASSERT(out_index < (output_height * output_width)); + PADDLE_ASSERT(out_index < out_c_stride); output_data[out_offset + out_index] = input_data[i]; } } @@ -57,19 +57,19 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, const int output_height, const int output_width, T* input_grad) { - int bsize = input_height * input_width * channels; - int csize = input_height * input_width; - int out_bsize = output_height * output_width * channels; - int out_csize = output_height * output_width; + int in_n_stride = input_height * input_width * channels; + int in_c_stride = input_height * input_width; + int out_n_stride = output_height * output_width * channels; + int out_c_stride = output_height * output_width; int index = blockIdx.x * blockDim.x + threadIdx.x; int offset = blockDim.x * gridDim.x; for (int i = index; i < nthreads; i += offset) { - int bidx = i / bsize; - int boffset = i % bsize; - int cidx = boffset / csize; - int out_offset = bidx * out_bsize + cidx * out_csize; + int bidx = i / in_n_stride; + int boffset = i % in_n_stride; + int cidx = boffset / in_c_stride; + int out_offset = bidx * out_n_stride + cidx * out_c_stride; int out_index = indices_data[i]; - PADDLE_ASSERT(out_index < (output_height * output_width)); + PADDLE_ASSERT(out_index < out_c_stride); input_grad[i] = output_grad[out_offset + out_index]; } } @@ -93,10 +93,8 @@ class Unpool2dMaxFunctor { const T2 * indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); int nthreads = batch_size * output_channels * input_height * input_width; - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - + int threads = 1024; + int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMax< T, T2><<(context) @@ -129,10 +127,8 @@ class Unpool2dMaxGradFunctor { const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int nthreads = batch_size * output_channels * input_height * input_width; - int blocks = (nthreads + 1024 - 1) / 1024; - dim3 threads(1024, 1); - dim3 grid(blocks, 1); - + int threads = 1024; + int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMaxGrad< T, T2><<(context) diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index addceca159..49a5129188 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. */ + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include "paddle/operators/unpool_op.h" namespace paddle { @@ -25,7 +25,7 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) The input tensor of unpool operator. " "The format of input tensor is NCHW. Where N is batch size, C is the " "number of channels, H and W is the height and width of feature."); - AddInput("Y", + AddInput("Indices", "(Tensor) The input tensor of the indices given out by MaxPool2d. " "The format of input tensor is NCHW. Where N is batch size, C is the " "number of channels, H and W is the height and width of feature."); @@ -50,12 +50,10 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "(string), unpooling type, can be \"max\" for max-unpooling ") .InEnum({"max"}); AddComment(R"DOC( - "input: the input Tensor to invert - indices: the indices given out by MaxPool2d - ksize – Size of the max pooling window. - stride – Stride of the max pooling window. - "It is set to kernel_size by default. - padding – Padding that was added to the input" + "Paper: http://www.matthewzeiler.com/wp-content/uploads/2017 + /07/iccv2011.pdf + PyTorch: http://pytorch.org/docs/master/nn.html?highlight=unpool# + torch.nn.MaxUnpool2d" )DOC"); } }; @@ -79,27 +77,20 @@ public: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp" "should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of UnpoolOp" + PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp" "should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of UnpoolOp should not be null."); - auto in_x_dims = ctx->GetInputDim("X"); - auto in_y_dims = ctx->GetInputDim("Y"); + auto in_y_dims = ctx->GetInputDim("Indices"); std::string unpooling_type = ctx->Attrs().Get("unpooling_type"); std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE(in_x_dims.size() == 4, "Unpooling intput must be of 4-dimensional."); - for (int i = 0; i < 4; ++i) { - PADDLE_ENFORCE(in_x_dims[i] == in_y_dims[i], - "X size must be eq Y size!"); - } - - + PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); for (size_t i = 0; i < ksize.size(); ++i) { output_shape.push_back( diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 0a1d8b5996..9b5ac667d3 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +Indicesou may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include "paddle/operators/unpool_op.h" diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index f05d22b49f..dfd4ef12b5 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -2,7 +2,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. -You may obtain a copy of the License at +Indicesou may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 @@ -26,7 +26,7 @@ class UnpoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* in_y = context.Input("Y"); + const framework::Tensor* in_y = context.Input("Indices"); auto * out = context.Output("Out"); std::string unpooling_type = context.Attr("unpooling_type"); std::vector ksize = context.Attr>("ksize"); @@ -47,7 +47,7 @@ class UnpoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { const framework::Tensor* in_x = context.Input("X"); - const framework::Tensor* in_y = context.Input("Y"); + const framework::Tensor* in_y = context.Input("Indices"); const framework::Tensor* out = context.Input("Out"); const framework::Tensor* out_grad = context.Input(framework::GradVarName("Out")); diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index 22826dc1b3..b3c6c85025 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -5,16 +5,16 @@ from op_test import OpTest def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings): s0, s1, s2, s3 = input.shape - out_H=(s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0] - out_W=(s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1] - out = np.zeros((s0, s1, out_H, out_W)) + out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0] + out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1] + out = np.zeros((s0, s1, out_hsize, out_wsize)) for nidx in xrange(s0): for cidx in xrange(s1): for h in xrange(s2): for w in xrange(s3): index = indices[nidx, cidx, h, w] - hidx = (index - index % out_W) / out_W - widx = index % out_W + hidx = (index - index % out_wsize) / out_wsize + widx = index % out_wsize out[nidx, cidx, int(hidx), int(widx)] = \ input[nidx, cidx, h, w] @@ -26,34 +26,34 @@ class TestUnpoolOp(OpTest): self.op_type = "unpool" self.init_test_case() pre_input = np.random.random(self.shape).astype("float32") - N, C, H, W = pre_input.shape - H_out = (H - self.ksize[0] + 2 * self.paddings[0]) / \ + nsize, csize, hsize, wsize = pre_input.shape + hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \ self.strides[0] + 1 - W_out = (W - self.ksize[1] + 2 * self.paddings[1]) / \ + wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \ self.strides[1] + 1 - input = np.zeros((N, C, H_out, W_out)) - indices = np.zeros((N, C, H_out, W_out)) - for i in xrange(H_out): - for j in xrange(W_out): + input = np.zeros((nsize, csize, hsize_out, wsize_out)) + indices = np.zeros((nsize, csize, hsize_out, wsize_out)) + for i in xrange(hsize_out): + for j in xrange(wsize_out): r_start = np.max((i * self.strides[0] - self.paddings[0], 0)) r_end = np.min((i * self.strides[0] + self.ksize[0] - \ - self.paddings[0], H)) + self.paddings[0], hsize)) c_start = np.max((j * self.strides[1] - self.paddings[1], 0)) c_end = np.min((j * self.strides[1] + self.ksize[1] - \ - self.paddings[1], W)) - for nidx in xrange(N): - for cidx in xrange(C): + self.paddings[1], wsize)) + for nidx in xrange(nsize): + for cidx in xrange(csize): x_masked = pre_input[nidx, cidx, r_start:r_end, \ c_start:c_end] input[nidx, cidx, i, j] = x_masked.max() arg = x_masked.argmax() indices[nidx, cidx, i, j] = \ - (r_start + arg / self.ksize[1]) * W + \ + (r_start + arg / self.ksize[1]) * wsize + \ c_start + arg % self.ksize[1] output = self.Unpool2d_forward_naive(input, indices, self.ksize, \ self.strides, self.paddings).astype("float32") self.inputs = {'X': input.astype('float32'), - 'Y': indices.astype('int32')} + 'Indices': indices.astype('int32')} self.attrs = { 'strides': self.strides, 'paddings': self.paddings, From 85e6906f0b1b301bd4218b2534e05c2f8961fd79 Mon Sep 17 00:00:00 2001 From: ranqiu Date: Tue, 28 Nov 2017 10:19:59 +0800 Subject: [PATCH 075/275] Refine the doc of layers.py --- .../paddle/trainer_config_helpers/layers.py | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index b0f21bdb46..4bd94861af 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2986,7 +2986,7 @@ def spp_layer(input, Reference: `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition - https://arxiv.org/abs/1406.4729`_ + `_ The example usage is: @@ -3088,7 +3088,7 @@ def img_cmrnorm_layer(input, Reference: `ImageNet Classification with Deep Convolutional Neural Networks - http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_ + `_ The example usage is: @@ -3156,7 +3156,7 @@ def batch_norm_layer(input, Reference: `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift - http://arxiv.org/abs/1502.03167`_ + `_ The example usage is: @@ -5414,9 +5414,9 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): Reference: `Maxout Networks - http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_ + `_ `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks - https://arxiv.org/pdf/1312.6082v4.pdf`_ + `_ .. math:: y_{si+j} = \max_k x_{gsi + sk + j} @@ -5483,7 +5483,7 @@ def ctc_layer(input, Reference: `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks - http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_ + `_ Note: Considering the 'blank' label needed by CTC, you need to use (num_classes + 1) @@ -5557,7 +5557,7 @@ def warp_ctc_layer(input, Reference: `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with Recurrent Neural Networks - http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_ + `_ Note: - Let num_classes represents the category number. Considering the 'blank' @@ -5778,7 +5778,7 @@ def nce_layer(input, Reference: `A fast and simple algorithm for training neural probabilistic language - models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_ + models. `_ The example usage is: @@ -5894,7 +5894,7 @@ def rank_cost(left, Reference: `Learning to Rank using Gradient Descent - http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_ + `_ .. math:: @@ -6430,7 +6430,7 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None): Reference: `Fast R-CNN - https://arxiv.org/pdf/1504.08083v2.pdf`_ + `_ The example usage is: @@ -6637,7 +6637,7 @@ def prelu_layer(input, Reference: `Delving Deep into Rectifiers: Surpassing Human-Level Performance on - ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_ + ImageNet Classification `_ .. math:: z_i &\\quad if \\quad z_i > 0 \\\\ @@ -6734,7 +6734,7 @@ def gated_unit_layer(input, Reference: `Language Modeling with Gated Convolutional Networks - https://arxiv.org/abs/1612.08083`_ + `_ .. math:: y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c) From f96bc313e87a8a8ef73907d153c28e117e3c8d3f Mon Sep 17 00:00:00 2001 From: Yancey Date: Tue, 28 Nov 2017 10:34:49 +0800 Subject: [PATCH 076/275] fix path env in build.sh (#5948) --- paddle/scripts/docker/build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index fda2a2f1b7..a2fdc5ce69 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -16,11 +16,13 @@ function cmake_gen() { echo "using python abi: $1" if [ "$1" == "cp27-cp27m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:} + export PATH=/opt/python/cp27-cp27m/bin/:${PATH} PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so" elif [ "$1" == "cp27-cp27mu" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:} + export PATH=/opt/python/cp27-cp27mu/bin/:${PATH} PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so" From dc82a30908d0d75948491b0a669abfd690b4acce Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 28 Nov 2017 10:41:07 +0800 Subject: [PATCH 077/275] Refine CheckStyle Script (#5942) * Refine CheckStyle Script * Disable linkchecker for build_doc.sh --- .travis.yml | 2 +- paddle/scripts/travis/build_doc.sh | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index c51e02eb79..e2d49daa19 100644 --- a/.travis.yml +++ b/.travis.yml @@ -42,7 +42,7 @@ before_install: script: - | timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout - RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi; + RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi; - | if [[ "$JOB" != "build_doc" ]]; then exit 0; fi; if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi; diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 28d82343ed..7d54f0254c 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -11,8 +11,9 @@ make -j `nproc` gen_proto_py make -j `nproc` paddle_docs paddle_docs_cn # check websites for broken links -linkchecker doc/en/html/index.html -linkchecker doc/cn/html/index.html +# It will be failed now! +#linkchecker doc/en/html/index.html +#linkchecker doc/cn/html/index.html # Parse Github URL REPO=`git config remote.origin.url` From a88d98c413d3ba70c37228e3d9d5e1cda77e9fa0 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 28 Nov 2017 10:46:31 +0800 Subject: [PATCH 078/275] Add comments --- python/paddle/trainer/config_parser.py | 16 ++++++++-------- python/paddle/trainer_config_helpers/layers.py | 1 + 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 9ec6ba6347..deb77e6fd7 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2400,15 +2400,14 @@ class CropLayer(LayerBase): image_conf.img_size_y = input_layer.height image_conf.channels = input_layer.size / (input_layer.width * input_layer.height) - + # only support for 4-dims inputs and NCHW order if (len(self.config.inputs) == 2): self.set_layer_height_width( self.get_input_layer(1).height, self.get_input_layer(1).width) self.set_layer_size(self.get_input_layer(1).size) else: - # NCHW order self.set_layer_height_width(shape[-2], shape[-1]) - self.set_layer_size(reduce(lambda x, y: x * y, shape)) + self.set_layer_size(reduce(lambda x, y: x * y, shape[1:])) @config_layer('batch_norm') @@ -3865,18 +3864,19 @@ class SwitchOrderLayer(LayerBase): else: in_h = input_layer.height in_w = input_layer.width + out_dims = None if input_layer.has_depth(): in_d = input_layer.depth in_c = input_layer.size / in_h / in_w / in_d + # batch_size, depth, height, width, channel out_dims = [0, in_d, in_h, in_w, in_c] - size = reduce(lambda x, y: x * y, - out_dims[reshape['width'][0]:]) else: in_c = input_layer.size / in_h / in_w + # batch_size, height, width, channel out_dims = [0, in_h, in_w, in_c] - size = reduce(lambda x, y: x * y, - out_dims[reshape['width'][0]:]) - + # Because (reshape['width'][0] > 0) always be true. + # So out_dims[0] won't be used. + size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:]) self.set_layer_size(size) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 8e127c9489..bfa395ee13 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -6854,6 +6854,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None): :param input: The input of this layer. If two inputs are given, the second one will be regarded as the reference. + And the input must be 4-dims and in NCHW order. :type input: LayerOutput | Sequence :param offset: The crop offset. :type offset: Sequence From 0a8a86e0c9733dd85e82c58d2042d1abb7c85b73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= Date: Tue, 28 Nov 2017 11:02:24 +0800 Subject: [PATCH 079/275] Send recv op (#5520) * WIP send recv op * WIP send recv * put grpc impl in details * put grpc impl in details * update wip * update proto * update proto * update proto * clean cmake * wip on op implementations * wip on op implementations * compile ok adding ut * wip unitest * add extern cares for linking * wip add ut * working version send recv * revert optimizer.py * update test cmake * add libtool to dockerfile * update cmake dependency * update cmake depends * update cmake grpc depends * fix cmake dependency * fix compile error * fix compile * follow comments * update * update copyfrom --- .clang-format | 1 - CMakeLists.txt | 2 + Dockerfile | 2 +- cmake/external/cares.cmake | 45 +++++ cmake/external/grpc.cmake | 58 +++++++ cmake/external/zlib.cmake | 2 + cmake/generic.cmake | 47 ++++++ paddle/framework/lod_tensor.cc | 163 +++++++++++++++++-- paddle/framework/lod_tensor.h | 9 + paddle/operators/CMakeLists.txt | 25 ++- paddle/operators/detail/CMakeLists.txt | 1 + paddle/operators/detail/recv_impl.cc | 44 +++++ paddle/operators/detail/send_impl.cc | 54 ++++++ paddle/operators/detail/send_recv.proto | 37 +++++ paddle/operators/detail/send_recv_impl.h | 87 ++++++++++ paddle/operators/detail/simple_block_queue.h | 52 ++++++ paddle/operators/load_op.cc | 56 +------ paddle/operators/recv_op.cc | 121 ++++++++++++++ paddle/operators/save_op.cc | 68 +------- paddle/operators/send_op.cc | 84 ++++++++++ paddle/operators/send_recv_op_test.cc | 125 ++++++++++++++ 21 files changed, 941 insertions(+), 142 deletions(-) create mode 100644 cmake/external/cares.cmake create mode 100644 cmake/external/grpc.cmake create mode 100644 paddle/operators/detail/CMakeLists.txt create mode 100644 paddle/operators/detail/recv_impl.cc create mode 100644 paddle/operators/detail/send_impl.cc create mode 100644 paddle/operators/detail/send_recv.proto create mode 100644 paddle/operators/detail/send_recv_impl.h create mode 100644 paddle/operators/detail/simple_block_queue.h create mode 100644 paddle/operators/recv_op.cc create mode 100644 paddle/operators/send_op.cc create mode 100644 paddle/operators/send_recv_op_test.cc diff --git a/.clang-format b/.clang-format index 9ba433b173..aff93435f5 100644 --- a/.clang-format +++ b/.clang-format @@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true BinPackParameters: false BinPackArguments: false ... - diff --git a/CMakeLists.txt b/CMakeLists.txt index 65164b8472..e76512166f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,6 +133,8 @@ include(external/any) # download libn::any include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/nccl) +include(external/cares) +include(external/grpc) include(cudnn) # set cudnn libraries, must before configure include(configure) # add paddle env configuration diff --git a/Dockerfile b/Dockerfile index 150344a811..857d3f3e5f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,7 +29,7 @@ RUN apt-get update && \ automake locales clang-format swig doxygen cmake \ liblapack-dev liblapacke-dev libboost-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ - net-tools && \ + net-tools libtool && \ apt-get clean -y # Install Go and glide diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake new file mode 100644 index 0000000000..e05111ee18 --- /dev/null +++ b/cmake/external/cares.cmake @@ -0,0 +1,45 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE) + return() +ENDIF() + +include (ExternalProject) + +# NOTE: c-ares is needed when linking with grpc. + +SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares) +SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares) +SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE) + +ExternalProject_Add( + extern_cares + GIT_REPOSITORY "https://github.com/c-ares/c-ares.git" + GIT_TAG "cares-1_13_0" + PREFIX ${CARES_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR} + BUILD_IN_SOURCE 1 + BUILD_COMMAND make + INSTALL_COMMAND make install +) + +ADD_LIBRARY(cares STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION + "${CARES_INSTALL_DIR}/lib/libcares.a") + +include_directories(${CARES_INCLUDE_DIR}) +ADD_DEPENDENCIES(cares extern_cares) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake new file mode 100644 index 0000000000..f431c037fd --- /dev/null +++ b/cmake/external/grpc.cmake @@ -0,0 +1,58 @@ +# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE) + return() +ENDIF() + +include (ExternalProject) + +SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) +SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) +SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) +SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) + +ExternalProject_Add( + extern_grpc + DEPENDS protobuf zlib + GIT_REPOSITORY "https://github.com/grpc/grpc.git" + GIT_TAG "v1.7.x" + PREFIX ${GRPC_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + BUILD_COMMAND make + INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install +) + +# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them. +ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a") + +ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc++.a") +ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgpr.a") + +ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION + "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a") + +include_directories(${GRPC_INCLUDE_DIR}) +ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) + diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index a98e069b7c..1638cd8fdf 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -50,6 +50,8 @@ ExternalProject_Add( ) LIST(APPEND external_project_dependencies zlib) +ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) IF(WITH_C_API) INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 7b82d409a3..c917ca0ff4 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -467,3 +467,50 @@ function(py_test TARGET_NAME) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endfunction() + +# grpc_library generate grpc code using grpc_cpp_plugin and protoc +# then build the generated protobuf code and grpc code with your +# implementation source codes together. Use SRCS argument for your +# implementation source files and PROTO argument for your .proto +# files. +# +# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep) + +function(grpc_library TARGET_NAME) + set(oneValueArgs PROTO) + set(multiValueArgs SRCS DEPS) + set(options "") + cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + message(STATUS "generating grpc ${grpc_library_PROTO}") + + get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE) + get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) + get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + + protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") + set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") + set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") + cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}") + + add_custom_command( + OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" + DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) + + # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it + # as compiler warnings instead of error. Should try remove the warnings also. + set_source_files_properties( + ${grpc_grpc_srcs} + PROPERTIES + COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}") + + set_source_files_properties( + ${grpc_library_SRCS} + PROPERTIES + COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") +endfunction() diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index a0f2906c74..fdf6de4bab 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -13,6 +13,8 @@ limitations under the License. */ #include "paddle/framework/lod_tensor.h" +#include "paddle/framework/data_type.h" +#include "paddle/framework/framework.pb.h" #include "paddle/memory/memcpy.h" #include "paddle/memory/memory.h" @@ -27,11 +29,11 @@ namespace paddle { namespace framework { -std::ostream& operator<<(std::ostream& os, const LoD& lod) { +std::ostream &operator<<(std::ostream &os, const LoD &lod) { os << "{"; - for (auto& v : lod) { + for (auto &v : lod) { os << "{"; - for (auto& i : v) { + for (auto &i : v) { os << i << ","; } os << "}"; @@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) { return os; } -LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) { +LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) { LoD new_lod; new_lod.reserve(level_end - level_begin); for (size_t i = level_begin; i < level_end; i++) { @@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) { return new_lod; } -LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, +LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, size_t elem_end) { PADDLE_ENFORCE_LT(level, in.size()); PADDLE_ENFORCE_LT(elem_end, in[level].size()); @@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, res[0].assign(in[level].begin() + elem_begin, in[level].begin() + elem_end + 1); for (size_t lvl = 1; lvl < res.size(); lvl++) { - const auto& in_level = in[level + lvl]; - const auto& above_level = res[lvl - 1]; - auto& out_level = res[lvl]; + const auto &in_level = in[level + lvl]; + const auto &above_level = res[lvl - 1]; + auto &out_level = res[lvl]; out_level.assign(in_level.begin() + above_level.front(), in_level.begin() + above_level.back() + 1); } @@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, // to make the first offset equals 0, all the elements minus the first // element size_t front = res[lvl].front(); - for (auto& ele : res[lvl]) { + for (auto &ele : res[lvl]) { ele -= front; } } return res; } -LoD ToAbsOffset(const LoD& in) { +LoD ToAbsOffset(const LoD &in) { // the lowest level stores relative offsets if (in.empty() || in.size() == 1) return in; LoD result = in; for (int level = result.size() - 2; level >= 0; level--) { - for (auto& ele : result[level]) { + for (auto &ele : result[level]) { ele = result[level + 1][ele]; } } return result; } -bool operator==(const LoD& a, const LoD& b) { +bool operator==(const LoD &a, const LoD &b) { if (a.size() != b.size()) { return false; } for (size_t i = 0; i < a.size(); i++) { - const auto& a_level = a[i]; - const auto& b_level = b[i]; + const auto &a_level = a[i]; + const auto &b_level = b[i]; if (a_level.size() != b_level.size()) { return false; } @@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin, } using LoDAndOffset = std::pair>; -LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx, +LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, size_t end_idx, size_t start_level) { LoD sub_lod; @@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx, return LoDAndOffset{sub_lod, {start_idx, end_idx}}; } -void AppendLoD(LoD* lod, const LoD& lod_length) { +void AppendLoD(LoD *lod, const LoD &lod_length) { PADDLE_ENFORCE( lod->empty() || lod->size() == lod_length.size(), "The lod_length should has the same size with the appended lod."); @@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) { *lod = LoD(lod_length.size(), std::vector({0})); } for (size_t i = 0; i < lod->size(); ++i) { - auto& level = (*lod)[i]; + auto &level = (*lod)[i]; for (size_t len : lod_length[i]) { level.push_back(level.back() + len); } } } +void SerializeToStream(std::ostream &os, const LoDTensor &tensor, + const platform::DeviceContext &dev_ctx) { + // TODO(typhoonzero): serialize to ostream + { // the 1st field, uint32_t version + constexpr uint32_t version = 0; + os.write(reinterpret_cast(&version), sizeof(version)); + } + { // the 2nd field, tensor description + // int32_t size + // void* protobuf message + framework::TensorDesc desc; + desc.set_data_type(framework::ToDataType(tensor.type())); + auto dims = framework::vectorize(tensor.dims()); + auto *pb_dims = desc.mutable_dims(); + pb_dims->Resize(static_cast(dims.size()), 0); + std::copy(dims.begin(), dims.end(), pb_dims->begin()); + int32_t size = desc.ByteSize(); + os.write(reinterpret_cast(&size), sizeof(size)); + auto out = desc.SerializeAsString(); + os.write(out.data(), size); + } + { // the 3rd field, tensor data + uint64_t size = tensor.memory_size(); + auto *data_ptr = tensor.data(); + PADDLE_ENFORCE(size < std::numeric_limits::max(), + "Index overflow when writing tensor"); + if (platform::is_gpu_place(tensor.place())) { +#ifdef PADDLE_WITH_CUDA + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto &gpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + boost::get(tensor.place()), + reinterpret_cast(data), size_to_write, + gpu_dev_ctx.stream()); + gpu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW("Unexpected branch"); +#endif + } else { + os.write(static_cast(data_ptr), + static_cast(size)); + } + } + { // the 4th field, lod information + // uint64_t lod_level + // uint64_t lod_level_1 size in byte. + // int* lod_level_1 data + // ... + auto lod = tensor.lod(); + uint64_t size = lod.size(); + os.write(reinterpret_cast(&size), sizeof(size)); + + for (auto &each : lod) { + size = each.size() * sizeof(framework::LoD::value_type::value_type); + os.write(reinterpret_cast(&size), sizeof(size)); + os.write(reinterpret_cast(each.data()), + static_cast(size)); + } + } +} + +void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { + uint32_t version; + is.read(reinterpret_cast(&version), sizeof(version)); + PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); + framework::TensorDesc desc; + { // int32_t size + // proto buffer + int32_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::unique_ptr buf(new char[size]); + is.read(reinterpret_cast(buf.get()), size); + PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), + "Cannot parse tensor desc"); + } + { // read tensor + std::vector dims; + dims.reserve(static_cast(desc.dims().size())); + std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); + tensor->Resize(framework::make_ddim(dims)); + + void *buf; + platform::Place cpu = platform::CPUPlace(); + switch (desc.data_type()) { + case framework::FP32: + buf = tensor->mutable_data(cpu); + break; + case framework::FP64: + buf = tensor->mutable_data(cpu); + break; + case framework::INT32: + buf = tensor->mutable_data(cpu); + break; + case framework::INT64: + buf = tensor->mutable_data(cpu); + break; + default: + PADDLE_THROW("DataType %d not supported", desc.data_type()); + } + is.read(static_cast(buf), tensor->memory_size()); + } + { // read lod + uint64_t lod_level; + is.read(reinterpret_cast(&lod_level), sizeof(lod_level)); + auto &lod = *tensor->mutable_lod(); + lod.resize(lod_level); + for (uint64_t i = 0; i < lod_level; ++i) { + uint64_t size; + is.read(reinterpret_cast(&size), sizeof(size)); + std::vector tmp(size / sizeof(size_t)); + is.read(reinterpret_cast(tmp.data()), + static_cast(size)); + lod[i] = tmp; + } + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h index 21bdfca111..9411c96aea 100644 --- a/paddle/framework/lod_tensor.h +++ b/paddle/framework/lod_tensor.h @@ -189,5 +189,14 @@ std::pair> GetSubLoDAndAbsoluteOffset( void AppendLoD(LoD* lod, const LoD& lod_length); +/* + * Serialize/Desiralize LoDTensor to std::ostream + * You can pass ofstream or ostringstream to serilize to file + * or to a in memory string. GPU tensor will be copied to CPU. + */ +void SerializeToStream(std::ostream& os, const LoDTensor& tensor, + const platform::DeviceContext& dev_ctx); +void DeserializeFromStream(std::istream& is, LoDTensor* tensor); + } // namespace framework } // namespace paddle diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index a4c4374cf2..7e5d4fd640 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -205,8 +205,24 @@ set(DEPS_OPS tensor_array_read_write_op gru_op adagrad_op - sgd_op) + sgd_op + save_op + load_op + send_op + recv_op) +add_subdirectory(detail) +op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) +set_source_files_properties( + send_op.cc + PROPERTIES + COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + +op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) +set_source_files_properties( + recv_op.cc + PROPERTIES + COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cross_entropy_op DEPS cross_entropy) @@ -235,6 +251,10 @@ op_library(conv_transpose_op DEPS vol2col) op_library(gru_op DEPS sequence2batch gru_compute) op_library(recurrent_op SRCS recurrent_op.cc DEPS executor) +# FIXME(typhoonzero): save/load depends lodtensor serialization functions +op_library(save_op DEPS lod_tensor) +op_library(load_op DEPS lod_tensor) + list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) op_library(${src}) @@ -242,6 +262,8 @@ endforeach() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") + + cc_test(gather_test SRCS gather_test.cc DEPS tensor) cc_test(net_op_test SRCS net_op_test.cc DEPS net_op) cc_test(scatter_test SRCS scatter_test.cc DEPS tensor) @@ -251,3 +273,4 @@ if(WITH_GPU) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) +cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt new file mode 100644 index 0000000000..f6bdc63cc2 --- /dev/null +++ b/paddle/operators/detail/CMakeLists.txt @@ -0,0 +1 @@ +grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc new file mode 100644 index 0000000000..89dc504522 --- /dev/null +++ b/paddle/operators/detail/recv_impl.cc @@ -0,0 +1,44 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "send_recv_impl.h" + +namespace paddle { +namespace operators { +namespace detail { + +Status SendRecvServerImpl::SendVariable(ServerContext *context, + const VariableMessage *in_var, + VariableMessage *out_var) { + framework::LoDTensor t; + // TODO(typhoonzero): desirealize in_tensor and run pserver network. + std::istringstream iss(in_var->serialized()); + framework::DeserializeFromStream(iss, &t); + lodtensor_queue_.Push(std::move(t)); + // Block util the sub graph is done. + t = lodtensor_return_queue_.Pop(); + std::ostringstream oss; + // FIXME(typhoonzero): get context from op. + framework::SerializeToStream(oss, t, platform::CPUDeviceContext()); + std::string *varname = out_var->mutable_varname(); + *varname = in_var->varname(); + std::string *serialized = out_var->mutable_serialized(); + *serialized = oss.str(); + + return Status::OK; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc new file mode 100644 index 0000000000..da1ddf75d2 --- /dev/null +++ b/paddle/operators/detail/send_impl.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "send_recv_impl.h" + +namespace paddle { +namespace operators { +namespace detail { + +bool RPCClient::SendVariable(const framework::Scope& scope, + const std::string& inname, + const std::string& outname) { + ClientContext context; + VariableMessage msg, out_msg; + // FIXME(typhoonzero): pass device context to here. + auto ctx = platform::CPUDeviceContext(); + auto* var = scope.FindVar(inname); + PADDLE_ENFORCE(var); + // TODO(typhoonzero): support SelectedRows + PADDLE_ENFORCE(var->IsType(), + "Only support LoDTensor, %s has wrong type", inname); + const framework::LoDTensor& tensor = var->Get(); + std::ostringstream oss; + framework::SerializeToStream(oss, tensor, ctx); + msg.set_varname(inname); + msg.set_serialized(oss.str()); + Status status = stub_->SendVariable(&context, msg, &out_msg); + if (!status.ok()) { + return false; + } + std::istringstream iss(out_msg.serialized()); + framework::LoDTensor ret_tensor; + framework::DeserializeFromStream(iss, &ret_tensor); + auto* outvar = scope.FindVar(outname); + framework::LoDTensor* out_tensor = outvar->GetMutable(); + // FIXME(typhoonzero): do not copy. + framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor); + return true; +} + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto new file mode 100644 index 0000000000..66f84678b3 --- /dev/null +++ b/paddle/operators/detail/send_recv.proto @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +syntax = "proto3"; + +package sendrecv; + +service SendRecvService { + // For parameter server round-robin like hashing, do not split tensors. + // Send and recv only one tensor + rpc SendVariable(VariableMessage) returns (VariableMessage) {} +} + +// VariableMessage is serialized paddle variable message. +// It can be: +// Tensor +// LoDTensor +// SelectedRows +message VariableMessage { + string varname = 1; + bytes serialized = 2; +} + +message VoidMessage { + +} \ No newline at end of file diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h new file mode 100644 index 0000000000..b9a5340a86 --- /dev/null +++ b/paddle/operators/detail/send_recv_impl.h @@ -0,0 +1,87 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include "paddle/framework/data_type.h" +#include "paddle/framework/lod_tensor.h" +#include "paddle/framework/scope.h" +#include "paddle/framework/selected_rows.h" +#include "paddle/operators/detail/simple_block_queue.h" + +// #include +// #include +// #include +// #include +#include "paddle/operators/detail/send_recv.grpc.pb.h" +#include "paddle/operators/detail/send_recv.pb.h" + +#include + +using grpc::Channel; +using grpc::Server; +using grpc::ServerContext; +using grpc::ServerReader; +using grpc::ServerBuilder; + +using grpc::ClientContext; +using grpc::ClientReader; +using grpc::ClientReaderWriter; +using grpc::ClientWriter; +using grpc::Status; +using sendrecv::SendRecvService; +using sendrecv::VariableMessage; +using sendrecv::VoidMessage; + +namespace paddle { +namespace operators { +namespace detail { + +class SendRecvServerImpl final : public SendRecvService::Service { + public: + explicit SendRecvServerImpl() {} + + Status SendVariable(ServerContext *context, const VariableMessage *in_var, + VariableMessage *out_var) override; + + const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); } + + void Push(const framework::LoDTensor &tensor) { + this->lodtensor_return_queue_.Push(tensor); + } + + private: + SimpleBlockQueue lodtensor_queue_; + SimpleBlockQueue lodtensor_return_queue_; + SimpleBlockQueue selected_rows_queue_; + SimpleBlockQueue selected_rows_return_queue_; +}; + +// RPCClient is a class to send tensors to pserver sub-network +// using different hashing methods. +class RPCClient { + public: + RPCClient(std::shared_ptr channel) + : stub_(SendRecvService::NewStub(channel)) {} + + bool SendVariable(const framework::Scope &scope, const std::string &inname, + const std::string &outname); + + private: + std::unique_ptr stub_; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h new file mode 100644 index 0000000000..4489921757 --- /dev/null +++ b/paddle/operators/detail/simple_block_queue.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace operators { +namespace detail { + +template +class SimpleBlockQueue { + private: + std::mutex mutex_; + std::condition_variable condition_; + std::deque queue_; + + public: + void Push(T const& value) { + { + std::unique_lock lock(this->mutex_); + queue_.push_front(value); + } + this->condition_.notify_one(); + } + + T Pop() { + std::unique_lock lock(this->mutex_); + this->condition_.wait(lock, [=] { return !this->queue_.empty(); }); + T rc(std::move(this->queue_.back())); + this->queue_.pop_back(); + return rc; + } +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc index b0838eed16..4e58b84430 100644 --- a/paddle/operators/load_op.cc +++ b/paddle/operators/load_op.cc @@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase { out_var_name); auto *tensor = out_var->GetMutable(); - - uint32_t version; - fin.read(reinterpret_cast(&version), sizeof(version)); - PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); - framework::TensorDesc desc; - { // int32_t size - // proto buffer - int32_t size; - fin.read(reinterpret_cast(&size), sizeof(size)); - std::unique_ptr buf(new char[size]); - fin.read(reinterpret_cast(buf.get()), size); - PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size), - "Cannot parse tensor desc"); - } - { // read tensor - std::vector dims; - dims.reserve(static_cast(desc.dims().size())); - std::copy(desc.dims().begin(), desc.dims().end(), - std::back_inserter(dims)); - tensor->Resize(framework::make_ddim(dims)); - - void *buf; - platform::Place cpu = platform::CPUPlace(); - switch (desc.data_type()) { - case framework::FP32: - buf = tensor->mutable_data(cpu); - break; - case framework::FP64: - buf = tensor->mutable_data(cpu); - break; - case framework::INT32: - buf = tensor->mutable_data(cpu); - break; - case framework::INT64: - buf = tensor->mutable_data(cpu); - break; - default: - PADDLE_THROW("DataType %d not supported", desc.data_type()); - } - fin.read(static_cast(buf), tensor->memory_size()); - } - { // read lod - uint64_t lod_level; - fin.read(reinterpret_cast(&lod_level), sizeof(lod_level)); - auto &lod = *tensor->mutable_lod(); - lod.resize(lod_level); - for (uint64_t i = 0; i < lod_level; ++i) { - uint64_t size; - fin.read(reinterpret_cast(&size), sizeof(size)); - std::vector tmp(size / sizeof(size_t)); - fin.read(reinterpret_cast(tmp.data()), - static_cast(size)); - lod[i] = tmp; - } - } + framework::DeserializeFromStream(fin, tensor); auto place = dev_ctx.GetPlace(); if (platform::is_gpu_place(place)) { diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc new file mode 100644 index 0000000000..c69e416e10 --- /dev/null +++ b/paddle/operators/recv_op.cc @@ -0,0 +1,121 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include +#include +#include + +#include + +#include "paddle/framework/data_type.h" +#include "paddle/framework/executor.h" +#include "paddle/framework/framework.pb.h" +#include "paddle/framework/lod_tensor.h" +#include "paddle/framework/op_registry.h" +#include "paddle/operators/detail/send_recv_impl.h" +#include "paddle/operators/detail/simple_block_queue.h" + +namespace paddle { +namespace operators { + +void RunServer(Server **rpc_server, + std::shared_ptr service, + const std::string &server_address) { + ServerBuilder builder; + builder.AddListeningPort(server_address, grpc::InsecureServerCredentials()); + builder.RegisterService(service.get()); + std::unique_ptr server(builder.BuildAndStart()); + *rpc_server = server.get(); + LOG(INFO) << "Server listening on " << server_address << std::endl; + server->Wait(); +} + +class RecvOp : public framework::OperatorBase { + public: + RecvOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) { + if (!rpc_service_) { + rpc_service_.reset(new detail::SendRecvServerImpl()); + std::string endpoint = Attr("endpoint"); + server_thread_.reset( + new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint)); + } + } + + virtual ~RecvOp() { + rpc_server_->Shutdown(); + server_thread_->join(); + } + + void Run(const framework::Scope &scope, + const platform::DeviceContext &dev_ctx) const override { + // blocking get one var from client. + const framework::LoDTensor &t = rpc_service_->Get(); + framework::Scope &recv_scope = scope.NewScope(); + // set graph input var + auto *var = recv_scope.Var(Input("RX")); + auto *tensor = var->GetMutable(); + // FIXME(typhoonzero): do not copy + framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor); + + auto *block = Attr("OptimizeBlock"); + auto *program = block->Program(); + framework::Executor executor(dev_ctx); + // Run sub graph to get optimized tensor + executor.Run(*program, &recv_scope, block->ID(), + false /*create_local_scope*/); + + auto *out_var = recv_scope.FindVar("Out"); + // push back + rpc_service_->Push(out_var->Get()); + } + + protected: + // grpc server instance to track status and gracefully shutdown. + // borrow an pointer from server thread. + Server *rpc_server_{nullptr}; + // grpc send/recv service implement to register. + std::shared_ptr rpc_service_; + std::shared_ptr server_thread_; +}; + +class RecvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("RX", "(Tensor) Input tensor to be saved"); + AddComment(R"DOC( +Recv operator + +This operator will recv tensor from send_op +)DOC"); + AddAttr("endpoint", + "(string, default 127.0.0.1:6164)" + "IP address to listen on.") + .SetDefault("127.0.0.1:6164") + .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); + AddAttr("OptimizeBlock", "type BlockDescBind*", + "optimize network run in server"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker); diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc index 56909fb65f..d4921cb80c 100644 --- a/paddle/operators/save_op.cc +++ b/paddle/operators/save_op.cc @@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase { "SaveOp only support LoDTensor, %s has wrong type", iname); auto &tensor = var->Get(); - - { // the 1st field, uint32_t version - constexpr uint32_t version = 0; - fout.write(reinterpret_cast(&version), sizeof(version)); - } - { // the 2nd field, tensor description - // int32_t size - // void* protobuf message - framework::TensorDesc desc; - desc.set_data_type(framework::ToDataType(tensor.type())); - auto dims = framework::vectorize(tensor.dims()); - auto *pb_dims = desc.mutable_dims(); - pb_dims->Resize(static_cast(dims.size()), 0); - std::copy(dims.begin(), dims.end(), pb_dims->begin()); - int32_t size = desc.ByteSize(); - fout.write(reinterpret_cast(&size), sizeof(size)); - auto out = desc.SerializeAsString(); - fout.write(out.data(), size); - } - { // the 3rd field, tensor data - uint64_t size = tensor.memory_size(); - auto *data_ptr = tensor.data(); - PADDLE_ENFORCE(size < std::numeric_limits::max(), - "Index overflow when writing tensor"); - if (platform::is_gpu_place(tensor.place())) { -#ifdef PADDLE_WITH_CUDA - constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB - std::unique_ptr buf(new char[kBufSize]); - auto &gpu_dev_ctx = - static_cast(dev_ctx); - platform::CPUPlace cpu; - uintptr_t data = reinterpret_cast(data_ptr); - while (size != 0) { - size_t size_to_write = std::min(kBufSize, static_cast(size)); - memory::Copy(cpu, buf.get(), - boost::get(tensor.place()), - reinterpret_cast(data), size_to_write, - gpu_dev_ctx.stream()); - gpu_dev_ctx.Wait(); - fout.write(buf.get(), size_to_write); - data += size_to_write; - size -= size_to_write; - } -#else - PADDLE_THROW("Unexpected branch"); -#endif - } else { - fout.write(static_cast(data_ptr), - static_cast(size)); - } - } - { // the 4th field, lod information - // uint64_t lod_level - // uint64_t lod_level_1 size in byte. - // int* lod_level_1 data - // ... - auto lod = tensor.lod(); - uint64_t size = lod.size(); - fout.write(reinterpret_cast(&size), sizeof(size)); - - for (auto &each : lod) { - size = each.size() * sizeof(framework::LoD::value_type::value_type); - fout.write(reinterpret_cast(&size), sizeof(size)); - fout.write(reinterpret_cast(each.data()), - static_cast(size)); - } - } + framework::SerializeToStream(fout, tensor, dev_ctx); } }; diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc new file mode 100644 index 0000000000..a3059847f2 --- /dev/null +++ b/paddle/operators/send_op.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include + +#include "paddle/framework/data_type.h" +#include "paddle/framework/framework.pb.h" +#include "paddle/framework/lod_tensor.h" +#include "paddle/framework/op_registry.h" + +#include "paddle/operators/detail/send_recv_impl.h" +#include "paddle/operators/detail/simple_block_queue.h" + +namespace paddle { +namespace operators { + +// TODO(typhoonzero): this is a simple implementation which only send +// one tensor +class SendOp : public framework::OperatorBase { + public: + SendOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) { + // init client when the operator is created at runtime. + if (!client_) { + std::string endpoint = Attr("endpoint"); + client_.reset(new detail::RPCClient( + grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials()))); + // TODO(typhoonzero): how to call InitVariables + } + } + void Run(const framework::Scope &scope, + const platform::DeviceContext &dev_ctx) const override { + auto iname = Input("X"); + auto oname = Output("Out"); + // TODO(typhoonzero): currently it's non-blocking, + // should block until server responds. + bool ret = client_->SendVariable(scope, iname, oname); + if (!ret) { + LOG(ERROR) << "send variable error"; + } + } + + protected: + std::shared_ptr client_{nullptr}; +}; + +class SendOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor) Input tensor to be saved"); + AddOutput("Out", "(Tensor) Output fetched from server"); + AddComment(R"DOC( +Recv operator + +This operator will recv tensor from send_op +)DOC"); + AddAttr("endpoint", + "(string, default 127.0.0.1:6164)" + "IP address to listen on.") + .SetDefault("127.0.0.1:6164") + .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker); diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc new file mode 100644 index 0000000000..ac03eb3752 --- /dev/null +++ b/paddle/operators/send_recv_op_test.cc @@ -0,0 +1,125 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +// TODO(typhoonzero): add python bindings for this test as +// a RemoteOptimizer. + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/framework/op_registry.h" +#include "paddle/framework/operator.h" +#include "paddle/framework/program_desc.h" + +USE_NO_KERNEL_OP(send); +USE_NO_KERNEL_OP(recv); +USE_OP(sum); + +// global for simplicity. +std::unique_ptr recv_op; + +void InitTensorsInScope(paddle::framework::Scope &scope, + paddle::platform::CPUPlace &place) { + paddle::platform::CPUDeviceContext ctx(place); + auto var = scope.Var("X"); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + float *expect = tensor->mutable_data(place); + for (int64_t i = 0; i < tensor->numel(); ++i) { + expect[i] = static_cast(i); + } + + auto out_var = scope.Var("Out"); + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({10, 10}); + tensor->mutable_data(place); // allocate +} + +void AddOp(const std::string &type, + const paddle::framework::VariableNameMap &inputs, + const paddle::framework::VariableNameMap &outputs, + paddle::framework::AttributeMap attrs, + paddle::framework::BlockDescBind *block) { + // insert output + for (auto kv : outputs) { + for (auto v : kv.second) { + auto var = block->Var(v); + var->SetDataType(paddle::framework::DataType::FP32); + } + } + + // insert op + auto op = block->AppendOp(); + op->SetType(type); + for (auto &kv : inputs) { + op->SetInput(kv.first, kv.second); + } + for (auto &kv : outputs) { + op->SetOutput(kv.first, kv.second); + } + op->SetAttrMap(attrs); +} + +void StartServerNet() { + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + InitTensorsInScope(scope, place); + + // sub program run in recv_op, for simple test we use sum + paddle::framework::ProgramDescBind program; + paddle::framework::BlockDescBind *block = program.MutableBlock(0); + // X for server side tensors, RX for received tensers, must be of same shape. + AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block); + + paddle::framework::AttributeMap attrs; + attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); + attrs.insert({"OptimizeBlock", block}); + recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}}, + {{"Out", {"Out"}}}, attrs); + paddle::platform::CPUDeviceContext ctx(place); + recv_op->Run(scope, ctx); +} + +TEST(SendRecvOp, CPU) { + std::thread server_thread(StartServerNet); + sleep(5); // wait server to start + // local net + paddle::framework::Scope scope; + paddle::platform::CPUPlace place; + InitTensorsInScope(scope, place); + + paddle::framework::AttributeMap attrs; + attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); + + auto send_op = paddle::framework::OpRegistry::CreateOp( + "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs); + paddle::platform::CPUDeviceContext ctx(place); + send_op->Run(scope, ctx); + + auto in_var = scope.Var("X"); + auto tensor = in_var->GetMutable(); + float *expected = tensor->data(); + + auto out_var = scope.Var("Out"); + auto target = out_var->GetMutable(); + // send fail cause output is none. + EXPECT_NE(target->memory_size(), size_t(0)); + float *actual = target->data(); + for (int64_t i = 0; i < target->numel(); ++i) { + EXPECT_EQ(expected[i] * 2, actual[i]); + } + recv_op.reset(); // dtor can shutdown and join server thread. + server_thread.join(); +} From 3022a790828dc75ce17deb1fb114d45838124f3d Mon Sep 17 00:00:00 2001 From: peterzhang2029 Date: Tue, 28 Nov 2017 10:59:29 +0800 Subject: [PATCH 080/275] add gpu test in test_LayerGrad --- paddle/gserver/tests/test_LayerGrad.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index cacf106929..12d6600916 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) { config.layerConfig.add_inputs(); config.layerConfig.add_inputs(); - // Not support GPU now - testLayerGrad(config, - "hsigmoid", - 100, - /* trans */ false, /* useGpu */ - false); + for (auto useGpu : {false, true}) { + testLayerGrad(config, + "hsigmoid", + 100, + /* trans */ false, /* useGpu */ + useGpu); + } } TEST(Layer, multi_cross) { From ed4810e55d44bc10c9adfedd4836c0ed1100a46e Mon Sep 17 00:00:00 2001 From: "Wang,Jeff" Date: Mon, 27 Nov 2017 19:22:43 -0800 Subject: [PATCH 081/275] update the write_docs_cn.rst to include using paddlepaddle.org viewer tool --- doc/howto/dev/write_docs_cn.rst | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index 731a63f945..25a967da83 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -8,7 +8,26 @@ PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两 如何构建文档 ============ -PaddlePaddle的文档构建有两种方式。 +PaddlePaddle的文档构建有三种方式。 + + +使用PaddlePaddle.org工具 +-------------- +这个是目前推荐的使用方法。除了可以自动编役文档,也可以直接在网页预览文档。 + +文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具 + +.. code-block:: bash + + mkdir paddlepaddle + cd paddlepaddle + git clone git@github.com:PaddlePaddle/Paddle.git + git clone git@github.com:PaddlePaddle/book.git + git clone git@github.com:PaddlePaddle/models.git + + docker run -it -p 8000:8000 paddlepaddle/paddlepaddle.org:latest + +之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 使用Docker构建 -------------- From ab1af66b1281b941c75d5c000141ce912ab1e37b Mon Sep 17 00:00:00 2001 From: peterzhang2029 Date: Tue, 28 Nov 2017 11:30:18 +0800 Subject: [PATCH 082/275] --amend --- paddle/gserver/tests/test_LayerGrad.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index 60a4feff03..c5359f272b 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -685,8 +685,8 @@ TEST(Layer, hsigmoidLayer) { testLayerGrad(config, "hsigmoid", 100, - /* trans */ false, /* useGpu */ - useGpu); + /* trans */ false, + /* useGpu */ useGpu); } } From 10acacf1bfa770ca96b8a8e4925a8c63fb099d94 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Tue, 28 Nov 2017 11:44:03 +0800 Subject: [PATCH 083/275] fix 404 link in dist arch doc --- doc/design/refactor/distributed_architecture.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md index ac7e98ccf1..08b09400a6 100644 --- a/doc/design/refactor/distributed_architecture.md +++ b/doc/design/refactor/distributed_architecture.md @@ -86,7 +86,7 @@ This could be fixed by making the parameter server run the same computation definition as the trainer. For a detailed explanation, please see -[Design Doc: Operation Graph Based Parameter Server](./dist_train.md) +[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md) ## Distributed Training Architecture From 0aceeee1fae98c0ad012f1c85adf91a49b4365fd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 28 Nov 2017 12:03:16 +0800 Subject: [PATCH 084/275] Feature/remove g program (#5930) * Unify fluid submodules to fluid module Change books just use `import fluid`, not submodules * Remove g_main_program/g_startup_program Use default_main_program/default_startup_program instead * Typo * Fix CI --- python/paddle/v2/fluid/evaluator.py | 4 ++-- python/paddle/v2/fluid/executor.py | 4 ++-- python/paddle/v2/fluid/framework.py | 10 ++++----- python/paddle/v2/fluid/io.py | 19 ++++++++--------- python/paddle/v2/fluid/layer_helper.py | 7 +++---- python/paddle/v2/fluid/layers.py | 2 +- .../fluid/tests/test_array_read_write_op.py | 4 ++-- .../v2/fluid/tests/test_conditional_block.py | 8 ++++--- .../v2/fluid/tests/test_executor_and_mul.py | 12 +++++------ .../v2/fluid/tests/test_lod_rank_table.py | 3 +-- .../v2/fluid/tests/test_operator_desc.py | 8 +++++-- .../paddle/v2/fluid/tests/test_parameter.py | 10 +++++---- python/paddle/v2/fluid/tests/test_program.py | 21 ++++++++++--------- .../v2/fluid/tests/test_shrink_rnn_memory.py | 6 ++++-- python/paddle/v2/fluid/tests/test_variable.py | 4 ++-- 15 files changed, 64 insertions(+), 58 deletions(-) diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py index bd4a6fda1f..137c573622 100644 --- a/python/paddle/v2/fluid/evaluator.py +++ b/python/paddle/v2/fluid/evaluator.py @@ -26,9 +26,9 @@ class Evaluator(object): name(str): The name of evaluator. such as, "accuracy". Used for generate temporary variable name. main_program(Program, optional): The evaluator should be added to this - main_program. Default g_main_program + main_program. Default default_main_program() startup_program(Program, optional):The parameter should be added to this - startup_program. Default g_startup_program + startup_program. Default default_startup_program() Attributes: states(list): The list of state variables. states will be reset to zero diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index 3e26d1b983..bdc82eede9 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -1,6 +1,6 @@ import numpy as np from . import core -from framework import Program, g_main_program +from framework import Program, default_main_program __all__ = ['Executor', 'g_scope'] @@ -103,7 +103,7 @@ class Executor(object): fetch_list = [] if program is None: - program = g_main_program + program = default_main_program() if not isinstance(program, Program): raise TypeError() diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 6d6ea23f55..1c42e4d44f 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -6,7 +6,7 @@ import proto.framework_pb2 as framework_pb2 __all__ = [ 'Block', 'Variable', 'Program', 'Operator', 'default_startup_program', - 'default_main_program', 'g_startup_program', 'g_main_program' + 'default_main_program' ] @@ -654,13 +654,13 @@ class Parameter(Variable): # program is a global instance. -g_main_program = Program() -g_startup_program = Program() +_main_program_ = Program() +_startup_program_ = Program() def default_startup_program(): - return g_startup_program + return _startup_program_ def default_main_program(): - return g_main_program + return _main_program_ diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py index e5b2aa3b91..e147ac22ad 100644 --- a/python/paddle/v2/fluid/io.py +++ b/python/paddle/v2/fluid/io.py @@ -1,8 +1,7 @@ import os import cPickle as pickle -from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \ - Variable +from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable __all__ = [ 'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params', @@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): """ if vars is None: if main_program is None: - main_program = g_main_program + main_program = default_main_program() if not isinstance(main_program, Program): raise TypeError("program should be as Program type or None") @@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): :param executor: executor that save variable :param dirname: directory path :param main_program: program. If vars is None, then filter all variables in this - program which fit `predicate`. Default g_program. + program which fit `predicate`. Default default_main_program(). :param predicate: The Predicate describes a callable that returns a variable as a bool. If it returns true, the variables will be loaded. :param vars: variables need to be loaded. If specify vars, program & @@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None): """ if vars is None: if main_program is None: - main_program = g_main_program + main_program = default_main_program() if not isinstance(main_program, Program): raise TypeError("program's type should be Program") @@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None): def get_inference_program(target_vars, main_program=None): if main_program is None: - main_program = g_main_program + main_program = default_main_program() if not isinstance(target_vars, list): target_vars = [target_vars] @@ -177,12 +176,12 @@ def save_inference_model(dirname, :param target_vars: Variables from which we can get inference results. :param executor: executor that save inference model :param main_program: original program, which will be pruned to build the inference model. - Default g_main_program. + Default default_main_program(). :return: None """ if main_program is None: - main_program = g_main_program + main_program = default_main_program() if not isinstance(target_vars, list): target_vars = [target_vars] @@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None): :param executor: executor for retrieving the value :param name: the name of the parameter :param program: the program where the variable is found - Default g_main_program. + Default default_main_program(). :return: the LoDTensor for the variable """ if program is None: - program = g_main_program + program = default_main_program() var = program.global_block().var(name) return get_parameter_value(var, executor) diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py index 5f88555511..7762b0d88f 100644 --- a/python/paddle/v2/fluid/layer_helper.py +++ b/python/paddle/v2/fluid/layer_helper.py @@ -1,8 +1,7 @@ import copy import itertools -from framework import Variable, g_main_program, \ - g_startup_program, unique_name, dtype_is_floating +from framework import Variable, default_main_program, default_startup_program, unique_name, dtype_is_floating from paddle.v2.fluid.initializer import Constant, Xavier @@ -22,7 +21,7 @@ class LayerHelper(object): def main_program(self): prog = self.kwargs.get('main_program', None) if prog is None: - return g_main_program + return default_main_program() else: return prog @@ -30,7 +29,7 @@ class LayerHelper(object): def startup_program(self): prog = self.kwargs.get('startup_program', None) if prog is None: - return g_startup_program + return default_startup_program() else: return prog diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 28bc3d214b..5a76c79db1 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -1,4 +1,4 @@ -from . import core +import core import proto.framework_pb2 as framework_pb2 from framework import OpProtoHolder, Variable, Program, Operator from initializer import Constant, Normal, Xavier diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py index b7790b0106..f6120aedec 100644 --- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py +++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py @@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core import paddle.v2.fluid.layers as layers from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.backward import append_backward_ops -from paddle.v2.fluid.framework import g_main_program +from paddle.v2.fluid.framework import default_main_program import numpy @@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase): append_backward_ops(total_sum_scaled) - g_vars = map(g_main_program.global_block().var, + g_vars = map(default_main_program().global_block().var, [each_x.name + "@GRAD" for each_x in x]) g_out = [ item.sum() diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py index d953ee7ddc..2b9d8f351a 100644 --- a/python/paddle/v2/fluid/tests/test_conditional_block.py +++ b/python/paddle/v2/fluid/tests/test_conditional_block.py @@ -1,7 +1,7 @@ import unittest import paddle.v2.fluid.layers as layers import paddle.v2.fluid.core as core -from paddle.v2.fluid.framework import g_startup_program, g_main_program +from paddle.v2.fluid.framework import default_startup_program, default_main_program from paddle.v2.fluid.executor import Executor from paddle.v2.fluid.backward import append_backward_ops import numpy @@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase): cpu = core.CPUPlace() exe = Executor(cpu) - exe.run(g_startup_program) + exe.run(default_startup_program()) x = numpy.random.random(size=(10, 1)).astype('float32') @@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase): append_backward_ops(loss=loss) outs = exe.run( feed={'X': x}, - fetch_list=[g_main_program.block(0).var(data.name + "@GRAD")])[0] + fetch_list=[ + default_main_program().block(0).var(data.name + "@GRAD") + ])[0] print outs diff --git a/python/paddle/v2/fluid/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py index 558273e30d..b1ef87c5cb 100644 --- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py +++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py @@ -1,9 +1,10 @@ import unittest -from paddle.v2.fluid.layers import mul, data, sequence_pool + +import numpy import paddle.v2.fluid.core as core + from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.framework import g_main_program -import numpy +from paddle.v2.fluid.layers import mul, data class TestExecutor(unittest.TestCase): @@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase): a_np = numpy.random.random((100, 784)).astype('float32') b_np = numpy.random.random((784, 100)).astype('float32') exe = Executor(place) - outs = exe.run(g_main_program, - feed={'a': a_np, - 'b': b_np}, - fetch_list=[out]) + outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out]) out = outs[0] self.assertEqual((100, 100), out.shape) self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np))) diff --git a/python/paddle/v2/fluid/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py index bbc11930b9..30d619fe31 100644 --- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py +++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py @@ -1,6 +1,5 @@ from paddle.v2.fluid.layers import lod_rank_table, data from paddle.v2.fluid.executor import Executor -from paddle.v2.fluid.framework import g_main_program import paddle.v2.fluid.core as core import numpy import unittest @@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase): tensor = core.LoDTensor() tensor.set(numpy.random.random(size=(17, 100)), cpu) tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]]) - exe.run(g_main_program, scope=scope, feed={'x': tensor}) + exe.run(scope=scope, feed={'x': tensor}) var = scope.find_var(rank_table.name) table = var.get_lod_rank_table() self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items()) diff --git a/python/paddle/v2/fluid/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py index e8362d2e9c..ce34d95ac8 100644 --- a/python/paddle/v2/fluid/tests/test_operator_desc.py +++ b/python/paddle/v2/fluid/tests/test_operator_desc.py @@ -1,11 +1,15 @@ import unittest -from paddle.v2.fluid.framework import Variable, Program, g_main_program + import paddle.v2.fluid.core as core +from paddle.v2.fluid.framework import Program, default_startup_program + +main_program = default_startup_program() + class TestOperator(unittest.TestCase): def test_error_type(self): - block = g_main_program.create_block() + block = main_program.create_block() try: block.append_op() self.assertFail() diff --git a/python/paddle/v2/fluid/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py index 13f6278ad8..694344acbb 100644 --- a/python/paddle/v2/fluid/tests/test_parameter.py +++ b/python/paddle/v2/fluid/tests/test_parameter.py @@ -1,17 +1,19 @@ import unittest -from paddle.v2.fluid.framework import g_main_program +from paddle.v2.fluid.framework import default_main_program import paddle.v2.fluid.core as core from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.io as io from paddle.v2.fluid.initializer import ConstantInitializer import numpy as np +main_program = default_main_program() + class TestParameter(unittest.TestCase): def test_param(self): shape = [784, 100] val = 1.0625 - b = g_main_program.global_block() + b = main_program.global_block() param = b.create_parameter( name='fc.w', shape=shape, @@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase): self.assertEqual(core.DataType.FP32, param.dtype) self.assertEqual(0, param.block.idx) exe = Executor(core.CPUPlace()) - p = exe.run(g_main_program, fetch_list=[param])[0] + p = exe.run(main_program, fetch_list=[param])[0] self.assertTrue(np.allclose(p, np.ones(shape) * val)) - p = io.get_parameter_value_by_name('fc.w', exe, g_main_program) + p = io.get_parameter_value_by_name('fc.w', exe, main_program) self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val)) diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py index 15653a1dbf..1a9313c68a 100644 --- a/python/paddle/v2/fluid/tests/test_program.py +++ b/python/paddle/v2/fluid/tests/test_program.py @@ -1,37 +1,38 @@ from __future__ import print_function import unittest -from paddle.v2.fluid.framework import Program -from paddle.v2.fluid.framework import g_main_program +from paddle.v2.fluid.framework import Program, default_main_program import paddle.v2.fluid.layers as layers +main_program = default_main_program() + class TestProgram(unittest.TestCase): def test_program(self): - b = g_main_program.current_block() + b = main_program.current_block() self.assertEqual(-1, b.parent_idx) self.assertEqual(0, b.idx) - b = g_main_program.create_block() + b = main_program.create_block() self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) - b = g_main_program.create_block() + b = main_program.create_block() self.assertEqual(2, b.idx) self.assertEqual(1, b.parent_idx) - g_main_program.rollback() + main_program.rollback() - b = g_main_program.current_block() + b = main_program.current_block() self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) - b = g_main_program.create_block() + b = main_program.create_block() self.assertEqual(3, b.idx) self.assertEqual(1, b.parent_idx) - g_main_program.rollback() - b = g_main_program.current_block() + main_program.rollback() + b = main_program.current_block() self.assertEqual(1, b.idx) self.assertEqual(0, b.parent_idx) diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py index 05f6a56064..86db4c64b4 100644 --- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py +++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py @@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core from paddle.v2.fluid.executor import Executor import paddle.v2.fluid.layers as layers from paddle.v2.fluid.backward import append_backward_ops -from paddle.v2.fluid.framework import g_main_program +from paddle.v2.fluid.framework import default_main_program import numpy +main_program = default_main_program() + class TestShrinkRNNMemory(unittest.TestCase): def test_shrink_rnn_memory(self): @@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase): append_backward_ops(loss=mem3_mean) x_grad = exe.run( feed={'x': tensor}, - fetch_list=[g_main_program.global_block().var('x@GRAD')])[0] + fetch_list=[main_program.global_block().var('x@GRAD')])[0] self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1) diff --git a/python/paddle/v2/fluid/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py index 92ffdceb6c..f1e4c0ba21 100644 --- a/python/paddle/v2/fluid/tests/test_variable.py +++ b/python/paddle/v2/fluid/tests/test_variable.py @@ -1,5 +1,5 @@ import unittest -from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_ +from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_ import paddle.v2.fluid.core as core import numpy as np @@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase): self.assertRaises(ValueError, lambda: convert("int8")) def test_var(self): - b = g_main_program.current_block() + b = default_main_program().current_block() w = b.create_var( dtype="float64", shape=[784, 100], lod_level=0, name="fc.w") self.assertNotEqual(str(w), "") From ea7359c60bdf6062b1296f471f50cbeaf8da243e Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 28 Nov 2017 12:47:17 +0800 Subject: [PATCH 085/275] Refine code and comments 1. Remove checking for num_neg_samples. 2. Fix dims of Output(Cost) and Input(Bias). 3. Renamed num_sampled_classes to num_neg_samples. 4. Add TODO for add more distribution sampler. 5. Init grad_data of bias by zero. 6. Refine comments. 7. Register a kernel for type double. --- paddle/operators/nce_op.cc | 95 +++++++++++++++--------- paddle/operators/nce_op.h | 15 ++-- python/paddle/v2/fluid/tests/test_nce.py | 14 ++-- 3 files changed, 77 insertions(+), 47 deletions(-) diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc index c365d5d922..bb9346b134 100644 --- a/paddle/operators/nce_op.cc +++ b/paddle/operators/nce_op.cc @@ -1,16 +1,16 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ #include "paddle/operators/nce_op.h" @@ -39,25 +39,25 @@ class NCEOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0], ctx->GetInputDim("Bias")[0]); } - auto num_sampled_classes = ctx->Attrs().Get("num_sampled_classes"); - auto num_classes = ctx->Attrs().Get("num_classes"); + auto num_neg_samples = ctx->Attrs().Get("num_neg_samples"); + auto num_total_classes = ctx->Attrs().Get("num_total_classes"); std::vector sampled_labels = ctx->Attrs().Get>("sampled_labels"); - PADDLE_ENFORCE_EQ(num_classes, ctx->GetInputDim("Weight")[0]); - PADDLE_ENFORCE_LT(num_sampled_classes, num_classes); + PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]); if (sampled_labels.size() > 0) { PADDLE_ENFORCE_EQ(sampled_labels.size(), - static_cast(num_sampled_classes)); + static_cast(num_neg_samples)); } // set dims of output(Out) std::vector out_dims; out_dims.push_back(x_dims[0]); + out_dims.push_back(1); ctx->SetOutputDim("Cost", framework::make_ddim(out_dims)); // set dims of output(SampleOut) std::vector sample_out_dims; sample_out_dims.push_back(x_dims[0]); - sample_out_dims.push_back(num_sampled_classes + num_true_classes); + sample_out_dims.push_back(num_neg_samples + num_true_classes); ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims)); ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims)); } @@ -76,34 +76,59 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker { NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim]."); - AddInput("Label", - "(Tensor) A tensor of shape [batch_size, num_true_class]. " - "'num_true_class' is the number of target class in each sample."); + AddInput( + "Label", + "(Tensor) A tensor of shape [batch_size, num_true_class]. " + "'num_true_class' is the number of target classes in each sample." + "The number of target classes per sample should be same. " + "If you have a variable number of target classes, " + "you can pad them out to a constant number by either repeating them" + " or by padding with an otherwise unused class.)"); AddInput("Weight", "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the " "total number of class."); - AddInput("Bias", - "(Tensor) A tensor of shape [num_class]. 'num_class' is the total " - "number of class. It is a dispensable input.") + AddInput( + "Bias", + "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total " + "number of class. It is a dispensable input.") .AsDispensable(); AddInput("SampleWeight", - "(Tensor) A tensor of shape [batch_size] storing a weight for " + "(Tensor) A tensor of shape [batch_size, 1] storing a weight for " "each sample. And it is a dispensable input. The default value of " "sample is 1.") .AsDispensable(); AddOutput("Cost", - "(Tensor) A tensor of shape [batch_size]. Cost of samples."); - AddOutput("SampleLogits", "An intermediate tensor.").AsIntermediate(); - AddOutput("SampleLabels", "An intermediate tensor.").AsIntermediate(); - AddAttr("num_classes", "Total number of classes."); - AddAttr("num_sampled_classes", "The number of negative classes.") + "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples."); + AddOutput("SampleLogits", + "An intermediate tensor of shape[batch_size, num_neg_samples + " + "num_pos_samples]." + "This tensor is output of forward kernel and used in backward " + "kernel to compute grads." + "Given X is the dot product of input tensor and sampled labels' " + "weights." + "Then 'SampleLogits' is sigmoid(X).") + .AsIntermediate(); + AddOutput("SampleLabels", + "An intermediate tensor of shape[batch_size, num_neg_samples + " + "num_pos_samples]." + "This tensor is output of forward kernel and used in backward " + "kernel to compute grads." + "") + .AsIntermediate(); + AddAttr("num_total_classes", + "Total number of classes in all samples."); + AddAttr("num_neg_samples", + "The number of negative classes. The default value is 10.") .SetDefault(10); - AddAttr>("sampled_labels", ""); + AddAttr>("custom_neg_classes", + "This attribute only be used in unitest. Classes " + "in this list wiil be used as negative classes " + "for every samples. Under normal conditions, " + "user should avoid setting this attribute."); AddComment(R"DOC( -Computes and returns the noise-contrastive estimation training loss. +Compute and return the noise-contrastive estimation training loss. See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf). -By default this uses a uniform distribution for sampling. -The number of target classes per example should be same. If you have a variable number of target classes, you can pad them out to a constant number by either repeating them or by padding with an otherwise unused class. +By default this operator uses a uniform distribution for sampling. )DOC"); } }; @@ -119,7 +144,7 @@ class NCEOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("SampleLogits")); PADDLE_ENFORCE(ctx->HasInput("SampleLabels")); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")), - "The input(Out@GRAD) should not be null"); + "The input(Out@GRAD) should not be null."); auto x_dims = ctx->GetInputDim("Input"); auto x_grad_name = framework::GradVarName("Input"); @@ -154,6 +179,8 @@ class NCEOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad); -REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel); +REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel, + ops::NCEKernel); REGISTER_OP_CPU_KERNEL(nce_grad, - ops::NCEGradKernel); + ops::NCEGradKernel, + ops::NCEGradKernel); diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index 3017bccdca..c41393d260 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -using Tensor = framework::Tensor; +using framework::Tensor; template @@ -35,8 +35,8 @@ void PrepareSamples(const framework::ExecutionContext& context) { auto label_dims = label->dims(); int num_classes = context.Attr("num_classes"); // for unitest - std::vector sampled_labels = - context.Attr>("sampled_labels"); + std::vector custom_neg_classes = + context.Attr>("custom_neg_classes"); // random machine std::random_device rd; std::mt19937 rng(rd()); @@ -54,12 +54,13 @@ void PrepareSamples(const framework::ExecutionContext& context) { for (; j < num_label; ++j) { sample_labels_data[index++] = label_data[i * num_label + j]; } - if (sampled_labels.size() > 0) { - for (auto label : sampled_labels) { + if (custom_neg_classes.size() > 0) { + for (auto label : custom_neg_classes) { sample_labels_data[index++] = label; } } else { for (; j < sample_labels_dims[1]; ++j) { + // TODO: support more distribution sampling sample_labels_data[index++] = rand(rng); } } @@ -176,6 +177,7 @@ class NCEGradKernel : public framework::OpKernel { auto d_bias = context.Output(framework::GradVarName("Bias")); if (d_bias != nullptr) { T* d_bias_data = d_bias->mutable_data(context.GetPlace()); + std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); for (size_t i = 0; i < sample_labels->numel(); ++i) { d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; } @@ -183,7 +185,8 @@ class NCEGradKernel : public framework::OpKernel { // get d_w auto d_w = context.Output(framework::GradVarName("Weight")); if (d_w != nullptr) { - d_w->mutable_data(context.GetPlace()); + auto d_w_data = d_w->mutable_data(context.GetPlace()); + std::fill(d_w_data, d_w_data + d_w->numel(), 0.0); auto d_w_matrix = EigenMatrix::From(*d_w); auto x_matrix = EigenMatrix::From(*(context.Input("Input"))); for (size_t i = 0; i < sample_labels->numel(); ++i) { diff --git a/python/paddle/v2/fluid/tests/test_nce.py b/python/paddle/v2/fluid/tests/test_nce.py index 82978f2d23..6cbf468e0a 100644 --- a/python/paddle/v2/fluid/tests/test_nce.py +++ b/python/paddle/v2/fluid/tests/test_nce.py @@ -18,25 +18,25 @@ def nce(input, weight, bias, sample_weight, labels, num_classes, samples.append((i, num, False, w)) sample_labels.append(num) # forward bias - sampleOut = np.zeros(len(samples)).astype(np.float32) + sample_out = np.zeros(len(samples)).astype(np.float32) if bias is not None: for i in range(len(samples)): - sampleOut[i] = bias[samples[i][1]] + sample_out[i] = bias[samples[i][1]] # forward weight for i in range(len(samples)): - sampleOut[i] += np.dot(input[samples[i][0]], weight[samples[i][1]]) + sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]]) # forward activation - sampleOut = 1.0 / (1.0 + np.exp(-sampleOut)) + sample_out = 1.0 / (1.0 + np.exp(-sample_out)) # forward cost out = np.zeros(batch_size).astype(np.float32) b = 1.0 / num_classes * num_sample_class for i in range(len(samples)): - o = sampleOut[i] + o = sample_out[i] cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b)) out[samples[i][0]] += cost * samples[i][3] - return (out, np.array(sampleOut).reshape(batch_size, - num_sample_class + num_true_class), + return (out, np.array(sample_out).reshape( + batch_size, num_sample_class + num_true_class), np.array(sample_labels).reshape(batch_size, num_sample_class + num_true_class)) From ab9d59c5396002a1c0695075164da5109c530150 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 28 Nov 2017 14:45:11 +0800 Subject: [PATCH 086/275] Fix double type error while using eigen api --- paddle/operators/nce_op.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index c41393d260..7a91070329 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -using framework::Tensor; +using Tensor = framework::Tensor; template @@ -107,12 +107,11 @@ class NCEKernel : public framework::OpKernel { auto input_mat = EigenMatrix::From(*(context.Input("Input"))); auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); for (size_t i = 0; i < sample_labels->numel(); ++i) { - Eigen::Tensor result = + Eigen::Tensor result = (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) * weight_mat.chip(sample_labels_data[i], 0)) .sum(); sample_out_data[i] += result(0); - // activation_->forward sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } // forward cost From 985e4ab62dc6ca2eb023d8c1e0c633dc235c847a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 28 Nov 2017 15:35:36 +0800 Subject: [PATCH 087/275] Add Python wrap of conv2d_transpose and its unittest (#5946) * Add Python wrap of conv2d_transpose and its unittest * Follow comments * Fix format --- paddle/operators/conv_transpose_op.cc | 18 ++-- paddle/operators/detail/send_recv.proto | 6 +- python/paddle/v2/fluid/layers.py | 93 ++++++++++++++++++++- python/paddle/v2/fluid/tests/test_layers.py | 9 ++ 4 files changed, 112 insertions(+), 14 deletions(-) diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index 3e55ef036a..314b577d00 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -74,12 +74,12 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker( "The format of output tensor is also NCHW."); AddAttr>( "strides", - "(vector defalut:{1, 1}), the strides(h_stride, w_stride) of " + "(vector default:{1, 1}), the strides(h_stride, w_stride) of " "convolution transpose operator.") .SetDefault({1, 1}); AddAttr>( "paddings", - "(vector defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution " + "(vector default:{0, 0}), the paddings(h_pad, w_pad) of convolution " "transpose operator.") .SetDefault({0, 0}); AddComment(R"DOC( @@ -101,8 +101,8 @@ Example: Output: Output shape: (N, C_out, H_out, W_out) where - H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0]; - W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1]; + H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + H_f; + W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + W_f; )DOC"); } @@ -130,12 +130,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker( "the number of channels, D is the depth of the feature, H is the " "height of the feature, and W is the width of the feature."); AddAttr>("strides", - "(vector defalut:{1, 1, 1}), the " + "(vector default:{1, 1, 1}), the " "strides{d_stride, h_stride, w_stride} of " "convolution transpose operator.") .SetDefault({1, 1, 1}); AddAttr>("paddings", - "(vector defalut:{0, 0, 0}), paddings(d_pad, " + "(vector default:{0, 0, 0}), paddings(d_pad, " "h_pad, w_pad) of convolution transpose operator.") .SetDefault({0, 0, 0}); AddComment(R"DOC( @@ -158,9 +158,9 @@ Example: Output: Output shape: (N, C_out, D_out, H_out, W_out) where - D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0]; - H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1]; - W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2]; + D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + D_f; + H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + H_f; + W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + W_f; )DOC"); } diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto index 66f84678b3..962c7d5981 100644 --- a/paddle/operators/detail/send_recv.proto +++ b/paddle/operators/detail/send_recv.proto @@ -17,7 +17,7 @@ syntax = "proto3"; package sendrecv; service SendRecvService { - // For parameter server round-robin like hashing, do not split tensors. + // For parameter server round-robin like hashing, do not split tensors. // Send and recv only one tensor rpc SendVariable(VariableMessage) returns (VariableMessage) {} } @@ -32,6 +32,4 @@ message VariableMessage { bytes serialized = 2; } -message VoidMessage { - -} \ No newline at end of file +message VoidMessage {} \ No newline at end of file diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 5a76c79db1..6adfac3a32 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -1,7 +1,7 @@ import core import proto.framework_pb2 as framework_pb2 from framework import OpProtoHolder, Variable, Program, Operator -from initializer import Constant, Normal, Xavier +from initializer import Constant, Normal, Xavier, Initializer from paddle.v2.fluid.layer_helper import LayerHelper, unique_name import re import cStringIO @@ -1587,6 +1587,97 @@ def array_length(array, main_program=None): return tmp +def conv2d_transpose(input, + num_filters, + output_size=None, + filter_size=None, + padding=None, + stride=None, + param_attr=None, + param_initializer=None, + main_program=None, + startup_program=None): + """ + The transpose of conv2d layer. + + This layer is also known as deconvolution layer. + + Args: + input(Variable): The input image with [N, C, H, W] format. + num_filters(int): The number of filter. It is as same as the output + image channel. + output_size(int|tuple|None): The output image size. If output size is a + tuple, it must contain two integers, (image_H, image_W). This + parameter only works when filter_size is None. + filter_size(int|tuple|None): The filter size. If filter_size is a tuple, + it must contain two integers, (filter_size_H, filter_size_W). + Otherwise, the filter will be a square. None if use output size to + calculate filter_size + padding(int|tuple): The padding size. If padding is a tuple, it must + contain two integers, (padding_H, padding_W). Otherwise, the + padding_H = padding_W = padding. + stride(int|tuple): The stride size. If stride is a tuple, it must + contain two integers, (stride_H, stride_W). Otherwise, the + stride_H = stride_W = stride. + param_attr: Parameter Attribute. + param_initializer(Initializer): Parameter Initializer. Default is Xavier + main_program(Program): the main program + startup_program(Program): the startup program + + Returns: + Variable: Output image. + """ + helper = LayerHelper("conv2d_transpose", **locals()) + if not isinstance(input, Variable): + raise TypeError("Input of conv2d_transpose must be Variable") + input_channel = input.shape[1] + + op_attr = dict() + + if isinstance(padding, int): + op_attr['paddings'] = [padding, padding] + elif padding is not None: + op_attr['paddings'] = padding + + if isinstance(stride, int): + op_attr['strides'] = stride + elif stride is not None: + op_attr['strides'] = stride + + if filter_size is None: + if output_size is None: + raise ValueError("output_size must be set when filter_size is None") + if isinstance(output_size, int): + output_size = [output_size, output_size] + + padding = op_attr.get('paddings', [0, 0]) + stride = op_attr.get('strides', [1, 1]) + + h_in = input.shape[2] + w_in = input.shape[3] + filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0] + filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1] + filter_size = [filter_size_h, filter_size_w] + elif isinstance(filter_size, int): + filter_size = [filter_size, filter_size] + + filter_shape = [input_channel, num_filters] + filter_size + img_filter = helper.create_parameter( + dtype=input.dtype, + shape=filter_shape, + attr=helper.param_attr, + initializer=param_initializer) + + out = helper.create_tmp_variable(dtype=input.dtype) + helper.append_op( + type='conv2d_transpose', + inputs={'Input': [input], + 'Filter': [img_filter]}, + outputs={'Output': out}, + attrs=op_attr) + return out + + class ConditionalBlockGuard(BlockGuard): def __init__(self, block): if not isinstance(block, ConditionalBlock): diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 87dc6d1a62..62b2a0f9a1 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -65,6 +65,15 @@ class TestBook(unittest.TestCase): print str(program) + def test_conv2d_transpose(self): + program = Program() + kwargs = {'main_program': program} + img = layers.data( + name='pixel', shape=[3, 2, 2], dtype='float32', **kwargs) + layers.conv2d_transpose( + input=img, num_filters=10, output_size=28, **kwargs) + print str(program) + def test_recognize_digits_conv(self): program = Program() From 76a65a83a015a38bd8f6654b4dc27d6040bcd5d8 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 28 Nov 2017 15:54:54 +0800 Subject: [PATCH 088/275] Fix comments style --- paddle/operators/nce_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index 7a91070329..8df20f432d 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -60,7 +60,7 @@ void PrepareSamples(const framework::ExecutionContext& context) { } } else { for (; j < sample_labels_dims[1]; ++j) { - // TODO: support more distribution sampling + // TODO(wanghaoshuang): support more distribution sampling sample_labels_data[index++] = rand(rng); } } From 696b0253e597a38edb948daf3278adc52a69b004 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 28 Nov 2017 18:28:35 +0800 Subject: [PATCH 089/275] Refine paddle/v2/fluid/profiler.py. --- paddle/platform/cuda_profiler.h | 8 +- python/paddle/v2/fluid/profiler.py | 78 ++++++------------- python/paddle/v2/fluid/tests/test_profiler.py | 2 +- 3 files changed, 30 insertions(+), 58 deletions(-) diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h index c096ce37c5..b6311cb23d 100644 --- a/paddle/platform/cuda_profiler.h +++ b/paddle/platform/cuda_profiler.h @@ -29,10 +29,10 @@ void CudaProfilerInit(std::string output_file, std::string output_mode, memcpy(buf.data(), tmpl.data(), tmpl.size()); auto result = mktemp(buf.data()); PADDLE_ENFORCE(strlen(result) != 0); - std::string config = result; + std::string config_file = result; { - std::ofstream ofs(config, std::ios::out | std::ios::trunc); + std::ofstream ofs(config_file, std::ios::out | std::ios::trunc); PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate()); for (const auto& line : config_flags) { ofs << line << std::endl; @@ -42,12 +42,12 @@ void CudaProfilerInit(std::string output_file, std::string output_mode, PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv"); cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; PADDLE_ENFORCE( - cudaProfilerInitialize(config.c_str(), output_file.c_str(), mode)); + cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode)); } void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); } -void CudaProfilerStop() { PADDLE_ENFORCE((cudaProfilerStop())); } +void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); } } // namespace platform } // namespace paddle diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index f31d6f0a61..2dbba72c64 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -1,9 +1,9 @@ import paddle.v2.fluid.core as core -import subprocess +from contextlib import contextmanager __all__ = ['CudaProfiler'] -NV_FLAGS = [ +NVPROF_CONFIG = [ "gpustarttimestamp", "gpuendtimestamp", "gridsize3d", @@ -14,61 +14,33 @@ NV_FLAGS = [ ] -def nvporf_init(output_file, output_mode=None, flags=None): - """ - Initialize the CUDA profiler. - This methods must be called before nvprof_start. - - :param output_file: The output file name. - :type output_file: string - :param output_mode: The output mode has Key-Value pair format and - Comma separated values format. - It should be 'kv' or 'csv'. - :type output_mode: string +@contextmanager +def cuda_profiler(output_file, output_mode=None, config=None): + """The CUDA profiler. + This fuctions is used to profile CUDA program by CUDA runtime application + programming interface. The profiling result will be written into + `output_file` with Key-Value pair format or Comma separated values format. + The user can set the output mode by `output_mode` argument and set the + counters/options for profiling by `config` argument. The default config + caontains 'gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d', + 'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'. + + Args: + output_file (string) : The output file name, the result will be + written into this file. + output_mode (string) : The output mode has Key-Value pair format and + Comma separated values format. It should be 'kv' or 'csv'. + config (string) : The profiler options and counters can refer to + "Compute Command Line Profiler User Guide". """ if output_mode is None: output_mode = 'csv' if output_mode not in ['kv', 'csv']: raise ValueError("The output mode must be 'key-value' or 'csv'.") - flags = NV_FLAGS if flags is None else flags - core.nvprof_init(output_file, output_mode, flags) - - -def nvporf_start(): - """ - Enables profiler collection by the active CUDA profiling tool. - """ + config = NVPROF_CONFIG if config is None else config + core.nvprof_init(output_file, output_mode, config) + # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() - - -def nvporf_stop(): - """ - Disables profiler collection. - """ + yield + # Disables profiler collection. core.nvprof_stop() - - -class CudaProfiler(object): - def __init__(self, output_file, output_mode=None, flags=None, enabled=True): - self.enabled = enabled - if not self.enabled: - return - self.entered = False - self.out_file = output_file - nvporf_init(output_file, output_mode, flags) - - def __enter__(self): - if not self.enabled: - return - if self.entered: - raise RuntimeError("The profiler traces are not reentrant") - self.entered = True - nvporf_start() - return self - - def __exit__(self, exc_type, exc_value, tb): - if exc_value is not None: - raise exc_value - if not self.enabled: - return - nvporf_stop() diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py index 1fec5c99bf..e8f24251b9 100644 --- a/python/paddle/v2/fluid/tests/test_profiler.py +++ b/python/paddle/v2/fluid/tests/test_profiler.py @@ -18,7 +18,7 @@ class TestProfiler(unittest.TestCase): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - with profiler.CudaProfiler("cuda_profiler.txt", 'csv') as nvprof: + with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: for i in range(epoc): input = np.random.random(dshape).astype("float32") exe.run(fluid.default_main_program(), feed={'data': input}) From 5e7e90ce8f09d1a970fb131f01c42b1882a1c06b Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 28 Nov 2017 18:28:35 +0800 Subject: [PATCH 090/275] Refine paddle/v2/fluid/profiler.py. --- paddle/platform/cuda_profiler.h | 8 +- python/paddle/v2/fluid/profiler.py | 82 ++++++------------- python/paddle/v2/fluid/tests/test_profiler.py | 4 +- 3 files changed, 33 insertions(+), 61 deletions(-) diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h index c096ce37c5..b6311cb23d 100644 --- a/paddle/platform/cuda_profiler.h +++ b/paddle/platform/cuda_profiler.h @@ -29,10 +29,10 @@ void CudaProfilerInit(std::string output_file, std::string output_mode, memcpy(buf.data(), tmpl.data(), tmpl.size()); auto result = mktemp(buf.data()); PADDLE_ENFORCE(strlen(result) != 0); - std::string config = result; + std::string config_file = result; { - std::ofstream ofs(config, std::ios::out | std::ios::trunc); + std::ofstream ofs(config_file, std::ios::out | std::ios::trunc); PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate()); for (const auto& line : config_flags) { ofs << line << std::endl; @@ -42,12 +42,12 @@ void CudaProfilerInit(std::string output_file, std::string output_mode, PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv"); cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; PADDLE_ENFORCE( - cudaProfilerInitialize(config.c_str(), output_file.c_str(), mode)); + cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode)); } void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); } -void CudaProfilerStop() { PADDLE_ENFORCE((cudaProfilerStop())); } +void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); } } // namespace platform } // namespace paddle diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index f31d6f0a61..2069b713fa 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -1,9 +1,9 @@ import paddle.v2.fluid.core as core -import subprocess +from contextlib import contextmanager __all__ = ['CudaProfiler'] -NV_FLAGS = [ +NVPROF_CONFIG = [ "gpustarttimestamp", "gpuendtimestamp", "gridsize3d", @@ -14,61 +14,33 @@ NV_FLAGS = [ ] -def nvporf_init(output_file, output_mode=None, flags=None): - """ - Initialize the CUDA profiler. - This methods must be called before nvprof_start. - - :param output_file: The output file name. - :type output_file: string - :param output_mode: The output mode has Key-Value pair format and - Comma separated values format. - It should be 'kv' or 'csv'. - :type output_mode: string +@contextmanager +def cuda_profiler(output_file, output_mode=None, config=None): + """The CUDA profiler. + This fuctions is used to profile CUDA program by CUDA runtime application + programming interface. The profiling result will be written into + `output_file` with Key-Value pair format or Comma separated values format. + The user can set the output mode by `output_mode` argument and set the + counters/options for profiling by `config` argument. The default config + is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d', + 'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace']. + + Args: + output_file (string) : The output file name, the result will be + written into this file. + output_mode (string) : The output mode has Key-Value pair format and + Comma separated values format. It should be 'kvp' or 'csv'. + config (string) : The profiler options and counters can refer to + "Compute Command Line Profiler User Guide". """ if output_mode is None: output_mode = 'csv' - if output_mode not in ['kv', 'csv']: - raise ValueError("The output mode must be 'key-value' or 'csv'.") - flags = NV_FLAGS if flags is None else flags - core.nvprof_init(output_file, output_mode, flags) - - -def nvporf_start(): - """ - Enables profiler collection by the active CUDA profiling tool. - """ + if output_mode not in ['kvp', 'csv']: + raise ValueError("The output mode must be 'kvp' or 'csv'.") + config = NVPROF_CONFIG if config is None else config + core.nvprof_init(output_file, output_mode, config) + # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() - - -def nvporf_stop(): - """ - Disables profiler collection. - """ + yield + # Disables profiler collection. core.nvprof_stop() - - -class CudaProfiler(object): - def __init__(self, output_file, output_mode=None, flags=None, enabled=True): - self.enabled = enabled - if not self.enabled: - return - self.entered = False - self.out_file = output_file - nvporf_init(output_file, output_mode, flags) - - def __enter__(self): - if not self.enabled: - return - if self.entered: - raise RuntimeError("The profiler traces are not reentrant") - self.entered = True - nvporf_start() - return self - - def __exit__(self, exc_type, exc_value, tb): - if exc_value is not None: - raise exc_value - if not self.enabled: - return - nvporf_stop() diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py index 1fec5c99bf..973e94b976 100644 --- a/python/paddle/v2/fluid/tests/test_profiler.py +++ b/python/paddle/v2/fluid/tests/test_profiler.py @@ -18,9 +18,9 @@ class TestProfiler(unittest.TestCase): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - with profiler.CudaProfiler("cuda_profiler.txt", 'csv') as nvprof: + with profiler.cuda_profiler('cuda_profiler.txt', 'kvp') as nvprof: for i in range(epoc): - input = np.random.random(dshape).astype("float32") + input = np.random.random(dshape).astype('float32') exe.run(fluid.default_main_program(), feed={'data': input}) From 6375c8cacbf72da741590361c887758d7a5323f5 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 28 Nov 2017 18:53:37 +0800 Subject: [PATCH 091/275] Fix MacOS compile (#5978) * Fix MacOS compile * Update GRPC * Unset PROTOBUF_EXEC --- cmake/external/grpc.cmake | 12 ++++++++++-- cmake/external/protobuf.cmake | 24 +++++++++++++++++------- 2 files changed, 27 insertions(+), 9 deletions(-) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index f431c037fd..1330ef82dc 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -23,6 +23,11 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc) SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) +IF(APPLE) + SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh) +ELSE() + SET(BUILD_CMD make) +ENDIF() ExternalProject_Add( extern_grpc @@ -33,7 +38,11 @@ ExternalProject_Add( UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_IN_SOURCE 1 - BUILD_COMMAND make + # NOTE(yuyang18): + # Disable -Werror, otherwise the compile will fail in MacOS. + # It seems that we cannot configure that by make command. + # Just dry run make command and remove `-Werror`, then use a shell to run make commands + BUILD_COMMAND ${BUILD_CMD} INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install ) @@ -55,4 +64,3 @@ SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION include_directories(${GRPC_INCLUDE_DIR}) ADD_DEPENDENCIES(grpc++_unsecure extern_grpc) - diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index be7f6a9465..7cfe1e6807 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -15,7 +15,18 @@ INCLUDE(ExternalProject) # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp FIND_PACKAGE(Protobuf QUIET) -SET(PROTOBUF_FOUND "OFF") +macro(UNSET_VAR VAR_NAME) + UNSET(${VAR_NAME} CACHE) + UNSET(${VAR_NAME}) +endmacro() +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(PROTOBUF_FOUND) +UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE) +UNSET_VAR(PROTOBUF_PROTOC_LIBRARY) +UNSET_VAR(PROTOBUF_LITE_LIBRARY) +UNSET_VAR(PROTOBUF_LIBRARY) +UNSET_VAR(PROTOBUF_INCLUDE_DIR) +UNSET_VAR(Protobuf_PROTOC_EXECUTABLE) if(NOT COMMAND protobuf_generate_python) # before cmake 3.4, protobuf_genrerate_python is not defined. function(protobuf_generate_python SRCS) @@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB) # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`. # make `protobuf_generate_cpp` happy. SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE}) - FOREACH(dep ${protobuf_DEPS}) ADD_DEPENDENCIES(protobuf ${dep}) ADD_DEPENDENCIES(protobuf_lite ${dep}) @@ -128,11 +138,11 @@ endmacro() set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf") if (NOT "${PROTOBUF_ROOT}" STREQUAL "") - find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include) - find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib) - find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib) - find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib) - find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin) + find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH) + find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH) + find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH) if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE) message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.") SET_PROTOBUF_VERSION() From 23b3fef062ce41d7b19060fb1190452c9160da59 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 28 Nov 2017 19:06:50 +0800 Subject: [PATCH 092/275] Make 'scale_op' supporting int and int64 (#5986) * Make 'scale_op' supporting int and int64 * refine .cu file --- paddle/operators/scale_op.cc | 4 +++- paddle/operators/scale_op.cu | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc index 5745580504..e5c10fec4d 100644 --- a/paddle/operators/scale_op.cc +++ b/paddle/operators/scale_op.cc @@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker); REGISTER_OP_CPU_KERNEL(scale, ops::ScaleKernel, - ops::ScaleKernel); + ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel); diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu index 820fd4e685..0d70775159 100644 --- a/paddle/operators/scale_op.cu +++ b/paddle/operators/scale_op.cu @@ -16,4 +16,6 @@ REGISTER_OP_GPU_KERNEL( scale, paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel); + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel); From c975fe1bdeac914847f59bee588feba0c76220f9 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 28 Nov 2017 19:34:03 +0800 Subject: [PATCH 093/275] batch norm support matrix input (#5980) * batch norm support matrix input * update gpu code * format code --- paddle/operators/batch_norm_op.cc | 15 ++--- paddle/operators/batch_norm_op.cu.cc | 31 ++++++---- .../book/test_image_classification_train.py | 3 +- .../v2/fluid/tests/test_batch_norm_op.py | 60 +++++++++++++++---- .../tests/test_image_classification_layer.py | 28 +++++---- 5 files changed, 93 insertions(+), 44 deletions(-) diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc index f884e6efa9..ac97bd83ab 100644 --- a/paddle/operators/batch_norm_op.cc +++ b/paddle/operators/batch_norm_op.cc @@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel { const auto x_dims = ctx->GetInputDim("X"); const TensorFormat tensor_format = StringToTensorFormat(ctx->Attrs().Get("tensor_format")); + + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "Input X must have 2 to 5 dimensions."); + const int C = (tensor_format == TensorFormat::NCHW ? x_dims[1] : x_dims[x_dims.size() - 1]); - PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, - "Input X must have 3 to 5 dimensions."); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); @@ -146,8 +147,8 @@ class BatchNormKernel : public framework::OpKernel { const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, - "The Input dim size should be between 3 and 5"); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); const int N = x_dims[0]; const int C = (tensor_format == TensorFormat::NCHW ? x_dims[1] @@ -339,8 +340,8 @@ class BatchNormGradKernel // Get the size for each dimension. // NCHW [batch_size, in_channels, in_height, in_width] const auto &x_dims = x->dims(); - PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, - "The Input dim size should be between 3 and 5"); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); const int N = x_dims[0]; const int C = (tensor_format == TensorFormat::NCHW ? x_dims[1] diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc index 726d1ea1b8..7b2f318700 100644 --- a/paddle/operators/batch_norm_op.cu.cc +++ b/paddle/operators/batch_norm_op.cu.cc @@ -29,14 +29,21 @@ void ExtractNCWHD(const framework::DDim &dims, const TensorFormat &tensor_format, int *N, int *C, int *H, int *W, int *D) { *N = dims[0]; - *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1]; - *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1]; - *W = dims.size() > 3 - ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2]) - : 1; - *D = dims.size() > 4 - ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3]) - : 1; + if (dims.size() == 2) { + *C = dims[1]; + *H = 1; + *W = 1; + *D = 1; + } else { + *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1]; + *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1]; + *W = dims.size() > 3 + ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2]) + : 1; + *D = dims.size() > 4 + ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3]) + : 1; + } } template @@ -56,8 +63,8 @@ class BatchNormKernel : public framework::OpKernel { // NCHW [batch_size, in_channels, in_height, in_width] const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, - "The Input dim size should be between 3 and 5"); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); int N, C, H, W, D; ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); @@ -180,8 +187,8 @@ class BatchNormGradKernel const auto &x_dims = x->dims(); - PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5, - "The Input dim size should be between 3 and 5"); + PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, + "The Input dim size should be between 2 and 5"); int N, C, H, W, D; ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D); diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py index cc45b10b90..0f0cc5b540 100644 --- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py @@ -69,8 +69,7 @@ def vgg16_bn_drop(input): drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5) fc1 = fluid.layers.fc(input=drop, size=512, act=None) - reshape1 = fluid.layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1))) - bn = fluid.layers.batch_norm(input=reshape1, act='relu') + bn = fluid.layers.batch_norm(input=fc1, act='relu') drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5) fc2 = fluid.layers.fc(input=drop2, size=512, act=None) return fc2 diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py index 71f9599e0d..e766a68c0e 100644 --- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py @@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set): def _reference_training(x, scale, offset, epsilon, data_format): + x_shape = x.shape + if len(x_shape) == 2: + if data_format == "NCHW": + x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1)) + else: + x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1])) + if data_format == "NCHW": n, c, h, w = x.shape x_square = x * x @@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format): offset_tile = np.reshape(offset, (1, c, 1, 1)) offset_tile = np.reshape(offset_tile, (1, c, 1, 1)) y = normalized * scale_tile + offset_tile + if len(x_shape) == 2: + y = np.reshape(y, (y.shape[0], y.shape[1])) return y, mean, var elif data_format == "NHWC": x_square = x * x @@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format): mean = x_sum / element_count var = x_square_sum / element_count - mean * mean normalized = (x - mean) / np.sqrt(var + epsilon) - return (normalized * scale + offset), mean, var + y = normalized * scale + offset + if len(x_shape) == 2: + y = np.reshape(y, x_shape) + return y, mean, var else: raise ValueError("Unknown data order.") @@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format): # (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon)) # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation + x_shape = x.shape + + if len(x_shape) == 2: + if data_format == "NCHW": + x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1)) + grad_y = np.reshape(grad_y, + (grad_y.shape[0], grad_y.shape[1], 1, 1)) + else: + x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1])) + grad_y = np.reshape(grad_y, + (grad_y.shape[0], 1, 1, grad_y.shape[1])) + if data_format == "NCHW": x = np.transpose(x, (0, 2, 3, 1)) grad_y = np.transpose(grad_y, (0, 2, 3, 1)) @@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format): grad_x = np.transpose(grad_x, (0, 3, 1, 2)) x = np.transpose(x, (0, 3, 1, 2)) grad_y = np.transpose(grad_y, (0, 3, 1, 2)) + + if len(x_shape) == 2: + grad_x = np.reshape(grad_x, x_shape) return grad_x, grad_scale, grad_offset @@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest): momentum = 0.9 # N, H, W, C: 2, 3, 4, 2 - n, h, w, c = 2, 3, 4, 2 + n, h, w, c = 2, 3, 4, 5 x_shape = [n, h, w, c] scale_shape = [c] @@ -184,20 +211,23 @@ class TestBatchNormOp(OpTest): print 'python: NHWC, NCHW, backward checking passed' def test_forward_backward(self): - def test_with_place(place, tensor_format): + def test_with_place(place, tensor_format, shape): # attr epsilon = 0.00001 momentum = 0.9 - # N, H, W, C: 12, 3, 4, 2 - n, h, w, c = 2, 3, 4, 2 - - if data_format == "NHWC": - x_shape = [n, h, w, c] - elif data_format == "NCHW": - x_shape = [n, c, h, w] + if len(shape) == 2: + x_shape = shape + c = shape[1] else: - raise ValueError("Unknown data type.") + # n, h, w, c = 2, 3, 4, 2 + n, h, w, c = shape[0], shape[1], shape[2], shape[3] + if data_format == "NHWC": + x_shape = [n, h, w, c] + elif data_format == "NCHW": + x_shape = [n, c, h, w] + else: + raise ValueError("Unknown data type.") scale_shape = [c] x_val = np.random.random_sample(x_shape).astype(np.float32) @@ -219,7 +249,10 @@ class TestBatchNormOp(OpTest): # for gradient test # y_grad = np.ones(x_shape).astype(np.float32) y_grad = np.zeros(x_shape).astype(np.float32) - y_grad[0, 0, 0, 0] = 1. + if len(y_grad.shape) == 2: + y_grad[0, 0] = 1. + else: + y_grad[0, 0, 0, 0] = 1. # y_grad = np.random.random_sample(x_shape).astype(np.float32) x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad( x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, @@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest): places.append(core.GPUPlace(0)) for place in places: for data_format in ["NCHW", "NHWC"]: - test_with_place(place, data_format) + test_with_place(place, data_format, [2, 3, 4, 5]) + test_with_place(place, data_format, [2, 3]) if __name__ == '__main__': diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py index 8e8e1b0a8c..2fd609d447 100644 --- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py +++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py @@ -1,6 +1,6 @@ import unittest -import paddle.v2.fluid.layers as layers +import paddle.v2.fluid as fluid import paddle.v2.fluid.nets as nets from paddle.v2.fluid.framework import Program @@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase): def test_batch_norm_layer(self): main_program = Program() startup_program = Program() - images = layers.data( + images = fluid.layers.data( name='pixel', shape=[3, 48, 48], dtype='float32', main_program=main_program) - layers.batch_norm( + hidden1 = fluid.layers.batch_norm( input=images, main_program=main_program, startup_program=startup_program) + hidden2 = fluid.layers.fc(input=hidden1, + size=128, + act='relu', + main_program=main_program) + hidden3 = fluid.layers.batch_norm( + input=hidden2, + main_program=main_program, + startup_program=startup_program) - # print str(main_program) + print str(main_program) def test_dropout_layer(self): main_program = Program() startup_program = Program() - images = layers.data( + images = fluid.layers.data( name='pixel', shape=[3, 48, 48], dtype='float32', main_program=main_program) - layers.dropout( + fluid.layers.dropout( x=images, dropout_prob=0.5, main_program=main_program, @@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase): main_program = Program() startup_program = Program() - images = layers.data( + images = fluid.layers.data( name='pixel', shape=[3, 48, 48], dtype='float32', @@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase): def test_elementwise_add_with_act(self): main_program = Program() startup_program = Program() - image1 = layers.data( + image1 = fluid.layers.data( name='pixel1', shape=[3, 48, 48], dtype='float32', main_program=main_program, startup_program=startup_program) - image2 = layers.data( + image2 = fluid.layers.data( name='pixel2', shape=[3, 48, 48], dtype='float32', main_program=main_program, startup_program=startup_program) - out = layers.elementwise_add( + out = fluid.layers.elementwise_add( x=image1, y=image2, act='relu', From 6ed135413a71bc2e5a44d762af564d056a5165c3 Mon Sep 17 00:00:00 2001 From: guosheng Date: Tue, 28 Nov 2017 21:49:39 +0800 Subject: [PATCH 094/275] Fix useGpu in HierarchicalSigmoidLayer --- paddle/gserver/layers/HierarchicalSigmoidLayer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp index 6317b66a45..236f8096bd 100644 --- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp +++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp @@ -164,7 +164,7 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) { cpuBias_ = biases_grad; } preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_); - if (useGpu) { + if (useGpu_) { biases_grad->copyFrom(*cpuBias_); } else { biases_grad = cpuBias_; From 6fc9a9fd690e2d5fe48f2b39ed2575a04ef32103 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Tue, 28 Nov 2017 23:15:09 +0800 Subject: [PATCH 095/275] modify for del T2 and doc update --- paddle/operators/math/unpooling.cc | 20 +++++----- paddle/operators/math/unpooling.cu | 39 +++++++++---------- paddle/operators/math/unpooling.h | 4 +- paddle/operators/unpool_op.cc | 19 +++++---- paddle/operators/unpool_op.cu.cc | 8 ++-- paddle/operators/unpool_op.h | 8 ++-- .../paddle/v2/fluid/tests/test_unpool_op.py | 4 +- 7 files changed, 52 insertions(+), 50 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index ab6212f387..dbc3936971 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -19,8 +19,8 @@ namespace operators { namespace math { // All tensors are in NCHW format -template -class Unpool2dMaxFunctor { +template +class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -35,7 +35,7 @@ class Unpool2dMaxFunctor { int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; const T* input_data = input.data(); - const T2 * indices_data = indices.data(); + const int * indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { @@ -54,8 +54,8 @@ class Unpool2dMaxFunctor { -template -class Unpool2dMaxGradFunctor { +template +class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -71,7 +71,7 @@ public: const int output_width = output.dims()[3]; int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; - const T2 * indices_data = indices.data(); + const int * indices_data = indices.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); @@ -90,10 +90,10 @@ public: } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 99e6fd052a..9cdd61f6d5 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -19,10 +19,10 @@ namespace paddle { namespace operators { namespace math { -template +template __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, - const T2 * indices_data, + const int * indices_data, const int input_height, const int input_width, const int channels, @@ -45,10 +45,10 @@ __global__ void KernelUnpool2dMax(const int nthreads, output_data[out_offset + out_index] = input_data[i]; } } -template +template __global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, - const T2* indices_data, + const int* indices_data, const int input_height, const int input_width, const int channels, @@ -76,8 +76,8 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, /* * All tensors are in NCHW format. */ -template -class Unpool2dMaxFunctor { +template +class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -90,15 +90,14 @@ class Unpool2dMaxFunctor { const int output_height = output->dims()[2]; const int output_width = output->dims()[3]; const T* input_data = input.data(); - const T2 * indices_data = indices.data(); + const int * indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); - int nthreads = batch_size * output_channels * input_height * input_width; int threads = 1024; int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMax< - T, T2><<<<(context) - .stream()>>>(nthreads, input_data, indices_data, + .stream()>>>(input.numel(), input_data, indices_data, input_height, input_width, output_channels, output_data, output_height, output_width); } @@ -106,8 +105,8 @@ class Unpool2dMaxFunctor { /* * All tensors are in NCHW format. */ -template -class Unpool2dMaxGradFunctor { +template +class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, @@ -122,18 +121,16 @@ class Unpool2dMaxGradFunctor { const int output_height = output.dims()[2]; const int output_width = output.dims()[3]; const T* input_data = input.data(); - const T2 * indices_data = indices.data(); + const int * indices_data = indices.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); - int nthreads = batch_size * output_channels * input_height * input_width; int threads = 1024; int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMaxGrad< - T, T2><<<<(context) - .stream()>>>( - nthreads, input_data, indices_data, + .stream()>>>(input.numel(), input_data, indices_data, input_height, input_width, output_channels, output_data, output_grad_data, output_height, output_width, @@ -141,11 +138,11 @@ class Unpool2dMaxGradFunctor { } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index e086b891a1..bf79354ed9 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { namespace math { -template +template class Unpool2dMaxFunctor { public: @@ -29,7 +29,7 @@ class Unpool2dMaxFunctor { framework::Tensor * output); }; -template +template class Unpool2dMaxGradFunctor { public: void operator()(const platform::DeviceContext& context, diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 49a5129188..2505148764 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -50,10 +50,15 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "(string), unpooling type, can be \"max\" for max-unpooling ") .InEnum({"max"}); AddComment(R"DOC( - "Paper: http://www.matthewzeiler.com/wp-content/uploads/2017 + "Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where + $$ + H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ + W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] + $$ + Paper: http://www.matthewzeiler.com/wp-content/uploads/2017 /07/iccv2011.pdf - PyTorch: http://pytorch.org/docs/master/nn.html?highlight=unpool# - torch.nn.MaxUnpool2d" )DOC"); } }; @@ -125,9 +130,9 @@ namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); REGISTER_OP_CPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); + ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL(unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 9b5ac667d3..d8214fc687 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -16,10 +16,10 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); + ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_GPU_KERNEL(unpool_grad, ops::UnpoolGradKernel, + float>, ops::UnpoolGradKernel); + double>); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index dfd4ef12b5..f618a7c0ba 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class UnpoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -37,12 +37,12 @@ class UnpoolKernel : public framework::OpKernel { math::SetConstant set_zero; set_zero(context.device_context(), out, static_cast(0)); } - math::Unpool2dMaxFunctor unpool2d_max_forward; + math::Unpool2dMaxFunctor unpool2d_max_forward; unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); } }; -template +template class UnpoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -64,7 +64,7 @@ class UnpoolGradKernel : public framework::OpKernel { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); } - math::Unpool2dMaxGradFunctor unpool2d_max_backward; + math::Unpool2dMaxGradFunctor unpool2d_max_backward; unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out, *out_grad, in_x_grad); } diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index b3c6c85025..292b9bc14a 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -50,7 +50,7 @@ class TestUnpoolOp(OpTest): indices[nidx, cidx, i, j] = \ (r_start + arg / self.ksize[1]) * wsize + \ c_start + arg % self.ksize[1] - output = self.Unpool2d_forward_naive(input, indices, self.ksize, \ + output = self.unpool2d_forward_naive(input, indices, self.ksize, \ self.strides, self.paddings).astype("float32") self.inputs = {'X': input.astype('float32'), 'Indices': indices.astype('int32')} @@ -69,7 +69,7 @@ class TestUnpoolOp(OpTest): self.check_grad(['X'], 'Out') def init_test_case(self): - self.Unpool2d_forward_naive = unpool2dmax_forward_naive + self.unpool2d_forward_naive = unpool2dmax_forward_naive self.unpooling_type = "max" self.shape = [6, 4, 5, 5] self.ksize = [3, 3] From a5feb771592d1bd7340ff7132518d6c52829b8e7 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Mon, 27 Nov 2017 17:12:21 -0800 Subject: [PATCH 096/275] address pr comment --- paddle/math/float16.h | 839 +++++++++++++++++------------ paddle/math/tests/test_float16.cpp | 2 + 2 files changed, 482 insertions(+), 359 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 3b22174148..65c0489e1f 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -16,9 +16,14 @@ limitations under the License. */ #include +#ifdef PADDLE_WITH_CUDA #include +#endif // PADDLE_WITH_CUDA + #include "unsupported/Eigen/CXX11/Tensor" +#include "paddle/platform/hostdevice.h" + #ifdef __GNUC__ #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) #else @@ -31,25 +36,12 @@ limitations under the License. */ #define PADDLE_CLANG_VER 0 #endif // __clang__ -#ifdef __CUDACC__ -#define PADDLE_HOSTDEVICE __host__ __device__ -#if CUDA_VERSION >= 7050 +#if defined(__CUDACC__) && CUDA_VERSION >= 7050 #define PADDLE_CUDA_FP16 #include -#endif // CUDA_VERSION >= 7050 -#else -#define PADDLE_HOSTDEVICE -#endif // __CUDACC__ - -#ifdef __arm__ -#define PADDLE_ARM_32 #endif -#ifdef __aarch64__ -#define PADDLE_ARM_64 -#endif - -#if defined(PADDLE_ARM_32) || defined(PADDLE_ARM_64) +#if defined(__arm__) || defined(__aarch64__) #define PADDLE_ARM #endif @@ -58,19 +50,12 @@ limitations under the License. */ #include #endif -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_32) -#define PADDLE_NEON_32 -#endif - -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_64) -#define PADDLE_NEON_64 +#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ + (PADDLE_GNUC_VER >= 62 || PADDLE_CLANG_VER >= 37) +#define PADDLE_WITH_NATIVE_FP16 #endif -#ifdef PADDLE_ARM -#ifdef __F16C__ -#undef __F16C__ -#endif // __F16C__ -#else +#ifndef PADDLE_ARM #include #endif // PADDLE_ARM @@ -78,27 +63,20 @@ limitations under the License. */ namespace paddle { -struct float16; - -namespace fp16_impl { -// Convert from float to half precision in round-to-nearest-even mode -PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f); -PADDLE_HOSTDEVICE inline float half_to_float(float16 h); -} // namespace fp16_impl - // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated // and aligned at least on a 2-byte boundary, which leads to efficient // memory access of float16 struct and also makes float16 compatible // with CUDA half, ARM float16_t, and Eigen::half data types. struct PADDLE_ALIGN(2) float16 { +public: uint16_t x; - PADDLE_HOSTDEVICE inline float16() : x(0) {} + HOSTDEVICE inline float16() : x(0) {} - PADDLE_HOSTDEVICE inline float16(const float16& h) : x(h.x) {} + HOSTDEVICE inline float16(const float16& h) : x(h.x) {} #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16(const half& h) { + HOSTDEVICE inline explicit float16(const half& h) { #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(&h)->x; #else @@ -107,78 +85,64 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16(const Eigen::half& h) : x(h.x) {} + HOSTDEVICE inline explicit float16(const Eigen::half& h) : x(h.x) {} -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) +#ifdef PADDLE_WITH_NATIVE_FP16 // __fp16 is a native half precision data type for arm cpu, // float16_t is an alias for __fp16 in arm_fp16.h, // which is included in arm_neon.h. - PADDLE_HOSTDEVICE inline float16(const float16_t& h) { - float16_t tmp = h; - x = *reinterpret_cast(&tmp); + HOSTDEVICE inline explicit float16(const float16_t& h) { + x = *reinterpret_cast(&h); } #endif - PADDLE_HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} - - PADDLE_HOSTDEVICE inline explicit float16(int8_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } - - PADDLE_HOSTDEVICE inline explicit float16(uint8_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } - - PADDLE_HOSTDEVICE inline explicit float16(int16_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } - - PADDLE_HOSTDEVICE inline explicit float16(uint16_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } + HOSTDEVICE inline explicit float16(float val) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + half tmp = __float2half(val); + x = *reinterpret_cast(&tmp); - PADDLE_HOSTDEVICE inline explicit float16(int32_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } +#elif defined(PADDLE_NEON) + float32x4_t tmp = vld1q_dup_f32(&val); + float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0); + x = *reinterpret_cast(&res); - PADDLE_HOSTDEVICE inline explicit float16(uint32_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } +#elif defined(__F16C__) + x = _cvtss_sh(val, 0); - PADDLE_HOSTDEVICE inline explicit float16(int64_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v, s; + v.f = val; + uint32_t sign = v.si & sigN; + v.si ^= sign; + sign >>= shiftSign; // logical shift + s.si = mulN; + s.si = s.f * v.f; // correct subnormals + v.si ^= (s.si ^ v.si) & -(minN > v.si); + v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); + v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); + v.ui >>= shift; // logical shift + v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); + v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); + x = v.ui | sign; - PADDLE_HOSTDEVICE inline explicit float16(uint64_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; +#endif } - PADDLE_HOSTDEVICE inline explicit float16(float val) { - float16 res = fp16_impl::float_to_half_rn(val); - x = res.x; - } + HOSTDEVICE inline explicit float16(bool b) : x(b ? 0x3c00 : 0) {} - PADDLE_HOSTDEVICE inline explicit float16(double val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; - } + template + HOSTDEVICE inline explicit float16(const T& val) + : x(float16(static_cast(val)).x) {} - PADDLE_HOSTDEVICE inline float16& operator=(const float16& rhs) { + HOSTDEVICE inline float16& operator=(const float16& rhs) { x = rhs.x; return *this; } #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline float16& operator=(const half& rhs) { + HOSTDEVICE inline float16& operator=(const half& rhs) { #if CUDA_VERSION >= 9000 x = reinterpret_cast<__half_raw*>(&rhs)->x; #else @@ -188,87 +152,75 @@ struct PADDLE_ALIGN(2) float16 { } #endif - PADDLE_HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { + HOSTDEVICE inline float16& operator=(const Eigen::half& rhs) { x = rhs.x; return *this; } -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) - PADDLE_HOSTDEVICE inline float16& operator=(const float16_t& rhs) { - float16_t tmp = rhs; - x = *reinterpret_cast(&tmp); +#ifdef PADDLE_WITH_NATIVE_FP16 + HOSTDEVICE inline float16& operator=(const float16_t& rhs) { + x = *reinterpret_cast(&rhs); return *this; } #endif - PADDLE_HOSTDEVICE inline float16& operator=(bool b) { + HOSTDEVICE inline float16& operator=(bool b) { x = b ? 0x3c00 : 0; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(int8_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(int8_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(uint8_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(uint8_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(int16_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(int16_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(uint16_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(uint16_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(int32_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(int32_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(uint32_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(uint32_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(int64_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(int64_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(uint64_t val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(uint64_t val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(float val) { - float16 res = fp16_impl::float_to_half_rn(val); - x = res.x; + HOSTDEVICE inline float16& operator=(float val) { + x = float16(val).x; return *this; } - PADDLE_HOSTDEVICE inline float16& operator=(double val) { - float16 res = fp16_impl::float_to_half_rn(static_cast(val)); - x = res.x; + HOSTDEVICE inline float16& operator=(double val) { + x = float16(val).x; return *this; } #ifdef PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline operator half() const { + HOSTDEVICE inline explicit operator half() const { #if CUDA_VERSION >= 9000 __half_raw h; h.x = x; @@ -281,186 +233,504 @@ struct PADDLE_ALIGN(2) float16 { } #endif // PADDLE_CUDA_FP16 - PADDLE_HOSTDEVICE inline operator Eigen::half() const { + HOSTDEVICE inline explicit operator Eigen::half() const { Eigen::half h; h.x = x; return h; } -#if defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 61 || PADDLE_CLANG_VER >= 34) - PADDLE_HOSTDEVICE inline operator float16_t() const { - float16 h = *this; - return *reinterpret_cast(&h); +#ifdef PADDLE_WITH_NATIVE_FP16 + HOSTDEVICE inline explicit operator float16_t() const { + return *reinterpret_cast(this); } #endif - PADDLE_HOSTDEVICE inline explicit operator bool() const { - return (x & 0x7fff) != 0; - } + HOSTDEVICE inline explicit operator float() const { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 + half tmp = *reinterpret_cast(this); + return __half2float(tmp); + +#elif defined(PADDLE_NEON) + float16x4_t res = vld1_dup_f16(reinterpret_cast(this)); + return vgetq_lane_f32(vcvt_f32_f16(res), 0); - PADDLE_HOSTDEVICE inline explicit operator int8_t() const { - return static_cast(fp16_impl::half_to_float(*this)); +#elif defined(__F16C__) + return _cvtsh_ss(this->x); + +#else + // Conversion routine adapted from + // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion + Bits v; + v.ui = this->x; + int32_t sign = v.si & sigC; + v.si ^= sign; + sign <<= shiftSign; + v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); + v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); + Bits s; + s.si = mulC; + s.f *= v.si; + int32_t mask = -(norC > v.si); + v.si <<= shift; + v.si ^= (s.si ^ v.si) & mask; + v.si |= sign; + return v.f; + +#endif } - PADDLE_HOSTDEVICE inline explicit operator uint8_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } + + HOSTDEVICE inline explicit operator int8_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int16_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator uint8_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint16_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator int16_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int32_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator uint16_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint32_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator int32_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator int64_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator uint32_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator uint64_t() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator int64_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator float() const { - return fp16_impl::half_to_float(*this); + HOSTDEVICE inline explicit operator uint64_t() const { + return static_cast(float(*this)); } - PADDLE_HOSTDEVICE inline explicit operator double() const { - return static_cast(fp16_impl::half_to_float(*this)); + HOSTDEVICE inline explicit operator double() const { + return static_cast(float(*this)); } + +private: + union Bits { + float f; + int32_t si; + uint32_t ui; + }; + + static const int shift = 13; + static const int shiftSign = 16; + + static const int32_t infN = 0x7F800000; + static const int32_t maxN = 0x477FE000; // max flt16 as flt32 + static const int32_t minN = 0x38800000; // min flt16 normal as flt32 + static const int32_t sigN = 0x80000000; // sign bit + + static constexpr int32_t infC = infN >> shift; + static constexpr int32_t nanN = (infC + 1) + << shift; // minimum flt16 nan as float32 + static constexpr int32_t maxC = maxN >> shift; + static constexpr int32_t minC = minN >> shift; + static constexpr int32_t sigC = sigN >> shiftSign; + + static const int32_t mulN = 0x52000000; // (1 << 23) / minN + static const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) + static const int32_t subC = 0x003FF; // max flt32 subnormal downshifted + static const int32_t norC = 0x00400; // min flt32 normal downshifted + + static constexpr int32_t maxD = infC - maxC - 1; + static constexpr int32_t minD = minC - subC - 1; }; -// Arithmetic operators -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 -__device__ inline float16 operator+(const float16& a, const float16& b) { +// Arithmetic operators on GPU +// CUDA 9.0 provides built-in arithmetic operators for half while +// CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are +// for users to write similar CUDA code in CUDA 7.5 and 8.0 as in +// CUDA 9.0 regarding the half data type. +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \ + __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000 +DEVICE inline half operator+(const half& a, const half& b) { + return __hadd(a, b); +} + +DEVICE inline half operator-(const half& a, const half& b) { + return __hsub(a, b); +} + +DEVICE inline half operator*(const half& a, const half& b) { + return __hmul(a, b); +} + +DEVICE inline half operator/(const half& a, const half& b) { + float num = __half2float(a); + float denom = __half2float(b); + return __float2half(num / denom); +} + +DEVICE inline half operator-(const half& a) { return __hneg(a); } + +DEVICE inline half& operator+=(half& a, const half& b) { + a = a + b; + return a; +} + +DEVICE inline half& operator-=(half& a, const half& b) { + a = a - b; + return a; +} + +DEVICE inline half& operator*=(half& a, const half& b) { + a = a * b; + return a; +} + +DEVICE inline half& operator/=(half& a, const half& b) { + a = a / b; + return a; +} + +DEVICE inline bool operator==(const half& a, const half& b) { + return __heq(a, b); +} + +DEVICE inline bool operator!=(const half& a, const half& b) { + return __hne(a, b); +} + +DEVICE inline bool operator<(const half& a, const half& b) { + return __hlt(a, b); +} + +DEVICE inline bool operator<=(const half& a, const half& b) { + return __hle(a, b); +} + +DEVICE inline bool operator>(const half& a, const half& b) { + return __hgt(a, b); +} + +DEVICE inline bool operator>=(const half& a, const half& b) { + return __hge(a, b); +} + +/* +DEVICE inline float16 operator+(const float16& a, const float16& b) { return float16(__hadd(half(a), half(b))); } -__device__ inline float16 operator-(const float16& a, const float16& b) { +DEVICE inline float16 operator-(const float16& a, const float16& b) { return float16(__hsub(half(a), half(b))); } -__device__ inline float16 operator*(const float16& a, const float16& b) { +DEVICE inline float16 operator*(const float16& a, const float16& b) { return float16(__hmul(half(a), half(b))); } -__device__ inline float16 operator/(const float16& a, const float16& b) { - // TODO(kexinzhao): check the cuda version that starts to support __hdiv +DEVICE inline float16 operator/(const float16& a, const float16& b) { float num = __half2float(half(a)); float denom = __half2float(half(b)); return float16(num / denom); } -__device__ inline float16 operator-(const float16& a) { +DEVICE inline float16 operator-(const float16& a) { return float16(__hneg(half(a))); } -__device__ inline float16& operator+=(float16& a, const float16& b) { +DEVICE inline float16& operator+=(float16& a, const float16& b) { a = a + b; return a; } -__device__ inline float16& operator-=(float16& a, const float16& b) { +DEVICE inline float16& operator-=(float16& a, const float16& b) { a = a - b; return a; } -__device__ inline float16& operator*=(float16& a, const float16& b) { +DEVICE inline float16& operator*=(float16& a, const float16& b) { a = a * b; return a; } -__device__ inline float16& operator/=(float16& a, const float16& b) { +DEVICE inline float16& operator/=(float16& a, const float16& b) { a = a / b; return a; } -__device__ inline bool operator==(const float16& a, const float16& b) { +DEVICE inline bool operator==(const float16& a, const float16& b) { return __heq(half(a), half(b)); } -__device__ inline bool operator!=(const float16& a, const float16& b) { +DEVICE inline bool operator!=(const float16& a, const float16& b) { return __hne(half(a), half(b)); } -__device__ inline bool operator<(const float16& a, const float16& b) { +DEVICE inline bool operator<(const float16& a, const float16& b) { return __hlt(half(a), half(b)); } -__device__ inline bool operator<=(const float16& a, const float16& b) { +DEVICE inline bool operator<=(const float16& a, const float16& b) { return __hle(half(a), half(b)); } -__device__ inline bool operator>(const float16& a, const float16& b) { +DEVICE inline bool operator>(const float16& a, const float16& b) { return __hgt(half(a), half(b)); } -__device__ inline bool operator>=(const float16& a, const float16& b) { +DEVICE inline bool operator>=(const float16& a, const float16& b) { return __hge(half(a), half(b)); } +*/ + +// Arithmetic operators on ARMv8.2-A CPU +#elif defined(PADDLE_WITH_NATIVE_FP16) +HOST inline float16 operator+(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fadd h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +HOST inline float16 operator-(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fsub h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +HOST inline float16 operator*(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fmul h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} + +HOST inline float16 operator/(const float16& a, const float16& b) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fdiv h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0", "v1"); + return res; +} -// On ARMv8.2-A CPU -#elif defined(PADDLE_NEON) && defined(PADDLE_ARM_FP16) && \ - (PADDLE_GNUC_VER >= 71 || PADDLE_CLANG_VER >= 39) -__host__ inline float16 operator+(const float16& a, const float16& b) { +HOST inline float16 operator-(const float16& a) { + float16 res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "fneg h0, h0\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [res_ptr] "r"(&(res.x)) + : // clobbers + "memory", "v0"); + return res; +} + +HOST inline float16& operator+=(float16& a, const float16& b) { + a = a + b; + return a; +} + +HOST inline float16& operator-=(float16& a, const float16& b) { + a = a - b; + return a; +} + +HOST inline float16& operator*=(float16& a, const float16& b) { + a = a * b; + return a; +} + +HOST inline float16& operator/=(float16& a, const float16& b) { + a = a / b; + return a; +} + +HOST inline bool operator==(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmeq h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +HOST inline bool operator!=(const float16& a, const float16& b) { + return !(a == b); +} + +HOST inline bool operator<(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v1.h}[0], [%[a_ptr]]\n" + "ld1 {v0.h}[0], [%[b_ptr]]\n" + "fcmgt h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +HOST inline bool operator<=(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v1.h}[0], [%[a_ptr]]\n" + "ld1 {v0.h}[0], [%[b_ptr]]\n" + "fcmge h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +HOST inline bool operator>(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmgt h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +HOST inline bool operator>=(const float16& a, const float16& b) { + uint16_t res; + asm volatile( + "ld1 {v0.h}[0], [%[a_ptr]]\n" + "ld1 {v1.h}[0], [%[b_ptr]]\n" + "fcmge h0, h0, h1\n" + "st1 {v0.h}[0], [%[res_ptr]]\n" + : // outputs + : // inputs + [a_ptr] "r"(&(a.x)), + [b_ptr] "r"(&(b.x)), + [res_ptr] "r"(&res) + : // clobbers + "memory", "v0", "v1"); + return (res & 0xffff) != 0; +} + +/* +HOST inline float16 operator+(const float16& a, const float16& b) { return float16(vaddh_f16(float16_t(a), float16_t(b))); } -__host__ inline float16 operator-(const float16& a, const float16& b) { +HOST inline float16 operator-(const float16& a, const float16& b) { return float16(vsubh_f16(float16_t(a), float16_t(b))); } -__host__ inline float16 operator*(const float16& a, const float16& b) { +HOST inline float16 operator*(const float16& a, const float16& b) { return float16(vmulh_f16(float16_t(a), float16_t(b))); } -__host__ inline float16 operator/(const float16& a, const float16& b) { +HOST inline float16 operator/(const float16& a, const float16& b) { return float16(vdivh_f16(float16_t(a), float16_t(b))); } -__host__ inline float16 operator-(const float16& a) { +HOST inline float16 operator-(const float16& a) { return float16(vnegh_f16(float16_t(a))); } -__host__ inline float16& operator+=(float16& a, const float16& b) { +HOST inline float16& operator+=(float16& a, const float16& b) { a = a + b; return a; } -__host__ inline float16& operator-=(float16& a, const float16& b) { +HOST inline float16& operator-=(float16& a, const float16& b) { a = a - b; return a; } -__host__ inline float16& operator*=(float16& a, const float16& b) { +HOST inline float16& operator*=(float16& a, const float16& b) { a = a * b; return a; } -__host__ inline float16& operator/=(float16& a, const float16& b) { +HOST inline float16& operator/=(float16& a, const float16& b) { a = a / b; return a; } -__host__ inline bool operator==(const float16& a, const float16& b) { +HOST inline bool operator==(const float16& a, const float16& b) { return static_cast(vceqh_f16(float16_t(a), float16_t(b))); } -__host__ inline bool operator!=(const float16& a, const float16& b) { +HOST inline bool operator!=(const float16& a, const float16& b) { return !(a == b); } -__host__ inline bool operator<(const float16& a, const float16& b) { +HOST inline bool operator<(const float16& a, const float16& b) { #ifdef PADDLE_NEON_64 return static_cast(vclth_f16(float16_t(a), float16_t(b))); #else @@ -468,7 +738,7 @@ __host__ inline bool operator<(const float16& a, const float16& b) { #endif // PADDLE_NEON_64 } -__host__ inline bool operator<=(const float16& a, const float16& b) { +HOST inline bool operator<=(const float16& a, const float16& b) { #ifdef PADDLE_NEON_64 return static_cast(vcleh_f16(float16_t(a), float16_t(b))); #else @@ -476,7 +746,7 @@ __host__ inline bool operator<=(const float16& a, const float16& b) { #endif // PADDLE_NEON_64 } -__host__ inline bool operator>(const float16& a, const float16& b) { +HOST inline bool operator>(const float16& a, const float16& b) { #ifdef PADDLE_NEON_64 return static_cast(vcgth_f16(float16_t(a), float16_t(b))); #else @@ -484,231 +754,82 @@ __host__ inline bool operator>(const float16& a, const float16& b) { #endif // PADDLE_NEON_64 } -__host__ inline bool operator>=(const float16& a, const float16& b) { +HOST inline bool operator>=(const float16& a, const float16& b) { #ifdef PADDLE_NEON_64 return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); #else return float(a) >= float(b); #endif // PADDLE_NEON_64 } +*/ -#else // Software emulation on other cpu -PADDLE_HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { +// Arithmetic operators, software emulated on other CPU +#else +HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { return float16(float(a) + float(b)); } -PADDLE_HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { +HOSTDEVICE inline float16 operator-(const float16& a, const float16& b) { return float16(float(a) - float(b)); } -PADDLE_HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { +HOSTDEVICE inline float16 operator*(const float16& a, const float16& b) { return float16(float(a) * float(b)); } -PADDLE_HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { +HOSTDEVICE inline float16 operator/(const float16& a, const float16& b) { return float16(float(a) / float(b)); } -PADDLE_HOSTDEVICE inline float16 operator-(const float16& a) { +HOSTDEVICE inline float16 operator-(const float16& a) { float16 res; res.x = a.x ^ 0x8000; return res; } -PADDLE_HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { +HOSTDEVICE inline float16& operator+=(float16& a, const float16& b) { a = float16(float(a) + float(b)); return a; } -PADDLE_HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { +HOSTDEVICE inline float16& operator-=(float16& a, const float16& b) { a = float16(float(a) - float(b)); return a; } -PADDLE_HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { +HOSTDEVICE inline float16& operator*=(float16& a, const float16& b) { a = float16(float(a) * float(b)); return a; } -PADDLE_HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { +HOSTDEVICE inline float16& operator/=(float16& a, const float16& b) { a = float16(float(a) / float(b)); return a; } -PADDLE_HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator==(const float16& a, const float16& b) { return float(a) == float(b); } -PADDLE_HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator!=(const float16& a, const float16& b) { return float(a) != float(b); } -PADDLE_HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator<(const float16& a, const float16& b) { return float(a) < float(b); } -PADDLE_HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator<=(const float16& a, const float16& b) { return float(a) <= float(b); } -PADDLE_HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { return float(a) > float(b); } -PADDLE_HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { +HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { return float(a) >= float(b); } #endif - -namespace fp16_impl { - -union Bits { - float f; - int32_t si; - uint32_t ui; -}; - -const int shift = 13; -const int shiftSign = 16; - -const int32_t infN = 0x7F800000; -const int32_t maxN = 0x477FE000; // max flt16 as flt32 -const int32_t minN = 0x38800000; // min flt16 normal as flt32 -const int32_t sigN = 0x80000000; // sign bit - -constexpr int32_t infC = infN >> shift; -constexpr int32_t nanN = (infC + 1) << shift; // minimum flt16 nan as float32 -constexpr int32_t maxC = maxN >> shift; -constexpr int32_t minC = minN >> shift; -constexpr int32_t sigC = sigN >> shiftSign; - -const int32_t mulN = 0x52000000; // (1 << 23) / minN -const int32_t mulC = 0x33800000; // minN / (1 << (23 - shift)) -const int32_t subC = 0x003FF; // max flt32 subnormal downshifted -const int32_t norC = 0x00400; // min flt32 normal downshifted - -constexpr int32_t maxD = infC - maxC - 1; -constexpr int32_t minD = minC - subC - 1; - -PADDLE_HOSTDEVICE inline float16 float_to_half_rn(float f) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - half tmp = __float2half(f); - return *reinterpret_cast(&tmp); - -#elif defined(PADDLE_NEON_64) - float16 res; - asm volatile( - "ld1 {v0.s}[0], [%[float_ptr]]\n" - "fcvt h0, s0\n" - "st1 {v0.h}[0], [%[half_ptr]]\n" - : // outputs - : // inputs - [float_ptr] "r"(&f), - [half_ptr] "r"(&(res.x)) - : // clobbers - "memory", "v0"); - return res; - -#elif defined(PADDLE_NEON_32) - float16 res; - asm volatile( - "vld1.32 {d0[0]}, [%[float_ptr]]\n" - "vcvt.f16.f32 d0, q0\n" - "vst1.16 {d0[0]}, [%[half_ptr]]\n" - : // outputs - : // inputs - [float_ptr] "r"(&f), - [half_ptr] "r"(&(res.x)) - : // clobbers - "memory", "d0"); - return res; - -#elif defined(__F16C__) - float16 res; - res.x = _cvtss_sh(f, 0); - return res; - -#else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v, s; - v.f = f; - uint32_t sign = v.si & sigN; - v.si ^= sign; - sign >>= shiftSign; // logical shift - s.si = mulN; - s.si = s.f * v.f; // correct subnormals - v.si ^= (s.si ^ v.si) & -(minN > v.si); - v.si ^= (infN ^ v.si) & -((infN > v.si) & (v.si > maxN)); - v.si ^= (nanN ^ v.si) & -((nanN > v.si) & (v.si > infN)); - v.ui >>= shift; // logical shift - v.si ^= ((v.si - maxD) ^ v.si) & -(v.si > maxC); - v.si ^= ((v.si - minD) ^ v.si) & -(v.si > subC); - float16 res; - res.x = v.ui | sign; - return res; - -#endif -} - -PADDLE_HOSTDEVICE inline float half_to_float(float16 h) { -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 - half tmp = *reinterpret_cast(&h); - return __half2float(tmp); - -#elif defined(PADDLE_NEON_64) - float res; - asm volatile( - "ld1 {v0.h}[0], [%[half_ptr]]\n" - "fcvt s0, h0\n" - "st1 {v0.s}[0], [%[float_ptr]]\n" - : // outputs - : // inputs - [half_ptr] "r"(&(h.x)), - [float_ptr] "r"(&res) - : // clobbers - "memory", "v0"); - return res; - -#elif defined(PADDLE_NEON_32) - float res; - asm volatile( - "vld1.16 {d0[0]}, [%[half_ptr]]\n" - "vcvt.f32.f16 q0, d0\n" - "vst1.32 {d0[0]}, [%[float_ptr]]\n" - : // outputs - : // inputs - [half_ptr] "r"(&(h.x)), - [float_ptr] "r"(&res) - : // clobbers - "memory", "v0"); - return res; - -#elif defined(__F16C__) - return _cvtsh_ss(h.x); - -#else - // Conversion routine adapted from - // http://stackoverflow.com/questions/1659440/32-bit-to-16-bit-floating-point-conversion - Bits v; - v.ui = h.x; - int32_t sign = v.si & sigC; - v.si ^= sign; - sign <<= shiftSign; - v.si ^= ((v.si + minD) ^ v.si) & -(v.si > subC); - v.si ^= ((v.si + maxD) ^ v.si) & -(v.si > maxC); - Bits s; - s.si = mulC; - s.f *= v.si; - int32_t mask = -(norC > v.si); - v.si <<= shift; - v.si ^= (s.si ^ v.si) & mask; - v.si |= sign; - return v.f; - -#endif -} - -} // namespace fp16_impl } // namespace paddle diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index 8c74bcc039..f5541d8f0f 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -55,10 +55,12 @@ TEST(float16, conversion_cpu) { EXPECT_EQ(float16(false).x, 0x0000); // Implicit conversion to and from Eigen::half + /* Eigen::half tmp = float16(1.0f); float16 v_conv = tmp; EXPECT_EQ(tmp.x, 0x3c00); EXPECT_EQ(v_conv.x, 0x3c00); + */ // Default constructor float16 v_def; From d9673cad153be572dbf356733f4e9ae6df1d56d2 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 29 Nov 2017 07:44:52 +0800 Subject: [PATCH 097/275] format code --- paddle/operators/math/unpooling.cc | 4 ++-- paddle/operators/math/unpooling.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index dbc3936971..35091e849b 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -35,7 +35,7 @@ class Unpool2dMaxFunctor { int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; const T* input_data = input.data(); - const int * indices_data = indices.data(); + const int* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { @@ -71,7 +71,7 @@ public: const int output_width = output.dims()[3]; int input_feasize = input_height * input_width; int output_feasize = output_height * output_width; - const int * indices_data = indices.data(); + const int* indices_data = indices.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 9cdd61f6d5..9f27e35d00 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -90,7 +90,7 @@ class Unpool2dMaxFunctor { const int output_height = output->dims()[2]; const int output_width = output->dims()[3]; const T* input_data = input.data(); - const int * indices_data = indices.data(); + const int* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; @@ -121,7 +121,7 @@ class Unpool2dMaxGradFunctor { const int output_height = output.dims()[2]; const int output_width = output.dims()[3]; const T* input_data = input.data(); - const int * indices_data = indices.data(); + const int* indices_data = indices.data(); const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); From bd561384bca825088417fbd69dc2282b7581bf3c Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 29 Nov 2017 08:23:50 +0800 Subject: [PATCH 098/275] format code --- paddle/operators/math/unpooling.cc | 17 +-- paddle/operators/math/unpooling.cu | 87 ++++++------ paddle/operators/math/unpooling.h | 9 +- paddle/operators/unpool_op.cc | 134 ++++++++++-------- paddle/operators/unpool_op.h | 8 +- .../paddle/v2/fluid/tests/test_unpool_op.py | 18 +-- 6 files changed, 133 insertions(+), 140 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index 35091e849b..b13d0104de 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -17,15 +17,13 @@ limitations under the License. */ namespace paddle { namespace operators { namespace math { - // All tensors are in NCHW format template class Unpool2dMaxFunctor { - public: +public: void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& indices, - framework::Tensor * output) { + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -40,7 +38,7 @@ class Unpool2dMaxFunctor { for (int b = 0; b < batch_size; ++b) { for (int c = 0; c < output_channels; ++c) { for (int i = 0; i < input_feasize; ++i) { - int index = indices_data[i]; + int index = indices_data[i]; PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!"); output_data[index] = input_data[i]; } @@ -51,9 +49,6 @@ class Unpool2dMaxFunctor { } } }; - - - template class Unpool2dMaxGradFunctor { public: @@ -62,7 +57,7 @@ public: const framework::Tensor& indices, const framework::Tensor& output, const framework::Tensor& output_grad, - framework::Tensor * input_grad) { + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -89,12 +84,10 @@ public: } } }; - template class Unpool2dMaxGradFunctor; template class Unpool2dMaxGradFunctor; template class Unpool2dMaxFunctor; template class Unpool2dMaxFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 9f27e35d00..6017920873 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -18,36 +18,33 @@ limitations under the License. */ namespace paddle { namespace operators { namespace math { - template -__global__ void KernelUnpool2dMax(const int nthreads, - const T* input_data, - const int * indices_data, +__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, + const int* indices_data, const int input_height, const int input_width, const int channels, T* output_data, const int output_height, const int output_width) { - int in_n_stride = input_height * input_width * channels; - int in_c_stride = input_height * input_width; - int out_n_stride = output_height * output_width * channels; - int out_c_stride = output_height * output_width; - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - int bidx = i / in_n_stride; - int boffset = i % in_n_stride; - int cidx = boffset / in_c_stride; - int out_offset = bidx * out_n_stride + cidx * out_c_stride; - int out_index = indices_data[i]; - PADDLE_ASSERT(out_index < out_c_stride); - output_data[out_offset + out_index] = input_data[i]; - } + int in_n_stride = input_height * input_width * channels; + int in_c_stride = input_height * input_width; + int out_n_stride = output_height * output_width * channels; + int out_c_stride = output_height * output_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int bidx = i / in_n_stride; + int boffset = i % in_n_stride; + int cidx = boffset / in_c_stride; + int out_offset = bidx * out_n_stride + cidx * out_c_stride; + int out_index = indices_data[i]; + PADDLE_ASSERT(out_index < out_c_stride); + output_data[out_offset + out_index] = input_data[i]; + } } template -__global__ void KernelUnpool2dMaxGrad(const int nthreads, - const T* input_data, +__global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, const int* indices_data, const int input_height, const int input_width, @@ -57,32 +54,32 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, const int output_height, const int output_width, T* input_grad) { - int in_n_stride = input_height * input_width * channels; - int in_c_stride = input_height * input_width; - int out_n_stride = output_height * output_width * channels; - int out_c_stride = output_height * output_width; - int index = blockIdx.x * blockDim.x + threadIdx.x; - int offset = blockDim.x * gridDim.x; - for (int i = index; i < nthreads; i += offset) { - int bidx = i / in_n_stride; - int boffset = i % in_n_stride; - int cidx = boffset / in_c_stride; - int out_offset = bidx * out_n_stride + cidx * out_c_stride; - int out_index = indices_data[i]; - PADDLE_ASSERT(out_index < out_c_stride); - input_grad[i] = output_grad[out_offset + out_index]; - } + int in_n_stride = input_height * input_width * channels; + int in_c_stride = input_height * input_width; + int out_n_stride = output_height * output_width * channels; + int out_c_stride = output_height * output_width; + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + int bidx = i / in_n_stride; + int boffset = i % in_n_stride; + int cidx = boffset / in_c_stride; + int out_offset = bidx * out_n_stride + cidx * out_c_stride; + int out_index = indices_data[i]; + PADDLE_ASSERT(out_index < out_c_stride); + input_grad[i] = output_grad[out_offset + out_index]; + } } /* * All tensors are in NCHW format. */ template class Unpool2dMaxFunctor { - public: +public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, - framework::Tensor * output) { + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -93,7 +90,7 @@ class Unpool2dMaxFunctor { const int* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); int threads = 1024; - int grid = (input.numel() + threads - 1) / threads; + int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMax< T><<(context) @@ -107,13 +104,13 @@ class Unpool2dMaxFunctor { */ template class Unpool2dMaxGradFunctor { - public: +public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, const framework::Tensor& output_grad, - framework::Tensor * input_grad) { + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -126,24 +123,20 @@ class Unpool2dMaxGradFunctor { const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int threads = 1024; - int grid = (input.numel() + threads - 1) / threads; + int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMaxGrad< T><<(context) .stream()>>>(input.numel(), input_data, indices_data, input_height, input_width, output_channels, output_data, output_grad_data, - output_height, output_width, - input_grad_data); + output_height, output_width, input_grad_data); } }; - template class Unpool2dMaxGradFunctor; template class Unpool2dMaxGradFunctor; - template class Unpool2dMaxFunctor; template class Unpool2dMaxFunctor; - } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index bf79354ed9..0b969d8d82 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -22,22 +22,21 @@ namespace math { template class Unpool2dMaxFunctor { - public: +public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& indices, - framework::Tensor * output); + const framework::Tensor& indices, framework::Tensor* output); }; template class Unpool2dMaxGradFunctor { - public: +public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, const framework::Tensor& output_grad, - framework::Tensor * input_grad); + framework::Tensor* input_grad); }; } // namespace math } // namespace operators diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 2505148764..cabf17401b 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -21,107 +21,115 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { Unpool2dOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", + AddInput( + "X", "(Tensor) The input tensor of unpool operator. " "The format of input tensor is NCHW. Where N is batch size, C is the " "number of channels, H and W is the height and width of feature."); - AddInput("Indices", + AddInput( + "Indices", "(Tensor) The input tensor of the indices given out by MaxPool2d. " "The format of input tensor is NCHW. Where N is batch size, C is the " "number of channels, H and W is the height and width of feature."); - AddOutput("Out", + AddOutput( + "Out", "(Tensor) The output tensor of unpool operator." "The format of output tensor is also NCHW." "Where N is batch size, C is " "the number of channels, H and W is the height and " "width of feature."); - AddAttr>("ksize", + AddAttr>( + "ksize", "(vector), the unpooling window size(height, width) " "of unpooling operator."); - AddAttr>("strides", + AddAttr>( + "strides", "(vector, default:{1, 1}), " "strides (height, width) of unpooling operator.") .SetDefault({1, 1}); - AddAttr>("paddings", + AddAttr>( + "paddings", "(vector defalut:{0,0}), " "paddings (height, width) of unpooling operator.") .SetDefault({0, 0}); - AddAttr("unpooling_type", + AddAttr( + "unpooling_type", "(string), unpooling type, can be \"max\" for max-unpooling ") .InEnum({"max"}); AddComment(R"DOC( - "Input shape: $(N, C_{in}, H_{in}, W_{in})$ - Output shape: $(N, C_{out}, H_{out}, W_{out})$ - Where + "Input shape: $(N, C_{in}, H_{in}, W_{in})$ + Output shape: $(N, C_{out}, H_{out}, W_{out})$ + Where $$ H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\ W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1] $$ - Paper: http://www.matthewzeiler.com/wp-content/uploads/2017 - /07/iccv2011.pdf + Paper: http://www.matthewzeiler.com/wp-content/uploads/2017 + /07/iccv2011.pdf )DOC"); } }; int OutputSize(int input_size, int ksize, int padding, int stride) { - int output_size = (input_size -1) * stride - 2 * padding + ksize; + int output_size = (input_size - 1) * stride - 2 * padding + ksize; return output_size; } class UnpoolOp : public framework::OperatorWithKernel { -protected: - framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); - } + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } -public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp" + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp" "should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp" - "should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of UnpoolOp should not be null."); - auto in_x_dims = ctx->GetInputDim("X"); - auto in_y_dims = ctx->GetInputDim("Indices"); - std::string unpooling_type = - ctx->Attrs().Get("unpooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE(in_x_dims.size() == 4, - "Unpooling intput must be of 4-dimensional."); - PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back( - OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); - } - ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); - } + auto in_x_dims = ctx->GetInputDim("X"); + auto in_y_dims = ctx->GetInputDim("Indices"); + std::string unpooling_type = + ctx->Attrs().Get("unpooling_type"); + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = + ctx->Attrs().Get>("paddings"); + PADDLE_ENFORCE(in_x_dims.size() == 4, + "Unpooling intput must be of 4-dimensional."); + PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } }; class UnpoolOpGrad : public framework::OperatorWithKernel { - protected: - framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); } - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Input(X@GRAD) should not be null."); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } }; } // namespace operators } // namespace paddle @@ -129,10 +137,10 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_CPU_KERNEL(unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); +REGISTER_OP_CPU_KERNEL( + unpool,ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL( + unpool_grad, ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index f618a7c0ba..8fad768e49 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -27,7 +27,7 @@ class UnpoolKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { const framework::Tensor* in_x = context.Input("X"); const framework::Tensor* in_y = context.Input("Indices"); - auto * out = context.Output("Out"); + auto* out = context.Output("Out"); std::string unpooling_type = context.Attr("unpooling_type"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); @@ -52,7 +52,7 @@ class UnpoolGradKernel : public framework::OpKernel { const framework::Tensor* out_grad = context.Input(framework::GradVarName("Out")); framework::Tensor* in_x_grad = - context.Output(framework::GradVarName("X")); + context.Output(framework::GradVarName("X")); std::string unpooling_type = context.Attr("unpooling_type"); std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); @@ -65,8 +65,8 @@ class UnpoolGradKernel : public framework::OpKernel { zero(device_ctx, in_x_grad, static_cast(0)); } math::Unpool2dMaxGradFunctor unpool2d_max_backward; - unpool2d_max_backward(context.device_context(), *in_x, *in_y, - *out, *out_grad, in_x_grad); + unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out, + *out_grad, in_x_grad); } }; diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index 292b9bc14a..321cd9fab8 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -52,14 +52,16 @@ class TestUnpoolOp(OpTest): c_start + arg % self.ksize[1] output = self.unpool2d_forward_naive(input, indices, self.ksize, \ self.strides, self.paddings).astype("float32") - self.inputs = {'X': input.astype('float32'), - 'Indices': indices.astype('int32')} + self.inputs = { + 'X': input.astype('float32'), + 'Indices': indices.astype('int32') + } self.attrs = { - 'strides': self.strides, - 'paddings': self.paddings, - 'ksize': self.ksize, - 'unpooling_type': self.unpooling_type, - } + 'strides': self.strides, + 'paddings': self.paddings, + 'ksize': self.ksize, + 'unpooling_type': self.unpooling_type, + } self.outputs = {'Out': output.astype('float32')} def test_check_output(self): @@ -76,7 +78,5 @@ class TestUnpoolOp(OpTest): self.strides = [2, 2] self.paddings = [0, 0] - - if __name__ == '__main__': unittest.main() From 2488b81fe53180f90d0772e490b3f97578442511 Mon Sep 17 00:00:00 2001 From: "Wang,Jeff" Date: Tue, 28 Nov 2017 17:26:43 -0800 Subject: [PATCH 099/275] Add write_docs_en.rst for English version Update the write_docs_en.rst and write_docs_cn.rst to include how to run the documentation viewer tool without Docker Add dev/contribute_to_paddle_cn.md onto navigation link --- doc/howto/dev/write_docs_cn.rst | 27 ++++++++++++-- doc/howto/dev/write_docs_en.rst | 65 +++++++++++++++++++++++++++++++++ doc/howto/index_cn.rst | 1 + doc/howto/index_en.rst | 1 + 4 files changed, 90 insertions(+), 4 deletions(-) create mode 100644 doc/howto/dev/write_docs_en.rst diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index 25a967da83..6e4e27dd00 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -29,6 +29,25 @@ PaddlePaddle的文档构建有三种方式。 之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 +如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 + +.. code-block:: bash + + mkdir paddlepaddle + cd paddlepaddle + git clone git@github.com:PaddlePaddle/Paddle.git + git clone git@github.com:PaddlePaddle/book.git + git clone git@github.com:PaddlePaddle/models.git + git clone git@github.com:PaddlePaddle/PaddlePaddle.org.git + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver + +之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。 +想了解更多关於 PaddlePaddle.org 工具,可以 `点击这里 `_ 。 + 使用Docker构建 -------------- @@ -71,12 +90,12 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程 PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下,包含所有和前端网页设计相关的文件。 -如何更新doc.paddlepaddle.org +如何更新www.paddlepaddle.org ============================ -更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。 -目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 -`英文文档 `_ 。 +更新的文档以PR的形式提交到github中,提交方式参见 `贡献文档 `_ 。 +目前PaddlePaddle的develop分支的文档是自动触发更新的,用户可以分别查看最新的 `中文文档 `_ 和 +`英文文档 `_ 。 .. _cmake: https://cmake.org/ diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst new file mode 100644 index 0000000000..0e60e21889 --- /dev/null +++ b/doc/howto/dev/write_docs_en.rst @@ -0,0 +1,65 @@ +################## +Contribute Documentation +################## + +PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. +Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories. + +How to Build Documentations +============ + +We recommend using PaddlePaddle.org tool to build documentation + + +Use PaddlePaddle.org tool +-------------- +This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser. + +The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool + +.. code-block:: bash + + mkdir paddlepaddle + cd paddlepaddle + git clone git@github.com:PaddlePaddle/Paddle.git + git clone git@github.com:PaddlePaddle/book.git + git clone git@github.com:PaddlePaddle/models.git + + docker run -it -p 8000:8000 paddlepaddle/paddlepaddle.org:latest + +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation + +If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up + +.. code-block:: bash + + mkdir paddlepaddle + cd paddlepaddle + git clone git@github.com:PaddlePaddle/Paddle.git + git clone git@github.com:PaddlePaddle/book.git + git clone git@github.com:PaddlePaddle/models.git + git clone git@github.com:PaddlePaddle/PaddlePaddle.org.git + export CONTENT_DIR= + export ENV='' + cd PaddlePaddle.org/portal/ + pip install -r requirements.txt + python manage.py runserver + +Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +If you want to learn more on the PaddlePaddle.org, please `click here `_ 。 + +How to write Documentations +============ + +PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail. + + +How to update www.paddlepaddle.org +============================ + +Please create PRs and submit them to github, please check `Contribute Code `_ 。 +PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and +`English Docs `_ 。 + +.. _cmake: https://cmake.org/ +.. _sphinx: http://www.sphinx-doc.org/en/1.4.8/ diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst index 76d3e0a009..8ea99ea40c 100644 --- a/doc/howto/index_cn.rst +++ b/doc/howto/index_cn.rst @@ -20,6 +20,7 @@ :maxdepth: 1 dev/build_cn.rst + dev/contribute_to_paddle_cn.md dev/write_docs_cn.rst 模型配置 diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst index 1b6034be4e..fbf0d2d3ae 100644 --- a/doc/howto/index_en.rst +++ b/doc/howto/index_en.rst @@ -21,6 +21,7 @@ Development dev/build_en.rst dev/new_layer_en.rst dev/contribute_to_paddle_en.md + dev/write_docs_en.rst Configuration ------------- From dcf3ffd98033ffa492932ed9ffb7880d0bf010a0 Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Tue, 28 Nov 2017 18:02:28 -0800 Subject: [PATCH 100/275] Adding log loss operator (#5854) * Adding log loss operator * Removing comments --- paddle/operators/log_loss_op.cc | 115 ++++++++++++++++++ paddle/operators/log_loss_op.cu | 22 ++++ paddle/operators/log_loss_op.h | 75 ++++++++++++ .../paddle/v2/fluid/tests/test_log_loss_op.py | 33 +++++ 4 files changed, 245 insertions(+) create mode 100644 paddle/operators/log_loss_op.cc create mode 100644 paddle/operators/log_loss_op.cu create mode 100644 paddle/operators/log_loss_op.h create mode 100644 python/paddle/v2/fluid/tests/test_log_loss_op.py diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc new file mode 100644 index 0000000000..257e5c8a49 --- /dev/null +++ b/paddle/operators/log_loss_op.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/log_loss_op.h" + +namespace paddle { +namespace operators { + +class LogLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predicted"), + "Input(Predicted) must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) must be initialized."); + + auto pred_dims = ctx->GetInputDim("Predicted"); + auto label_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ(pred_dims, label_dims); + PADDLE_ENFORCE_EQ(pred_dims.size(), 2, + "The rank of Input(Predicted) must be 2 and the shape is " + "[batch_size, 1]."); + PADDLE_ENFORCE_EQ(pred_dims[1], 1, + "Each row of Input(Predicted) contains a real value, " + "so the 2nd dimension of Input(X) must be 1."); + + ctx->SetOutputDim("Loss", {pred_dims[0], 1}); + ctx->ShareLoD("Predicted", "Loss"); + } +}; + +template +class LogLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + LogLossOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Predicted", + "The input value (Predicted) of Log loss op." + "Predicted is a 2-D tensor with shape [batch_size, 1]."); + AddInput("Labels", + "The target value (Labels) of Log loss op." + "Labels is a 2-D tensor with shape [batch_size, 1]."); + AddOutput("Loss", + "The output tensor with shape [batch_size, 1] " + "which represents the log loss."); + AddAttr("epsilon", "Epsilon in log loss."); + AddComment(R"DOC( +LogLoss Operator. + +Log loss is a loss function used for binary classification. Log Loss quantifies +the accuracy of a classifier by penalising false classifications. Minimising the +Log Loss is equivalent to maximising the accuracy of the classifier. We define +Predicted as the values predicted by our model and Labels as the target ground +truth value. Log loss can evaluate how close the predicted values are to the +target. The shapes of Predicted and Labels are both [batch_size, 1]. +The equation is: + +$$ +Loss = - Labels * log(Predicted + \epsilon) - + (1 - Labels) * log(1 - Predicted + \epsilon) +$$ + +)DOC"); + } +}; + +class LogLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Predicted"), + "Input(Predicted) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")), + "Output(Predicted@GRAD) should not be null."); + + auto pred_dims = ctx->GetInputDim("Predicted"); + auto label_dims = ctx->GetInputDim("Labels"); + auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); + PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); + + auto pred_grad_name = framework::GradVarName("Predicted"); + ctx->SetOutputDim(pred_grad_name, pred_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker, log_loss_grad, + ops::LogLossGradOp); +REGISTER_OP_CPU_KERNEL(log_loss, + ops::LogLossKernel); +REGISTER_OP_CPU_KERNEL( + log_loss_grad, ops::LogLossGradKernel); diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu new file mode 100644 index 0000000000..6c189ef341 --- /dev/null +++ b/paddle/operators/log_loss_op.cu @@ -0,0 +1,22 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/log_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(log_loss, + ops::LogLossKernel); +REGISTER_OP_GPU_KERNEL( + log_loss_grad, ops::LogLossGradKernel); diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h new file mode 100644 index 0000000000..73404fce91 --- /dev/null +++ b/paddle/operators/log_loss_op.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; + +template +class LogLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* loss_out = ctx.Output("Loss"); + + loss_out->mutable_data(ctx.GetPlace()); + + auto epsilon = static_cast(ctx.Attr("epsilon")); + + auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); + auto label = EigenVector::Flatten(*ctx.Input("Labels")); + + auto loss = EigenVector::Flatten(*loss_out); + auto place = ctx.GetEigenDevice(); + + loss.device(place) = (-(label * (prediction + epsilon).log()) - + ((static_cast(1) - label) * + (static_cast(1) - prediction + epsilon).log())); + } +}; + +template +class LogLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto epsilon = static_cast(ctx.Attr("epsilon")); + + auto prediction = EigenVector::Flatten(*ctx.Input("Predicted")); + auto label = EigenVector::Flatten(*ctx.Input("Labels")); + + auto* dloss = ctx.Input(framework::GradVarName("Loss")); + auto* dpred = ctx.Output(framework::GradVarName("Predicted")); + + auto dl = EigenVector::Flatten(*dloss); + auto place = ctx.GetEigenDevice(); + + if (dpred) { + dpred->mutable_data(ctx.GetPlace()); + auto dx = framework::EigenVector::Flatten(*dpred); + dx.device(place) = dl * (-(label / (prediction + epsilon)) + + ((static_cast(1) - label) / + (static_cast(1) - prediction + epsilon))); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/v2/fluid/tests/test_log_loss_op.py b/python/paddle/v2/fluid/tests/test_log_loss_op.py new file mode 100644 index 0000000000..2eeaa90758 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py @@ -0,0 +1,33 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestLogLossOp(OpTest): + def setUp(self): + self.op_type = 'log_loss' + samples_num = 32 + + predicted = np.random.uniform(0.1, 1.0, + (samples_num, 1)).astype("float32") + labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32") + epsilon = 1e-4 + self.inputs = { + 'Predicted': predicted, + 'Labels': labels, + } + + self.attrs = {'epsilon': epsilon} + loss = -labels * np.log(predicted + epsilon) - ( + 1 - labels) * np.log(1 - predicted + epsilon) + self.outputs = {'Loss': loss} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03) + + +if __name__ == '__main__': + unittest.main() From c52ed8de37b922b8cc5d9ab1a4ff34a426667ed6 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 29 Nov 2017 10:57:55 +0800 Subject: [PATCH 101/275] format code --- paddle/operators/math/unpooling.cc | 22 +++++------ paddle/operators/math/unpooling.cu | 59 ++++++++++++------------------ paddle/operators/math/unpooling.h | 23 +++++------- paddle/operators/unpool_op.cc | 15 ++++---- paddle/operators/unpool_op.cu.cc | 14 +++---- paddle/operators/unpool_op.h | 3 -- 6 files changed, 54 insertions(+), 82 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index b13d0104de..71928314ba 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -13,17 +13,15 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/math/unpooling.h" - namespace paddle { namespace operators { namespace math { -// All tensors are in NCHW format template class Unpool2dMaxFunctor { -public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& indices, framework::Tensor* output) { + public: + void operator()( + const platform::DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -51,13 +49,11 @@ public: }; template class Unpool2dMaxGradFunctor { -public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& indices, - const framework::Tensor& output, - const framework::Tensor& output_grad, - framework::Tensor* input_grad) { + public: + void operator()( + const platform::DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, const framework::Tensor& output, + const framework::Tensor& output_grad, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 6017920873..4c6cb7bbca 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -19,14 +19,10 @@ namespace paddle { namespace operators { namespace math { template -__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, - const int* indices_data, - const int input_height, - const int input_width, - const int channels, - T* output_data, - const int output_height, - const int output_width) { +__global__ void KernelUnpool2dMax( + const int nthreads, const T* input_data, const int* indices_data, + const int input_height, const int input_width, const int channels, + T* output_data, const int output_height, const int output_width) { int in_n_stride = input_height * input_width * channels; int in_c_stride = input_height * input_width; int out_n_stride = output_height * output_width * channels; @@ -44,16 +40,11 @@ __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, } } template -__global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, - const int* indices_data, - const int input_height, - const int input_width, - const int channels, - const T* output_data, - const T* output_grad, - const int output_height, - const int output_width, - T* input_grad) { +__global__ void KernelUnpool2dMaxGrad( + const int nthreads, const T* input_data, const int* indices_data, + const int input_height, const int input_width, const int channels, + const T* output_data, const T* output_grad, const int output_height, + const int output_width, T* input_grad) { int in_n_stride = input_height * input_width * channels; int in_c_stride = input_height * input_width; int out_n_stride = output_height * output_width * channels; @@ -75,11 +66,10 @@ __global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, */ template class Unpool2dMaxFunctor { -public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& indices, - framework::Tensor* output) { + public: + void operator()( + const platform::DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -91,12 +81,11 @@ public: T* output_data = output->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMax< - T><<<<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_height, output_width); + .stream()>>>(input.numel(), input_data, indices_data, + input_height, input_width, output_channels, + output_data, output_height, output_width); } }; /* @@ -104,7 +93,7 @@ public: */ template class Unpool2dMaxGradFunctor { -public: + public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, @@ -124,13 +113,11 @@ public: T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMaxGrad< - T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_grad_data, - output_height, output_width, input_grad_data); + KernelUnpool2dMaxGrad<<(context) + .stream()>>>(input.numel(), input_data, indices_data, + input_height, input_width, output_channels, output_data, + output_grad_data, output_height, output_width, input_grad_data); } }; template class Unpool2dMaxGradFunctor; diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index 0b969d8d82..43e32bf4fb 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -18,25 +18,20 @@ limitations under the License. */ namespace paddle { namespace operators { namespace math { - template - class Unpool2dMaxFunctor { -public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& indices, framework::Tensor* output); + public: + void operator()( + const platform::DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output); }; - template class Unpool2dMaxGradFunctor { -public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& indices, - const framework::Tensor& output, - const framework::Tensor& output_grad, - framework::Tensor* input_grad); + public: + void operator()( + const platform::DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& indices, const framework::Tensor& output, + const framework::Tensor& output_grad, framework::Tensor* input_grad); }; } // namespace math } // namespace operators diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index cabf17401b..a51df3aa42 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -31,13 +31,12 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) The input tensor of the indices given out by MaxPool2d. " "The format of input tensor is NCHW. Where N is batch size, C is the " "number of channels, H and W is the height and width of feature."); - AddOutput( - "Out", - "(Tensor) The output tensor of unpool operator." - "The format of output tensor is also NCHW." - "Where N is batch size, C is " - "the number of channels, H and W is the height and " - "width of feature."); + AddOutput("Out", + "(Tensor) The output tensor of unpool operator." + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); AddAttr>( "ksize", "(vector), the unpooling window size(height, width) " @@ -138,7 +137,7 @@ namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); REGISTER_OP_CPU_KERNEL( - unpool,ops::UnpoolKernel, + unpool, ops::UnpoolKernel, ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL( unpool_grad, ops::UnpoolGradKernel, diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index d8214fc687..8ee9e2b373 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -15,11 +15,9 @@ limitations under the License. */ #include "paddle/operators/unpool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_GPU_KERNEL(unpool_grad, - ops::UnpoolGradKernel, - ops::UnpoolGradKernel); +REGISTER_OP_GPU_KERNEL( + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_GPU_KERNEL( + unpool_grad, ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index 8fad768e49..243eb7e532 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -20,7 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { - template class UnpoolKernel : public framework::OpKernel { public: @@ -41,7 +40,6 @@ class UnpoolKernel : public framework::OpKernel { unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); } }; - template class UnpoolGradKernel : public framework::OpKernel { public: @@ -69,6 +67,5 @@ class UnpoolGradKernel : public framework::OpKernel { *out_grad, in_x_grad); } }; - } // namespace operators } // namespace paddle From d2ee3c98df1203ca68e711a1fb04ddbd6d048b33 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 29 Nov 2017 11:23:46 +0800 Subject: [PATCH 102/275] format code --- paddle/operators/math/unpooling.cc | 16 +-- paddle/operators/math/unpooling.cu | 50 ++++---- paddle/operators/math/unpooling.h | 17 +-- paddle/operators/unpool_op.cc | 114 +++++++++--------- paddle/operators/unpool_op.cu.cc | 8 +- .../paddle/v2/fluid/tests/test_unpool_op.py | 5 +- 6 files changed, 110 insertions(+), 100 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index 71928314ba..9017ffaab1 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -19,9 +19,9 @@ namespace math { template class Unpool2dMaxFunctor { public: - void operator()( - const platform::DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& indices, framework::Tensor* output) { + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -50,10 +50,12 @@ class Unpool2dMaxFunctor { template class Unpool2dMaxGradFunctor { public: - void operator()( - const platform::DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& indices, const framework::Tensor& output, - const framework::Tensor& output_grad, framework::Tensor* input_grad) { + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + const framework::Tensor& output, + const framework::Tensor& output_grad, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 4c6cb7bbca..f3a317b3b3 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -19,10 +19,12 @@ namespace paddle { namespace operators { namespace math { template -__global__ void KernelUnpool2dMax( - const int nthreads, const T* input_data, const int* indices_data, - const int input_height, const int input_width, const int channels, - T* output_data, const int output_height, const int output_width) { +__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, + const int* indices_data, + const int input_height, const int input_width, + const int channels, T* output_data, + const int output_height, + const int output_width) { int in_n_stride = input_height * input_width * channels; int in_c_stride = input_height * input_width; int out_n_stride = output_height * output_width * channels; @@ -40,11 +42,12 @@ __global__ void KernelUnpool2dMax( } } template -__global__ void KernelUnpool2dMaxGrad( - const int nthreads, const T* input_data, const int* indices_data, - const int input_height, const int input_width, const int channels, - const T* output_data, const T* output_grad, const int output_height, - const int output_width, T* input_grad) { +__global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, + const int* indices_data, + const int input_height, const int input_width, + const int channels, const T* output_data, + const T* output_grad, const int output_height, + const int output_width, T* input_grad) { int in_n_stride = input_height * input_width * channels; int in_c_stride = input_height * input_width; int out_n_stride = output_height * output_width * channels; @@ -67,9 +70,9 @@ __global__ void KernelUnpool2dMaxGrad( template class Unpool2dMaxFunctor { public: - void operator()( - const platform::DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& indices, framework::Tensor* output) { + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, const framework::Tensor& indices, + framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -81,11 +84,12 @@ class Unpool2dMaxFunctor { T* output_data = output->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMax<<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_height, output_width); + KernelUnpool2dMax< + T><<(context) + .stream()>>>(input.numel(), input_data, indices_data, + input_height, input_width, output_channels, + output_data, output_height, output_width); } }; /* @@ -113,11 +117,13 @@ class Unpool2dMaxGradFunctor { T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMaxGrad<<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, output_data, - output_grad_data, output_height, output_width, input_grad_data); + KernelUnpool2dMaxGrad< + T><<(context) + .stream()>>>(input.numel(), input_data, indices_data, + input_height, input_width, output_channels, output_data, + output_grad_data, output_height, output_width, + input_grad_data); } }; template class Unpool2dMaxGradFunctor; diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index 43e32bf4fb..61eadcdcd5 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -21,17 +21,20 @@ namespace math { template class Unpool2dMaxFunctor { public: - void operator()( - const platform::DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& indices, framework::Tensor* output); + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + framework::Tensor* output); }; template class Unpool2dMaxGradFunctor { public: - void operator()( - const platform::DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& indices, const framework::Tensor& output, - const framework::Tensor& output_grad, framework::Tensor* input_grad); + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& indices, + const framework::Tensor& output, + const framework::Tensor& output_grad, + framework::Tensor* input_grad); }; } // namespace math } // namespace operators diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index a51df3aa42..a40aadcccc 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -32,24 +32,22 @@ class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker { "The format of input tensor is NCHW. Where N is batch size, C is the " "number of channels, H and W is the height and width of feature."); AddOutput("Out", - "(Tensor) The output tensor of unpool operator." - "The format of output tensor is also NCHW." - "Where N is batch size, C is " - "the number of channels, H and W is the height and " - "width of feature."); + "(Tensor) The output tensor of unpool operator." + "The format of output tensor is also NCHW." + "Where N is batch size, C is " + "the number of channels, H and W is the height and " + "width of feature."); AddAttr>( "ksize", "(vector), the unpooling window size(height, width) " "of unpooling operator."); - AddAttr>( - "strides", - "(vector, default:{1, 1}), " - "strides (height, width) of unpooling operator.") + AddAttr>("strides", + "(vector, default:{1, 1}), " + "strides (height, width) of unpooling operator.") .SetDefault({1, 1}); - AddAttr>( - "paddings", - "(vector defalut:{0,0}), " - "paddings (height, width) of unpooling operator.") + AddAttr>("paddings", + "(vector defalut:{0,0}), " + "paddings (height, width) of unpooling operator.") .SetDefault({0, 0}); AddAttr( "unpooling_type", @@ -75,71 +73,71 @@ int OutputSize(int input_size, int ksize, int padding, int stride) { } class UnpoolOp : public framework::OperatorWithKernel { - protected: - framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx) const override { + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), + framework::ToDataType(ctx.Input("X")->type()), ctx.device_context()); } - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp" + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp" "should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp" + PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp" "should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), + PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of UnpoolOp should not be null."); - auto in_x_dims = ctx->GetInputDim("X"); - auto in_y_dims = ctx->GetInputDim("Indices"); - std::string unpooling_type = + auto in_x_dims = ctx->GetInputDim("X"); + auto in_y_dims = ctx->GetInputDim("Indices"); + std::string unpooling_type = ctx->Attrs().Get("unpooling_type"); - std::vector ksize = ctx->Attrs().Get>("ksize"); - std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = + std::vector ksize = ctx->Attrs().Get>("ksize"); + std::vector strides = ctx->Attrs().Get>("strides"); + std::vector paddings = ctx->Attrs().Get>("paddings"); - PADDLE_ENFORCE(in_x_dims.size() == 4, + PADDLE_ENFORCE(in_x_dims.size() == 4, "Unpooling intput must be of 4-dimensional."); - PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); - std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back( - OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); - } - ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); - } + PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); + std::vector output_shape({in_x_dims[0], in_x_dims[1]}); + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back( + OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + } + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + } }; class UnpoolOpGrad : public framework::OperatorWithKernel { - protected: - framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); - } + protected: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } - public: - using framework::OperatorWithKernel::OperatorWithKernel; - void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Input(X@GRAD) should not be null."); - ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); - } + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } }; -} // namespace operators -} // namespace paddle +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); REGISTER_OP_CPU_KERNEL( - unpool, ops::UnpoolKernel, - ops::UnpoolKernel); + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL( - unpool_grad, ops::UnpoolGradKernel, - ops::UnpoolGradKernel); + unpool_grad, ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 8ee9e2b373..29b393f474 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -16,8 +16,8 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL( - unpool, ops::UnpoolKernel, - ops::UnpoolKernel); + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_GPU_KERNEL( - unpool_grad, ops::UnpoolGradKernel, - ops::UnpoolGradKernel); + unpool_grad, ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py index 321cd9fab8..e87f283042 100644 --- a/python/paddle/v2/fluid/tests/test_unpool_op.py +++ b/python/paddle/v2/fluid/tests/test_unpool_op.py @@ -55,13 +55,13 @@ class TestUnpoolOp(OpTest): self.inputs = { 'X': input.astype('float32'), 'Indices': indices.astype('int32') - } + } self.attrs = { 'strides': self.strides, 'paddings': self.paddings, 'ksize': self.ksize, 'unpooling_type': self.unpooling_type, - } + } self.outputs = {'Out': output.astype('float32')} def test_check_output(self): @@ -78,5 +78,6 @@ class TestUnpoolOp(OpTest): self.strides = [2, 2] self.paddings = [0, 0] + if __name__ == '__main__': unittest.main() From 29262ab24d8675d5b50fe21dda59f4102db1bb7b Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 29 Nov 2017 11:56:29 +0800 Subject: [PATCH 103/275] Fix unitest. --- paddle/operators/nce_op.cc | 8 ++++---- paddle/operators/nce_op.h | 16 ++++++++-------- python/paddle/v2/fluid/tests/test_nce.py | 14 +++++++------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc index bb9346b134..952da10434 100644 --- a/paddle/operators/nce_op.cc +++ b/paddle/operators/nce_op.cc @@ -41,11 +41,11 @@ class NCEOp : public framework::OperatorWithKernel { } auto num_neg_samples = ctx->Attrs().Get("num_neg_samples"); auto num_total_classes = ctx->Attrs().Get("num_total_classes"); - std::vector sampled_labels = - ctx->Attrs().Get>("sampled_labels"); + std::vector custom_neg_classes = + ctx->Attrs().Get>("custom_neg_classes"); PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]); - if (sampled_labels.size() > 0) { - PADDLE_ENFORCE_EQ(sampled_labels.size(), + if (custom_neg_classes.size() > 0) { + PADDLE_ENFORCE_EQ(custom_neg_classes.size(), static_cast(num_neg_samples)); } // set dims of output(Out) diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index 8df20f432d..ea92a797fe 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -33,14 +33,14 @@ void PrepareSamples(const framework::ExecutionContext& context) { auto label = context.Input("Label"); const int64_t* label_data = label->data(); auto label_dims = label->dims(); - int num_classes = context.Attr("num_classes"); + int num_total_classes = context.Attr("num_total_classes"); // for unitest std::vector custom_neg_classes = context.Attr>("custom_neg_classes"); // random machine std::random_device rd; std::mt19937 rng(rd()); - std::uniform_int_distribution rand(0, num_classes - 1); + std::uniform_int_distribution rand(0, num_total_classes - 1); auto sample_labels = context.Output("SampleLabels"); auto sample_labels_dims = sample_labels->dims(); @@ -84,13 +84,13 @@ class NCEKernel : public framework::OpKernel { } auto out = context.Output("Cost"); T* out_data = out->mutable_data(context.GetPlace()); - int num_smalped_classes = context.Attr("num_sampled_classes"); - int num_classes = context.Attr("num_classes"); + int num_neg_samples = context.Attr("num_neg_samples"); + int num_total_classes = context.Attr("num_total_classes"); int num_true_class = 1; if (label != nullptr) { num_true_class = label->dims()[1]; } - T b = 1. / num_classes * num_smalped_classes; + T b = 1. / num_total_classes * num_neg_samples; // forward bias auto bias = context.Input("Bias"); if (bias != nullptr) { @@ -151,13 +151,13 @@ class NCEGradKernel : public framework::OpKernel { if (sample_weight != nullptr) { sample_weight_data = sample_weight->data(); } - int num_smalped_classes = context.Attr("num_sampled_classes"); - int num_classes = context.Attr("num_classes"); + int num_neg_samples = context.Attr("num_neg_samples"); + int num_total_classes = context.Attr("num_total_classes"); int num_true_class = 1; if (label != nullptr) { num_true_class = label->dims()[1]; } - T b = 1. / num_classes * num_smalped_classes; + T b = 1. / num_total_classes * num_neg_samples; Tensor sample_grad; // tmp tensor T* sample_grad_data = sample_grad.mutable_data(sample_labels->dims(), context.GetPlace()); diff --git a/python/paddle/v2/fluid/tests/test_nce.py b/python/paddle/v2/fluid/tests/test_nce.py index 6cbf468e0a..8aeba69769 100644 --- a/python/paddle/v2/fluid/tests/test_nce.py +++ b/python/paddle/v2/fluid/tests/test_nce.py @@ -35,7 +35,7 @@ def nce(input, weight, bias, sample_weight, labels, num_classes, o = sample_out[i] cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b)) out[samples[i][0]] += cost * samples[i][3] - return (out, np.array(sample_out).reshape( + return (out[:, np.newaxis], np.array(sample_out).reshape( batch_size, num_sample_class + num_true_class), np.array(sample_labels).reshape(batch_size, num_sample_class + num_true_class)) @@ -43,16 +43,16 @@ def nce(input, weight, bias, sample_weight, labels, num_classes, class TestNCE(OpTest): def generate_data(self, dim, batch_size, num_classes, num_true_class, - num_sampled_classes): + num_neg_samples): input = np.random.randn(batch_size, dim).astype(np.float32) weight = np.random.randn(num_classes, dim).astype(np.float32) bias = np.random.randn(num_classes).astype(np.float32) sample_weight = np.random.randn(batch_size).astype(np.float32) labels = np.random.randint(0, num_classes, (batch_size, num_true_class)) self.attrs = { - 'num_classes': num_classes, - 'num_sampled_classes': num_sampled_classes, - 'sampled_labels': range(num_sampled_classes) + 'num_total_classes': num_classes, + 'num_neg_samples': num_neg_samples, + 'custom_neg_classes': range(num_neg_samples) } self.inputs = { 'Input': input, @@ -68,8 +68,8 @@ class TestNCE(OpTest): def compute(self): out = nce(self.inputs['Input'], self.inputs['Weight'], self.inputs['Bias'], self.inputs['SampleWeight'], - self.inputs['Label'], self.attrs['num_classes'], - self.attrs['num_sampled_classes']) + self.inputs['Label'], self.attrs['num_total_classes'], + self.attrs['num_neg_samples']) self.outputs = { 'Cost': out[0], 'SampleLogits': out[1], From 3e552cdcac5370a59152c60670008e575a80da5d Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 29 Nov 2017 11:31:15 +0800 Subject: [PATCH 104/275] Fix gru_op related code style --- paddle/operators/gru_op.h | 46 +- paddle/operators/math/detail/gru_cpu_kernel.h | 540 +++++++++--------- paddle/operators/math/detail/gru_gpu_kernel.h | 252 ++++---- paddle/operators/math/detail/gru_kernel.h | 135 +++-- paddle/operators/math/gru_compute.cc | 64 ++- paddle/operators/math/gru_compute.cu | 148 ++--- paddle/operators/math/gru_compute.h | 31 +- 7 files changed, 617 insertions(+), 599 deletions(-) diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index 1b18368e0e..564489d3a9 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel { int frame_size = hidden_dims[1]; math::hl_gru_value gru_value; - gru_value.gateWeight = const_cast(weight_data); - gru_value.stateWeight = + gru_value.gate_weight = const_cast(weight_data); + gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); Tensor ordered_h0; const size_t* order = batch_gate->lod()[2].data(); @@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel { // to reorder. ReorderInitState(context.device_context(), *h0, order, &ordered_h0, true); - gru_value.prevOutValue = ordered_h0.data(); + gru_value.prev_out_value = ordered_h0.data(); } else { - gru_value.prevOutValue = nullptr; + gru_value.prev_out_value = nullptr; } auto batch_starts = batch_gate->lod()[0]; size_t num_batch = batch_starts.size() - 1; @@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel { Tensor gate_t = batch_gate->Slice(bstart, bend); Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); Tensor hidden_t = batch_hidden->Slice(bstart, bend); - gru_value.outputValue = hidden_t.data(); - gru_value.gateValue = gate_t.data(); - gru_value.resetOutputValue = reset_hidden_prev_t.data(); + gru_value.output_value = hidden_t.data(); + gru_value.gate_value = gate_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); math::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); - gru_value.prevOutValue = gru_value.outputValue; + gru_value.prev_out_value = gru_value.output_value; } math::Batch2LoDTensorFunctor to_seq; @@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel { to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse); math::hl_gru_value gru_value; - gru_value.gateWeight = const_cast(weight_data); - gru_value.stateWeight = + gru_value.gate_weight = const_cast(weight_data); + gru_value.state_weight = const_cast(weight_data + 2 * frame_size * frame_size); math::hl_gru_grad gru_grad; if (weight_grad) { - gru_grad.gateWeightGrad = + gru_grad.gate_weight_grad = weight_grad->mutable_data(context.GetPlace()); zero(dev_ctx, weight_grad, static_cast(0.0)); - gru_grad.stateWeightGrad = + gru_grad.state_weight_grad = weight_grad->data() + 2 * frame_size * frame_size; } else { - gru_grad.gateWeightGrad = nullptr; - gru_grad.stateWeightGrad = nullptr; + gru_grad.gate_weight_grad = nullptr; + gru_grad.state_weight_grad = nullptr; } auto batch_starts = batch_hidden_grad.lod()[0]; @@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel { int cur_batch_size = bend - bstart; Tensor gate_t = batch_gate->Slice(bstart, bend); - gru_value.gateValue = gate_t.data(); + gru_value.gate_value = gate_t.data(); Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend); - gru_value.resetOutputValue = reset_hidden_prev_t.data(); + gru_value.reset_output_value = reset_hidden_prev_t.data(); Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend); - gru_grad.outputGrad = hidden_grad_t.data(); + gru_grad.output_grad = hidden_grad_t.data(); Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend); - gru_grad.gateGrad = gate_grad_t.data(); + gru_grad.gate_grad = gate_grad_t.data(); Tensor reset_hidden_prev_grad_t = batch_reset_hidden_prev_grad.Slice(bstart, bend); - gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data(); + gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data(); if (n == 0) { - gru_value.prevOutValue = h0 ? ordered_h0.data() : nullptr; - gru_grad.prevOutGrad = + gru_value.prev_out_value = h0 ? ordered_h0.data() : nullptr; + gru_grad.prev_out_grad = h0 && h0_grad ? ordered_h0_grad.data() : nullptr; } else { int bstart_pre = static_cast(batch_starts[n - 1]); Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart); - gru_value.prevOutValue = hidden_prev_t.data(); + gru_value.prev_out_value = hidden_prev_t.data(); Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart); - gru_grad.prevOutGrad = hidden_prev_grad_t.data(); + gru_grad.prev_out_grad = hidden_prev_grad_t.data(); } math::GRUUnitGradFunctor::compute( diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h index 51af140cf4..4c67dec9cb 100644 --- a/paddle/operators/math/detail/gru_cpu_kernel.h +++ b/paddle/operators/math/detail/gru_cpu_kernel.h @@ -25,393 +25,397 @@ namespace detail { #ifndef __NVCC__ template -void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput, - T *gateValue, T *resetOutputValue, - T *prevOutputValue, int frameSize, +void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, activation_mode_t active_gate) { - T rValueUpdateGate; - T rValueResetGate; - T rValueResetOutput; - T rPrevOut = 0; - T *updateGate = gateValue; - T *resetGate = gateValue + frameSize; - - for (int i = 0; i < frameSize; i++) { - rValueUpdateGate = updateGate[i]; - rValueResetGate = resetGate[i]; - if (prevOutputValue) { - rPrevOut = prevOutputValue[i]; + T r_value_update_gate; + T r_value_reset_gate; + T r_value_reset_output; + T r_prev_out = 0; + T *update_gate = gate_value; + T *reset_gate = gate_value + frame_size; + + for (int i = 0; i < frame_size; i++) { + r_value_update_gate = update_gate[i]; + r_value_reset_gate = reset_gate[i]; + if (prev_output_value) { + r_prev_out = prev_output_value[i]; } - opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, - rValueResetOutput, active_gate); + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); - updateGate[i] = rValueUpdateGate; - resetGate[i] = rValueResetGate; - resetOutputValue[i] = rValueResetOutput; + update_gate[i] = r_value_update_gate; + reset_gate[i] = r_value_reset_gate; + reset_output_value[i] = r_value_reset_output; } } template -void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput, - T *gateValue, T *prevOutputValue, - T *outputValue, int frameSize, +void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, activation_mode_t active_node) { - T rValueUpdateGate; - T rValueFrameState; - T rPrevOut = 0; - T rOutput; - T *updateGate = gateValue; - T *frameState = gateValue + frameSize * 2; - - for (int i = 0; i < frameSize; i++) { - rValueUpdateGate = updateGate[i]; - rValueFrameState = frameState[i]; - if (prevOutputValue) { - rPrevOut = prevOutputValue[i]; + T r_value_update_gate; + T r_value_frame_state; + T r_prev_out = 0; + T r_output; + T *update_gate = gate_value; + T *frame_state = gate_value + frame_size * 2; + + for (int i = 0; i < frame_size; i++) { + r_value_update_gate = update_gate[i]; + r_value_frame_state = frame_state[i]; + if (prev_output_value) { + r_prev_out = prev_output_value[i]; } - opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, - active_node); + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); - frameState[i] = rValueFrameState; - outputValue[i] = rOutput; + frame_state[i] = r_value_frame_state; + output_value[i] = r_output; } } template -void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue, - T *resetOutputValue, T *prevOutputValue, - int frameSize, +void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, activation_mode_t active_gate) { #ifdef __AVX__ - __m256 rValueUpdateGate; - __m256 rValueResetGate; - __m256 rValueResetOutput; - __m256 rPrevOut = _mm256_set1_ps(0.0f); - __m256 *updateGate = (__m256 *)gateValue; - __m256 *resetGate = (__m256 *)(gateValue + frameSize); - - for (int i = 0; i < frameSize / 8; i++) { - rValueUpdateGate = updateGate[i]; - rValueResetGate = resetGate[i]; - if (prevOutputValue) { - rPrevOut = ((__m256 *)prevOutputValue)[i]; + __m256 r_value_update_gate; + __m256 r_value_reset_gate; + __m256 r_value_reset_output; + __m256 r_prev_out = _mm256_set1_ps(0.0f); + __m256 *update_gate = (__m256 *)gate_value; + __m256 *reset_gate = (__m256 *)(gate_value + frame_size); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_update_gate = update_gate[i]; + r_value_reset_gate = reset_gate[i]; + if (prev_output_value) { + r_prev_out = ((__m256 *)prev_output_value)[i]; } - opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, - rValueResetOutput, active_gate); + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); - updateGate[i] = rValueUpdateGate; - resetGate[i] = rValueResetGate; - ((__m256 *)resetOutputValue)[i] = rValueResetOutput; + update_gate[i] = r_value_update_gate; + reset_gate[i] = r_value_reset_gate; + ((__m256 *)reset_output_value)[i] = r_value_reset_output; } #endif } template -void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue, - T *prevOutputValue, T *outputValue, - int frameSize, +void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, activation_mode_t active_node) { #ifdef __AVX__ - __m256 rValueUpdateGate; - __m256 rValueFrameState; - __m256 rPrevOut = _mm256_set1_ps(0.0f); - __m256 rOutput; - __m256 *updateGate = (__m256 *)gateValue; - __m256 *frameState = (__m256 *)(gateValue + frameSize * 2); - - for (int i = 0; i < frameSize / 8; i++) { - rValueUpdateGate = updateGate[i]; - rValueFrameState = frameState[i]; - if (prevOutputValue) { - rPrevOut = ((__m256 *)prevOutputValue)[i]; + __m256 r_value_update_gate; + __m256 r_value_frame_state; + __m256 r_prev_out = _mm256_set1_ps(0.0f); + __m256 r_output; + __m256 *update_gate = (__m256 *)gate_value; + __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_update_gate = update_gate[i]; + r_value_frame_state = frame_state[i]; + if (prev_output_value) { + r_prev_out = ((__m256 *)prev_output_value)[i]; } - opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, - active_node); + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); - frameState[i] = rValueFrameState; - ((__m256 *)outputValue)[i] = rOutput; + frame_state[i] = r_value_frame_state; + ((__m256 *)output_value)[i] = r_output; } #endif } template -inline void forward_reset_output(OpResetOutput opResetOutput, - hl_gru_value value, int frameSize, - int batchSize, activation_mode_t active_gate) { - for (int b = 0; b < batchSize; b++) { - if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { +inline void forward_reset_output(OpResetOutput op_reset_output, + hl_gru_value value, int frame_size, + int batch_size, + activation_mode_t active_gate) { + for (int b = 0; b < batch_size; b++) { + if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { hl_avx_gru_forward_reset_output( - opResetOutput, value.gateValue, value.resetOutputValue, - value.prevOutValue, frameSize, active_gate); + op_reset_output, value.gate_value, value.reset_output_value, + value.prev_out_value, frame_size, active_gate); } else { hl_naive_gru_forward_reset_output( - opResetOutput, value.gateValue, value.resetOutputValue, - value.prevOutValue, frameSize, active_gate); + op_reset_output, value.gate_value, value.reset_output_value, + value.prev_out_value, frame_size, active_gate); } - value.gateValue += frameSize * 3; - value.resetOutputValue += frameSize; - if (value.prevOutValue) { - value.prevOutValue += frameSize; + value.gate_value += frame_size * 3; + value.reset_output_value += frame_size; + if (value.prev_out_value) { + value.prev_out_value += frame_size; } } } template -inline void forward_final_output(OpFinalOutput opFinalOutput, - hl_gru_value value, int frameSize, - int batchSize, activation_mode_t active_node) { - for (int b = 0; b < batchSize; b++) { - if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { - hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue, - value.prevOutValue, value.outputValue, - frameSize, active_node); +inline void forward_final_output(OpFinalOutput op_final_output, + hl_gru_value value, int frame_size, + int batch_size, + activation_mode_t active_node) { + for (int b = 0; b < batch_size; b++) { + if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { + hl_avx_gru_forward_final_output(op_final_output, value.gate_value, + value.prev_out_value, value.output_value, + frame_size, active_node); } else { - hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue, - value.prevOutValue, value.outputValue, - frameSize, active_node); + hl_naive_gru_forward_final_output( + op_final_output, value.gate_value, value.prev_out_value, + value.output_value, frame_size, active_node); } - value.gateValue += frameSize * 3; - value.outputValue += frameSize; - if (value.prevOutValue) { - value.prevOutValue += frameSize; + value.gate_value += frame_size * 3; + value.output_value += frame_size; + if (value.prev_out_value) { + value.prev_out_value += frame_size; } } } template -void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue, - T *gateGrad, T *prevOutValue, - T *prevOutGrad, T *outputGrad, - int frameSize, +void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, activation_mode_t active_node) { - T rUpdateGateValue; - T rUpdateGateGrad; - T rFrameStateValue; - T rFrameStateGrad; - T rOutGrad; - T rPrevOutValue = 0; - T rPrevOutGrad = 0; - T *updateGateValue = gateValue; - T *updateGateGrad = gateGrad; - T *frameStateValue = gateValue + frameSize * 2; - T *frameStateGrad = gateGrad + frameSize * 2; - - for (int i = 0; i < frameSize; i++) { - rUpdateGateValue = updateGateValue[i]; - rFrameStateValue = frameStateValue[i]; - rOutGrad = outputGrad[i]; - if (prevOutValue) { - rPrevOutValue = prevOutValue[i]; + T r_update_gate_value; + T r_update_gate_grad; + T r_frame_state_value; + T r_frame_state_grad; + T r_out_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T *update_gate_value = gate_value; + T *update_gate_grad = gate_grad; + T *frame_state_value = gate_value + frame_size * 2; + T *frame_state_grad = gate_grad + frame_size * 2; + + for (int i = 0; i < frame_size; i++) { + r_update_gate_value = update_gate_value[i]; + r_frame_state_value = frame_state_value[i]; + r_out_grad = output_grad[i]; + if (prev_out_value) { + r_prev_out_value = prev_out_value[i]; } - if (prevOutGrad) { - rPrevOutGrad = prevOutGrad[i]; + if (prev_out_grad) { + r_prev_out_grad = prev_out_grad[i]; } - opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, - rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, - active_node); + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); - updateGateGrad[i] = rUpdateGateGrad; - frameStateGrad[i] = rFrameStateGrad; - if (prevOutGrad) { - prevOutGrad[i] = rPrevOutGrad; + update_gate_grad[i] = r_update_gate_grad; + frame_state_grad[i] = r_frame_state_grad; + if (prev_out_grad) { + prev_out_grad[i] = r_prev_out_grad; } } } template -void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue, - T *gateGrad, T *prevOutValue, - T *prevOutGrad, T *resetOutputGrad, - int frameSize, +void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, activation_mode_t active_gate) { - T rUpdateGateValue; - T rUpdateGateGrad; - T rResetGateValue; - T rResetGateGrad; - T rResetOutputGrad = 0; - T rPrevOutValue = 0; - T rPrevOutGrad = 0; - T *updateGateValue = gateValue; - T *updateGateGrad = gateGrad; - T *resetGateValue = gateValue + frameSize; - T *resetGateGrad = gateGrad + frameSize; - - for (int i = 0; i < frameSize; i++) { - rUpdateGateValue = updateGateValue[i]; - rUpdateGateGrad = updateGateGrad[i]; - rResetGateValue = resetGateValue[i]; - - if (prevOutValue && prevOutGrad) { - rResetOutputGrad = resetOutputGrad[i]; + T r_update_gate_value; + T r_update_gate_grad; + T r_reset_gate_value; + T r_reset_gate_grad; + T r_reset_output_grad = 0; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T *update_gate_value = gate_value; + T *update_gate_grad = gate_grad; + T *reset_gate_value = gate_value + frame_size; + T *reset_gate_grad = gate_grad + frame_size; + + for (int i = 0; i < frame_size; i++) { + r_update_gate_value = update_gate_value[i]; + r_update_gate_grad = update_gate_grad[i]; + r_reset_gate_value = reset_gate_value[i]; + + if (prev_out_value && prev_out_grad) { + r_reset_output_grad = reset_output_grad[i]; } - if (prevOutValue) { - rPrevOutValue = prevOutValue[i]; + if (prev_out_value) { + r_prev_out_value = prev_out_value[i]; } - if (prevOutGrad) { - rPrevOutGrad = prevOutGrad[i]; + if (prev_out_grad) { + r_prev_out_grad = prev_out_grad[i]; } - opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, - rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, - active_gate); + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); - updateGateGrad[i] = rUpdateGateGrad; - resetGateGrad[i] = rResetGateGrad; - if (prevOutGrad) { - prevOutGrad[i] = rPrevOutGrad; + update_gate_grad[i] = r_update_gate_grad; + reset_gate_grad[i] = r_reset_gate_grad; + if (prev_out_grad) { + prev_out_grad[i] = r_prev_out_grad; } } } template -void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue, - T *gateGrad, T *prevOutValue, - T *prevOutGrad, T *outputGrad, - int frameSize, +void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, activation_mode_t active_node) { #ifdef __AVX__ - __m256 rUpdateGateValue; - __m256 rUpdateGateGrad; - __m256 rFrameStateValue; - __m256 rFrameStateGrad; - __m256 rOutGrad; - __m256 rPrevOutValue = _mm256_set1_ps(0.0f); - __m256 rPrevOutGrad = _mm256_set1_ps(0.0f); - __m256 *updateGateValue = (__m256 *)gateValue; - __m256 *updateGateGrad = (__m256 *)gateGrad; - __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2); - __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2); - - for (int i = 0; i < frameSize / 8; i++) { - rUpdateGateValue = updateGateValue[i]; - rFrameStateValue = frameStateValue[i]; - rOutGrad = ((__m256 *)outputGrad)[i]; - if (prevOutValue) { - rPrevOutValue = ((__m256 *)prevOutValue)[i]; + __m256 r_update_gate_value; + __m256 r_update_gate_grad; + __m256 r_frame_state_value; + __m256 r_frame_state_grad; + __m256 r_out_grad; + __m256 r_prev_out_value = _mm256_set1_ps(0.0f); + __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); + __m256 *update_gate_value = (__m256 *)gate_value; + __m256 *update_gate_grad = (__m256 *)gate_grad; + __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2); + __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2); + + for (int i = 0; i < frame_size / 8; i++) { + r_update_gate_value = update_gate_value[i]; + r_frame_state_value = frame_state_value[i]; + r_out_grad = ((__m256 *)output_grad)[i]; + if (prev_out_value) { + r_prev_out_value = ((__m256 *)prev_out_value)[i]; } - if (prevOutGrad) { - rPrevOutGrad = ((__m256 *)prevOutGrad)[i]; + if (prev_out_grad) { + r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; } - opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, - rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, - active_node); + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); - updateGateGrad[i] = rUpdateGateGrad; - frameStateGrad[i] = rFrameStateGrad; - if (prevOutGrad) { - ((__m256 *)prevOutGrad)[i] = rPrevOutGrad; + update_gate_grad[i] = r_update_gate_grad; + frame_state_grad[i] = r_frame_state_grad; + if (prev_out_grad) { + ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; } } #endif } template -void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue, - T *gateGrad, T *prevOutValue, - T *prevOutGrad, T *resetOutputGrad, - int frameSize, +void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, activation_mode_t active_gate) { #ifdef __AVX__ - __m256 rUpdateGateValue; - __m256 rUpdateGateGrad; - __m256 rResetGateValue; - __m256 rResetGateGrad; - __m256 rResetOutputGrad = _mm256_set1_ps(0.0f); - __m256 rPrevOutValue = _mm256_set1_ps(0.0f); - __m256 rPrevOutGrad = _mm256_set1_ps(0.0f); - __m256 *updateGateValue = (__m256 *)gateValue; - __m256 *updateGateGrad = (__m256 *)gateGrad; - __m256 *resetGateValue = (__m256 *)(gateValue + frameSize); - __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize); - - for (int i = 0; i < frameSize / 8; i++) { - rUpdateGateValue = updateGateValue[i]; - rUpdateGateGrad = updateGateGrad[i]; - rResetGateValue = resetGateValue[i]; - - if (prevOutValue && prevOutGrad) { - rResetOutputGrad = ((__m256 *)resetOutputGrad)[i]; + __m256 r_update_gate_value; + __m256 r_update_gate_grad; + __m256 r_reset_gate_value; + __m256 r_reset_gate_grad; + __m256 r_reset_output_grad = _mm256_set1_ps(0.0f); + __m256 r_prev_out_value = _mm256_set1_ps(0.0f); + __m256 r_prev_out_grad = _mm256_set1_ps(0.0f); + __m256 *update_gate_value = (__m256 *)gate_value; + __m256 *update_gate_grad = (__m256 *)gate_grad; + __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size); + __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size); + + for (int i = 0; i < frame_size / 8; i++) { + r_update_gate_value = update_gate_value[i]; + r_update_gate_grad = update_gate_grad[i]; + r_reset_gate_value = reset_gate_value[i]; + + if (prev_out_value && prev_out_grad) { + r_reset_output_grad = ((__m256 *)reset_output_grad)[i]; } - if (prevOutValue) { - rPrevOutValue = ((__m256 *)prevOutValue)[i]; + if (prev_out_value) { + r_prev_out_value = ((__m256 *)prev_out_value)[i]; } - if (prevOutGrad) { - rPrevOutGrad = ((__m256 *)prevOutGrad)[i]; + if (prev_out_grad) { + r_prev_out_grad = ((__m256 *)prev_out_grad)[i]; } - opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, - rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, - active_gate); + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); - updateGateGrad[i] = rUpdateGateGrad; - resetGateGrad[i] = rResetGateGrad; - if (prevOutGrad) { - ((__m256 *)prevOutGrad)[i] = rPrevOutGrad; + update_gate_grad[i] = r_update_gate_grad; + reset_gate_grad[i] = r_reset_gate_grad; + if (prev_out_grad) { + ((__m256 *)prev_out_grad)[i] = r_prev_out_grad; } } #endif } template -inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value value, - hl_gru_grad grad, int frameSize, - int batchSize, activation_mode_t active_node) { - for (int b = 0; b < batchSize; b++) { - if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { +inline void backward_state_grad(OpStateGrad op_state_grad, + hl_gru_value value, hl_gru_grad grad, + int frame_size, int batch_size, + activation_mode_t active_node) { + for (int b = 0; b < batch_size; b++) { + if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { hl_avx_gru_backward_state_grad( - opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue, - grad.prevOutGrad, grad.outputGrad, frameSize, active_node); + op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.output_grad, frame_size, active_node); } else { hl_naive_gru_backward_state_grad( - opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue, - grad.prevOutGrad, grad.outputGrad, frameSize, active_node); + op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.output_grad, frame_size, active_node); } - value.gateValue += frameSize * 3; - if (value.prevOutValue) { - value.prevOutValue += frameSize; + value.gate_value += frame_size * 3; + if (value.prev_out_value) { + value.prev_out_value += frame_size; } - grad.gateGrad += frameSize * 3; - grad.outputGrad += frameSize; - if (grad.prevOutGrad) { - grad.prevOutGrad += frameSize; + grad.gate_grad += frame_size * 3; + grad.output_grad += frame_size; + if (grad.prev_out_grad) { + grad.prev_out_grad += frame_size; } } } template -inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value value, - hl_gru_grad grad, int frameSize, - int batchSize, activation_mode_t active_gate) { - for (int b = 0; b < batchSize; b++) { - if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) { +inline void backward_reset_grad(OpResetGrad op_reset_grad, + hl_gru_value value, hl_gru_grad grad, + int frame_size, int batch_size, + activation_mode_t active_gate) { + for (int b = 0; b < batch_size; b++) { + if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) { hl_avx_gru_backward_reset_grad( - opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue, - grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate); + op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate); } else { hl_naive_gru_backward_reset_grad( - opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue, - grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate); + op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value, + grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate); } - value.gateValue += frameSize * 3; - if (value.prevOutValue) { - value.prevOutValue += frameSize; + value.gate_value += frame_size * 3; + if (value.prev_out_value) { + value.prev_out_value += frame_size; } - grad.gateGrad += frameSize * 3; - grad.resetOutputGrad += frameSize; - if (grad.prevOutGrad) { - grad.prevOutGrad += frameSize; + grad.gate_grad += frame_size * 3; + grad.reset_output_grad += frame_size; + if (grad.prev_out_grad) { + grad.prev_out_grad += frame_size; } } } diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h index 6441c648b0..f3983c5195 100644 --- a/paddle/operators/math/detail/gru_gpu_kernel.h +++ b/paddle/operators/math/detail/gru_gpu_kernel.h @@ -27,174 +27,174 @@ namespace math { namespace detail { /* - * threads(framePerBlock, batchPerBlock) - * grid(frameBlocks, batchBlocks) + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) */ -template -__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput, - T *gateValue, T *resetOutputValue, - T *prevOutputValue, int frameSize, - int batchSize, +template +__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, + T *gate_value, T *reset_output_value, + T *prev_output_value, int frame_size, + int batch_size, activation_mode_t active_gate) { - const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (frameIdx >= frameSize) return; - - int batchIdx = 0; - if (isBatch) { - batchIdx = blockIdx.y * blockDim.y + threadIdx.y; - if (batchIdx >= batchSize) return; - gateValue += batchIdx * 3 * frameSize; - resetOutputValue += batchIdx * frameSize; + const int frame_idx = block_idx.x * block_dim.x + thread_idx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = block_idx.y * block_dim.y + thread_idx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + reset_output_value += batch_idx * frame_size; } - T rPrevOut = 0; - T rValueResetOutput; - T rValueUpdateGate = gateValue[frameIdx + frameSize * 0]; - T rValueResetGate = gateValue[frameIdx + frameSize * 1]; + T r_prev_out = 0; + T r_value_reset_output; + T r_value_update_gate = gate_value[frame_idx + frame_size * 0]; + T r_value_reset_gate = gate_value[frame_idx + frame_size * 1]; - if (prevOutputValue) { - if (isBatch) prevOutputValue += batchIdx * frameSize; - rPrevOut = prevOutputValue[frameIdx]; + if (prev_output_value) { + if (is_batch) prev_output_value += batch_idx * frame_size; + r_prev_out = prev_output_value[frame_idx]; } - opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput, - active_gate); + op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out, + r_value_reset_output, active_gate); - gateValue[frameIdx + frameSize * 0] = rValueUpdateGate; - gateValue[frameIdx + frameSize * 1] = rValueResetGate; - resetOutputValue[frameIdx] = rValueResetOutput; + gate_value[frame_idx + frame_size * 0] = r_value_update_gate; + gate_value[frame_idx + frame_size * 1] = r_value_reset_gate; + reset_output_value[frame_idx] = r_value_reset_output; } /* - * threads(framePerBlock, batchPerBlock) - * grid(frameBlocks, batchBlocks) + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) */ -template -__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput, - T *gateValue, T *prevOutputValue, - T *outputValue, int frameSize, - int batchSize, +template +__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, + T *gate_value, T *prev_output_value, + T *output_value, int frame_size, + int batch_size, activation_mode_t active_node) { - const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (frameIdx >= frameSize) return; - int batchIdx = 0; - if (isBatch) { - batchIdx = blockIdx.y * blockDim.y + threadIdx.y; - if (batchIdx >= batchSize) return; - gateValue += batchIdx * 3 * frameSize; - outputValue += batchIdx * frameSize; + const int frame_idx = block_idx.x * block_dim.x + thread_idx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = block_idx.y * block_dim.y + thread_idx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + output_value += batch_idx * frame_size; } - T rOutput; - T rPrevOut = 0; - T rValueUpdateGate = gateValue[frameIdx + frameSize * 0]; - T rValueFrameState = gateValue[frameIdx + frameSize * 2]; + T r_output; + T r_prev_out = 0; + T r_value_update_gate = gate_value[frame_idx + frame_size * 0]; + T r_value_frame_state = gate_value[frame_idx + frame_size * 2]; - if (prevOutputValue) { - if (isBatch) prevOutputValue += batchIdx * frameSize; - rPrevOut = prevOutputValue[frameIdx]; + if (prev_output_value) { + if (is_batch) prev_output_value += batch_idx * frame_size; + r_prev_out = prev_output_value[frame_idx]; } - opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput, - active_node); + op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out, + r_output, active_node); - gateValue[frameIdx + frameSize * 2] = rValueFrameState; - outputValue[frameIdx] = rOutput; + gate_value[frame_idx + frame_size * 2] = r_value_frame_state; + output_value[frame_idx] = r_output; } /* - * threads(framePerBlock, batchPerBlock) - * grid(frameBlocks, batchBlocks) + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) */ -template -__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue, - T *gateGrad, T *prevOutValue, - T *prevOutGrad, T *outputGrad, - int frameSize, int batchSize, +template +__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *output_grad, + int frame_size, int batch_size, activation_mode_t active_node) { - const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (frameIdx >= frameSize) return; - int batchIdx = 0; - if (isBatch) { - batchIdx = blockIdx.y * blockDim.y + threadIdx.y; - if (batchIdx >= batchSize) return; - gateValue += batchIdx * 3 * frameSize; - gateGrad += batchIdx * 3 * frameSize; - outputGrad += batchIdx * frameSize; + const int frame_idx = block_idx.x * block_dim.x + thread_idx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = block_idx.y * block_dim.y + thread_idx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + gate_grad += batch_idx * 3 * frame_size; + output_grad += batch_idx * frame_size; } - T rUpdateGateGrad; - T rFrameStateGrad; - T rPrevOutValue = 0; - T rPrevOutGrad = 0; - T rUpdateGateValue = gateValue[frameIdx + frameSize * 0]; - T rFrameStateValue = gateValue[frameIdx + frameSize * 2]; - T rOutGrad = outputGrad[frameIdx]; + T r_update_gate_grad; + T r_frame_state_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T r_update_gate_value = gate_value[frame_idx + frame_size * 0]; + T r_frame_state_value = gate_value[frame_idx + frame_size * 2]; + T r_out_grad = output_grad[frame_idx]; - if (prevOutValue && prevOutGrad) { - if (isBatch) prevOutValue += batchIdx * frameSize; - rPrevOutValue = prevOutValue[frameIdx]; + if (prev_out_value && prev_out_grad) { + if (is_batch) prev_out_value += batch_idx * frame_size; + r_prev_out_value = prev_out_value[frame_idx]; - if (isBatch) prevOutGrad += batchIdx * frameSize; - rPrevOutGrad = prevOutGrad[frameIdx]; + if (is_batch) prev_out_grad += batch_idx * frame_size; + r_prev_out_grad = prev_out_grad[frame_idx]; } - opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue, - rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad, - active_node); + op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value, + r_frame_state_grad, r_prev_out_value, r_prev_out_grad, + r_out_grad, active_node); - gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad; - gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad; - if (prevOutGrad) { - prevOutGrad[frameIdx] = rPrevOutGrad; + gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; + gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad; + if (prev_out_grad) { + prev_out_grad[frame_idx] = r_prev_out_grad; } } /* - * threads(framePerBlock, batchPerBlock) - * grid(frameBlocks, batchBlocks) + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) */ -template -__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue, - T *gateGrad, T *prevOutValue, - T *prevOutGrad, T *resetOutputGrad, - int frameSize, int batchSize, +template +__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, + T *gate_grad, T *prev_out_value, + T *prev_out_grad, T *reset_output_grad, + int frame_size, int batch_size, activation_mode_t active_gate) { - const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (frameIdx >= frameSize) return; - int batchIdx = 0; - if (isBatch) { - batchIdx = blockIdx.y * blockDim.y + threadIdx.y; - if (batchIdx >= batchSize) return; - gateValue += batchIdx * 3 * frameSize; - gateGrad += batchIdx * 3 * frameSize; - resetOutputGrad += batchIdx * frameSize; + const int frame_idx = block_idx.x * block_dim.x + thread_idx.x; + if (frame_idx >= frame_size) return; + int batch_idx = 0; + if (is_batch) { + batch_idx = block_idx.y * block_dim.y + thread_idx.y; + if (batch_idx >= batch_size) return; + gate_value += batch_idx * 3 * frame_size; + gate_grad += batch_idx * 3 * frame_size; + reset_output_grad += batch_idx * frame_size; } - T rResetGateGrad; - T rPrevOutValue = 0; - T rPrevOutGrad = 0; - T rResetOutputGrad = 0; - T rUpdateGateValue = gateValue[frameIdx + frameSize * 0]; - T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0]; - T rResetGateValue = gateValue[frameIdx + frameSize * 1]; - - if (prevOutValue && prevOutGrad) { - if (isBatch) prevOutValue += batchIdx * frameSize; - if (isBatch) prevOutGrad += batchIdx * frameSize; - rPrevOutValue = prevOutValue[frameIdx]; - rPrevOutGrad = prevOutGrad[frameIdx]; - rResetOutputGrad = resetOutputGrad[frameIdx]; + T r_reset_gate_grad; + T r_prev_out_value = 0; + T r_prev_out_grad = 0; + T r_reset_output_grad = 0; + T r_update_gate_value = gate_value[frame_idx + frame_size * 0]; + T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0]; + T r_reset_gate_value = gate_value[frame_idx + frame_size * 1]; + + if (prev_out_value && prev_out_grad) { + if (is_batch) prev_out_value += batch_idx * frame_size; + if (is_batch) prev_out_grad += batch_idx * frame_size; + r_prev_out_value = prev_out_value[frame_idx]; + r_prev_out_grad = prev_out_grad[frame_idx]; + r_reset_output_grad = reset_output_grad[frame_idx]; } - opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue, - rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad, - active_gate); + op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value, + r_reset_gate_grad, r_prev_out_value, r_prev_out_grad, + r_reset_output_grad, active_gate); - gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad; - gateGrad[frameIdx + frameSize * 1] = rResetGateGrad; - if (prevOutGrad) { - prevOutGrad[frameIdx] = rPrevOutGrad; + gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad; + gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad; + if (prev_out_grad) { + prev_out_grad[frame_idx] = r_prev_out_grad; } } } // namespace detail diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h index 8a681d8d8b..acd84be01d 100644 --- a/paddle/operators/math/detail/gru_kernel.h +++ b/paddle/operators/math/detail/gru_kernel.h @@ -28,23 +28,25 @@ namespace forward { template class gru_resetOutput { public: - HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut, - T &valueResetOutput, activation_mode_t actGate) { - valueUpdateGate = activation(valueUpdateGate, actGate); - valueResetGate = activation(valueResetGate, actGate); - valueResetOutput = prevOut * valueResetGate; + HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate, + T &prev_out, T &value_reset_output, + activation_mode_t act_gate) { + value_update_gate = activation(value_update_gate, act_gate); + value_reset_gate = activation(value_reset_gate, act_gate); + value_reset_output = prev_out * value_reset_gate; } #ifndef __NVCC__ #ifndef __AVX__ static const bool avx = false; #else static const bool avx = true; - HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate, - __m256 &prevOut, __m256 &valueResetOutput, - activation_mode_t actGate) { - valueUpdateGate = activation(valueUpdateGate, actGate); - valueResetGate = activation(valueResetGate, actGate); - valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate); + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &value_reset_gate, __m256 &prev_out, + __m256 &value_reset_output, + activation_mode_t act_gate) { + value_update_gate = activation(value_update_gate, act_gate); + value_reset_gate = activation(value_reset_gate, act_gate); + value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate); } #endif #endif @@ -53,24 +55,26 @@ class gru_resetOutput { template class gru_finalOutput { public: - HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut, - T &valueOutput, activation_mode_t actInput) { - valueFrameState = activation(valueFrameState, actInput); - valueOutput = prevOut - (valueUpdateGate * prevOut) + - (valueUpdateGate * valueFrameState); + HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state, + T &prev_out, T &value_output, + activation_mode_t act_input) { + value_frame_state = activation(value_frame_state, act_input); + value_output = prev_out - (value_update_gate * prev_out) + + (value_update_gate * value_frame_state); } #ifndef __NVCC__ #ifndef __AVX__ static const bool avx = false; #else static const bool avx = true; - HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState, - __m256 &prevOut, __m256 &valueOutput, - activation_mode_t actInput) { - valueFrameState = activation(valueFrameState, actInput); - valueOutput = _mm256_add_ps( - _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)), - _mm256_mul_ps(valueUpdateGate, valueFrameState)); + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &value_frame_state, __m256 &prev_out, + __m256 &value_output, + activation_mode_t act_input) { + value_frame_state = activation(value_frame_state, act_input); + value_output = _mm256_add_ps( + _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)), + _mm256_mul_ps(value_update_gate, value_frame_state)); } #endif #endif @@ -82,34 +86,37 @@ namespace backward { template class gru_stateGrad { public: - HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate, - T &valueFrameState, T &gradFrameState, - T &valuePrevOut, T &gradPrevOut, T &gradOutput, - activation_mode_t actInput) { - gradUpdateGate = (gradOutput * valueFrameState); - gradUpdateGate -= (gradOutput * valuePrevOut); - gradPrevOut -= (gradOutput * valueUpdateGate); - gradPrevOut += gradOutput; - gradFrameState = - activation(gradOutput * valueUpdateGate, valueFrameState, actInput); + HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, + T &value_frame_state, T &grad_frame_state, + T &value_prev_out, T &grad_prev_out, + T &grad_output, activation_mode_t act_input) { + grad_update_gate = (grad_output * value_frame_state); + grad_update_gate -= (grad_output * value_prev_out); + grad_prev_out -= (grad_output * value_update_gate); + grad_prev_out += grad_output; + grad_frame_state = activation(grad_output * value_update_gate, + value_frame_state, act_input); } #ifndef __NVCC__ #ifndef __AVX__ static const bool avx = false; #else static const bool avx = true; - HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate, - __m256 &valueFrameState, __m256 &gradFrameState, - __m256 &valuePrevOut, __m256 &gradPrevOut, - __m256 &gradOutput, activation_mode_t actInput) { - gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState); - gradUpdateGate = - _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut)); - gradPrevOut = _mm256_add_ps( - _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)), - gradOutput); - gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate), - valueFrameState, actInput); + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &grad_update_gate, + __m256 &value_frame_state, + __m256 &grad_frame_state, __m256 &value_prev_out, + __m256 &grad_prev_out, __m256 &grad_output, + activation_mode_t act_input) { + grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state); + grad_update_gate = _mm256_sub_ps( + grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out)); + grad_prev_out = _mm256_add_ps( + _mm256_sub_ps(grad_prev_out, + _mm256_mul_ps(grad_output, value_update_gate)), + grad_output); + grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate), + value_frame_state, act_input); } #endif #endif @@ -118,30 +125,32 @@ class gru_stateGrad { template class gru_resetGrad { public: - HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate, - T &valueResetGate, T &gradResetGate, - T &valuePrevOut, T &gradPrevOut, - T &gradResetOutput, activation_mode_t actGate) { - gradResetGate = (gradResetOutput * valuePrevOut); - gradPrevOut += (gradResetOutput * valueResetGate); - gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate); - gradResetGate = activation(gradResetGate, valueResetGate, actGate); + HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate, + T &value_reset_gate, T &grad_reset_gate, + T &value_prev_out, T &grad_prev_out, + T &grad_reset_output, activation_mode_t act_gate) { + grad_reset_gate = (grad_reset_output * value_prev_out); + grad_prev_out += (grad_reset_output * value_reset_gate); + grad_update_gate = + activation(grad_update_gate, value_update_gate, act_gate); + grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); } #ifndef __NVCC__ #ifndef __AVX__ static const bool avx = false; #else static const bool avx = true; - HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate, - __m256 &valueResetGate, __m256 &gradResetGate, - __m256 &valuePrevOut, __m256 &gradPrevOut, - __m256 &gradResetOutput, - activation_mode_t actGate) { - gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut); - gradPrevOut = _mm256_add_ps(gradPrevOut, - _mm256_mul_ps(gradResetOutput, valueResetGate)); - gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate); - gradResetGate = activation(gradResetGate, valueResetGate, actGate); + HOSTDEVICE void operator()(__m256 &value_update_gate, + __m256 &grad_update_gate, __m256 &value_reset_gate, + __m256 &grad_reset_gate, __m256 &value_prev_out, + __m256 &grad_prev_out, __m256 &grad_reset_output, + activation_mode_t act_gate) { + grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out); + grad_prev_out = _mm256_add_ps( + grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate)); + grad_update_gate = + activation(grad_update_gate, value_update_gate, act_gate); + grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate); } #endif #endif diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc index 125af449d3..ae4e47b014 100644 --- a/paddle/operators/math/gru_compute.cc +++ b/paddle/operators/math/gru_compute.cc @@ -21,29 +21,29 @@ namespace math { template struct GRUUnitFunctor { static void compute(const platform::DeviceContext &context, - hl_gru_value value, int frameSize, int batchSize, + hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { #ifndef __NVCC__ - if (value.prevOutValue) { + if (value.prev_out_value) { math::gemm( - context, false, false, batchSize, frameSize * 2, frameSize, 1, - value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1, - value.gateValue, frameSize * 3); + context, false, false, batch_size, frame_size * 2, frame_size, 1, + value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, + 1, value.gate_value, frame_size * 3); } detail::forward_reset_output(detail::forward::gru_resetOutput(), value, - frameSize, batchSize, active_gate); + frame_size, batch_size, active_gate); - if (value.prevOutValue) { + if (value.prev_out_value) { math::gemm( - context, false, false, batchSize, frameSize, frameSize, 1, - value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1, - value.gateValue + frameSize * 2, frameSize * 3); + context, false, false, batch_size, frame_size, frame_size, 1, + value.reset_output_value, frame_size, value.state_weight, frame_size, + 1, value.gate_value + frame_size * 2, frame_size * 3); } detail::forward_final_output(detail::forward::gru_finalOutput(), value, - frameSize, batchSize, active_node); + frame_size, batch_size, active_node); #endif } }; @@ -51,41 +51,43 @@ struct GRUUnitFunctor { template struct GRUUnitGradFunctor { static void compute(const platform::DeviceContext &context, - hl_gru_value value, hl_gru_grad grad, int frameSize, - int batchSize, activation_mode_t active_node, + hl_gru_value value, hl_gru_grad grad, + int frame_size, int batch_size, + activation_mode_t active_node, activation_mode_t active_gate) { #ifndef __NVCC__ detail::backward_state_grad(detail::backward::gru_stateGrad(), value, - grad, frameSize, batchSize, active_node); + grad, frame_size, batch_size, active_node); - if (value.prevOutValue && grad.prevOutGrad) { + if (value.prev_out_value && grad.prev_out_grad) { math::gemm( - context, false, true, batchSize, frameSize, frameSize, 1, - grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight, - frameSize, 0, grad.resetOutputGrad, frameSize); + context, false, true, batch_size, frame_size, frame_size, 1, + grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, + frame_size, 0, grad.reset_output_grad, frame_size); - if (grad.stateWeightGrad) { + if (grad.state_weight_grad) { math::gemm( - context, true, false, frameSize, frameSize, batchSize, 1, - value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2, - frameSize * 3, 1, grad.stateWeightGrad, frameSize); + context, true, false, frame_size, frame_size, batch_size, 1, + value.reset_output_value, frame_size, + grad.gate_grad + frame_size * 2, frame_size * 3, 1, + grad.state_weight_grad, frame_size); } } detail::backward_reset_grad(detail::backward::gru_resetGrad(), value, - grad, frameSize, batchSize, active_gate); + grad, frame_size, batch_size, active_gate); - if (grad.prevOutGrad && value.prevOutValue) { + if (grad.prev_out_grad && value.prev_out_value) { math::gemm( - context, false, true, batchSize, frameSize, frameSize * 2, 1, - grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1, - grad.prevOutGrad, frameSize); + context, false, true, batch_size, frame_size, frame_size * 2, 1, + grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, + grad.prev_out_grad, frame_size); - if (grad.gateWeightGrad) { + if (grad.gate_weight_grad) { math::gemm( - context, true, false, frameSize, frameSize * 2, batchSize, 1, - value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1, - grad.gateWeightGrad, frameSize * 2); + context, true, false, frame_size, frame_size * 2, batch_size, 1, + value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, + grad.gate_weight_grad, frame_size * 2); } } #endif diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu index 7b9e54ac02..0252bdbdb6 100644 --- a/paddle/operators/math/gru_compute.cu +++ b/paddle/operators/math/gru_compute.cu @@ -21,66 +21,66 @@ namespace math { template struct GRUUnitFunctor { static void compute(const platform::DeviceContext &context, - hl_gru_value value, int frameSize, int batchSize, + hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { auto stream = reinterpret_cast(context).stream(); dim3 threads; dim3 grid; - if (batchSize == 1) { - int framePerBlock = frameSize <= 1024 ? frameSize : 1024; - int frameBlocks = (frameSize + 1024 - 1) / 1024; - threads = dim3(framePerBlock, 1); - grid = dim3(frameBlocks, 1); + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); } else { threads = dim3(32, 32); - grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); } - if (value.prevOutValue) { + if (value.prev_out_value) { math::gemm( - context, false, false, batchSize, frameSize * 2, frameSize, 1, - value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1, - value.gateValue, frameSize * 3); + context, false, false, batch_size, frame_size * 2, frame_size, 1, + value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, + 1, value.gate_value, frame_size * 3); } - if (batchSize == 1) { + if (batch_size == 1) { detail::KeGruForwardResetOutput, - /* isBatch= */ false, + /* is_batch= */ false, T><<>>( - detail::forward::gru_resetOutput(), value.gateValue, - value.resetOutputValue, value.prevOutValue, frameSize, batchSize, - active_gate); + detail::forward::gru_resetOutput(), value.gate_value, + value.reset_output_value, value.prev_out_value, frame_size, + batch_size, active_gate); } else { detail::KeGruForwardResetOutput, - /* isBatch= */ true, + /* is_batch= */ true, T><<>>( - detail::forward::gru_resetOutput(), value.gateValue, - value.resetOutputValue, value.prevOutValue, frameSize, batchSize, - active_gate); + detail::forward::gru_resetOutput(), value.gate_value, + value.reset_output_value, value.prev_out_value, frame_size, + batch_size, active_gate); } - if (value.prevOutValue) { + if (value.prev_out_value) { math::gemm( - context, false, false, batchSize, frameSize, frameSize, 1, - value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1, - value.gateValue + frameSize * 2, frameSize * 3); + context, false, false, batch_size, frame_size, frame_size, 1, + value.reset_output_value, frame_size, value.state_weight, frame_size, + 1, value.gate_value + frame_size * 2, frame_size * 3); } - if (batchSize == 1) { + if (batch_size == 1) { detail::KeGruForwardFinalOutput, - /* isBatch= */ false, + /* is_batch= */ false, T><<>>( - detail::forward::gru_finalOutput(), value.gateValue, - value.prevOutValue, value.outputValue, frameSize, batchSize, + detail::forward::gru_finalOutput(), value.gate_value, + value.prev_out_value, value.output_value, frame_size, batch_size, active_node); } else { detail::KeGruForwardFinalOutput, - /* isBatch= */ true, + /* is_batch= */ true, T><<>>( - detail::forward::gru_finalOutput(), value.gateValue, - value.prevOutValue, value.outputValue, frameSize, batchSize, + detail::forward::gru_finalOutput(), value.gate_value, + value.prev_out_value, value.output_value, frame_size, batch_size, active_node); } } @@ -89,80 +89,82 @@ struct GRUUnitFunctor { template struct GRUUnitGradFunctor { static void compute(const platform::DeviceContext &context, - hl_gru_value value, hl_gru_grad grad, int frameSize, - int batchSize, activation_mode_t active_node, + hl_gru_value value, hl_gru_grad grad, + int frame_size, int batch_size, + activation_mode_t active_node, activation_mode_t active_gate) { auto stream = reinterpret_cast(context).stream(); dim3 threads; dim3 grid; - if (batchSize == 1) { - int framePerBlock = frameSize <= 1024 ? frameSize : 1024; - int frameBlocks = (frameSize + 1024 - 1) / 1024; - threads = dim3(framePerBlock, 1); - grid = dim3(frameBlocks, 1); + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); } else { threads = dim3(32, 32); - grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); } - if (batchSize == 1) { + if (batch_size == 1) { detail::KeGruBackwardStateGrad< detail::backward::gru_stateGrad, - /* isBatch= */ false><<>>( - detail::backward::gru_stateGrad(), value.gateValue, grad.gateGrad, - value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize, - batchSize, active_node); + /* is_batch= */ false><<>>( + detail::backward::gru_stateGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.output_grad, frame_size, batch_size, active_node); } else { detail::KeGruBackwardStateGrad< detail::backward::gru_stateGrad, - /* isBatch= */ true><<>>( - detail::backward::gru_stateGrad(), value.gateValue, grad.gateGrad, - value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize, - batchSize, active_node); + /* is_batch= */ true><<>>( + detail::backward::gru_stateGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.output_grad, frame_size, batch_size, active_node); } - if (value.prevOutValue && grad.prevOutGrad) { + if (value.prev_out_value && grad.prev_out_grad) { math::gemm( - context, false, true, batchSize, frameSize, frameSize, 1, - grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight, - frameSize, 0, grad.resetOutputGrad, frameSize); + context, false, true, batch_size, frame_size, frame_size, 1, + grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, + frame_size, 0, grad.reset_output_grad, frame_size); - if (grad.stateWeightGrad) { + if (grad.state_weight_grad) { math::gemm( - context, true, false, frameSize, frameSize, batchSize, 1, - value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2, - frameSize * 3, 1, grad.stateWeightGrad, frameSize); + context, true, false, frame_size, frame_size, batch_size, 1, + value.reset_output_value, frame_size, + grad.gate_grad + frame_size * 2, frame_size * 3, 1, + grad.state_weight_grad, frame_size); } } - if (batchSize == 1) { + if (batch_size == 1) { detail::KeGruBackwardResetGrad< detail::backward::gru_resetGrad, - /* isBatch= */ false><<>>( - detail::backward::gru_resetGrad(), value.gateValue, grad.gateGrad, - value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize, - batchSize, active_gate); + /* is_batch= */ false><<>>( + detail::backward::gru_resetGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.reset_output_grad, frame_size, batch_size, active_gate); } else { detail::KeGruBackwardResetGrad< detail::backward::gru_resetGrad, - /* isBatch= */ true><<>>( - detail::backward::gru_resetGrad(), value.gateValue, grad.gateGrad, - value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize, - batchSize, active_gate); + /* is_batch= */ true><<>>( + detail::backward::gru_resetGrad(), value.gate_value, + grad.gate_grad, value.prev_out_value, grad.prev_out_grad, + grad.reset_output_grad, frame_size, batch_size, active_gate); } - if (grad.prevOutGrad && value.prevOutValue) { + if (grad.prev_out_grad && value.prev_out_value) { math::gemm( - context, false, true, batchSize, frameSize, frameSize * 2, 1, - grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1, - grad.prevOutGrad, frameSize); + context, false, true, batch_size, frame_size, frame_size * 2, 1, + grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, + grad.prev_out_grad, frame_size); - if (grad.gateWeightGrad) { + if (grad.gate_weight_grad) { math::gemm( - context, true, false, frameSize, frameSize * 2, batchSize, 1, - value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1, - grad.gateWeightGrad, frameSize * 2); + context, true, false, frame_size, frame_size * 2, batch_size, 1, + value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, + grad.gate_weight_grad, frame_size * 2); } } } diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h index 1475fb3810..58ea59f68e 100644 --- a/paddle/operators/math/gru_compute.h +++ b/paddle/operators/math/gru_compute.h @@ -22,28 +22,28 @@ namespace math { // TODO(guosheng): refine code style in gru_compute template struct hl_gru_value { - T *gateWeight; - T *stateWeight; - T *gateValue; - T *resetOutputValue; - T *outputValue; - T *prevOutValue; + T *gate_weight; + T *state_weight; + T *gate_value; + T *reset_output_value; + T *output_value; + T *prev_out_value; }; template struct hl_gru_grad { - T *gateWeightGrad; - T *stateWeightGrad; - T *gateGrad; - T *resetOutputGrad; - T *outputGrad; - T *prevOutGrad; + T *gate_weight_grad; + T *state_weight_grad; + T *gate_grad; + T *reset_output_grad; + T *output_grad; + T *prev_out_grad; }; template struct GRUUnitFunctor { static void compute(const platform::DeviceContext &context, - hl_gru_value value, int frameSize, int batchSize, + hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate); }; @@ -51,8 +51,9 @@ struct GRUUnitFunctor { template struct GRUUnitGradFunctor { static void compute(const platform::DeviceContext &context, - hl_gru_value value, hl_gru_grad grad, int frameSize, - int batchSize, activation_mode_t active_node, + hl_gru_value value, hl_gru_grad grad, + int frame_size, int batch_size, + activation_mode_t active_node, activation_mode_t active_gate); }; From 1ab03d49d5161dfeb5c8251d7609b78e3ff29440 Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 29 Nov 2017 13:47:19 +0800 Subject: [PATCH 105/275] Fix gru_op related code style in gpu_kernel --- paddle/operators/math/detail/gru_gpu_kernel.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h index f3983c5195..d2edcb7f25 100644 --- a/paddle/operators/math/detail/gru_gpu_kernel.h +++ b/paddle/operators/math/detail/gru_gpu_kernel.h @@ -36,12 +36,12 @@ __global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output, T *prev_output_value, int frame_size, int batch_size, activation_mode_t active_gate) { - const int frame_idx = block_idx.x * block_dim.x + thread_idx.x; + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; if (is_batch) { - batch_idx = block_idx.y * block_dim.y + thread_idx.y; + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; if (batch_idx >= batch_size) return; gate_value += batch_idx * 3 * frame_size; reset_output_value += batch_idx * frame_size; @@ -75,11 +75,11 @@ __global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output, T *output_value, int frame_size, int batch_size, activation_mode_t active_node) { - const int frame_idx = block_idx.x * block_dim.x + thread_idx.x; + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; if (is_batch) { - batch_idx = block_idx.y * block_dim.y + thread_idx.y; + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; if (batch_idx >= batch_size) return; gate_value += batch_idx * 3 * frame_size; output_value += batch_idx * frame_size; @@ -112,11 +112,11 @@ __global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value, T *prev_out_grad, T *output_grad, int frame_size, int batch_size, activation_mode_t active_node) { - const int frame_idx = block_idx.x * block_dim.x + thread_idx.x; + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; if (is_batch) { - batch_idx = block_idx.y * block_dim.y + thread_idx.y; + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; if (batch_idx >= batch_size) return; gate_value += batch_idx * 3 * frame_size; gate_grad += batch_idx * 3 * frame_size; @@ -160,11 +160,11 @@ __global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value, T *prev_out_grad, T *reset_output_grad, int frame_size, int batch_size, activation_mode_t active_gate) { - const int frame_idx = block_idx.x * block_dim.x + thread_idx.x; + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; if (frame_idx >= frame_size) return; int batch_idx = 0; if (is_batch) { - batch_idx = block_idx.y * block_dim.y + thread_idx.y; + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; if (batch_idx >= batch_size) return; gate_value += batch_idx * 3 * frame_size; gate_grad += batch_idx * 3 * frame_size; From 0c0ff2828ccedb51db23290d6df9e4c83839d6af Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= Date: Wed, 29 Nov 2017 14:38:50 +0800 Subject: [PATCH 106/275] Refine install docs (#5943) * refine install docs * do not remove files * follow comments * update --- .../build_from_source_cn.rst | 36 +++++++++++++++-- .../build_from_source_en.rst | 40 +++++++++++++++++-- .../build_and_install/docker_install_cn.rst | 2 +- .../build_and_install/docker_install_en.rst | 2 +- .../build_and_install/pip_install_cn.rst | 2 +- .../build_and_install/pip_install_en.rst | 2 +- doc/howto/index_cn.rst | 1 - doc/howto/index_en.rst | 1 - 8 files changed, 72 insertions(+), 14 deletions(-) diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst index 55665ac8ed..3c525bdad6 100644 --- a/doc/getstarted/build_and_install/build_from_source_cn.rst +++ b/doc/getstarted/build_and_install/build_from_source_cn.rst @@ -1,4 +1,4 @@ -从源码编译PaddlePaddle +从源码编译 ====================== .. _build_step: @@ -7,8 +7,11 @@ ---------------- PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译工具。 -我们推荐您使用PaddlePaddle编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境 +我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译,这样可以免去单独安装编译依赖的步骤,可选的不同编译环境Docker镜像 可以在 `这里 `_ 找到。 + +如果您选择不使用Docker镜像,则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。 + 编译PaddlePaddle,需要执行: .. code-block:: bash @@ -22,7 +25,6 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译 cd build cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. make - 编译完成后会在build/python/dist目录下生成输出的whl包,可以选在在当前机器安装也可以拷贝到目标机器安装: @@ -31,7 +33,33 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译 pip install python/dist/*.whl -.. _build_step: +.. _run_test: + +执行单元测试 +---------------- + +如果您期望在编译完成后立即执行所有的单元测试,可以按照下面的方法: + +使用Docker的情况下,设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后,立即执行单元测试。 +开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。 + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh + +如果不使用Docker,可以执行ctest命令即可: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF .. + make + ctest + # 指定执行其中一个单元测试 test_mul_op + ctest -R test_mul_op + +.. _compile_deps: 编译依赖 ---------------- diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst index 9a3ed7dd57..76fbc43de2 100644 --- a/doc/getstarted/build_and_install/build_from_source_en.rst +++ b/doc/getstarted/build_and_install/build_from_source_en.rst @@ -1,4 +1,4 @@ -Build PaddlePaddle from Sources +Build from Sources ========================== .. _build_step: @@ -9,14 +9,18 @@ How To Build PaddlePaddle mainly uses `CMake `_ and GCC, G++ as compile tools. We recommend you to use our pre-built Docker image to run the build to avoid installing dependencies by yourself. We have several build environment -Docker images `here `_. +Docker images `here `_ . + +If you choose not to use Docker image for your build, you need to install the +below `Compile Dependencies`_ before run the build. + Then run: .. code-block:: bash git clone https://github.com/PaddlePaddle/Paddle.git cd Paddle - # run the following command to build CPU-Only binaries if you are using docker + # run the following command to build a CPU-Only binaries if you are using docker docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh # else run these commands mkdir build @@ -32,7 +36,35 @@ machine or copy it to the target machine. pip install python/dist/*.whl -.. _build_step: + +.. _run_test: + +Run Tests +---------------- + +If you wish to run the tests, you may follow the below steps: + +When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build. +Set :code:`WITH_GPU=ON` Can also run tests on GPU. + +.. code-block:: bash + + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh + +If you don't use Docker, just run ctest will start the tests: + +.. code-block:: bash + + mkdir build + cd build + cmake -DWITH_GPU=OFF -DWITH_TESTING=ON .. + make + ctest + # run a single test like test_mul_op + ctest -R test_mul_op + + +.. _compile_deps: Compile Dependencies ---------------- diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst index 07933b2e0b..f78b1fb0e1 100644 --- a/doc/getstarted/build_and_install/docker_install_cn.rst +++ b/doc/getstarted/build_and_install/docker_install_cn.rst @@ -1,4 +1,4 @@ -使用Docker安装运行PaddlePaddle +使用Docker安装运行 ================================ 使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。 diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst index 9b977c9c72..d7acc7aeb7 100644 --- a/doc/getstarted/build_and_install/docker_install_en.rst +++ b/doc/getstarted/build_and_install/docker_install_en.rst @@ -1,4 +1,4 @@ -PaddlePaddle in Docker Containers +Run in Docker Containers ================================= Run PaddlePaddle in Docker container so that you don't need to care about diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst index 41312da48c..b26bf4c95c 100644 --- a/doc/getstarted/build_and_install/pip_install_cn.rst +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -1,4 +1,4 @@ -使用pip安装PaddlePaddle +使用pip安装 ================================ PaddlePaddle可以使用常用的Python包管理工具 diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst index 4f295e14ba..113790e4e4 100644 --- a/doc/getstarted/build_and_install/pip_install_en.rst +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -1,4 +1,4 @@ -Install PaddlePaddle Using pip +Install Using pip ================================ You can use current widely used Python package management diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst index 76d3e0a009..eb95356c67 100644 --- a/doc/howto/index_cn.rst +++ b/doc/howto/index_cn.rst @@ -19,7 +19,6 @@ .. toctree:: :maxdepth: 1 - dev/build_cn.rst dev/write_docs_cn.rst 模型配置 diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst index 1b6034be4e..1fbfcd260b 100644 --- a/doc/howto/index_en.rst +++ b/doc/howto/index_en.rst @@ -18,7 +18,6 @@ Development .. toctree:: :maxdepth: 1 - dev/build_en.rst dev/new_layer_en.rst dev/contribute_to_paddle_en.md From 4ecbab42d8831bcc31c7d29092fc5c07f39c318c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= Date: Wed, 29 Nov 2017 14:45:28 +0800 Subject: [PATCH 107/275] Fix compile on cudnn7 (#5982) * fix compile on cudnn7 * update * update * make silent --- cmake/external/grpc.cmake | 2 +- paddle/operators/conv_cudnn_op.cu.cc | 4 ++-- paddle/platform/dynload/cudnn.cc | 4 ++++ paddle/platform/dynload/cudnn.h | 6 ++++++ 4 files changed, 13 insertions(+), 3 deletions(-) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 1330ef82dc..219ea1b908 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -42,7 +42,7 @@ ExternalProject_Add( # Disable -Werror, otherwise the compile will fail in MacOS. # It seems that we cannot configure that by make command. # Just dry run make command and remove `-Werror`, then use a shell to run make commands - BUILD_COMMAND ${BUILD_CMD} + BUILD_COMMAND ${BUILD_CMD} HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install ) diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index a9763d4248..3f97dc7ee0 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel { cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); -#if CUDNN_VERSION_MIN(7, 0, 0) +#if CUDNN_VERSION_MIN(7, 0, 1) // cudnn 7 can support groups, no need to do it mannually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. @@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel { cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(paddings, strides, dilations); -#if CUDNN_VERSION_MIN(7, 0, 0) +#if CUDNN_VERSION_MIN(7, 0, 1) // cudnn 7 can support groups, no need to do it mannually // FIXME(typhoonzero): find a better way to disable groups // rather than setting it to 1. diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc index d3e4cb567d..761d9edd87 100644 --- a/paddle/platform/dynload/cudnn.cc +++ b/paddle/platform/dynload/cudnn.cc @@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); #endif +#ifdef CUDNN_DNN_ROUTINE_EACH_R7 +CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); +#endif + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h index b2d69da93b..61caac5450 100644 --- a/paddle/platform/dynload/cudnn.h +++ b/paddle/platform/dynload/cudnn.h @@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif +#if CUDNN_VERSION >= 7001 +#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ + __macro(cudnnSetConvolutionGroupCount); +CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + } // namespace dynload } // namespace platform } // namespace paddle From 3206094b5eaf919aac6cdcae46254055ddf98ed9 Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 29 Nov 2017 15:04:56 +0800 Subject: [PATCH 108/275] format code --- paddle/operators/math/unpooling.cc | 4 +-- paddle/operators/math/unpooling.cu | 36 ++++++++++++------------- paddle/operators/math/unpooling.h | 3 +-- paddle/operators/unpool_op.cc | 42 ++++++++++++++++-------------- paddle/operators/unpool_op.cu.cc | 6 ++--- 5 files changed, 46 insertions(+), 45 deletions(-) diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index 9017ffaab1..b57d3dc141 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -20,8 +20,8 @@ template class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& indices, framework::Tensor* output) { + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index f3a317b3b3..058b82d9d2 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -20,11 +20,12 @@ namespace operators { namespace math { template __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, - const int* indices_data, - const int input_height, const int input_width, - const int channels, T* output_data, - const int output_height, - const int output_width) { + const int* indices_data, + const int input_height, + const int input_width, + const int channels, T* output_data, + const int output_height, + const int output_width) { int in_n_stride = input_height * input_width * channels; int in_c_stride = input_height * input_width; int out_n_stride = output_height * output_width * channels; @@ -42,12 +43,11 @@ __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, } } template -__global__ void KernelUnpool2dMaxGrad(const int nthreads, const T* input_data, - const int* indices_data, - const int input_height, const int input_width, - const int channels, const T* output_data, - const T* output_grad, const int output_height, - const int output_width, T* input_grad) { +__global__ void KernelUnpool2dMaxGrad( + const int nthreads, const T* input_data, const int* indices_data, + const int input_height, const int input_width, const int channels, + const T* output_data, const T* output_grad, const int output_height, + const int output_width, T* input_grad) { int in_n_stride = input_height * input_width * channels; int in_c_stride = input_height * input_width; int out_n_stride = output_height * output_width * channels; @@ -71,8 +71,8 @@ template class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, const framework::Tensor& indices, - framework::Tensor* output) { + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -88,8 +88,8 @@ class Unpool2dMaxFunctor { T><<(context) .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_height, output_width); + input_height, input_width, output_channels, + output_data, output_height, output_width); } }; /* @@ -121,9 +121,9 @@ class Unpool2dMaxGradFunctor { T><<(context) .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, output_data, - output_grad_data, output_height, output_width, - input_grad_data); + input_height, input_width, output_channels, + output_data, output_grad_data, output_height, + output_width, input_grad_data); } }; template class Unpool2dMaxGradFunctor; diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index 61eadcdcd5..7077d7c227 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -23,8 +23,7 @@ class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, const framework::Tensor& input, - const framework::Tensor& indices, - framework::Tensor* output); + const framework::Tensor& indices, framework::Tensor* output); }; template class Unpool2dMaxGradFunctor { diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index a40aadcccc..8bd596dbb0 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -75,36 +75,38 @@ int OutputSize(int input_size, int ksize, int padding, int stride) { class UnpoolOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.device_context()); - } + } public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of UnpoolOp" - "should not be null."); - PADDLE_ENFORCE(ctx->HasInput("Indices"), "Input(Indices) of UnpoolOp" + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of UnpoolOp" + "should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Indices"), + "Input(Indices) of UnpoolOp" "should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of UnpoolOp should not be null."); auto in_x_dims = ctx->GetInputDim("X"); auto in_y_dims = ctx->GetInputDim("Indices"); - std::string unpooling_type = - ctx->Attrs().Get("unpooling_type"); + std::string unpooling_type = ctx->Attrs() + .Get("unpooling_type"); std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); PADDLE_ENFORCE(in_x_dims.size() == 4, - "Unpooling intput must be of 4-dimensional."); + "Unpooling intput must be of 4-dimensional."); PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); for (size_t i = 0; i < ksize.size(); ++i) { output_shape.push_back( - OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); } ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); } @@ -113,30 +115,30 @@ class UnpoolOp : public framework::OperatorWithKernel { class UnpoolOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.device_context()); - } + } public: using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), - "Input(X@GRAD) should not be null."); + "Input(X@GRAD) should not be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } }; -} // namespace operators -} // namespace paddle +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL( - unpool, ops::UnpoolKernel, - ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL(unpool, + ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL( unpool_grad, ops::UnpoolGradKernel, ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 29b393f474..18aafb7dc7 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/unpool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - unpool, ops::UnpoolKernel, - ops::UnpoolKernel); +REGISTER_OP_GPU_KERNEL(unpool, + ops::UnpoolKernel, + ops::UnpoolKernel); REGISTER_OP_GPU_KERNEL( unpool_grad, ops::UnpoolGradKernel, ops::UnpoolGradKernel); From 41bd1f9115c4cb8a9a9afcc656b6d0f00d9b1cb5 Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Tue, 28 Nov 2017 17:09:12 -0800 Subject: [PATCH 109/275] fix gpu test, clean code and add cmake --- CMakeLists.txt | 1 + cmake/configure.cmake | 5 + paddle/math/float16.h | 217 ++++++++--------------------- paddle/math/tests/test_float16.cpp | 8 -- paddle/math/tests/test_float16.cu | 90 ++++++------ 5 files changed, 109 insertions(+), 212 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fd3582a1bc..a2bb5d73bc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) +option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) # CMAKE_BUILD_TYPE if(NOT CMAKE_BUILD_TYPE) diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 24ddb24399..2c202707ff 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -24,6 +24,11 @@ if(WITH_DOUBLE) add_definitions(-DPADDLE_TYPE_DOUBLE) endif(WITH_DOUBLE) +if(WITH_ARM_FP16) + add_definitions(-DPADDLE_ARM_FP16) + add_definitions("-march=armv8.2-a+fp16+simd") +endif(WITH_ARM_FP16) + if(WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING) endif(WITH_TESTING) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 65c0489e1f..778b48bce8 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#include +#include #ifdef PADDLE_WITH_CUDA #include @@ -71,6 +71,7 @@ struct PADDLE_ALIGN(2) float16 { public: uint16_t x; + // Constructors HOSTDEVICE inline float16() : x(0) {} HOSTDEVICE inline float16(const float16& h) : x(h.x) {} @@ -89,8 +90,7 @@ public: #ifdef PADDLE_WITH_NATIVE_FP16 // __fp16 is a native half precision data type for arm cpu, - // float16_t is an alias for __fp16 in arm_fp16.h, - // which is included in arm_neon.h. + // float16_t is an alias for __fp16 HOSTDEVICE inline explicit float16(const float16_t& h) { x = *reinterpret_cast(&h); } @@ -141,6 +141,7 @@ public: return *this; } +// Assignment operators #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline float16& operator=(const half& rhs) { #if CUDA_VERSION >= 9000 @@ -219,6 +220,7 @@ public: return *this; } +// Conversion opertors #ifdef PADDLE_CUDA_FP16 HOSTDEVICE inline explicit operator half() const { #if CUDA_VERSION >= 9000 @@ -353,27 +355,54 @@ private: // CUDA 7.5 and 8.0 do not. The arithmetic operators defined here are // for users to write similar CUDA code in CUDA 7.5 and 8.0 as in // CUDA 9.0 regarding the half data type. -#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && \ - __CUDA_ARCH__ >= 530 && CUDA_VERSION < 9000 +#if defined(PADDLE_CUDA_FP16) && CUDA_VERSION < 9000 + DEVICE inline half operator+(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hadd(a, b); +#else + float res = float(float16(a)) + float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator-(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hsub(a, b); +#else + float res = float(float16(a)) - float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator*(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hmul(a, b); +#else + float res = float(float16(a)) * float(float16(b)); + return half(float16(res)); +#endif } DEVICE inline half operator/(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300 float num = __half2float(a); float denom = __half2float(b); return __float2half(num / denom); +#else + float res = float(float16(a)) / float(float16(b)); + return half(float16(res)); +#endif } -DEVICE inline half operator-(const half& a) { return __hneg(a); } +DEVICE inline half operator-(const half& a) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hneg(a); +#else + float res = -float(float16(a)); + return half(float16(res)); +#endif +} DEVICE inline half& operator+=(half& a, const half& b) { a = a + b; @@ -396,99 +425,57 @@ DEVICE inline half& operator/=(half& a, const half& b) { } DEVICE inline bool operator==(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __heq(a, b); +#else + return float(float16(a)) == float(float16(b)); +#endif } DEVICE inline bool operator!=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hne(a, b); +#else + return float(float16(a)) != float(float16(b)); +#endif } DEVICE inline bool operator<(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hlt(a, b); +#else + return float(float16(a)) < float(float16(b)); +#endif } DEVICE inline bool operator<=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hle(a, b); +#else + return float(float16(a)) <= float(float16(b)); +#endif } DEVICE inline bool operator>(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hgt(a, b); +#else + return float(float16(a)) > float(float16(b)); +#endif } DEVICE inline bool operator>=(const half& a, const half& b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 return __hge(a, b); +#else + return float(float16(a)) >= float(float16(b)); +#endif } -/* -DEVICE inline float16 operator+(const float16& a, const float16& b) { - return float16(__hadd(half(a), half(b))); -} - -DEVICE inline float16 operator-(const float16& a, const float16& b) { - return float16(__hsub(half(a), half(b))); -} - -DEVICE inline float16 operator*(const float16& a, const float16& b) { - return float16(__hmul(half(a), half(b))); -} - -DEVICE inline float16 operator/(const float16& a, const float16& b) { - float num = __half2float(half(a)); - float denom = __half2float(half(b)); - return float16(num / denom); -} - -DEVICE inline float16 operator-(const float16& a) { - return float16(__hneg(half(a))); -} - -DEVICE inline float16& operator+=(float16& a, const float16& b) { - a = a + b; - return a; -} - -DEVICE inline float16& operator-=(float16& a, const float16& b) { - a = a - b; - return a; -} - -DEVICE inline float16& operator*=(float16& a, const float16& b) { - a = a * b; - return a; -} - -DEVICE inline float16& operator/=(float16& a, const float16& b) { - a = a / b; - return a; -} - -DEVICE inline bool operator==(const float16& a, const float16& b) { - return __heq(half(a), half(b)); -} - -DEVICE inline bool operator!=(const float16& a, const float16& b) { - return __hne(half(a), half(b)); -} - -DEVICE inline bool operator<(const float16& a, const float16& b) { - return __hlt(half(a), half(b)); -} - -DEVICE inline bool operator<=(const float16& a, const float16& b) { - return __hle(half(a), half(b)); -} - -DEVICE inline bool operator>(const float16& a, const float16& b) { - return __hgt(half(a), half(b)); -} - -DEVICE inline bool operator>=(const float16& a, const float16& b) { - return __hge(half(a), half(b)); -} -*/ +#endif // PADDLE_CUDA_FP16 // Arithmetic operators on ARMv8.2-A CPU -#elif defined(PADDLE_WITH_NATIVE_FP16) +#if defined(PADDLE_WITH_NATIVE_FP16) HOST inline float16 operator+(const float16& a, const float16& b) { float16 res; asm volatile( @@ -681,88 +668,6 @@ HOST inline bool operator>=(const float16& a, const float16& b) { return (res & 0xffff) != 0; } -/* -HOST inline float16 operator+(const float16& a, const float16& b) { - return float16(vaddh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator-(const float16& a, const float16& b) { - return float16(vsubh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator*(const float16& a, const float16& b) { - return float16(vmulh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator/(const float16& a, const float16& b) { - return float16(vdivh_f16(float16_t(a), float16_t(b))); -} - -HOST inline float16 operator-(const float16& a) { - return float16(vnegh_f16(float16_t(a))); -} - -HOST inline float16& operator+=(float16& a, const float16& b) { - a = a + b; - return a; -} - -HOST inline float16& operator-=(float16& a, const float16& b) { - a = a - b; - return a; -} - -HOST inline float16& operator*=(float16& a, const float16& b) { - a = a * b; - return a; -} - -HOST inline float16& operator/=(float16& a, const float16& b) { - a = a / b; - return a; -} - -HOST inline bool operator==(const float16& a, const float16& b) { - return static_cast(vceqh_f16(float16_t(a), float16_t(b))); -} - -HOST inline bool operator!=(const float16& a, const float16& b) { - return !(a == b); -} - -HOST inline bool operator<(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vclth_f16(float16_t(a), float16_t(b))); -#else - return float(a) < float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator<=(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcleh_f16(float16_t(a), float16_t(b))); -#else - return float(a) <= float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator>(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcgth_f16(float16_t(a), float16_t(b))); -#else - return float(a) > float(b); -#endif // PADDLE_NEON_64 -} - -HOST inline bool operator>=(const float16& a, const float16& b) { -#ifdef PADDLE_NEON_64 - return static_cast(vcgeh_f16(float16_t(a), float16_t(b))); -#else - return float(a) >= float(b); -#endif // PADDLE_NEON_64 -} -*/ - // Arithmetic operators, software emulated on other CPU #else HOSTDEVICE inline float16 operator+(const float16& a, const float16& b) { diff --git a/paddle/math/tests/test_float16.cpp b/paddle/math/tests/test_float16.cpp index f5541d8f0f..74cc55aa37 100644 --- a/paddle/math/tests/test_float16.cpp +++ b/paddle/math/tests/test_float16.cpp @@ -54,14 +54,6 @@ TEST(float16, conversion_cpu) { EXPECT_EQ(float16(true).x, 0x3c00); EXPECT_EQ(float16(false).x, 0x0000); - // Implicit conversion to and from Eigen::half - /* - Eigen::half tmp = float16(1.0f); - float16 v_conv = tmp; - EXPECT_EQ(tmp.x, 0x3c00); - EXPECT_EQ(v_conv.x, 0x3c00); - */ - // Default constructor float16 v_def; EXPECT_EQ(v_def.x, 0x0000); diff --git a/paddle/math/tests/test_float16.cu b/paddle/math/tests/test_float16.cu index 941f266603..4b520feaaf 100644 --- a/paddle/math/tests/test_float16.cu +++ b/paddle/math/tests/test_float16.cu @@ -15,41 +15,38 @@ limitations under the License. */ #include "paddle/utils/Logging.h" -#define ARITHMETIC_KERNEL(op_type, sign) \ - __global__ void op_type( \ - const float16* in1, const float16* in2, float16* out) { \ - out[0] = in1[0] sign in2[0]; \ +#define ARITHMETIC_KERNEL(op_type, sign) \ + __global__ void op_type(const half* in1, const half* in2, half* out) { \ + out[0] = in1[0] sign in2[0]; \ } -#define COMPOUND_KERNEL(op_type, sign) \ - __global__ void op_type(float16* in1, const float16* in2) { \ - in1[0] sign in2[0]; \ - } +#define COMPOUND_KERNEL(op_type, sign) \ + __global__ void op_type(half* in1, const half* in2) { in1[0] sign in2[0]; } -#define COMPARISON_KERNEL(op_type, sign) \ - __global__ void op_type(const float16* in1, const float16* in2, bool* out) { \ - out[0] = in1[0] sign in2[0]; \ +#define COMPARISON_KERNEL(op_type, sign) \ + __global__ void op_type(const half* in1, const half* in2, bool* out) { \ + out[0] = in1[0] sign in2[0]; \ } #define ARITHMETIC_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, float v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2, *out; \ - float16 *d_in1, *d_in2, *d_out; \ - int size = sizeof(float16); \ + half *in1, *in2, *out; \ + half *d_in1, *d_in2, *d_out; \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_out, size); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ - out = (float16*)malloc(size); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ + out = (half*)malloc(size); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(out[0]), v_out); \ + EXPECT_EQ(float(float16(out[0])), v_out); \ free(in1); \ free(in2); \ free(out); \ @@ -61,20 +58,20 @@ limitations under the License. */ #define COMPOUND_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, float v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2; \ - float16 *d_in1, *d_in2; \ - int size = sizeof(float16); \ + half *in1, *in2; \ + half *d_in1, *d_in2; \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2); \ cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \ - EXPECT_EQ(float(in1[0]), v_out); \ + EXPECT_EQ(float(float16(in1[0])), v_out); \ free(in1); \ free(in2); \ cudaFree(d_in1); \ @@ -84,18 +81,18 @@ limitations under the License. */ #define COMPARISON_KERNEL_LAUNCH(op_type) \ void Test##op_type(float v_in1, float v_in2, bool v_out) { \ LOG(INFO) << "Test " << #op_type << " on GPU!"; \ - float16 *in1, *in2; \ - float16 *d_in1, *d_in2; \ + half *in1, *in2; \ + half *d_in1, *d_in2; \ bool *out, *d_out; \ - int size = sizeof(float16); \ + int size = sizeof(half); \ cudaMalloc((void**)&d_in1, size); \ cudaMalloc((void**)&d_in2, size); \ cudaMalloc((void**)&d_out, 1); \ - in1 = (float16*)malloc(size); \ - in2 = (float16*)malloc(size); \ + in1 = (half*)malloc(size); \ + in2 = (half*)malloc(size); \ out = (bool*)malloc(1); \ - in1[0] = float16(v_in1); \ - in2[0] = float16(v_in2); \ + in1[0] = half(float16(v_in1)); \ + in2[0] = half(float16(v_in2)); \ cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \ cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \ op_type<<<1, 1>>>(d_in1, d_in2, d_out); \ @@ -112,6 +109,7 @@ limitations under the License. */ #ifdef PADDLE_CUDA_FP16 namespace paddle { +#if CUDA_VERSION < 9000 ARITHMETIC_KERNEL(Add, +) ARITHMETIC_KERNEL(Sub, -) ARITHMETIC_KERNEL(Mul, *) @@ -123,19 +121,19 @@ ARITHMETIC_KERNEL_LAUNCH(Mul) ARITHMETIC_KERNEL_LAUNCH(Div) // Negative sign kernel -__global__ void Neg(float16* in) { in[0] = -in[0]; } +__global__ void Neg(half* in) { in[0] = -in[0]; } void TestNeg(float v_in, float v_out) { LOG(INFO) << "Test Neg on GPU!"; - float16 *in, *d_in; - int size = sizeof(float16); + half *in, *d_in; + int size = sizeof(half); cudaMalloc((void**)&d_in, size); - in = (float16*)malloc(size); - in[0] = float16(v_in); + in = (half*)malloc(size); + in[0] = half(float16(v_in)); cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice); Neg<<<1, 1>>>(d_in); cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost); - EXPECT_EQ(float(in[0]), v_out); + EXPECT_EQ(float(float16(in[0])), v_out); free(in); cudaFree(d_in); } @@ -193,6 +191,7 @@ TEST(float16, comparision_on_gpu) { TestGreaterEqual(4, 4, true); TestGreaterEqual(4, 5, false); } +#endif // CUDA_VERSION TEST(float16, conversion_on_gpu) { // Explicit conversion to and from cuda half @@ -204,16 +203,11 @@ TEST(float16, conversion_on_gpu) { EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff); EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00); - // Implicit conversion to and from cuda half - half tmp = float16(1.0f); - float16 val = tmp; - EXPECT_EQ(val.x, 0x3c00); - // Assignment operator float16 v_assign; - v_assign = tmp; + v_assign = half(float16(1.0f)); EXPECT_EQ(v_assign.x, 0x3c00); } } // namespace paddle -#endif +#endif // PADDLE_CUDA_FP16 From 4ffb73fd3b353c3d2acfcea3106bfd1a4d4202ee Mon Sep 17 00:00:00 2001 From: sweetsky0901 Date: Wed, 29 Nov 2017 15:51:28 +0800 Subject: [PATCH 110/275] format .. --- paddle/operators/math/unpooling.cu | 25 ++++++++++++------------- paddle/operators/unpool_op.cc | 8 +++----- 2 files changed, 15 insertions(+), 18 deletions(-) diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 058b82d9d2..37c3c8b689 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -21,8 +21,7 @@ namespace math { template __global__ void KernelUnpool2dMax(const int nthreads, const T* input_data, const int* indices_data, - const int input_height, - const int input_width, + const int input_height, const int input_width, const int channels, T* output_data, const int output_height, const int output_width) { @@ -71,8 +70,8 @@ template class Unpool2dMaxFunctor { public: void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& indices, framework::Tensor* output) { + const framework::Tensor& input, + const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -86,10 +85,10 @@ class Unpool2dMaxFunctor { int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMax< T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_height, output_width); + reinterpret_cast(context) + .stream()>>>(input.numel(), input_data, indices_data, + input_height, input_width, output_channels, + output_data, output_height, output_width); } }; /* @@ -119,11 +118,11 @@ class Unpool2dMaxGradFunctor { int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMaxGrad< T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_grad_data, output_height, - output_width, input_grad_data); + reinterpret_cast(context) + .stream()>>>(input.numel(), input_data, indices_data, + input_height, input_width, output_channels, + output_data, output_grad_data, output_height, + output_width, input_grad_data); } }; template class Unpool2dMaxGradFunctor; diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 8bd596dbb0..89c48e071c 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -94,12 +94,11 @@ class UnpoolOp : public framework::OperatorWithKernel { "Output(Out) of UnpoolOp should not be null."); auto in_x_dims = ctx->GetInputDim("X"); auto in_y_dims = ctx->GetInputDim("Indices"); - std::string unpooling_type = ctx->Attrs() - .Get("unpooling_type"); + std::string unpooling_type = + ctx->Attrs().Get("unpooling_type"); std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); - std::vector paddings = - ctx->Attrs().Get>("paddings"); + std::vector paddings = ctx->Attrs().Get>("paddings"); PADDLE_ENFORCE(in_x_dims.size() == 4, "Unpooling intput must be of 4-dimensional."); PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims); @@ -142,4 +141,3 @@ REGISTER_OP_CPU_KERNEL(unpool, REGISTER_OP_CPU_KERNEL( unpool_grad, ops::UnpoolGradKernel, ops::UnpoolGradKernel); - From a5236265b752b9dfad32ae1188798b22eaba9a22 Mon Sep 17 00:00:00 2001 From: yangyaming Date: Wed, 29 Nov 2017 15:55:21 +0800 Subject: [PATCH 111/275] Refine doc for smooth l1 loss op. --- paddle/operators/smooth_l1_loss_op.cc | 62 ++++++++++++++++----------- 1 file changed, 36 insertions(+), 26 deletions(-) diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc index ebf7b43700..50543fcc14 100644 --- a/paddle/operators/smooth_l1_loss_op.cc +++ b/paddle/operators/smooth_l1_loss_op.cc @@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; void InferShape(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized."); - PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null."); auto x_dims = ctx->GetInputDim("X"); auto y_dims = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same."); + PADDLE_ENFORCE_EQ(x_dims, y_dims); PADDLE_ENFORCE_GE(x_dims.size(), 2, - "The tensor rank of X must be at least 2."); + "The tensor rank of Input(X) should not be less than 2."); if (ctx->HasInput("InsideWeight")) { PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"), "If weights are provided, must specify both " "inside and outside weights."); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims, - "The shape of InsideWeight must be same as X."); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims, - "The shape of OutsideWeight must be same as X."); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims); + PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims); } ctx->SetOutputDim("Diff", x_dims); @@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker* op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", - "The input tensor of smooth l1 loss op." - "The rank should be greater or equal to 2 with shape " - "[batch_size, value_dim1, value_dim2, ..., value_dimN]"); + "(Tensor, default Tensor) A tensor with rank at least 2. " + "The input value of smooth l1 loss op with shape " + "[batch_size, dim1, ..., dimN]."); AddInput("Y", - "The target tensor of smooth l1 loss op " - "with the same shape as X."); + "(Tensor, default Tensor) A tensor with rank at least 2. " + "The target value of smooth l1 loss op with same shape as X."); AddInput("InsideWeight", - "Optional input tensor of smooth l1 loss op with the same shape " - "as X. If provided, the result of (X - Y) will be multiplied " + "(Tensor, default Tensor) A tensor with rank at least 2. " + "This input is optional and should have same shape with X. " + "If provided, the result of (X - Y) will be multiplied " "by this tensor element by element.") .AsDispensable(); AddInput("OutsideWeight", - "Optinal input of smooth l1 loss op with the same shape as X." - "If provided, the output smooth l1 loss will be multiplied by " - "this tensor element by element.") + "(Tensor, default Tensor) A tensor with rank at least 2. " + "This input is optional and should have same shape with X. " + "If provided, the out smooth l1 loss will be multiplied by this " + "tensor element by element.") .AsDispensable(); - AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).") + AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).") .AsIntermediate(); - AddOutput("Out", "Smooth l1 loss."); + AddOutput("Out", + "(Tensor, default Tensor) A tensor with rank be 2. " + "The output smooth l1 loss with shape [batch_size, 1]."); AddAttr("sigma", "Hyper parameter of smooth l1 loss op." "A float scalar with default value 3.0.") @@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Smooth L1 Loss Operator. -This operator computes the smooth l1 loss for input and target. -The operator takes the first dimension of input as the batch size. +This operator computes the smooth l1 loss for X and Y. +The operator takes the first dimension of X and Y as batch size. For each instance, it computes the smooth l1 loss element by element first -and then sums all the losses. So the resulting output shape -is [batch_size, 1]. +and then sums all the losses. So the shape of Out is [batch_size, 1]. The equation is: -loss = $$0.5 * (\sigma * (x-y))^2$$ if $$|x - y| < 1 /({\sigma}^2)$$ - $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise +$$ +Out_{\sigma}(X, Y)_i = \begin{cases} +0.5 * (\sigma * (X_i - Y_i)) ^ 2 +\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\ +\frac{|X_i - Y_i| - 0.5}{{\sigma}^2}, +\quad otherwise +\end{cases} +$$ + +In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith +element of Out, X and Y. )DOC"); } From 1b6dcc2fe839a190a070ca2fd469b540c00ca1ae Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 29 Nov 2017 16:51:05 +0800 Subject: [PATCH 112/275] Feature/param attr (#5996) * Make param_attr as a strong typed class Fix #5819 --- python/paddle/v2/fluid/__init__.py | 3 +- python/paddle/v2/fluid/layer_helper.py | 71 +++++------ python/paddle/v2/fluid/layers.py | 114 ++++-------------- python/paddle/v2/fluid/param_attr.py | 61 ++++++++++ .../tests/book/test_label_semantic_roles.py | 10 +- .../tests/book/test_recognize_digits_mlp.py | 11 +- .../tests/book/test_recommender_system.py | 10 +- .../v2/fluid/tests/book/test_word2vec.py | 8 +- python/paddle/v2/fluid/tests/test_layers.py | 8 +- .../v2/fluid/tests/test_recurrent_op.py | 4 +- 10 files changed, 141 insertions(+), 159 deletions(-) create mode 100644 python/paddle/v2/fluid/param_attr.py diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 9677c9568c..c033b27bea 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -13,13 +13,14 @@ import nets import optimizer import backward import regularizer +from param_attr import ParamAttr from core import LoDTensor, CPUPlace, GPUPlace Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', - 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor' + 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' ] diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py index 7762b0d88f..5b384e5cf5 100644 --- a/python/paddle/v2/fluid/layer_helper.py +++ b/python/paddle/v2/fluid/layer_helper.py @@ -1,8 +1,10 @@ import copy import itertools -from framework import Variable, default_main_program, default_startup_program, unique_name, dtype_is_floating +from framework import Variable, default_main_program, default_startup_program, \ + unique_name, dtype_is_floating from paddle.v2.fluid.initializer import Constant, Xavier +from param_attr import ParamAttr class LayerHelper(object): @@ -59,31 +61,15 @@ class LayerHelper(object): @property def param_attr(self): - default = {'name': None} - actual = self.kwargs.get('param_attr', None) - if actual is None: - actual = default - for default_field in default.keys(): - if default_field not in actual: - actual[default_field] = default[default_field] - return actual + return ParamAttr.to_attr(self.kwargs.get('param_attr', None)) @property def bias_attr(self): - default = {'name': None} - bias_attr = self.kwargs.get('bias_attr', None) - if bias_attr is None: - bias_attr = default - - if isinstance(bias_attr, dict): - for default_field in default.keys(): - if default_field not in bias_attr: - bias_attr[default_field] = default[default_field] - return bias_attr + return ParamAttr.to_attr(self.kwargs.get('bias_attr', None)) def multiple_param_attr(self, length): param_attr = self.param_attr - if isinstance(param_attr, dict): + if isinstance(param_attr, ParamAttr): param_attr = [param_attr] if len(param_attr) != 1 and len(param_attr) != length: @@ -111,23 +97,30 @@ class LayerHelper(object): raise ValueError("Data Type mismatch") return dtype - def create_parameter(self, attr, shape, dtype, suffix='w', - initializer=None): + def create_parameter(self, + attr, + shape, + dtype, + is_bias=False, + default_initializer=None): # Deepcopy the attr so that parameters can be shared in program - attr_copy = copy.deepcopy(attr) - if initializer is not None: - attr_copy['initializer'] = initializer + assert isinstance(attr, ParamAttr) + suffix = 'b' if is_bias else 'w' + + if default_initializer is None: + if is_bias: + attr.set_default_bias_initializer() + else: + attr.set_default_param_initializer() else: - attr_copy['initializer'] = self._get_default_initializer(dtype) - if attr_copy['name'] is None: - attr_copy['name'] = unique_name(".".join([self.name, suffix])) + attr.set_default_initializer(default_initializer) + if attr.name is None: + attr.name = unique_name(".".join([self.name, suffix])) + self.startup_program.global_block().create_parameter( - dtype=dtype, shape=shape, **attr_copy) + dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True)) return self.main_program.global_block().create_parameter( - name=attr_copy['name'], - dtype=dtype, - shape=shape, - trainable=attr_copy.get('trainable', True)) + dtype=dtype, shape=shape, **attr.to_kwargs()) def create_tmp_variable(self, dtype): return self.main_program.current_block().create_var( @@ -152,11 +145,7 @@ class LayerHelper(object): persistable=True, initializer=initializer) - def append_bias_op(self, - input_var, - bias_initializer, - dim_start=1, - dim_end=None): + def append_bias_op(self, input_var, dim_start=1, dim_end=None): """ Append bias operator and return its output. If the user does not set bias_attr, append_bias_op will return input_var @@ -176,11 +165,7 @@ class LayerHelper(object): return input_var b = self.create_parameter( - attr=bias_attr, - shape=size, - dtype=input_var.dtype, - suffix='b', - initializer=bias_initializer) + attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True) tmp = self.create_tmp_variable(dtype=input_var.dtype) self.append_op( type='elementwise_add', diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 6adfac3a32..9dcc11d216 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -5,6 +5,7 @@ from initializer import Constant, Normal, Xavier, Initializer from paddle.v2.fluid.layer_helper import LayerHelper, unique_name import re import cStringIO +from param_attr import ParamAttr __all__ = [ 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat', @@ -17,9 +18,7 @@ def fc(input, size, num_flatten_dims=1, param_attr=None, - param_initializer=None, bias_attr=None, - bias_initializer=None, act=None, name=None, main_program=None, @@ -54,23 +53,10 @@ def fc(input, to the LayerHelper constructor. """ - - def _get_default_param_initializer(): - return Xavier() - - def _get_default_bias_initializer(): - return Constant() - helper = LayerHelper('fc', **locals()) dtype = helper.input_dtype() - if param_initializer is None: - param_initializer = _get_default_param_initializer() - - if bias_initializer is None: - bias_initializer = _get_default_bias_initializer() - mul_results = [] for input_var, param_attr in helper.iter_inputs_and_params(): input_shape = input_var.shape @@ -78,10 +64,7 @@ def fc(input, reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1) ] + [size] w = helper.create_parameter( - attr=param_attr, - initializer=param_initializer, - shape=param_shape, - dtype=dtype) + attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False) tmp = helper.create_tmp_variable(dtype) helper.append_op( type="mul", @@ -102,7 +85,7 @@ def fc(input, helper.append_op( type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias}) # add bias - pre_activation = helper.append_bias_op(pre_bias, bias_initializer) + pre_activation = helper.append_bias_op(pre_bias) # add activation return helper.append_activation(pre_activation) @@ -110,7 +93,6 @@ def fc(input, def embedding(input, size, is_sparse=False, - param_initializer=None, param_attr=None, dtype='float32', main_program=None, @@ -119,6 +101,7 @@ def embedding(input, Embedding Layer. Args: + param_initializer: input: The input to the function size: The size of the layer is_sparse: A flag that decleares whether the input is sparse @@ -136,15 +119,9 @@ def embedding(input, """ - def _get_default_param_initializer(): - return Xavier() - helper = LayerHelper('embedding', **locals()) w = helper.create_parameter( - attr=helper.param_attr, - shape=size, - dtype=dtype, - initializer=param_initializer or _get_default_param_initializer()) + attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False) tmp = helper.create_tmp_variable(dtype) helper.append_op( type='lookup_table', @@ -176,7 +153,7 @@ def dynamic_lstm(input, if not use_peepholes: bias_size[1] = 4 * size bias = helper.create_parameter( - attr=helper.bias_attr, shape=bias_size, dtype=dtype, suffix='b') + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) hidden = helper.create_tmp_variable(dtype) cell = helper.create_tmp_variable(dtype) @@ -471,19 +448,14 @@ def sums(input, out=None, main_program=None, startup_program=None): def linear_chain_crf(input, label, param_attr=None, - param_initializer=None, main_program=None, startup_program=None): - def _get_default_param_initializer(): - return Xavier() - helper = LayerHelper('linear_chain_crf', **locals()) size = input.shape[1] transition = helper.create_parameter( attr=helper.param_attr, shape=[size + 2, size], - dtype=helper.input_dtype(), - initializer=param_initializer or _get_default_param_initializer()) + dtype=helper.input_dtype()) alpha = helper.create_tmp_variable(dtype=helper.input_dtype()) emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype()) @@ -646,9 +618,7 @@ def sequence_conv(input, filter_stride=1, padding=None, bias_attr=None, - bias_initializer=None, param_attr=None, - param_initializer=None, act=None, main_program=None, startup_program=None): @@ -658,30 +628,15 @@ def sequence_conv(input, in the input parameters to the function. """ - def _get_default_bias_initializer(): - return Constant() - - def _get_default_param_initializer(): - return Xavier() - # FIXME(dzh) : want to unify the argument of python layer # function. So we ignore some unecessary attributes. # such as, padding_trainable, context_start. helper = LayerHelper('sequence_conv', **locals()) dtype = helper.input_dtype() - - if param_initializer is None: - param_initializer = _get_default_param_initializer() - if bias_initializer is None: - bias_initializer = _get_default_bias_initializer() - filter_shape = [filter_size * input.shape[1], num_filters] filter = helper.create_parameter( - attr=helper.param_attr, - shape=filter_shape, - dtype=dtype, - initializer=param_initializer) + attr=helper.param_attr, shape=filter_shape, dtype=dtype) pre_bias = helper.create_tmp_variable(dtype) helper.append_op( @@ -696,7 +651,7 @@ def sequence_conv(input, 'contextStart': -int(filter_size / 2), 'contextLength': filter_size }) - pre_act = helper.append_bias_op(pre_bias, bias_initializer) + pre_act = helper.append_bias_op(pre_bias) return helper.append_activation(pre_act) @@ -707,9 +662,7 @@ def conv2d(input, padding=None, groups=None, param_attr=None, - param_initializer=None, bias_attr=None, - bias_initializer=None, act=None, name=None, main_program=None, @@ -722,13 +675,6 @@ def conv2d(input, conv-2d output, if mentioned in the input parameters. """ - def _get_default_bias_initializer(): - return Constant() - - def _get_default_param_initializer(filter_size, num_channels): - std = (2.0 / (filter_size[0]**2 * num_channels))**0.5 - return Normal(0.0, std, 0) - helper = LayerHelper('conv2d', **locals()) dtype = helper.input_dtype() @@ -750,17 +696,16 @@ def conv2d(input, input_shape = input.shape filter_shape = [num_filters, num_filter_channels] + filter_size - if param_initializer is None: - param_initializer = _get_default_param_initializer(filter_size, - num_channels) - if bias_initializer is None: - bias_initializer = _get_default_bias_initializer() + def _get_default_param_initializer(): + std = (2.0 / (filter_size[0]**2 * num_channels))**0.5 + return Normal(0.0, std, 0) filter = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype, - initializer=param_initializer) + default_initializer=_get_default_param_initializer()) + pre_bias = helper.create_tmp_variable(dtype) helper.append_op( @@ -774,8 +719,7 @@ def conv2d(input, 'paddings': padding, 'groups': groups}) - pre_act = helper.append_bias_op( - pre_bias, bias_initializer, dim_start=1, dim_end=2) + pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) return helper.append_activation(pre_act) @@ -876,12 +820,10 @@ def batch_norm(input, attr=helper.param_attr, shape=param_shape, dtype=dtype, - initializer=Constant(1.0)) + default_initializer=Constant(1.0)) + bias = helper.create_parameter( - attr=helper.param_attr, - shape=param_shape, - dtype=dtype, - initializer=Constant(0.0)) + attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True) mean = helper.create_global_variable( dtype=input.dtype, shape=param_shape, persistable=True) @@ -1356,7 +1298,7 @@ def lod_rank_table(x, level=0, main_program=None): def max_sequence_len(rank_table, main_program=None): """ - This function creates an operator to calculate the length of + This function creates an operator to calculate the length of max seqence through input rank_table(should be a lod_rank_table) """ helper = LayerHelper("max_seqence_len", **locals()) @@ -1594,35 +1536,33 @@ def conv2d_transpose(input, padding=None, stride=None, param_attr=None, - param_initializer=None, main_program=None, startup_program=None): """ The transpose of conv2d layer. - + This layer is also known as deconvolution layer. - + Args: input(Variable): The input image with [N, C, H, W] format. num_filters(int): The number of filter. It is as same as the output image channel. output_size(int|tuple|None): The output image size. If output size is a - tuple, it must contain two integers, (image_H, image_W). This + tuple, it must contain two integers, (image_H, image_W). This parameter only works when filter_size is None. filter_size(int|tuple|None): The filter size. If filter_size is a tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise, the filter will be a square. None if use output size to calculate filter_size padding(int|tuple): The padding size. If padding is a tuple, it must - contain two integers, (padding_H, padding_W). Otherwise, the + contain two integers, (padding_H, padding_W). Otherwise, the padding_H = padding_W = padding. stride(int|tuple): The stride size. If stride is a tuple, it must contain two integers, (stride_H, stride_W). Otherwise, the stride_H = stride_W = stride. param_attr: Parameter Attribute. - param_initializer(Initializer): Parameter Initializer. Default is Xavier main_program(Program): the main program - startup_program(Program): the startup program + startup_program(Program): the startup program Returns: Variable: Output image. @@ -1663,10 +1603,7 @@ def conv2d_transpose(input, filter_shape = [input_channel, num_filters] + filter_size img_filter = helper.create_parameter( - dtype=input.dtype, - shape=filter_shape, - attr=helper.param_attr, - initializer=param_initializer) + dtype=input.dtype, shape=filter_shape, attr=helper.param_attr) out = helper.create_tmp_variable(dtype=input.dtype) helper.append_op( @@ -1675,6 +1612,7 @@ def conv2d_transpose(input, 'Filter': [img_filter]}, outputs={'Output': out}, attrs=op_attr) + return out diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py new file mode 100644 index 0000000000..86088fdd7c --- /dev/null +++ b/python/paddle/v2/fluid/param_attr.py @@ -0,0 +1,61 @@ +from initializer import Initializer, Xavier, Constant +from regularizer import WeightDecayRegularizer + + +class ParamAttr(object): + def __init__(self, + name=None, + initializer=None, + learning_rate=1.0, + regularizer=None, + trainable=True): + self.name = name + self.initializer = initializer + self.learning_rate = learning_rate + self.regularizer = regularizer + self.trainable = trainable + + def set_default_initializer(self, initializer): + if initializer is None: + if self.initializer is None: + raise ValueError("ParamAttr.initializer is not set") + return + + if self.initializer is not None: + return + + self.initializer = initializer + + def set_default_param_initializer(self): + self.set_default_initializer(Xavier()) + + def set_default_bias_initializer(self): + self.set_default_initializer(Constant(0.0)) + + @staticmethod + def to_attr(arg): + if arg is None: + return ParamAttr() + elif isinstance(arg, ParamAttr): + return arg + elif isinstance(arg, str) or isinstance(arg, unicode): + return ParamAttr(name=arg) + elif isinstance(arg, Initializer): + return ParamAttr(initializer=arg) + elif isinstance(arg, WeightDecayRegularizer): + return ParamAttr(regularizer=arg) + elif isinstance(arg, bool): + return ParamAttr.to_attr(None) if arg else False + else: + raise TypeError("{0} cast to ParamAttr".format(type(arg))) + + def to_kwargs(self, with_initializer=False): + kwargs = { + 'name': self.name, + 'learning_rate': self.learning_rate, + 'regularizer': self.regularizer, + 'trainable': self.trainable + } + if with_initializer: + kwargs['initializer'] = self.initializer + return kwargs diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index 93987a2b80..bcd6f4d6bc 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -44,7 +44,7 @@ def db_lstm(): size=[pred_len, word_dim], dtype='float32', is_sparse=IS_SPARSE, - param_attr={'name': 'vemb'}) + param_attr='vemb') mark_embedding = fluid.layers.embedding( input=mark, @@ -57,8 +57,8 @@ def db_lstm(): fluid.layers.embedding( size=[word_dict_len, word_dim], input=x, - param_attr={'name': embedding_name, - 'trainable': False}) for x in word_input + param_attr=fluid.ParamAttr( + name=embedding_name, trainable=False)) for x in word_input ] emb_layers.append(predicate_embedding) emb_layers.append(mark_embedding) @@ -125,8 +125,8 @@ def main(): crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, - param_attr={"name": 'crfw', - "learning_rate": mix_hidden_lr}) + param_attr=fluid.ParamAttr( + name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(x=crf_cost) # TODO(qiao) # 1. add crf_decode_layer and evaluator diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py index 8ca45134dc..fa18965aac 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py @@ -6,24 +6,21 @@ import paddle.v2.fluid as fluid BATCH_SIZE = 128 image = fluid.layers.data(name='x', shape=[784], dtype='float32') -param_attr = { - 'name': None, - 'regularization': fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE) -} +regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE) hidden1 = fluid.layers.fc(input=image, size=128, act='relu', - param_attr=param_attr) + param_attr=regularizer) hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu', - param_attr=param_attr) + param_attr=regularizer) predict = fluid.layers.fc(input=hidden2, size=10, act='softmax', - param_attr=param_attr) + param_attr=regularizer) label = fluid.layers.data(name='y', shape=[1], dtype='int64') diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py index f8dc151857..db91ca4f9c 100644 --- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py @@ -24,7 +24,7 @@ def get_usr_combined_features(): input=uid, dtype='float32', size=[USR_DICT_SIZE, 32], - param_attr={'name': 'user_table'}, + param_attr='user_table', is_sparse=IS_SPARSE) usr_fc = layers.fc(input=usr_emb, size=32) @@ -36,7 +36,7 @@ def get_usr_combined_features(): usr_gender_emb = layers.embedding( input=usr_gender_id, size=[USR_GENDER_DICT_SIZE, 16], - param_attr={'name': 'gender_table'}, + param_attr='gender_table', is_sparse=IS_SPARSE) usr_gender_fc = layers.fc(input=usr_gender_emb, size=16) @@ -48,7 +48,7 @@ def get_usr_combined_features(): input=usr_age_id, size=[USR_AGE_DICT_SIZE, 16], is_sparse=IS_SPARSE, - param_attr={'name': 'age_table'}) + param_attr='age_table') usr_age_fc = layers.fc(input=usr_age_emb, size=16) @@ -58,7 +58,7 @@ def get_usr_combined_features(): usr_job_emb = layers.embedding( input=usr_job_id, size=[USR_JOB_DICT_SIZE, 16], - param_attr={'name': 'job_table'}, + param_attr='job_table', is_sparse=IS_SPARSE) usr_job_fc = layers.fc(input=usr_job_emb, size=16) @@ -81,7 +81,7 @@ def get_mov_combined_features(): input=mov_id, dtype='float32', size=[MOV_DICT_SIZE, 32], - param_attr={'name': 'movie_table'}, + param_attr='movie_table', is_sparse=IS_SPARSE) mov_fc = layers.fc(input=mov_emb, size=32) diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index b0cd1a518c..92d3629d42 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -23,25 +23,25 @@ embed_first = fluid.layers.embedding( size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, - param_attr={'name': 'shared_w'}) + param_attr='shared_w') embed_second = fluid.layers.embedding( input=second_word, size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, - param_attr={'name': 'shared_w'}) + param_attr='shared_w') embed_third = fluid.layers.embedding( input=third_word, size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, - param_attr={'name': 'shared_w'}) + param_attr='shared_w') embed_forth = fluid.layers.embedding( input=forth_word, size=[dict_size, EMBED_SIZE], dtype='float32', is_sparse=IS_SPARSE, - param_attr={'name': 'shared_w'}) + param_attr='shared_w') concat_embed = fluid.layers.concat( input=[embed_first, embed_second, embed_third, embed_forth], axis=1) diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 62b2a0f9a1..b6906be60b 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -132,26 +132,26 @@ class TestBook(unittest.TestCase): input=first_word, size=[dict_size, embed_size], dtype='float32', - param_attr={'name': 'shared_w'}, + param_attr='shared_w', main_program=program) embed_second = layers.embedding( input=second_word, size=[dict_size, embed_size], dtype='float32', - param_attr={'name': 'shared_w'}, + param_attr='shared_w', main_program=program) embed_third = layers.embedding( input=third_word, size=[dict_size, embed_size], dtype='float32', - param_attr={'name': 'shared_w'}, + param_attr='shared_w', main_program=program) embed_forth = layers.embedding( input=forth_word, size=[dict_size, embed_size], dtype='float32', - param_attr={'name': 'shared_w'}, + param_attr='shared_w', main_program=program) concat_embed = layers.concat( diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py index 84548847f7..36e0c84c0b 100644 --- a/python/paddle/v2/fluid/tests/test_recurrent_op.py +++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py @@ -271,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1): temp_l = layers.fc(input=x_t, size=self.input_dim, - param_attr={'name': 'W'}, + param_attr='W', bias_attr=False, **self.p_info) temp_r = layers.fc(input=h_pre, size=self.input_dim, - param_attr={'name': 'U'}, + param_attr='U', bias_attr=False, **self.p_info) From 8a5a8637f9342f996ee0b92ff55cbe82ecced6e5 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Wed, 29 Nov 2017 18:35:49 +0800 Subject: [PATCH 113/275] fix bug in trainer/tests/CMakeLists.txt --- paddle/trainer/tests/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt index 9d33e20656..bd518d8598 100644 --- a/paddle/trainer/tests/CMakeLists.txt +++ b/paddle/trainer/tests/CMakeLists.txt @@ -20,13 +20,13 @@ if(WITH_PYTHON) add_unittest_without_exec(test_TrainerOnePass test_TrainerOnePass.cpp) add_test(NAME test_TrainerOnePass - COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} - ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass + COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port + ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) endif() #################### test_config_parser ######################### add_test(NAME test_config_parser - COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} - ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py + COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} + ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) From ff8a6778483dcaff32e5e0acc056cf45d12148ff Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 29 Nov 2017 13:42:42 +0000 Subject: [PATCH 114/275] Revise comments in rank_loss_op --- paddle/operators/rank_loss_op.cc | 31 ++++++++++++++++++++----------- paddle/operators/rank_loss_op.cu | 2 +- paddle/operators/rank_loss_op.h | 2 +- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc index 061e82412e..87774a56f3 100644 --- a/paddle/operators/rank_loss_op.cc +++ b/paddle/operators/rank_loss_op.cc @@ -4,7 +4,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel { auto right_dims = ctx->GetInputDim("Right"); PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims), - "All inputs must have the same size"); - PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1), - "All inputs must be row vector with size batch_size x 1."); + "All inputs must have the same size."); + PADDLE_ENFORCE( + (label_dims.size() == 2) && (label_dims[1] == 1), + "All inputs must be 2-D tensors with shape [batch_size x 1]."); ctx->SetOutputDim("Out", label_dims); } }; @@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Label", - "The label indicating A ranked higher than B or not, row vector."); - AddInput("Left", "The output of RankNet for doc A, vector."); - AddInput("Right", "The output of RankNet for doc B, vetor."); - AddOutput("Out", "The output loss of RankLoss operator, vector."); + "(2-D Tensor with shape [batch_size x 1]) " + "The label indicating A ranked higher than B or not."); + AddInput("Left", + "(2-D Tensor with shape [batch_size x 1]) " + "The output of RankNet for doc A."); + AddInput("Right", + "(2-D Tensor with shape [batch_size x 1]) " + "The output of RankNet for doc B."); + AddOutput("Out", + "(2-D Tensor with shape [batch_size x 1]) " + "The output loss of RankLoss operator."); AddComment(R"DOC( RankLoss Operator. @@ -65,8 +73,9 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of the input pair. The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label -(P_{i,j}), which represent the output of RankNet for the two docs and the label, -respectively, and yields the rank loss C_{i,j} using the following equation: +(P_{i,j}), which represent the output score of RankNet for the two docs and +the label respectively, and yields the rank loss C_{i,j} using the following +equation: \f$$ C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\ @@ -74,7 +83,7 @@ respectively, and yields the rank loss C_{i,j} using the following equation: \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} \f$$ -The operator can take inputs of one sample or in batch. +The operator can take batch inputs with size batch_size (batch_size >= 1). )DOC"); } diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu index 779588ff36..5382e3a629 100644 --- a/paddle/operators/rank_loss_op.cu +++ b/paddle/operators/rank_loss_op.cu @@ -4,7 +4,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h index f184d6efcb..703c77a0b2 100644 --- a/paddle/operators/rank_loss_op.h +++ b/paddle/operators/rank_loss_op.h @@ -4,7 +4,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, From 4d1ee0ff126de91d7705f5587400466926ba5907 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Wed, 29 Nov 2017 13:56:27 +0000 Subject: [PATCH 115/275] Amend license and comments in reshape_op --- paddle/operators/reshape_op.cc | 7 +++---- paddle/operators/{reshape_op.cu.cc => reshape_op.cu} | 2 +- paddle/operators/reshape_op.h | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) rename paddle/operators/{reshape_op.cu.cc => reshape_op.cu} (94%) diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc index ba774ec216..39bf2118d6 100644 --- a/paddle/operators/reshape_op.cc +++ b/paddle/operators/reshape_op.cc @@ -1,11 +1,10 @@ - /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, @@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel { // TODO(qiao) change batch_size for (size_t i = 1; i < shape.size(); ++i) { PADDLE_ENFORCE(shape[i] > 0, - "Each dimension of shape " - "must be positiv except the first."); + "Each dimension of Attr(shape) " + "must be positive except the first one."); } if (shape[0] < 0) { shape[0] = x_dims[0]; diff --git a/paddle/operators/reshape_op.cu.cc b/paddle/operators/reshape_op.cu similarity index 94% rename from paddle/operators/reshape_op.cu.cc rename to paddle/operators/reshape_op.cu index 23dbe089d3..dca6c15007 100644 --- a/paddle/operators/reshape_op.cu.cc +++ b/paddle/operators/reshape_op.cu @@ -4,7 +4,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h index 0e98c8b4f4..73fd1da642 100644 --- a/paddle/operators/reshape_op.h +++ b/paddle/operators/reshape_op.h @@ -4,7 +4,7 @@ you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 + http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, From 7300655ffd5deb47e24e493524534c94570ed48b Mon Sep 17 00:00:00 2001 From: "Yang Yang(Tony)" Date: Wed, 29 Nov 2017 10:51:59 -0800 Subject: [PATCH 116/275] Update cpu_profiling.md (#6012) --- doc/howto/optimization/cpu_profiling.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md index 32d89a7c18..b3330b0b59 100644 --- a/doc/howto/optimization/cpu_profiling.md +++ b/doc/howto/optimization/cpu_profiling.md @@ -71,7 +71,7 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py ``` -可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 +可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 ```text Called By: @@ -121,7 +121,7 @@ python -m yep -v main.py 1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。 2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 -3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 +3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 ### 查看性能分析文件 From 35572355c2261c493aa782ba1255971f4dfa385e Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Wed, 29 Nov 2017 11:13:39 -0800 Subject: [PATCH 117/275] Edit float16 doc (#5851) * Add survey of support of half in different CUDA versions * small fix --- doc/design/float16.md | 45 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/doc/design/float16.md b/doc/design/float16.md index 078801ba2e..1ea95ed6b5 100644 --- a/doc/design/float16.md +++ b/doc/design/float16.md @@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors. - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU). +### CUDA version issue +There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. +CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows: +``` +typedef struct __align__(2) { + unsigned short x; +} __half; + +typedef __half half; +``` +This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types: +``` +__global__ void Add() { + half a, b, c; + c = __hadd(a, b); // correct + c = a + b; // compiler error: no operator "+" matches these operands +} +``` +CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp). + +Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows: +``` +typedef struct __CUDA_ALIGN__(2) { + unsigned short x; +} __half_raw; + + +struct __CUDA_ALIGN__(2) __half { +protected: + unsigned short __x; +public: + // constructors and conversion operators from/to + // __half_raw and other built-in data types +} + +typedef __half half; + +__device__ __forceinline__ +__half operator+(const __half &lh, const __half &rh) { + return __hadd(lh, rh); +} + +// Other overloaded operators +``` +This new design makes `c = a + b` work correctly for CUDA half data type. ## Implementation The float16 class holds a 16-bit `uint16_t` data internally. From 36df67b17c0057725661f11065c87509a3cc898f Mon Sep 17 00:00:00 2001 From: Kexin Zhao Date: Wed, 29 Nov 2017 16:30:55 -0800 Subject: [PATCH 118/275] small fix --- paddle/math/float16.h | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index 778b48bce8..f805cad08b 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -735,6 +735,5 @@ HOSTDEVICE inline bool operator>(const float16& a, const float16& b) { HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { return float(a) >= float(b); } - #endif } // namespace paddle From 00eceea06e0b7e7771c027bac190078f6ed4e77f Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Thu, 30 Nov 2017 10:32:03 +0800 Subject: [PATCH 119/275] Fix the problem that building for Android fails with WITH_TESTING=ON. (#6051) --- paddle/gserver/tests/CMakeLists.txt | 51 ++++++++++++++--------------- 1 file changed, 25 insertions(+), 26 deletions(-) diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index c295ea19c9..24e6cae8e6 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -62,11 +62,11 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) endif() if(NOT MOBILE_INFERENCE) -################## test_Evaluator ####################### + ################## test_Evaluator ####################### add_unittest(test_Evaluator test_Evaluator.cpp) -############### test_RecurrentGradientMachine ############### + ############### test_RecurrentGradientMachine ############### # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine # I will fix it. add_unittest_without_exec(test_RecurrentGradientMachine @@ -77,7 +77,7 @@ if(NOT MOBILE_INFERENCE) ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) -############### test_NetworkCompare ############### + ############### test_NetworkCompare ############### add_unittest_without_exec(test_NetworkCompare test_NetworkCompare.cpp) if(WITH_GPU) @@ -89,34 +89,33 @@ if(NOT MOBILE_INFERENCE) COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle) endif() -endif() + ################# test_CompareSparse ################## + add_unittest_without_exec(test_CompareSparse + test_CompareSparse.cpp) + if(NOT ON_TRAVIS) + add_test(NAME test_CompareSparse + COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d + ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests + ./.set_port.sh -p port -n 6 + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) + endif() + + ################ test_CompareTwoNets ###################### + add_unittest_without_exec(test_CompareTwoNets + test_CompareTwoNets.cpp) + add_test(NAME test_CompareTwoNets + COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d + ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests + ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) +endif() +################ test_PyDataProvider2 ###################### add_unittest_without_exec(test_PyDataProvider2 test_PyDataProvider2.cpp) - add_test(NAME test_PyDataProvider2 COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2 WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle ) - -################# test_CompareSparse ################## -add_unittest_without_exec(test_CompareSparse - test_CompareSparse.cpp) -if(NOT ON_TRAVIS) - add_test(NAME test_CompareSparse - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d - ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests - ./.set_port.sh -p port -n 6 - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) -endif() - -################ test_CompareTwoNets ###################### -add_unittest_without_exec(test_CompareTwoNets - test_CompareTwoNets.cpp) -add_test(NAME test_CompareTwoNets - COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d - ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests - ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets - WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) From da62d6cc24e22b499204b415f8ab7d4ca96c71d2 Mon Sep 17 00:00:00 2001 From: Yibing Liu Date: Thu, 30 Nov 2017 02:54:37 +0000 Subject: [PATCH 120/275] fix the doc display problem in rank_loss_op --- paddle/operators/rank_loss_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc index 87774a56f3..912f88f455 100644 --- a/paddle/operators/rank_loss_op.cc +++ b/paddle/operators/rank_loss_op.cc @@ -77,11 +77,11 @@ The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label the label respectively, and yields the rank loss C_{i,j} using the following equation: -\f$$ - C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\ +$$ + C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\ o_{i,j} = o_i - o_j \\ \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \} -\f$$ +$$ The operator can take batch inputs with size batch_size (batch_size >= 1). From e1b8c27acbba44a52b10b8593e95eb1279f60bf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=AD=A6=E6=AF=85?= Date: Thu, 30 Nov 2017 12:01:06 +0800 Subject: [PATCH 121/275] Add back print_operators_doc (#5970) * add back print_operators_doc * fix style check * fix style check --- paddle/operators/detail/send_recv.proto | 2 +- paddle/scripts/docker/build.sh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto index 962c7d5981..07ff9d2c62 100644 --- a/paddle/operators/detail/send_recv.proto +++ b/paddle/operators/detail/send_recv.proto @@ -32,4 +32,4 @@ message VariableMessage { bytes serialized = 2; } -message VoidMessage {} \ No newline at end of file +message VoidMessage {} diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index a2fdc5ce69..502637c881 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -183,6 +183,7 @@ EOF ${DOCKERFILE_GPU_ENV} ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/master/master /usr/bin/ + ADD paddle/pybind/print_operators_doc /usr/bin/ # default command shows the paddle version and exit CMD ["paddle", "version"] EOF From dc91c4e3a42b678ad14742af8845b94c4a0ac50d Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 30 Nov 2017 12:13:12 +0800 Subject: [PATCH 122/275] Fix MacOS compile (#6062) --- cmake/external/grpc.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 219ea1b908..86122aec8c 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -24,9 +24,9 @@ SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc) SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE) SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE) IF(APPLE) - SET(BUILD_CMD make -n | sed "s/-Werror//g" | sh) + SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh) ELSE() - SET(BUILD_CMD make) + SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin) ENDIF() ExternalProject_Add( @@ -42,7 +42,7 @@ ExternalProject_Add( # Disable -Werror, otherwise the compile will fail in MacOS. # It seems that we cannot configure that by make command. # Just dry run make command and remove `-Werror`, then use a shell to run make commands - BUILD_COMMAND ${BUILD_CMD} HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin + BUILD_COMMAND ${BUILD_CMD} INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install ) From 82dd1653ae48a54a2ec8371f927812b351164820 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Thu, 30 Nov 2017 12:18:23 +0800 Subject: [PATCH 123/275] Fix python.v2.fluid arg parse (#6055) * fix python gflags init * format code --- python/paddle/v2/fluid/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index c033b27bea..dd25bc19ec 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -36,7 +36,8 @@ def __read_gflags_from_env__(): read_env_flags = ['use_pinned_memory'] if core.is_compile_gpu(): read_env_flags.append('fraction_of_gpu_memory_to_use') - core.init_gflags(sys.argv + ["--tryfromenv=" + ",".join(read_env_flags)]) + core.init_gflags([sys.argv[0]] + + ["--tryfromenv=" + ",".join(read_env_flags)]) __read_gflags_from_env__() From 35453df18f738c18a7c66d886296068d88dc1304 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 30 Nov 2017 13:41:28 +0800 Subject: [PATCH 124/275] Fix ShareLoD bug (#6084) Fix #6087 --- paddle/framework/op_desc.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 48cd131550..02a8253243 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR, "The %d-th output of Output(%s) must be LoDTensor.", j, out); - in_var->SetLoDLevel(out_var->GetLodLevel()); + out_var->SetLoDLevel(in_var->GetLodLevel()); } bool IsRuntime() const override; From 5fc88244b5247c687694cc792eea0f20b8eebd49 Mon Sep 17 00:00:00 2001 From: Liu Yiqun Date: Thu, 30 Nov 2017 06:07:31 +0000 Subject: [PATCH 125/275] Fix the compiling error when seting WITH_C_API=ON and WITH_PYTHON=ON. --- CMakeLists.txt | 2 ++ paddle/pserver/CMakeLists.txt | 2 +- paddle/trainer/CMakeLists.txt | 6 ++---- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e76512166f..2d38f398ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -83,6 +83,8 @@ if(ANDROID OR IOS) "Disable RDMA when cross-compiling for Android and iOS" FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL when cross-compiling for Android and iOS" FORCE) + set(WITH_GOLANG OFF CACHE STRING + "Disable golang when cross-compiling for Android and iOS" FORCE) # Compile PaddlePaddle mobile inference library if (NOT WITH_C_API) diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt index ccfc0e7602..f75475a88f 100644 --- a/paddle/pserver/CMakeLists.txt +++ b/paddle/pserver/CMakeLists.txt @@ -49,7 +49,7 @@ if(WITH_TESTING) add_subdirectory(test) endif() -if(NOT WITH_C_API) +if(NOT MOBILE_INFERENCE) add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES}) link_paddle_exe(paddle_pserver_main) diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt index 3d471a0c01..72911695bd 100644 --- a/paddle/trainer/CMakeLists.txt +++ b/paddle/trainer/CMakeLists.txt @@ -54,7 +54,7 @@ if(WITH_TESTING) add_subdirectory(tests) endif() -if(NOT WITH_C_API) +if(NOT MOBILE_INFERENCE) add_paddle_exe(paddle_trainer TrainerMain.cpp) add_paddle_exe(paddle_merge_model MergeModel.cpp) @@ -74,7 +74,5 @@ endif() if(WITH_GOLANG) add_dependencies(paddle_trainer_lib paddle_pserver_cclient) target_link_libraries(paddle_trainer_lib paddle_pserver_cclient) - if(NOT WITH_C_API) - target_link_libraries(paddle_trainer paddle_pserver_cclient) - endif() + target_link_libraries(paddle_trainer paddle_pserver_cclient) endif(WITH_GOLANG) From ac596a3952a3f75cc12f1eefafb14a165a57ff95 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 30 Nov 2017 14:14:13 +0800 Subject: [PATCH 126/275] Feature/switch program (#5932) * Unify fluid submodules to fluid module Change books just use `import fluid`, not submodules * Remove g_main_program/g_startup_program Use default_main_program/default_startup_program instead * Typo * Add API for switch default program * Two functions: switch_main_program/switch_startup_program * A guard: program_guard. Users can use the `with` statement change default programs * Change unittests in `test_layers` * Fix CI * Fix CI * Fix CI --- python/paddle/v2/fluid/framework.py | 79 +++++- python/paddle/v2/fluid/tests/test_layers.py | 271 ++++++++------------ 2 files changed, 188 insertions(+), 162 deletions(-) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 1c42e4d44f..49c6d89834 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -3,10 +3,12 @@ import collections import numpy as np from . import core import proto.framework_pb2 as framework_pb2 +import contextlib __all__ = [ 'Block', 'Variable', 'Program', 'Operator', 'default_startup_program', - 'default_main_program' + 'default_main_program', 'program_guard', 'switch_startup_program', + 'switch_main_program' ] @@ -659,8 +661,83 @@ _startup_program_ = Program() def default_startup_program(): + """ + Get default startup program. In startup program, Paddle will initialize + parameters, initialize nccl handle, etc. + + Returns: + Program: startup program + """ return _startup_program_ def default_main_program(): + """ + Get default main program. The main program is used for training or testing. + + Returns: + Program: main program + """ return _main_program_ + + +def switch_main_program(program): + """ + Switch the main program to a new program. + + Args: + program(Program): The new main program + + Returns: + Program: The previous main program + """ + global _main_program_ + prev_program = _main_program_ + _main_program_ = program + return prev_program + + +def switch_startup_program(program): + """ + Switch the startup program to a new program + Args: + program(Program): The new startup program + + Returns: + Program: The previous startup program + """ + global _startup_program_ + prev_program = _startup_program_ + _startup_program_ = program + return prev_program + + +@contextlib.contextmanager +def program_guard(main_program, startup_program=None): + """ + Switch program with `with` statement + + Examples: + >>> with program_guard(Program()): + >>> data = fluid.layers.data(...) + >>> hidden = fluid.layers.fc(...) + + Args: + main_program(Program): New main program inside `with` statement + startup_program(Program): New startup program inside `with` statement. + None means do not change startup program. + + Returns: + None + """ + if not isinstance(main_program, Program): + raise TypeError("main_program should be Program") + main_program = switch_main_program(main_program) + if startup_program is not None: + if not isinstance(startup_program, Program): + raise TypeError("startup_program should be Program") + startup_program = switch_startup_program(startup_program) + yield + switch_main_program(main_program) + if startup_program is not None: + switch_startup_program(startup_program) diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index b6906be60b..33b0e54f42 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -1,192 +1,141 @@ +from __future__ import print_function import unittest import paddle.v2.fluid.layers as layers import paddle.v2.fluid.nets as nets -from paddle.v2.fluid.framework import Program +from paddle.v2.fluid.framework import Program, program_guard class TestBook(unittest.TestCase): def test_fit_a_line(self): program = Program() - x = layers.data( - name='x', shape=[13], dtype='float32', main_program=program) - y_predict = layers.fc(input=x, size=1, act=None, main_program=program) + with program_guard(program, startup_program=Program()): + x = layers.data(name='x', shape=[13], dtype='float32') + y_predict = layers.fc(input=x, size=1, act=None) + y = layers.data(name='y', shape=[1], dtype='float32') + cost = layers.square_error_cost(input=y_predict, label=y) + avg_cost = layers.mean(x=cost) + self.assertIsNotNone(avg_cost) + program.append_backward(avg_cost) - y = layers.data( - name='y', shape=[1], dtype='float32', main_program=program) - cost = layers.square_error_cost( - input=y_predict, label=y, main_program=program) - - avg_cost = layers.mean(x=cost, main_program=program) - self.assertIsNotNone(avg_cost) - program.append_backward(avg_cost) - - print str(program) + print(str(program)) def test_recognize_digits_mlp(self): program = Program() - - # Change g_program, so the rest layers use `g_program` - images = layers.data( - name='pixel', shape=[784], dtype='float32', main_program=program) - label = layers.data( - name='label', shape=[1], dtype='int32', main_program=program) - hidden1 = layers.fc(input=images, - size=128, - act='relu', - main_program=program) - hidden2 = layers.fc(input=hidden1, - size=64, - act='relu', - main_program=program) - predict = layers.fc(input=hidden2, - size=10, - act='softmax', - main_program=program) - cost = layers.cross_entropy( - input=predict, label=label, main_program=program) - avg_cost = layers.mean(x=cost, main_program=program) - self.assertIsNotNone(avg_cost) - - print str(program) + with program_guard(program, startup_program=Program()): + # Change g_program, so the rest layers use `g_program` + images = layers.data(name='pixel', shape=[784], dtype='float32') + label = layers.data(name='label', shape=[1], dtype='int32') + hidden1 = layers.fc(input=images, size=128, act='relu') + hidden2 = layers.fc(input=hidden1, size=64, act='relu') + predict = layers.fc(input=hidden2, size=10, act='softmax') + cost = layers.cross_entropy(input=predict, label=label) + avg_cost = layers.mean(x=cost) + self.assertIsNotNone(avg_cost) + + print(str(program)) def test_simple_conv2d(self): program = Program() - images = layers.data( - name='pixel', - shape=[3, 48, 48], - dtype='int32', - main_program=program) - layers.conv2d( - input=images, - num_filters=3, - filter_size=[4, 4], - main_program=program) - - print str(program) + with program_guard(program, startup_program=Program()): + images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32') + layers.conv2d(input=images, num_filters=3, filter_size=[4, 4]) + + print(str(program)) def test_conv2d_transpose(self): program = Program() - kwargs = {'main_program': program} - img = layers.data( - name='pixel', shape=[3, 2, 2], dtype='float32', **kwargs) - layers.conv2d_transpose( - input=img, num_filters=10, output_size=28, **kwargs) - print str(program) + with program_guard(program): + img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32') + layers.conv2d_transpose(input=img, num_filters=10, output_size=28) + print(str(program)) def test_recognize_digits_conv(self): program = Program() - - images = layers.data( - name='pixel', - shape=[1, 28, 28], - dtype='float32', - main_program=program) - label = layers.data( - name='label', shape=[1], dtype='int32', main_program=program) - conv_pool_1 = nets.simple_img_conv_pool( - input=images, - filter_size=5, - num_filters=2, - pool_size=2, - pool_stride=2, - act="relu", - main_program=program) - conv_pool_2 = nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=4, - pool_size=2, - pool_stride=2, - act="relu", - main_program=program) - - predict = layers.fc(input=conv_pool_2, - size=10, - act="softmax", - main_program=program) - cost = layers.cross_entropy( - input=predict, label=label, main_program=program) - avg_cost = layers.mean(x=cost, main_program=program) - - program.append_backward(avg_cost) - - print str(program) + with program_guard(program, startup_program=Program()): + images = layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = layers.data(name='label', shape=[1], dtype='int32') + conv_pool_1 = nets.simple_img_conv_pool( + input=images, + filter_size=5, + num_filters=2, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=4, + pool_size=2, + pool_stride=2, + act="relu") + + predict = layers.fc(input=conv_pool_2, size=10, act="softmax") + cost = layers.cross_entropy(input=predict, label=label) + avg_cost = layers.mean(x=cost) + + program.append_backward(avg_cost) + + print(str(program)) def test_word_embedding(self): program = Program() - dict_size = 10000 - embed_size = 32 - first_word = layers.data( - name='firstw', shape=[1], dtype='int64', main_program=program) - second_word = layers.data( - name='secondw', shape=[1], dtype='int64', main_program=program) - third_word = layers.data( - name='thirdw', shape=[1], dtype='int64', main_program=program) - forth_word = layers.data( - name='forthw', shape=[1], dtype='int64', main_program=program) - next_word = layers.data( - name='nextw', shape=[1], dtype='int64', main_program=program) - - embed_first = layers.embedding( - input=first_word, - size=[dict_size, embed_size], - dtype='float32', - param_attr='shared_w', - main_program=program) - embed_second = layers.embedding( - input=second_word, - size=[dict_size, embed_size], - dtype='float32', - param_attr='shared_w', - main_program=program) - - embed_third = layers.embedding( - input=third_word, - size=[dict_size, embed_size], - dtype='float32', - param_attr='shared_w', - main_program=program) - embed_forth = layers.embedding( - input=forth_word, - size=[dict_size, embed_size], - dtype='float32', - param_attr='shared_w', - main_program=program) - - concat_embed = layers.concat( - input=[embed_first, embed_second, embed_third, embed_forth], - axis=1, - main_program=program) - - hidden1 = layers.fc(input=concat_embed, - size=256, - act='sigmoid', - main_program=program) - predict_word = layers.fc(input=hidden1, - size=dict_size, - act='softmax', - main_program=program) - cost = layers.cross_entropy( - input=predict_word, label=next_word, main_program=program) - avg_cost = layers.mean(x=cost, main_program=program) - self.assertIsNotNone(avg_cost) - - print str(program) + with program_guard(program, startup_program=Program()): + dict_size = 10000 + embed_size = 32 + first_word = layers.data(name='firstw', shape=[1], dtype='int64') + second_word = layers.data(name='secondw', shape=[1], dtype='int64') + third_word = layers.data(name='thirdw', shape=[1], dtype='int64') + forth_word = layers.data(name='forthw', shape=[1], dtype='int64') + next_word = layers.data(name='nextw', shape=[1], dtype='int64') + + embed_first = layers.embedding( + input=first_word, + size=[dict_size, embed_size], + dtype='float32', + param_attr='shared_w') + embed_second = layers.embedding( + input=second_word, + size=[dict_size, embed_size], + dtype='float32', + param_attr='shared_w') + + embed_third = layers.embedding( + input=third_word, + size=[dict_size, embed_size], + dtype='float32', + param_attr='shared_w') + embed_forth = layers.embedding( + input=forth_word, + size=[dict_size, embed_size], + dtype='float32', + param_attr='shared_w') + + concat_embed = layers.concat( + input=[embed_first, embed_second, embed_third, embed_forth], + axis=1) + + hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid') + predict_word = layers.fc(input=hidden1, + size=dict_size, + act='softmax') + cost = layers.cross_entropy(input=predict_word, label=next_word) + avg_cost = layers.mean(x=cost) + self.assertIsNotNone(avg_cost) + + print(str(program)) def test_linear_chain_crf(self): program = Program() - - # Change g_program, so the rest layers use `g_program` - images = layers.data( - name='pixel', shape=[784], dtype='float32', main_program=program) - label = layers.data( - name='label', shape=[1], dtype='int32', main_program=program) - hidden = layers.fc(input=images, size=128, main_program=program) - crf = layers.linear_chain_crf( - input=hidden, label=label, main_program=program) - - print str(program) + with program_guard(program, startup_program=Program()): + images = layers.data(name='pixel', shape=[784], dtype='float32') + label = layers.data(name='label', shape=[1], dtype='int32') + hidden = layers.fc(input=images, size=128) + crf = layers.linear_chain_crf(input=hidden, label=label) + self.assertNotEqual(crf, None) + + print(str(program)) if __name__ == '__main__': From 849bf9d0d0ebf7ab6509a588b8e1b28e9f4d3d67 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 30 Nov 2017 14:17:25 +0800 Subject: [PATCH 127/275] separate mkldnn benchmark as train and infer --- benchmark/paddle/image/run_mkldnn.sh | 107 --------------------- benchmark/paddle/image/run_mkldnn_infer.sh | 68 +++++++++++++ benchmark/paddle/image/run_mkldnn_train.sh | 47 +++++++++ 3 files changed, 115 insertions(+), 107 deletions(-) delete mode 100755 benchmark/paddle/image/run_mkldnn.sh create mode 100755 benchmark/paddle/image/run_mkldnn_infer.sh create mode 100755 benchmark/paddle/image/run_mkldnn_train.sh diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn.sh deleted file mode 100755 index c78079fa45..0000000000 --- a/benchmark/paddle/image/run_mkldnn.sh +++ /dev/null @@ -1,107 +0,0 @@ -set -e - -function train() { - unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY - topology=$1 - layer_num=$2 - bs=$3 - use_mkldnn=$4 - if [ $4 == "True" ]; then - thread=1 - log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log" - elif [ $4 == "False" ]; then - thread=`nproc` - # each trainer_count use only 1 core to avoid conflict - log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log" - else - echo "Wrong input $4, use True or False." - exit 0 - fi - args="batch_size=${bs},layer_num=${layer_num}" - config="${topology}.py" - paddle train --job=time \ - --config=$config \ - --use_mkldnn=$use_mkldnn \ - --use_gpu=False \ - --trainer_count=$thread \ - --log_period=10 \ - --test_period=100 \ - --config_args=$args \ - 2>&1 | tee ${log} -} - -function test() { - unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY - topology=$1 - layer_num=$2 - bs=$3 - use_mkldnn=$4 - if [ $4 == "True" ]; then - thread=1 - log="logs/test-${topology}-${layer_num}-mkldnn-${bs}.log" - elif [ $4 == "False" ]; then - thread=`nproc` - if [ $thread -gt $bs ]; then - thread=$bs - fi - log="logs/test-${topology}-${layer_num}-${thread}mklml-${bs}.log" - else - echo "Wrong input $4, use True or False." - exit 0 - fi - - models_in="models/${topology}-${layer_num}/pass-00000/" - if [ ! -d $models_in ]; then - echo "Training model ${topology}_${layer_num}" - paddle train --job=train \ - --config="${topology}.py" \ - --use_mkldnn=True \ - --use_gpu=False \ - --trainer_count=1 \ - --num_passes=1 \ - --save_dir="models/${topology}-${layer_num}" \ - --config_args="batch_size=128,layer_num=${layer_num}" \ - > /dev/null 2>&1 - echo "Done" - fi - paddle train --job=test \ - --config="${topology}.py" \ - --use_mkldnn=$use_mkldnn \ - --use_gpu=False \ - --trainer_count=$thread \ - --log_period=10 \ - --config_args="batch_size=${bs},layer_num=${layer_num},is_test=True" \ - --init_model_path=$models_in \ - 2>&1 | tee ${log} -} - -if [ ! -f "train.list" ]; then - echo " " > train.list -fi -if [ ! -f "test.list" ]; then - echo " " > test.list -fi -if [ ! -d "logs" ]; then - mkdir logs -fi -if [ ! -d "models" ]; then - mkdir -p models -fi - -# inference benchmark -for use_mkldnn in True False; do - for batchsize in 1 2 4 8 16; do - test googlenet v1 $batchsize $use_mkldnn - test resnet 50 $batchsize $use_mkldnn - test vgg 19 $batchsize $use_mkldnn - done -done - -# training benchmark -for use_mkldnn in True False; do - for batchsize in 64 128 256; do - train vgg 19 $batchsize $use_mkldnn - train resnet 50 $batchsize $use_mkldnn - train googlenet v1 $batchsize $use_mkldnn - done -done diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkldnn_infer.sh new file mode 100755 index 0000000000..3081d5e7b5 --- /dev/null +++ b/benchmark/paddle/image/run_mkldnn_infer.sh @@ -0,0 +1,68 @@ +set -e + +function infer() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + if [ $thread -gt $bs ]; then + thread=$bs + fi + log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + + models_in="models/${topology}-${layer_num}/pass-00000/" + if [ ! -d $models_in ]; then + echo "Training model ${topology}_${layer_num}" + paddle train --job=train \ + --config="${topology}.py" \ + --use_mkldnn=True \ + --use_gpu=False \ + --trainer_count=1 \ + --num_passes=1 \ + --save_dir="models/${topology}-${layer_num}" \ + --config_args="batch_size=128,layer_num=${layer_num}" \ + > /dev/null 2>&1 + echo "Done" + fi + paddle train --job=test \ + --config="${topology}.py" \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=32 \ + --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \ + --init_model_path=$models_in \ + 2>&1 | tee ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -f "test.list" ]; then + echo " " > test.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi +if [ ! -d "models" ]; then + mkdir -p models +fi + +# inference benchmark +for use_mkldnn in True False; do + for batchsize in 1 2 4 8 16; do + infer googlenet v1 $batchsize $use_mkldnn + infer resnet 50 $batchsize $use_mkldnn + infer vgg 19 $batchsize $use_mkldnn + done +done diff --git a/benchmark/paddle/image/run_mkldnn_train.sh b/benchmark/paddle/image/run_mkldnn_train.sh new file mode 100755 index 0000000000..320206239a --- /dev/null +++ b/benchmark/paddle/image/run_mkldnn_train.sh @@ -0,0 +1,47 @@ +set -e + +function train() { + unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY + topology=$1 + layer_num=$2 + bs=$3 + use_mkldnn=$4 + if [ $4 == "True" ]; then + thread=1 + log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log" + elif [ $4 == "False" ]; then + thread=`nproc` + # each trainer_count use only 1 core to avoid conflict + log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log" + else + echo "Wrong input $4, use True or False." + exit 0 + fi + args="batch_size=${bs},layer_num=${layer_num}" + config="${topology}.py" + paddle train --job=time \ + --config=$config \ + --use_mkldnn=$use_mkldnn \ + --use_gpu=False \ + --trainer_count=$thread \ + --log_period=10 \ + --test_period=100 \ + --config_args=$args \ + 2>&1 | tee ${log} +} + +if [ ! -f "train.list" ]; then + echo " " > train.list +fi +if [ ! -d "logs" ]; then + mkdir logs +fi + +# training benchmark +for use_mkldnn in True False; do + for batchsize in 64 128 256; do + train vgg 19 $batchsize $use_mkldnn + train resnet 50 $batchsize $use_mkldnn + train googlenet v1 $batchsize $use_mkldnn + done +done From a5aac614108c4b2b6d88d0c3446e4184911a319c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 30 Nov 2017 14:24:35 +0800 Subject: [PATCH 128/275] skip cost when inference --- benchmark/paddle/image/googlenet.py | 20 +++++++++++++++----- benchmark/paddle/image/provider.py | 14 ++++++++++---- benchmark/paddle/image/resnet.py | 27 +++++++++++++++++++-------- benchmark/paddle/image/vgg.py | 18 ++++++++++++++---- 4 files changed, 58 insertions(+), 21 deletions(-) diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py index 5b1f0ca006..d3dc0506d5 100644 --- a/benchmark/paddle/image/googlenet.py +++ b/benchmark/paddle/image/googlenet.py @@ -6,8 +6,15 @@ width = 224 num_class = 1000 batch_size = get_config_arg('batch_size', int, 128) use_gpu = get_config_arg('use_gpu', bool, True) - -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +is_infer = get_config_arg("is_infer", bool, False) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} define_py_data_sources2( "train.list", "test.list", module="provider", obj="process", args=args) @@ -146,7 +153,6 @@ def inception(name, input, channels, \ return cat -lab = data_layer(name="label", size=1000) data = data_layer(name="input", size=3 * height * width) # stage 1 @@ -224,6 +230,10 @@ pool5 = img_pool_layer( dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4) out3 = fc_layer( name="output3", input=dropout, size=1000, act=SoftmaxActivation()) -loss3 = cross_entropy(name='loss3', input=out3, label=lab) -outputs(loss3) +if is_infer: + outputs(out3) +else: + lab = data_layer(name="label", size=num_class) + loss3 = cross_entropy(name='loss3', input=out3, label=lab) + outputs(loss3) diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py index 4703944c87..a3a6b6fc4d 100644 --- a/benchmark/paddle/image/provider.py +++ b/benchmark/paddle/image/provider.py @@ -13,8 +13,11 @@ def initHook(settings, height, width, color, num_class, **kwargs): settings.data_size = settings.height * settings.width * 3 else: settings.data_size = settings.height * settings.width - - settings.slots = [dense_vector(settings.data_size), integer_value(1)] + settings.is_infer = kwargs.get('is_infer', False) + if settings.is_infer: + settings.slots = [dense_vector(settings.data_size)] + else: + settings.slots = [dense_vector(settings.data_size), integer_value(1)] @provider( @@ -22,5 +25,8 @@ def initHook(settings, height, width, color, num_class, **kwargs): def process(settings, file_list): for i in xrange(1024): img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() - lab = random.randint(0, settings.num_class - 1) - yield img.astype('float32'), int(lab) + if settings.is_infer: + yield img.astype('float32') + else: + lab = random.randint(0, settings.num_class - 1) + yield img.astype('float32'), int(lab) diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py index f8c1c2df88..163394e566 100644 --- a/benchmark/paddle/image/resnet.py +++ b/benchmark/paddle/image/resnet.py @@ -6,9 +6,15 @@ width = 224 num_class = 1000 batch_size = get_config_arg('batch_size', int, 64) layer_num = get_config_arg("layer_num", int, 50) -is_test = get_config_arg("is_test", bool, False) - -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +is_infer = get_config_arg("is_infer", bool, False) + +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} define_py_data_sources2( "train.list", "test.list", module="provider", obj="process", args=args) @@ -45,7 +51,10 @@ def conv_bn_layer(name, act=LinearActivation(), bias_attr=False) return batch_norm_layer( - name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test) + name=name + "_bn", + input=tmp, + act=active_type, + use_global_stats=is_infer) def bottleneck_block(name, input, num_filters1, num_filters2): @@ -207,7 +216,9 @@ elif layer_num == 152: else: print("Wrong layer number.") -lbl = data_layer(name="label", size=num_class) -loss = cross_entropy(name='loss', input=resnet, label=lbl) -inputs(img, lbl) -outputs(loss) +if is_infer: + outputs(resnet) +else: + lbl = data_layer(name="label", size=num_class) + loss = cross_entropy(name='loss', input=resnet, label=lbl) + outputs(loss) diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py index 97f4dbe0e1..2d8075bcf2 100644 --- a/benchmark/paddle/image/vgg.py +++ b/benchmark/paddle/image/vgg.py @@ -6,8 +6,15 @@ width = 224 num_class = 1000 batch_size = get_config_arg('batch_size', int, 64) layer_num = get_config_arg('layer_num', int, 19) +is_infer = get_config_arg("is_infer", bool, False) -args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} +args = { + 'height': height, + 'width': width, + 'color': True, + 'num_class': num_class, + 'is_infer': is_infer +} define_py_data_sources2( "train.list", "test.list", module="provider", obj="process", args=args) @@ -98,6 +105,9 @@ elif layer_num == 19: else: print("Wrong layer number.") -lab = data_layer('label', num_class) -loss = cross_entropy(input=vgg, label=lab) -outputs(loss) +if is_infer: + outputs(vgg) +else: + lab = data_layer('label', num_class) + loss = cross_entropy(input=vgg, label=lab) + outputs(loss) From aef639448c67999e3bfc094c6d39ca528fe193a4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 30 Nov 2017 14:33:43 +0800 Subject: [PATCH 129/275] skip train list when inference, skip test list when training --- benchmark/paddle/image/googlenet.py | 6 +++++- benchmark/paddle/image/resnet.py | 6 +++++- benchmark/paddle/image/vgg.py | 6 +++++- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py index d3dc0506d5..7059c13bd2 100644 --- a/benchmark/paddle/image/googlenet.py +++ b/benchmark/paddle/image/googlenet.py @@ -16,7 +16,11 @@ args = { 'is_infer': is_infer } define_py_data_sources2( - "train.list", "test.list", module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py index 163394e566..4a14363ff1 100644 --- a/benchmark/paddle/image/resnet.py +++ b/benchmark/paddle/image/resnet.py @@ -16,7 +16,11 @@ args = { 'is_infer': is_infer } define_py_data_sources2( - "train.list", "test.list", module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py index 2d8075bcf2..8d0a1e97a4 100644 --- a/benchmark/paddle/image/vgg.py +++ b/benchmark/paddle/image/vgg.py @@ -16,7 +16,11 @@ args = { 'is_infer': is_infer } define_py_data_sources2( - "train.list", "test.list", module="provider", obj="process", args=args) + "train.list" if not is_infer else None, + "test.list" if is_infer else None, + module="provider", + obj="process", + args=args) settings( batch_size=batch_size, From 605b3e449911420e5a171085d457916d668268e1 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Wed, 29 Nov 2017 23:22:19 -0800 Subject: [PATCH 130/275] Translate the CPU profiling document (#6073) * Translate the CPU profiling document * Paragraphing --- doc/howto/optimization/cpu_profiling.md | 166 +++++++++++++-------- doc/howto/optimization/cpu_profiling_cn.md | 155 +++++++++++++++++++ 2 files changed, 255 insertions(+), 66 deletions(-) create mode 100644 doc/howto/optimization/cpu_profiling_cn.md diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md index b3330b0b59..e1d91c668e 100644 --- a/doc/howto/optimization/cpu_profiling.md +++ b/doc/howto/optimization/cpu_profiling.md @@ -1,42 +1,52 @@ -此教程会介绍如何使用Python的cProfile包,与Python库yep,google perftools来运行性能分析(Profiling)与调优。 +This tutorial introduces techniques we used to profile and tune the +CPU performance of PaddlePaddle. We will use Python packages +`cProfile` and `yep`, and Google `perftools`. -运行性能分析可以让开发人员科学的,有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中,真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。 +Profiling is the process that reveals the performance bottlenecks, +which could be very different from what's in the developers' mind. +Performance tuning is to fix the bottlenecks. Performance optimization +repeats the steps of profiling and tuning alternatively. -性能优化的步骤,通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。 +PaddlePaddle users program AI by calling the Python API, which calls +into `libpaddle.so.` written in C++. In this tutorial, we focus on +the profiling and tuning of -Paddle提供了Python语言绑定。用户使用Python进行神经网络编程,训练,测试。Python解释器通过`pybind`和`swig`调用Paddle的动态链接库,进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分: +1. the Python code and +1. the mixture of Python and C++ code. -* Python代码的性能分析 -* Python与C++混合代码的性能分析 +## Profiling the Python Code +### Generate the Performance Profiling File -## Python代码的性能分析 - -### 生成性能分析文件 - -Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下: +We can use Python standard +package, [`cProfile`](https://docs.python.org/2/library/profile.html), +to generate Python profiling file. For example: ```bash python -m cProfile -o profile.out main.py ``` -其中`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。 - -### 查看性能分析文件 +where `main.py` is the program we are going to profile, `-o` specifies +the output file. Without `-o`, `cProfile` would outputs to standard +output. -当main.py运行完毕后,性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来。 +### Look into the Profiling File -使用`pip install cprofilev`安装`cprofilev`工具。安装完成后,使用如下命令开启HTTP服务 +`cProfile` generates `profile.out` after `main.py` completes. We can +use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into +the details: ```bash cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py ``` -其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。 +where `-a` specifies the HTTP IP, `-p` specifies the port, `-f` +specifies the profiling file, and `main.py` is the source file. -访问对应网址,即可显示性能分析的结果。性能分析结果格式如下: +Open the Web browser and points to the local IP and the specifies +port, we will see the output like the following: -```text +``` ncalls tottime percall cumtime percall filename:lineno(function) 1 0.284 0.284 29.514 29.514 main.py:1() 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) @@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() ``` -每一列的含义是: +where each line corresponds to Python function, and the meaning of +each column is as follows: -| 列名 | 含义 | +| column | meaning | | --- | --- | -| ncalls | 函数的调用次数 | -| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 | -| percall | tottime的每次调用平均时间 | -| cumtime | 函数总时间。包含这个函数调用其他函数的时间 | -| percall | cumtime的每次调用平均时间 | -| filename:lineno(function) | 文件名, 行号,函数名 | +| ncalls | the number of calls into a function | +| tottime | the total execution time of the function, not including the + execution time of other functions called by the function | +| percall | tottime divided by ncalls | +| cumtime | the total execution time of the function, including the execution time of other functions being called | +| percall | cumtime divided by ncalls | +| filename:lineno(function) | where the function is defined | +### Identify Performance Bottlenecks -### 寻找性能瓶颈 - -通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。 - -将性能分析结果按照tottime排序,效果如下: +Usually, `tottime` and the related `percall` time is what we want to +focus on. We can sort above profiling file by tottime: ```text 4696 12.040 0.003 12.040 0.003 {built-in method run} @@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() - ``` -可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 +We can see that the most time-consuming function is the `built-in +method run`, which is a C++ function in `libpaddle.so`. We will +explain how to profile C++ code in the next section. At the right +moment, let's look into the third function `sync_with_cpp`, which is a +Python function. We can click it to understand more about it: -```text +``` Called By: Ordered by: internal time @@ -92,72 +105,93 @@ Called: List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> ``` -通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。 +The lists of the callers of `sync_with_cpp` might help us understand +how to improve the function definition. +## Profiling Python and C++ Code +### Generate the Profiling File -## Python与C++混合代码的性能分析 +To profile a mixture of Python and C++ code, we can use a Python +package, `yep`, that can work with Google's `perftools`, which is a +commonly-used profiler for C/C++ code. -### 生成性能分析文件 - -C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析 - -使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为 +In Ubuntu systems, we can install `yep` and `perftools` by running the +following commands: ```bash +apt update apt install libgoogle-perftools-dev pip install yep ``` -安装完毕后,我们可以通过 +Then we can run the following command ```bash python -m yep -v main.py ``` -生成性能分析文件。生成的性能分析文件为`main.py.prof`。 +to generate the profiling file. The default filename is +`main.py.prof`. + +Please be aware of the `-v` command line option, which prints the +analysis results after generating the profiling file. By taking a +glance at the print result, we'd know that if we stripped debug +information from `libpaddle.so` at build time. The following hints +help make sure that the analysis results are readable: -命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施: +1. Use GCC command line option `-g` when building `libpaddle.so` so to + include the debug information. The standard building system of + PaddlePaddle is CMake, so you might want to set + `CMAKE_BUILD_TYPE=RelWithDebInfo`. -1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。 -2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 -3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 +1. Use GCC command line option `-O2` or `-O3` to generate optimized + binary code. It doesn't make sense to profile `libpaddle.so` + without optimization, because it would anyway run slowly. -### 查看性能分析文件 +1. Profiling the single-threaded binary file before the + multi-threading version, because the latter often generates tangled + profiling analysis result. You might want to set environment + variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically + starting multiple threads. -在运行完性能分析后,会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。 +### Look into the Profiling File -安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下: +The tool we used to look into the profiling file generated by +`perftools` is [`pprof`](https://github.com/google/pprof), which +provides a Web-based GUI like `cprofilev`. + +We can rely on the standard Go toolchain to retrieve the source code +of `pprof` and build it: ```bash go get github.com/google/pprof ``` -进而我们可以使用如下命令开启一个HTTP服务: +Then we can use it to profile `main.py.prof` generated in the previous +section: ```bash pprof -http=0.0.0.0:3213 `which python` ./main.py.prof ``` -这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。 - -访问对应的网址,我们可以查看性能分析的结果。结果如下图所示: +Where `-http` specifies the IP and port of the HTTP service. +Directing our Web browser to the service, we would see something like +the following: ![result](./pprof_1.png) +### Identifying the Performance Bottlenecks -### 寻找性能瓶颈 - -与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。 - -例如下图中, +Similar to how we work with `cprofilev`, we'd focus on `tottime` and +`cumtime`. ![kernel_perf](./pprof_2.png) -在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。 - -在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。 - -## 总结 +We can see that the execution time of multiplication and the computing +of the gradient of multiplication takes 2% to 4% of the total running +time, and `MomentumOp` takes about 17%. Obviously, we'd want to +optimize `MomentumOp`. -至此,两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式,Paddle的开发人员和使用人员可以有次序的,科学的发现和解决性能问题。 +`pprof` would mark performance critical parts of the program in +red. It's a good idea to follow the hint. diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md new file mode 100644 index 0000000000..14eba0e2f3 --- /dev/null +++ b/doc/howto/optimization/cpu_profiling_cn.md @@ -0,0 +1,155 @@ +此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。 + +Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。 + +PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分: + +* Python 代码的性能分析 +* Python 与 C++ 混合代码的性能分析 + + +## Python代码的性能分析 + +### 生成性能分析文件 + +Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下: + +```bash +python -m cProfile -o profile.out main.py +``` + +其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。 + +### 查看性能分析文件 + +`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来: + +```bash +cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py +``` + +其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。 + +用Web浏览器访问对应网址,即可显示性能分析的结果: + +``` + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.284 0.284 29.514 29.514 main.py:1() + 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() +``` + +每一列的含义是: + +| 列名 | 含义 | +| --- | --- | +| ncalls | 函数的调用次数 | +| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 | +| percall | tottime的每次调用平均时间 | +| cumtime | 函数总时间。包含这个函数调用其他函数的时间 | +| percall | cumtime的每次调用平均时间 | +| filename:lineno(function) | 文件名, 行号,函数名 | + + +### 寻找性能瓶颈 + +通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。 + +将性能分析结果按照tottime排序,效果如下: + +```text + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) + 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) + 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) + 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() +``` + +可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 + +```text +Called By: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> + +Function was called by... + ncalls tottime cumtime +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone) + 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward) + + +Called: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> +``` + +通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。 + + + +## Python与C++混合代码的性能分析 + +### 生成性能分析文件 + +C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析 + +使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为 + +```bash +apt update +apt install libgoogle-perftools-dev +pip install yep +``` + +安装完毕后,我们可以通过 + +```bash +python -m yep -v main.py +``` + +生成性能分析文件。生成的性能分析文件为`main.py.prof`。 + +命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施: + +1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。 +2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 +3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 + +### 查看性能分析文件 + +在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。 + +安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下: + +```bash +go get github.com/google/pprof +``` + +进而我们可以使用如下命令开启一个HTTP服务: + +```bash +pprof -http=0.0.0.0:3213 `which python` ./main.py.prof +``` + +这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。 + +访问对应的网址,我们可以查看性能分析的结果。结果如下图所示: + +![result](./pprof_1.png) + + +### 寻找性能瓶颈 + +与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。 + +例如下图中, + +![kernel_perf](./pprof_2.png) + +在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。 + +在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。 From 1238706d724687bea415d053111adee6cd0aa90b Mon Sep 17 00:00:00 2001 From: QI JUN Date: Thu, 30 Nov 2017 16:33:09 +0800 Subject: [PATCH 131/275] Refine unittest with setting gflags (#5476) * add gflags for C++ unittest --- cmake/generic.cmake | 8 ++-- paddle/memory/memory.cc | 27 +++++++++++--- paddle/optimizer/parameter_optimizer_test.cc | 5 --- paddle/optimizer/serialization_test.cc | 5 --- paddle/testing/CMakeLists.txt | 2 + paddle/testing/paddle_gtest_main.cc | 39 ++++++++++++++++++++ 6 files changed, 66 insertions(+), 20 deletions(-) create mode 100644 paddle/testing/paddle_gtest_main.cc diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c917ca0ff4..9cf256fb6d 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -227,8 +227,8 @@ function(cc_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endfunction(cc_test) @@ -288,8 +288,8 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(nv_test) diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 5eb1c44eb6..95cfe2525e 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -81,18 +81,33 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { } template <> -void* Alloc(platform::GPUPlace place, size_t size) { - return GetGPUBuddyAllocator(place.device)->Alloc(size); +size_t Used(platform::GPUPlace place) { + return GetGPUBuddyAllocator(place.device)->Used(); } template <> -void Free(platform::GPUPlace place, void* p) { - GetGPUBuddyAllocator(place.device)->Free(p); +void* Alloc(platform::GPUPlace place, size_t size) { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + auto* ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + int cur_dev = platform::GetCurrentDeviceId(); + platform::SetDeviceId(place.device); + size_t avail, total; + platform::GpuMemoryUsage(avail, total); + LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " + << place.device << ", available " << avail << " bytes"; + LOG(WARNING) << "total " << total; + LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); + LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); + LOG(WARNING) << "GPU memory used: " << Used(place); + platform::SetDeviceId(cur_dev); + } + return ptr; } template <> -size_t Used(platform::GPUPlace place) { - return GetGPUBuddyAllocator(place.device)->Used(); +void Free(platform::GPUPlace place, void* p) { + GetGPUBuddyAllocator(place.device)->Free(p); } #endif diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc index f29e531712..83757a3917 100644 --- a/paddle/optimizer/parameter_optimizer_test.cc +++ b/paddle/optimizer/parameter_optimizer_test.cc @@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); } TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); } TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); } - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/paddle/optimizer/serialization_test.cc b/paddle/optimizer/serialization_test.cc index 4c416f55ee..940e941e90 100644 --- a/paddle/optimizer/serialization_test.cc +++ b/paddle/optimizer/serialization_test.cc @@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) { EXPECT_EQ(t1[i], t[i]); } } - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 4245df5ab7..2275c950ba 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -5,4 +5,6 @@ if(WITH_TESTING) add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies}) add_library(paddle_test_util STATIC TestUtil.cpp) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) + add_library(paddle_gtest_main STATIC paddle_gtest_main.cc) + add_dependencies(paddle_gtest_main paddle_memory gtest gflags) endif() diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc new file mode 100644 index 0000000000..a491322b7e --- /dev/null +++ b/paddle/testing/paddle_gtest_main.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/memory/memory.h" + +int main(int argc, char** argv) { + std::vector new_argv; + std::string gflags_env; + new_argv.push_back(argv[0]); +#ifdef PADDLE_WITH_CUDA + new_argv.push_back( + strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); +#else + new_argv.push_back(strdup("--tryfromenv=use_pinned_memory")); +#endif + int new_argc = static_cast(new_argv.size()); + char** new_argv_address = new_argv.data(); + google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); + testing::InitGoogleTest(&argc, argv); + paddle::memory::Used(paddle::platform::CPUPlace()); +#ifdef PADDLE_WITH_CUDA + paddle::memory::Used(paddle::platform::GPUPlace(0)); +#endif + return RUN_ALL_TESTS(); +} From fe6af6b6aca67d8c204dfb3496ecfe09df9b850c Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 30 Nov 2017 16:45:24 +0800 Subject: [PATCH 132/275] Enhance the AvgPooling to support optional exclude-mode --- paddle/cuda/include/stub/hl_cnn_stub.h | 6 ++-- paddle/cuda/src/hl_cuda_cnn.cu | 28 ++++++++++++------- paddle/gserver/layers/PoolLayer.cpp | 2 ++ paddle/gserver/layers/PoolLayer.h | 2 ++ paddle/gserver/layers/PoolProjection.cpp | 8 ++++-- paddle/gserver/layers/PoolProjection.h | 1 + paddle/gserver/tests/test_LayerGrad.cpp | 16 ++++++++++- paddle/math/Matrix.cpp | 24 ++++++++++------ paddle/math/Matrix.h | 19 +++++++++---- proto/ModelConfig.proto | 2 ++ python/paddle/trainer/config_parser.py | 9 ++++-- .../paddle/trainer_config_helpers/layers.py | 14 +++++++--- .../paddle/trainer_config_helpers/poolings.py | 13 ++++++++- 13 files changed, 107 insertions(+), 37 deletions(-) diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h index 968ed4840f..706cc59a8e 100644 --- a/paddle/cuda/include/stub/hl_cnn_stub.h +++ b/paddle/cuda/include/stub/hl_cnn_stub.h @@ -68,7 +68,8 @@ inline void hl_avgpool_forward(const int frameCnt, const int paddingH, const int paddingW, real* tgtData, - const int tgtStride) {} + const int tgtStride, + const bool excludeMode) {} inline void hl_avgpool_backward(const int frameCnt, const real* outGrad, @@ -86,7 +87,8 @@ inline void hl_avgpool_backward(const int frameCnt, real scaleA, real scaleB, real* backGrad, - const int outStride) {} + const int outStride, + const bool excludeMode) {} inline void hl_maxpool3D_forward(const int frameCnt, const real* inputData, diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu index 3699b1e8ae..2d1bc4f6d5 100644 --- a/paddle/cuda/src/hl_cuda_cnn.cu +++ b/paddle/cuda/src/hl_cuda_cnn.cu @@ -210,7 +210,8 @@ __global__ void KeAvgPoolForward(const int nthreads, const int padH, const int padW, real* tgtData, - const int tgtStride) { + const int tgtStride, + const bool excludeMode) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int pw = index % pooledW; @@ -224,7 +225,8 @@ __global__ void KeAvgPoolForward(const int nthreads, int wend = min(wstart + sizeX, width); hstart = max(hstart, 0); wstart = max(wstart, 0); - int pool_size = (hend - hstart) * (wend - wstart); + int poolSize = + excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX; real aveval = 0; inputData += (frameNum * channels + c) * height * width; @@ -235,7 +237,7 @@ __global__ void KeAvgPoolForward(const int nthreads, } int tgtIndex = index % (pooledW * pooledH * channels) + frameNum * tgtStride; - tgtData[tgtIndex] = aveval / pool_size; + tgtData[tgtIndex] = aveval / poolSize; } } @@ -253,7 +255,8 @@ void hl_avgpool_forward(const int frameCnt, const int paddingH, const int paddingW, real* tgtData, - const int tgtStride) { + const int tgtStride, + const bool excludeMode) { int num_kernels = pooledH * pooledW * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; KeAvgPoolForward<<>>(num_kernels, @@ -270,7 +273,8 @@ void hl_avgpool_forward(const int frameCnt, paddingH, paddingW, tgtData, - tgtStride); + tgtStride, + excludeMode); CHECK_SYNC("hl_avgpool_forward failed"); } @@ -290,7 +294,8 @@ __global__ void KeAvgPoolBackward(const int nthreads, real scaleA, real scaleB, real* tgtGrad, - const int outStride) { + const int outStride, + const bool excludeMode) { int index = blockIdx.x * blockDim.x + threadIdx.x; if (index < nthreads) { int offsetW = index % width + padW; @@ -314,8 +319,9 @@ __global__ void KeAvgPoolBackward(const int nthreads, int wstart = pw * strideW - padW; int wend = min(wstart + sizeX, width); wstart = max(wstart, 0); - int poolsize = (hend - hstart) * (wend - wstart); - gradient += outGrad[ph * pooledW + pw] / poolsize; + int poolSize = + excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX; + gradient += outGrad[ph * pooledW + pw] / poolSize; } } tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient; @@ -338,7 +344,8 @@ void hl_avgpool_backward(const int frameCnt, real scaleA, real scaleB, real* backGrad, - const int outStride) { + const int outStride, + const bool excludeMode) { int num_kernels = height * width * channels * frameCnt; int blocks = (num_kernels + 1024 - 1) / 1024; @@ -358,7 +365,8 @@ void hl_avgpool_backward(const int frameCnt, scaleA, scaleB, backGrad, - outStride); + outStride, + excludeMode); CHECK_SYNC("hl_avgpool_backward failed"); } diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp index 87613a96c5..fceb389d06 100644 --- a/paddle/gserver/layers/PoolLayer.cpp +++ b/paddle/gserver/layers/PoolLayer.cpp @@ -45,6 +45,8 @@ bool PoolLayer::init(const LayerMap& layerMap, strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride(); confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding(); outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); + + excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true; return true; } diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h index d43292ad2d..9df672a935 100644 --- a/paddle/gserver/layers/PoolLayer.h +++ b/paddle/gserver/layers/PoolLayer.h @@ -38,6 +38,8 @@ protected: std::string poolType_; + bool excludeMode_; + public: explicit PoolLayer(const LayerConfig& config) : Layer(config) {} diff --git a/paddle/gserver/layers/PoolProjection.cpp b/paddle/gserver/layers/PoolProjection.cpp index d90b438448..6a9de394ce 100644 --- a/paddle/gserver/layers/PoolProjection.cpp +++ b/paddle/gserver/layers/PoolProjection.cpp @@ -36,6 +36,8 @@ PoolProjection::PoolProjection(const ProjectionConfig& config, strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride(); confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding(); outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); + + excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true; } size_t PoolProjection::getSize() { @@ -141,7 +143,8 @@ void AvgPoolProjection::forward() { outputY_, outputX_, confPaddingY_, - confPadding_); + confPadding_, + excludeMode_); } void AvgPoolProjection::backward(const UpdateCallback& callback) { @@ -166,6 +169,7 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) { 1, 1, confPaddingY_, - confPadding_); + confPadding_, + excludeMode_); } } // namespace paddle diff --git a/paddle/gserver/layers/PoolProjection.h b/paddle/gserver/layers/PoolProjection.h index 9a75f465f6..a0412714bc 100644 --- a/paddle/gserver/layers/PoolProjection.h +++ b/paddle/gserver/layers/PoolProjection.h @@ -28,6 +28,7 @@ protected: int confPaddingY_, confPadding_; size_t channels_; std::string poolType_; + bool excludeMode_; public: PoolProjection(const ProjectionConfig& config, diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index c5359f272b..2b6ba77470 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -1211,7 +1211,10 @@ void setPoolConfig(TestConfig* config, pool->set_output_y(oh); } -void testPoolLayer(const string& poolType, bool trans, bool useGpu) { +void testPoolLayer(const string& poolType, + bool trans, + bool useGpu, + bool excludeMode = true) { TestConfig config; config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0}); LayerInputConfig* input = config.layerConfig.add_inputs(); @@ -1219,6 +1222,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) { pool->set_img_size(14); pool->set_img_size_y(14); + pool->set_exclude_mode(excludeMode); setPoolConfig(&config, pool, poolType); config.layerConfig.set_size(pool->output_x() * pool->output_y() * pool->channels()); @@ -1250,16 +1254,26 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) { TEST(Layer, PoolLayer) { testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false); + testPoolLayer("avg-projection", + /* trans= */ false, + /* useGpu= */ false, + /* excludeMode= */ false); testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false); testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false); #ifdef PADDLE_WITH_CUDA testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true); + testPoolLayer("avg-projection", + /* trans= */ false, + /* useGpu= */ true, + /* excludeMode= */ false); testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true); + testPoolLayer2( + "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true); testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true); #endif } diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 88e9180690..ebbbdfab1d 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -1130,7 +1130,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, size_t outputH, size_t outputW, size_t paddingH, - size_t paddingW) { + size_t paddingW, + bool excludeMode) { CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal"; real* inputData = inputMat.getData(); @@ -1153,7 +1154,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat, paddingH, paddingW, data_, - getStride()); + getStride(), + excludeMode); } void GpuMatrix::avgPoolBackward(Matrix& outGrad, @@ -1168,7 +1170,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, real scaleTargets, real scaleOutput, size_t paddingH, - size_t paddingW) { + size_t paddingW, + bool excludeMode) { CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal"; real* outDiff = outGrad.getData(); @@ -1194,7 +1197,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad, scaleTargets, scaleOutput, data_, - outGrad.getStride()); + outGrad.getStride(), + excludeMode); } void GpuMatrix::maxPool3DForward(Matrix& inputMat, @@ -2136,7 +2140,8 @@ void CpuMatrix::avgPoolForward(Matrix& input, size_t outputH, size_t outputW, size_t paddingH, - size_t paddingW) { + size_t paddingW, + bool excludeMode) { // The main loop size_t num = input.getHeight(); size_t inLength = imgSizeH * imgSizeW; @@ -2165,7 +2170,8 @@ void CpuMatrix::avgPoolForward(Matrix& input, tgtData[ph * outputW + pw] += inData[h * imgSizeW + w]; } } - int poolSize = (hend - hstart) * (wend - wstart); + int poolSize = + excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX; CHECK(poolSize); tgtData[ph * outputW + pw] /= poolSize; } @@ -2189,7 +2195,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input, real scaleTargets, real scaleOutput, size_t paddingH, - size_t paddingW) { + size_t paddingW, + bool excludeMode) { size_t num = input.getHeight(); size_t channels = input.getWidth() / outputH / outputW; size_t inLength = imgSizeH * imgSizeW; @@ -2211,7 +2218,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input, int wstart = pw * strideW - paddingW; int wend = std::min(wstart + sizeX, imgSizeW); wstart = std::max(wstart, 0); - int poolSize = (hend - hstart) * (wend - wstart); + int poolSize = + excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX; CHECK(poolSize); for (int h = hstart; h < hend; ++h) { diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h index e273f11236..c8e690e642 100644 --- a/paddle/math/Matrix.h +++ b/paddle/math/Matrix.h @@ -911,7 +911,8 @@ public: size_t outputH, size_t outputW, size_t paddingH, - size_t paddingW) { + size_t paddingW, + bool excludeMode = true) { LOG(FATAL) << "Not implemeted"; } @@ -927,9 +928,11 @@ public: real scaleTargets, real scaleOutput, size_t paddingH, - size_t paddingW) { + size_t paddingW, + bool excludeMode = true) { LOG(FATAL) << "Not implemeted"; } + /** * Pooling 3D forward operation, pick out the largest element * in the sizeX of value @@ -1458,7 +1461,8 @@ public: size_t outputH, size_t outputW, size_t paddingH, - size_t paddingW); + size_t paddingW, + bool excludeMode = true); void avgPoolBackward(Matrix& input, size_t imgSizeH, @@ -1472,7 +1476,8 @@ public: real scaleTargets, real scaleOutput, size_t paddingH, - size_t paddingW); + size_t paddingW, + bool excludeMode = true); void maxPool3DForward(Matrix& inputMat, Matrix& maxPoolIdx, @@ -1730,7 +1735,8 @@ public: size_t outputH, size_t outputW, size_t paddingH, - size_t paddingW); + size_t paddingW, + bool excludeMode = true); void avgPoolBackward(Matrix& input, size_t imgSizeH, @@ -1744,7 +1750,8 @@ public: real scaleTargets, real scaleOutput, size_t paddingH, - size_t paddingW); + size_t paddingW, + bool excludeMode = true); void maxPool3DForward(Matrix& inputMat, Matrix& maxPoolIdx, diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 2fcdbbc8bd..2c131338c0 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -139,6 +139,8 @@ message PoolConfig { optional uint32 output_z = 16 [ default = 1 ]; optional uint32 img_size_z = 17 [ default = 1 ]; optional uint32 padding_z = 18 [ default = 1 ]; + + optional bool exclude_mode = 19 [ default = true ]; } message SppConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 5b173694dd..ca4a66d30d 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1233,7 +1233,7 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf): bilinear_conf.out_size_y = bilinear.out_size_y -def parse_pool(pool, input_layer_name, pool_conf, ceil_mode): +def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode): pool_conf.pool_type = pool.pool_type config_assert(pool.pool_type in [ 'max-projection', 'avg-projection', 'max-pool-with-mask', 'cudnn-max-pool', 'cudnn-avg-pool' @@ -1263,6 +1263,8 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode): pool_conf.padding_y, pool_conf.stride_y, not ceil_mode) + pool_conf.exclude_mode = exclude_mode + def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode): pool_conf.pool_type = pool.pool_type @@ -2303,7 +2305,8 @@ class NormLayer(LayerBase): class PoolLayer(LayerBase): layer_type = 'pool' - def __init__(self, name, inputs, ceil_mode=True, **xargs): + def __init__(self, name, inputs, ceil_mode=True, exclude_mode=True, + **xargs): use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0)) if self.layer_type == "mkldnn_pool": config_assert(use_mkldnn, "mkldnn_pool only support MKLDNN") @@ -2314,7 +2317,7 @@ class PoolLayer(LayerBase): input_layer = self.get_input_layer(input_index) pool_conf = self.config.inputs[input_index].pool_conf parse_pool(self.inputs[input_index].pool, input_layer.name, - pool_conf, ceil_mode) + pool_conf, ceil_mode, exclude_mode) self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x, pool_conf.channels) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index f6dc58b9c0..46fe09b947 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -21,7 +21,7 @@ from .activations import LinearActivation, SigmoidActivation, TanhActivation, \ ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation from .evaluators import * from .poolings import MaxPooling, AvgPooling, MaxWithMaskPooling, BasePoolingType, \ - CudnnAvgPooling, CudnnMaxPooling + CudnnAvgPooling, CudnnAvgInclPadPooling, CudnnMaxPooling from .attrs import * from .default_decorators import * @@ -2709,7 +2709,8 @@ def img_pool_layer(input, pool_size_y=None, stride_y=None, padding_y=None, - ceil_mode=True): + ceil_mode=True, + exclude_mode=True): """ Image pooling Layer. @@ -2773,10 +2774,14 @@ def img_pool_layer(input, :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details. :type layer_attr: ExtraLayerAttribute - :param ceil_mode: Wether to use the ceil function to calculate output height and width. + :param ceil_mode: Whether to use the ceil function to calculate output height and width. True is the default. If it is set to False, the floor function will be used. :type ceil_mode: bool + :param exclude_mode: Whether to exclude the padding cells when calculating, but only + work when pool_type is AvgPooling. If use cudnn, use CudnnAvgPooling + or CudnnAvgInclPadPooling as pool_type to identify. + :type exclude_mode: bool :return: LayerOutput object. :rtype: LayerOutput """ @@ -2790,7 +2795,7 @@ def img_pool_layer(input, pool_type.name = 'avg' assert type(pool_type) in [AvgPooling, MaxPooling, MaxWithMaskPooling, CudnnAvgPooling, - CudnnMaxPooling], \ + CudnnMaxPooling, CudnnAvgInclPadPooling], \ "only (Cudnn)AvgPooling, (Cudnn)MaxPooling, MaxWithMaskPooling are supported" type_name = pool_type.name + '-projection' \ @@ -2819,6 +2824,7 @@ def img_pool_layer(input, padding_y=padding_y)) ], ceil_mode=ceil_mode, + exclude_mode=exclude_mode, **ExtraLayerAttribute.to_kwargs(layer_attr)) return LayerOutput( name, diff --git a/python/paddle/trainer_config_helpers/poolings.py b/python/paddle/trainer_config_helpers/poolings.py index f45616551b..e0aeb311b3 100644 --- a/python/paddle/trainer_config_helpers/poolings.py +++ b/python/paddle/trainer_config_helpers/poolings.py @@ -16,7 +16,8 @@ __all__ = [ "BasePoolingType", "MaxPooling", "AvgPooling", "MaxWithMaskPooling", - "CudnnMaxPooling", "CudnnAvgPooling", "SumPooling", "SquareRootNPooling" + "CudnnMaxPooling", "CudnnAvgPooling", "CudnnAvgInclPadPooling", + "SumPooling", "SquareRootNPooling" ] @@ -88,6 +89,16 @@ class CudnnAvgPooling(BasePoolingType): BasePoolingType.__init__(self, "cudnn-avg-pool") +class CudnnAvgInclPadPooling(BasePoolingType): + """ + Cudnn average pooling only support GPU. Return the average value in the + pooling window taking into account the padding cells. + """ + + def __init__(self): + BasePoolingType.__init__(self, "cudnn-avg-incl-pad-pool") + + class AvgPooling(BasePoolingType): """ Average pooling. From 4c95301e98eb031a702831dd75312bf743c88c8a Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 30 Nov 2017 16:55:00 +0800 Subject: [PATCH 133/275] add WITH_DOC for print_operators_doc --- paddle/pybind/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt index a54dc0d9fd..fd55f410d3 100644 --- a/paddle/pybind/CMakeLists.txt +++ b/paddle/pybind/CMakeLists.txt @@ -5,4 +5,6 @@ if(WITH_PYTHON) ${GLOB_OP_LIB}) endif(WITH_PYTHON) -cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB}) +if(WITH_DOC) + cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB}) +endif(WITH_DOC) From a38c1512437531d429d25254e774c2f9bc29e31e Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 30 Nov 2017 17:20:39 +0800 Subject: [PATCH 134/275] Add GetInputsElementDim (#6091) --- paddle/framework/shape_inference.cc | 6 ++++++ paddle/framework/shape_inference.h | 1 + paddle/operators/while_op.cc | 6 +++--- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc index 0af41b164f..2298507471 100644 --- a/paddle/framework/shape_inference.cc +++ b/paddle/framework/shape_inference.cc @@ -22,6 +22,12 @@ std::vector InferShapeContext::GetInputsDim( return GetDims(names); } +DDim InferShapeContext::GetInputsElementDim(const std::string &name, + int idx) const { + const std::vector &names = Inputs(name); + return this->GetDim(names[idx]); +} + void InferShapeContext::SetOutputsDim( const std::string &name, const std::vector &dims) { auto &names = Outputs(name); diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h index 05dc47f06a..46f2ea84b4 100644 --- a/paddle/framework/shape_inference.h +++ b/paddle/framework/shape_inference.h @@ -37,6 +37,7 @@ class InferShapeContext { virtual framework::DDim GetInputDim(const std::string &name) const = 0; std::vector GetInputsDim(const std::string &name) const; + DDim GetInputsElementDim(const std::string &name, int idx) const; virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0; void SetOutputsDim(const std::string &name, diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 68b4f77059..59460f6c87 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -287,7 +287,6 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { auto p_names = ctx->Inputs(kParameters); auto pg_names = ctx->Outputs(kParamGrads); - auto dims = ctx->GetInputsDim(kParameters); auto var_types = ctx->GetInputsVarType(kParameters); std::vector names_to_set; std::vector dims_to_set; @@ -295,13 +294,14 @@ class WhileGradOpShapeInference : public framework::InferShapeBase { if (pg_names[i] == framework::kEmptyVarName) { continue; } + auto dims = ctx->GetInputsElementDim(kParameters, i); if (var_types[i] == framework::VarDesc::LOD_TENSOR) { names_to_set.push_back(pg_names[i]); - dims_to_set.push_back(dims[i]); + dims_to_set.push_back(dims); } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) { // not sure how to set the dim of LOD_TENSOR_ARRAY names_to_set.push_back(pg_names[i]); - dims_to_set.push_back(dims[i]); + dims_to_set.push_back(dims); } } ctx->SetDims(names_to_set, dims_to_set); From 4e564e4852f3733027e7eb382c9f4b660a9e0d3d Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 30 Nov 2017 17:26:07 +0800 Subject: [PATCH 135/275] make WriteToArrayOp supporting empty tensor input (#6030) --- paddle/operators/tensor_array_read_write_op.cc | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index ad09fb53ce..efde850143 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -37,9 +37,15 @@ class WriteToArrayOp : public ArrayOp { << " to " << offset + 1; out->resize(offset + 1); } - auto *out_tensor = &out->at(offset); - CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor); - out_tensor->set_lod(x_tensor.lod()); + if (x_tensor.memory_size() > 0) { + auto *out_tensor = &out->at(offset); + CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor); + out_tensor->set_lod(x_tensor.lod()); + } else { + VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " + "nothing has been written to output array[" + << offset << "]."; + } } }; From 0d40a4dbc6f6a8c1b50d65fa096cadd55dae71c4 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 30 Nov 2017 17:43:11 +0800 Subject: [PATCH 136/275] Add lod_level for data layer (#6040) --- python/paddle/v2/fluid/layers.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 9dcc11d216..5a977978bf 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -185,6 +185,7 @@ def data(name, shape, append_batch_size=True, dtype='float32', + lod_level=0, type=core.VarDesc.VarType.LOD_TENSOR, main_program=None, startup_program=None, @@ -198,6 +199,7 @@ def data(name, append_batch_size: Whether or not to append the data as a batch. dtype: The type of data : float32, float_16, int etc type: The output type. By default it is LOD_TENSOR. + lod_level(int): The LoD Level. 0 means the input data is not a sequence. main_program: Name of the main program that calls this startup_program: Name of the startup program stop_gradient: A boolean that mentions whether gradient should flow. @@ -228,7 +230,8 @@ def data(name, shape=shape, dtype=dtype, type=type, - stop_gradient=stop_gradient) + stop_gradient=stop_gradient, + lod_level=lod_level) def create_tensor(dtype, name=None, main_program=None, startup_program=None): From cb5a7a8b4c86fc76224dac2b919f0d7aa8874a23 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 30 Nov 2017 18:15:28 +0800 Subject: [PATCH 137/275] Fix errors of GPU AvgPooling with the excludeMode argument --- paddle/cuda/include/hl_cnn.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h index 89c1f48eda..8841806292 100644 --- a/paddle/cuda/include/hl_cnn.h +++ b/paddle/cuda/include/hl_cnn.h @@ -116,6 +116,7 @@ extern void hl_maxpool_backward(const int frameCnt, * @param[in] paddingW padding width. * @param[out] tgtData output data. * @param[in] tgtStride stride between output data samples. + * @param[in] excludeMode whether to consider paddings for size. * */ extern void hl_avgpool_forward(const int frameCnt, @@ -132,7 +133,8 @@ extern void hl_avgpool_forward(const int frameCnt, const int paddingH, const int paddingW, real* tgtData, - const int tgtStride); + const int tgtStride, + bool excludeMode); /** * @brief Maximum pool backward. @@ -154,6 +156,7 @@ extern void hl_avgpool_forward(const int frameCnt, * @param[in] scaleB scale. * @param[out] backGrad output grad. * @param[in] outStride stride between output data samples. + * @param[in] excludeMode whether to consider paddings for size. * */ extern void hl_avgpool_backward(const int frameCnt, @@ -172,7 +175,8 @@ extern void hl_avgpool_backward(const int frameCnt, real scaleA, real scaleB, real* backGrad, - const int outStride); + const int outStride, + bool excludeMode); extern void hl_maxpool3D_forward(const int frameCnt, const real* inputData, From 4d47683b1cd2ef2a6bfce70af34743dac56d1f5e Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 30 Nov 2017 19:23:43 +0800 Subject: [PATCH 138/275] Use protobuf v3.2.0 for MOBILE_INFERENCE compiling. --- cmake/external/protobuf.cmake | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 7cfe1e6807..7ae0b16b08 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -188,14 +188,24 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}") ENDIF() + SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + IF(MOBILE_INFERENCE) + SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git") + SET(PROTOBUF_TAG "v3.2.0") + IF(NOT BUILD_FOR_HOST) + SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF") + ENDIF() + ENDIF() + ExternalProject_Add( ${TARGET_NAME} ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${PROTOBUF_SOURCES_DIR} UPDATE_COMMAND "" DEPENDS zlib - GIT_REPOSITORY "https://github.com/google/protobuf.git" - GIT_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546" + GIT_REPOSITORY ${PROTOBUF_REPO} + GIT_TAG ${PROTOBUF_TAG} CONFIGURE_COMMAND ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake ${OPTIONAL_ARGS} @@ -213,7 +223,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ) ENDFUNCTION() -SET(PROTOBUF_VERSION 3.1) +IF(NOT MOBILE_INFERENCE) + SET(PROTOBUF_VERSION 3.1) +ELSE() + SET(PROTOBUF_VERSION 3.2) +ENDIF() IF(CMAKE_CROSSCOMPILING) build_protobuf(protobuf_host TRUE) LIST(APPEND external_project_dependencies protobuf_host) From a0648ee449335ba4989a83230874089b8685e3ad Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Thu, 30 Nov 2017 20:37:23 +0800 Subject: [PATCH 139/275] Add comments. --- cmake/external/protobuf.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 7ae0b16b08..fab2af362b 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -191,6 +191,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") IF(MOBILE_INFERENCE) + # The reason why the official version is not used is described in + # https://github.com/PaddlePaddle/Paddle/issues/6114 SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git") SET(PROTOBUF_TAG "v3.2.0") IF(NOT BUILD_FOR_HOST) From e1358945d11a681f19870d4282b33f649c312e01 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 30 Nov 2017 23:46:21 +0800 Subject: [PATCH 140/275] Refine AvgPooling with excludeMode to make it compatible with the raw prototxt --- proto/ModelConfig.proto | 2 +- python/paddle/trainer/config_parser.py | 6 +++--- python/paddle/trainer_config_helpers/layers.py | 7 ++++--- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto index 2c131338c0..1fbdd5bbd8 100644 --- a/proto/ModelConfig.proto +++ b/proto/ModelConfig.proto @@ -140,7 +140,7 @@ message PoolConfig { optional uint32 img_size_z = 17 [ default = 1 ]; optional uint32 padding_z = 18 [ default = 1 ]; - optional bool exclude_mode = 19 [ default = true ]; + optional bool exclude_mode = 19; } message SppConfig { diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index ca4a66d30d..3fe844b883 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -1262,8 +1262,8 @@ def parse_pool(pool, input_layer_name, pool_conf, ceil_mode, exclude_mode): pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y, pool_conf.padding_y, pool_conf.stride_y, not ceil_mode) - - pool_conf.exclude_mode = exclude_mode + if exclude_mode != None: + pool_conf.exclude_mode = exclude_mode def parse_pool3d(pool, input_layer_name, pool_conf, ceil_mode): @@ -2305,7 +2305,7 @@ class NormLayer(LayerBase): class PoolLayer(LayerBase): layer_type = 'pool' - def __init__(self, name, inputs, ceil_mode=True, exclude_mode=True, + def __init__(self, name, inputs, ceil_mode=True, exclude_mode=None, **xargs): use_mkldnn = int(g_command_config_args.get("use_mkldnn", 0)) if self.layer_type == "mkldnn_pool": diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 46fe09b947..8c5cc25d6c 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2710,7 +2710,7 @@ def img_pool_layer(input, stride_y=None, padding_y=None, ceil_mode=True, - exclude_mode=True): + exclude_mode=None): """ Image pooling Layer. @@ -2779,8 +2779,9 @@ def img_pool_layer(input, be used. :type ceil_mode: bool :param exclude_mode: Whether to exclude the padding cells when calculating, but only - work when pool_type is AvgPooling. If use cudnn, use CudnnAvgPooling - or CudnnAvgInclPadPooling as pool_type to identify. + work when pool_type is AvgPooling. If None, also exclude the padding + cells. If use cudnn, use CudnnAvgPooling or CudnnAvgInclPadPooling + as pool_type to identify the mode. :type exclude_mode: bool :return: LayerOutput object. :rtype: LayerOutput From 79b17097f65a0c6a0b25eb7385b423c01129f003 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 1 Dec 2017 00:27:43 +0800 Subject: [PATCH 141/275] cal FPS of inference result --- benchmark/paddle/image/provider.py | 2 +- benchmark/paddle/image/run_mkldnn_infer.sh | 22 ++++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py index a3a6b6fc4d..927b175994 100644 --- a/benchmark/paddle/image/provider.py +++ b/benchmark/paddle/image/provider.py @@ -23,7 +23,7 @@ def initHook(settings, height, width, color, num_class, **kwargs): @provider( init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM) def process(settings, file_list): - for i in xrange(1024): + for i in xrange(2560 if settings.is_infer else 1024): img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten() if settings.is_infer: yield img.astype('float32') diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkldnn_infer.sh index 3081d5e7b5..03a76c0540 100755 --- a/benchmark/paddle/image/run_mkldnn_infer.sh +++ b/benchmark/paddle/image/run_mkldnn_infer.sh @@ -1,5 +1,12 @@ set -e +function clock_to_seconds() { + hours=`echo $1 | awk -F ':' '{print $1}'` + mins=`echo $1 | awk -F ':' '{print $2}'` + secs=`echo $1 | awk -F ':' '{print $3}'` + echo `bc -l <<< "$secs + $mins * 60 + $hours * 3600"` +} + function infer() { unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY topology=$1 @@ -34,15 +41,26 @@ function infer() { > /dev/null 2>&1 echo "Done" fi + log_period=$((256 / bs)) paddle train --job=test \ --config="${topology}.py" \ --use_mkldnn=$use_mkldnn \ --use_gpu=False \ --trainer_count=$thread \ - --log_period=32 \ + --log_period=$log_period \ --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \ --init_model_path=$models_in \ - 2>&1 | tee ${log} + 2>&1 | tee ${log} + + # calculate the last 5 logs period time of 1280 samples, + # the time before are burning time. + start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` + start_sec=`clock_to_seconds $start` + end_sec=`clock_to_seconds $end` + fps=`bc <<< "scale = 2; 1280 / ($end_sec - $start_sec)"` + echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} + echo "FPS: $fps images/sec" >> ${log} } if [ ! -f "train.list" ]; then From d36db0d3ec9a5ef3cc30c2c55ec2e28541ca9b1a Mon Sep 17 00:00:00 2001 From: xuwei06 Date: Thu, 30 Nov 2017 09:40:38 -0800 Subject: [PATCH 142/275] Fix comments in sequence_rnn_(mixed/matched)_inputs.py --- paddle/gserver/tests/sequence_rnn_matched_inputs.py | 2 +- paddle/gserver/tests/sequence_rnn_mixed_inputs.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py index e2635b4400..59e8c91733 100644 --- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py +++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py @@ -41,7 +41,7 @@ nonseq = embedding_layer(input=label, size=word_dim) # This hierarchical RNN is designed to be equivalent to the simple RNN in -# sequence_rnn_multi_unequalength_inputs.conf +# sequence_rnn_mixed_inputs.conf def outer_step(subseq, seq, nonseq, encoding): outer_mem = memory(name="outer_rnn_state", size=hidden_dim) diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py index 84a66e2944..6fe9dca6e2 100644 --- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py +++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py @@ -37,7 +37,7 @@ encoding = embedding_layer(input=data2, size=word_dim) # This hierarchical RNN is designed to be equivalent to the simple RNN in -# sequence_rnn_multi_unequalength_inputs.conf +# sequence_rnn_matched_inputs.conf def outer_step(subseq, seq, nonseq, encoding): outer_mem = memory(name="outer_rnn_state", size=hidden_dim) From 6dc5b34e5b9fc462f6cbd5c96883981fe8258264 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Fri, 1 Dec 2017 03:25:04 +0530 Subject: [PATCH 143/275] Polishing the cpu profiling doc (#6116) --- doc/howto/optimization/cpu_profiling.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md index e1d91c668e..1775374cf6 100644 --- a/doc/howto/optimization/cpu_profiling.md +++ b/doc/howto/optimization/cpu_profiling.md @@ -1,13 +1,13 @@ -This tutorial introduces techniques we used to profile and tune the +This tutorial introduces techniques we use to profile and tune the CPU performance of PaddlePaddle. We will use Python packages -`cProfile` and `yep`, and Google `perftools`. +`cProfile` and `yep`, and Google's `perftools`. -Profiling is the process that reveals the performance bottlenecks, +Profiling is the process that reveals performance bottlenecks, which could be very different from what's in the developers' mind. -Performance tuning is to fix the bottlenecks. Performance optimization +Performance tuning is done to fix these bottlenecks. Performance optimization repeats the steps of profiling and tuning alternatively. -PaddlePaddle users program AI by calling the Python API, which calls +PaddlePaddle users program AI applications by calling the Python API, which calls into `libpaddle.so.` written in C++. In this tutorial, we focus on the profiling and tuning of @@ -82,7 +82,7 @@ focus on. We can sort above profiling file by tottime: We can see that the most time-consuming function is the `built-in method run`, which is a C++ function in `libpaddle.so`. We will -explain how to profile C++ code in the next section. At the right +explain how to profile C++ code in the next section. At this moment, let's look into the third function `sync_with_cpp`, which is a Python function. We can click it to understand more about it: @@ -135,8 +135,8 @@ to generate the profiling file. The default filename is `main.py.prof`. Please be aware of the `-v` command line option, which prints the -analysis results after generating the profiling file. By taking a -glance at the print result, we'd know that if we stripped debug +analysis results after generating the profiling file. By examining the + the print result, we'd know that if we stripped debug information from `libpaddle.so` at build time. The following hints help make sure that the analysis results are readable: @@ -155,9 +155,9 @@ help make sure that the analysis results are readable: variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically starting multiple threads. -### Look into the Profiling File +### Examining the Profiling File -The tool we used to look into the profiling file generated by +The tool we used to examine the profiling file generated by `perftools` is [`pprof`](https://github.com/google/pprof), which provides a Web-based GUI like `cprofilev`. @@ -194,4 +194,4 @@ time, and `MomentumOp` takes about 17%. Obviously, we'd want to optimize `MomentumOp`. `pprof` would mark performance critical parts of the program in -red. It's a good idea to follow the hint. +red. It's a good idea to follow the hints. From 3a8311f819977acdbfe35e884846e0201d9211cd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 1 Dec 2017 10:23:29 +0800 Subject: [PATCH 144/275] Fix compile error for gcc 6.3 (#6112) --- cmake/flags.cmake | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 2b125cef6a..1120677a37 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -111,6 +111,8 @@ set(COMMON_FLAGS -Wno-error=sign-compare -Wno-error=unused-local-typedefs -Wno-error=parentheses-equality # Warnings in pybind11 + -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 + -Wno-error=terminate # Warning in PADDLE_ENFORCE ) set(GPU_COMMON_FLAGS From 8ac02279f28e2e944b983793bc91da8e58a75e94 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 1 Dec 2017 10:23:47 +0800 Subject: [PATCH 145/275] Fix the proformance problem of enforce (#6085) * Fix Proformance problem of enforce * Fix missing `;` in code * Fix CI --- paddle/operators/concat_op.cc | 4 ++-- paddle/operators/elementwise_op.h | 4 ++-- paddle/operators/elementwise_op_function.h | 2 +- paddle/operators/sequence_slice_op.h | 10 ++++---- paddle/operators/sum_op.h | 2 +- paddle/platform/enforce.h | 28 ++++++++++++++-------- 6 files changed, 29 insertions(+), 21 deletions(-) diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc index 5f05268925..6134ac78b1 100644 --- a/paddle/operators/concat_op.cc +++ b/paddle/operators/concat_op.cc @@ -25,7 +25,7 @@ class ConcatOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL, - "Inputs(X) of ConcatOp should be empty.") + "Inputs(X) of ConcatOp should be empty."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ConcatOp should not be null."); @@ -45,7 +45,7 @@ class ConcatOp : public framework::OperatorWithKernel { } PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], "Input tensors should have the same " - "elements except the specify axis.") + "elements except the specify axis."); } } ctx->SetOutputDim("Out", out_dims); diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h index 56e5eb69bc..ea533503e4 100644 --- a/paddle/operators/elementwise_op.h +++ b/paddle/operators/elementwise_op.h @@ -35,7 +35,7 @@ class ElementwiseOp : public framework::OperatorWithKernel { auto x_dim = ctx->GetInputDim("X"); auto y_dim = ctx->GetInputDim("Y"); PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input.") + "Rank of first input must >= rank of second input."); ctx->SetOutputDim("Out", x_dim); ctx->ShareLoD("X", /*->*/ "Out"); } @@ -120,7 +120,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { auto out_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), - "Rank of first input must >= rank of second input.") + "Rank of first input must >= rank of second input."); auto x_grad_name = framework::GradVarName("X"); auto y_grad_name = framework::GradVarName("Y"); diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 488a35aafc..8aa35b2c46 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -106,7 +106,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { auto x_dims = x->dims(); auto y_dims = y->dims(); PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), - "Rank of first input must >= rank of second input.") + "Rank of first input must >= rank of second input."); if (x_dims == y_dims) { functor f; diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h index 6411e0a466..428ef556da 100644 --- a/paddle/operators/sequence_slice_op.h +++ b/paddle/operators/sequence_slice_op.h @@ -54,10 +54,10 @@ class SequenceSliceOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_EQ( n, static_cast(length->dims()[0]), - "The size of input-sequence and length-array should be the same") + "The size of input-sequence and length-array should be the same"); PADDLE_ENFORCE_EQ( n, static_cast(offset->dims()[0]), - "The size of input-sequence and offset-array should be the same") + "The size of input-sequence and offset-array should be the same"); const int64_t* offset_data = offset->data(); const int64_t* length_data = length->data(); @@ -78,11 +78,11 @@ class SequenceSliceOpKernel : public framework::OpKernel { for (size_t i = 0; i < n; ++i) { PADDLE_ENFORCE_LT(0, offset_data[i], - "The offset[%d] must greater than zero.", i) + "The offset[%d] must greater than zero.", i); PADDLE_ENFORCE_LT(0, length_data[i], - "The length[%d] must greater than zero.", i) + "The length[%d] must greater than zero.", i); PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i], - lod[0][i + 1], "The target tensor's length overflow.") + lod[0][i + 1], "The target tensor's length overflow."); } out->mutable_data(ctx.GetPlace()); diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index 4afec03ece..a1eb3b014e 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -84,7 +84,7 @@ class SumKernel : public framework::OpKernel { int64_t offset = 0; for (int i = 0; i < N; i++) { PADDLE_ENFORCE_EQ(out->height(), - in_vars[i]->Get().height()) + in_vars[i]->Get().height()); functor(context.device_context(), in_vars[i]->Get(), offset, out); offset += in_vars[i]->Get().value().numel(); diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index 415020ab96..97338a4ce6 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -234,16 +234,24 @@ inline void throw_on_error(T e) { __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) -#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ - PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \ - paddle::string::Sprintf("" __VA_ARGS__)); - -#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ - PADDLE_ENFORCE(__VAL0 __CMP __VAL1, \ - "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \ - #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \ - paddle::string::to_string(__VAL1), \ - paddle::string::Sprintf("" __VA_ARGS__)); +#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ + do { \ + if (UNLIKELY(nullptr == (__VAL))) { \ + PADDLE_THROW(#__VAL " should not be null\n%s", \ + paddle::string::Sprintf("" __VA_ARGS__)); \ + } \ + } while (0) + +#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ + do { \ + if (!UNLIKELY((__VAL0)__CMP(__VAL1))) { \ + PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP \ + " %s\n%s", \ + #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \ + paddle::string::to_string(__VAL1), \ + paddle::string::Sprintf("" __VA_ARGS__)); \ + } \ + } while (0) } // namespace platform } // namespace paddle From 42708ded549cf4c731abd75df8e7b3ef797a4052 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Fri, 1 Dec 2017 13:04:08 +0800 Subject: [PATCH 146/275] Enable the case N != ldc in EigenBlasGemm. (#5976) * Enable the case N != ldc in EigenBlasGemm. * Use MemoryHandle instead of direct calling of posix_memalign to alloc temporary memory. * Use Eigen's slice() instead of a temporary memory. * Add if-else for different cases in EigenBlasGemm (for N ?= ldc). --- paddle/function/EigenGemm.cpp | 36 ++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp index b3e666e860..644098a9e7 100644 --- a/paddle/function/EigenGemm.cpp +++ b/paddle/function/EigenGemm.cpp @@ -21,7 +21,7 @@ template struct EigenBlasGemm { typedef Eigen::TensorMap, Eigen::Aligned> - Matrix; + EigenMatrix; static void compute(const bool transA, const bool transB, @@ -56,14 +56,13 @@ struct EigenBlasGemm { sizeB[1] = N; CHECK_EQ(N, ldb); } - Eigen::array sizeC; - sizeC[0] = M; - sizeC[1] = N; - CHECK_EQ(N, ldc); + Eigen::array sizeC = {{M, ldc}}; + Eigen::array offsetC = {{0, 0}}; + Eigen::array extentC = {{M, N}}; - const Matrix a(const_cast(A), sizeA); - const Matrix b(const_cast(B), sizeB); - Matrix c(C, sizeC); + const EigenMatrix a(const_cast(A), sizeA); + const EigenMatrix b(const_cast(B), sizeB); + EigenMatrix c(C, sizeC); typedef typename Eigen::Tensor::DimensionPair DimPair; Eigen::array dims; @@ -72,12 +71,23 @@ struct EigenBlasGemm { dims[0].second = transB ? 1 : 0; Eigen::DefaultDevice device; - if (alpha == T(1) && beta == T(0)) { - c.device(device) = a.contract(b, dims); - } else if (alpha == T(1) && beta == T(1)) { - c.device(device) += a.contract(b, dims); + if (N == ldc) { + if (alpha == T(1) && beta == T(0)) { + c.device(device) = a.contract(b, dims); + } else if (alpha == T(1) && beta == T(1)) { + c.device(device) += a.contract(b, dims); + } else { + c.device(device) = alpha * a.contract(b, dims) + beta * c; + } } else { - c.device(device) = alpha * a.contract(b, dims) + beta * c; + if (alpha == T(1) && beta == T(0)) { + c.slice(offsetC, extentC).device(device) = a.contract(b, dims); + } else if (alpha == T(1) && beta == T(1)) { + c.slice(offsetC, extentC).device(device) += a.contract(b, dims); + } else { + c.slice(offsetC, extentC).device(device) = + alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC); + } } } }; From ade6c8327812c52c91066bf8eeda3036a001d0dc Mon Sep 17 00:00:00 2001 From: QI JUN Date: Fri, 1 Dec 2017 13:07:15 +0800 Subject: [PATCH 147/275] open test_word2vec (#6104) --- python/paddle/v2/fluid/tests/book/test_word2vec.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index 92d3629d42..1b441e15c7 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -58,10 +58,6 @@ train_reader = paddle.batch( place = fluid.CPUPlace() exe = fluid.Executor(place) -# fix https://github.com/PaddlePaddle/Paddle/issues/5434 then remove -# below exit line. -exit(0) - exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): @@ -79,6 +75,6 @@ for pass_id in range(PASS_NUM): 'nextw': input_data[4] }, fetch_list=[avg_cost]) - if avg_cost_np[0] < 10.0: + if avg_cost_np[0] < 5.0: exit(0) # if avg cost less than 10.0, we think our code is good. exit(1) From 1a852861b287c4b7c1bc8bcd8610acdc073e3164 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 1 Dec 2017 13:30:12 +0800 Subject: [PATCH 148/275] add switch for distributed support --- CMakeLists.txt | 1 + cmake/external/cares.cmake | 2 +- cmake/external/grpc.cmake | 2 +- paddle/operators/CMakeLists.txt | 5 ++++- 4 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e76512166f..3bdb3b7388 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) +option(WITH_DISTRIBUTE "Compile with grpc distributed support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) # CMAKE_BUILD_TYPE diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake index e05111ee18..ac456933bd 100644 --- a/cmake/external/cares.cmake +++ b/cmake/external/cares.cmake @@ -13,7 +13,7 @@ # limitations under the License. # -IF(MOBILE_INFERENCE) +IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) return() ENDIF() diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake index 86122aec8c..abee6698e3 100644 --- a/cmake/external/grpc.cmake +++ b/cmake/external/grpc.cmake @@ -13,7 +13,7 @@ # limitations under the License. # -IF(MOBILE_INFERENCE) +IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE) return() ENDIF() diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 937441b318..54d3881b8c 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -212,6 +212,7 @@ set(DEPS_OPS send_op recv_op) +if(WITH_DISTRIBUTE) add_subdirectory(detail) op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) set_source_files_properties( @@ -225,6 +226,9 @@ set_source_files_properties( PROPERTIES COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") +cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) +endif() + op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cross_entropy_op DEPS cross_entropy) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) @@ -275,4 +279,3 @@ if(WITH_GPU) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) endif() cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) -cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor) From d4fcd2a59fc3cdc6a750e695ea90e3a867c09a77 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Fri, 1 Dec 2017 14:01:34 +0800 Subject: [PATCH 149/275] Fix the doc of LSTM operator. --- paddle/operators/lstm_op.cc | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc index 4cbb60f3fd..fa8e5f2da8 100644 --- a/paddle/operators/lstm_op.cc +++ b/paddle/operators/lstm_op.cc @@ -181,7 +181,7 @@ class LSTMOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Long-Short Term Memory (LSTM) Operator. -The defalut implementation is diagonal/peephole connection +The defalut implementation is diagonal/peephole connection (https://arxiv.org/pdf/1402.1128.pdf), the formula is as follows: $$ @@ -198,27 +198,27 @@ c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\ h_t = o_t \odot act_h(c_t) $$ -where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix -of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$ +where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix +of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$ are diagonal weight matrices for peephole connections. In our implementation, we use vectors to reprenset these diagonal weight matrices. The b terms -denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$ +denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$ is the non-line activations, such as logistic sigmoid function, and -\f$i, f, o\f$ and \f$c\f$ are the input gate, forget gate, output gate, +$i, f, o$ and $c$ are the input gate, forget gate, output gate, and cell activation vectors, respectively, all of which have the same size as -the cell output activation vector \f$h\f$. +the cell output activation vector $h$. -The \f$\odot\f$ is the element-wise product of the vectors. \f$act_g\f$ and \f$act_h\f$ +The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$ are the cell input and cell output activation functions and `tanh` is usually -used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state, +used for them. $\tilde{c_t}$ is also called candidate hidden state, which is computed based on the current input and the previous hidden state. -Set `use_peepholes` False to disable peephole connection -(http://www.bioinf.jku.at/publications/older/2604.pdf). The formula -is omitted here. +Set `use_peepholes` False to disable peephole connection. The formula +is omitted here, please refer to the paper +http://www.bioinf.jku.at/publications/older/2604.pdf for details. -Note that these \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$ -operations on the input \f$x_{t}\f$ are NOT included in this operator. +Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$ +operations on the input $x_{t}$ are NOT included in this operator. Users can choose to use fully-connect operator before LSTM operator. )DOC"); From 1fe5acb25a2cedd765da28642510b2ce497dc659 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 1 Dec 2017 14:47:15 +0800 Subject: [PATCH 150/275] Expose sigmoid_cross_entropy_with_logits (#6147) Also, change the `labels` to `label` for api consistency --- .../sigmoid_cross_entropy_with_logits_op.cc | 24 +++++++-------- .../sigmoid_cross_entropy_with_logits_op.h | 6 ++-- python/paddle/v2/fluid/layers.py | 1 + python/paddle/v2/fluid/tests/test_layers.py | 10 +++++++ ...st_sigmoid_cross_entropy_with_logits_op.py | 29 +++++++++++-------- 5 files changed, 41 insertions(+), 29 deletions(-) diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc index d9e4054652..782f4c7936 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -25,20 +25,19 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("Labels"), - "Input(Labels) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Labels"); + auto labels_dims = ctx->GetInputDim("Label"); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); PADDLE_ENFORCE_EQ(labels_dims.size(), 2, - "Input(Labels)'s rank should be 2."); + "Input(Label)'s rank should be 2."); PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], - "The 1st dimension of Input(X) and Input(Labels) should " + "The 1st dimension of Input(X) and Input(Label) should " "be equal."); PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], - "The 2nd dimension of Input(X) and Input(Labels) should " + "The 2nd dimension of Input(X) and Input(Label) should " "be equal."); ctx->SetOutputDim("Out", x_dims); @@ -53,26 +52,25 @@ class SigmoidCrossEntropyWithLogitsGradOp void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); - PADDLE_ENFORCE(ctx->HasInput("Labels"), - "Input(Labels) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) shoudl be not null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@GRAD) should be not null."); auto x_dims = ctx->GetInputDim("X"); - auto labels_dims = ctx->GetInputDim("Labels"); + auto labels_dims = ctx->GetInputDim("Label"); auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); PADDLE_ENFORCE_EQ(labels_dims.size(), 2, - "Input(Labels)'s rank should be 2."); + "Input(Label)'s rank should be 2."); PADDLE_ENFORCE_EQ(dout_dims.size(), 2, "Input(Out@Grad)'s rank should be 2."); PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0], - "The 1st dimension of Input(X) and Input(Labels) should " + "The 1st dimension of Input(X) and Input(Label) should " "be equal."); PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1], - "The 2nd dimension of Input(X) and Input(Labels) should " + "The 2nd dimension of Input(X) and Input(Label) should " "be equal."); PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0], "The 1st dimension of Input(X) and Input(Out@Grad) " @@ -97,7 +95,7 @@ class SigmoidCrossEntropyWithLogitsOpMaker "This input is a tensor of logits computed by the previous " " operator. Logits are unscaled log probabilities given as " "log(p/(1-p))."); - AddInput("Labels", + AddInput("Label", "(Tensor, default Tensor), a 2-D tensor of the same type " "and shape as X. This input is a tensor of probabalistic labels " "for each logit"); diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h index 41c619f181..2a9d9bbc77 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h @@ -25,8 +25,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = - context.Input("Labels"); + const framework::Tensor *Labels = context.Input("Label"); framework::Tensor *Out = context.Output("Out"); Out->mutable_data(context.GetPlace()); @@ -52,8 +51,7 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = - context.Input("Labels"); + const framework::Tensor *Labels = context.Input("Label"); const framework::Tensor *dOut = context.Input(framework::GradVarName("Out")); framework::Tensor *dX = diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 5a977978bf..e41bfae285 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -403,6 +403,7 @@ _create_op_func_('sigmoid') _create_op_func_('scale') _create_op_func_('reshape') _create_op_func_('transpose') +_create_op_func_('sigmoid_cross_entropy_with_logits') def cast(x, dtype, main_program=None): diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 33b0e54f42..a9d9d369c7 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -137,6 +137,16 @@ class TestBook(unittest.TestCase): print(str(program)) + def test_sigmoid_cross_entropy(self): + program = Program() + with program_guard(program): + dat = layers.data(name='data', shape=[10], dtype='float32') + lbl = layers.data(name='label', shape=[10], dtype='float32') + self.assertIsNotNone( + layers.sigmoid_cross_entropy_with_logits( + x=dat, label=lbl)) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py index e53856b38a..c42f578f72 100644 --- a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py +++ b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py @@ -2,11 +2,12 @@ import numpy as np from op_test import OpTest from scipy.special import logit from scipy.special import expit +import unittest class TestSigmoidCrossEntropyWithLogitsOp1(OpTest): - '''Test sigmoid_cross_entropy_with_logit_op with binary labels - ''' + """Test sigmoid_cross_entropy_with_logit_op with binary label + """ def setUp(self): self.op_type = "sigmoid_cross_entropy_with_logits" @@ -16,16 +17,16 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest): 'X': logit( np.random.uniform(0, 1, (batch_size, num_classes)) .astype("float32")), - 'Labels': np.random.randint(0, 2, (batch_size, num_classes)) + 'Label': np.random.randint(0, 2, (batch_size, num_classes)) .astype("float32") } # Fw Pass is implemented as elementwise sigmoid followed by # elementwise logistic loss - # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X)) + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) sigmoid_X = expit(self.inputs['X']) - term1 = self.inputs['Labels'] * np.log(sigmoid_X) - term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) self.outputs = {'Out': -term1 - term2} def test_check_output(self): @@ -36,8 +37,8 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest): class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): - '''Test sigmoid_cross_entropy_with_logit_op with probabalistic labels - ''' + """Test sigmoid_cross_entropy_with_logit_op with probabalistic label + """ def setUp(self): self.op_type = "sigmoid_cross_entropy_with_logits" @@ -47,16 +48,16 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): 'X': logit( np.random.uniform(0, 1, (batch_size, num_classes)) .astype("float32")), - 'Labels': np.random.uniform(0, 1, (batch_size, num_classes)) + 'Label': np.random.uniform(0, 1, (batch_size, num_classes)) .astype("float32") } # Fw Pass is implemented as elementwise sigmoid followed by # elementwise logistic loss - # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X)) + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) sigmoid_X = expit(self.inputs['X']) - term1 = self.inputs['Labels'] * np.log(sigmoid_X) - term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) self.outputs = {'Out': -term1 - term2} def test_check_output(self): @@ -64,3 +65,7 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): def test_check_grad(self): self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main() From 02e0b5f9eabcc183f4c05a139270a49fbe725852 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 1 Dec 2017 15:00:31 +0800 Subject: [PATCH 151/275] follow comments, unify picture fonts, color and sizes --- doc/design/mkldnn/README.MD | 67 ++++++++++++++------------ doc/design/mkldnn/image/engine.png | Bin 36180 -> 17102 bytes doc/design/mkldnn/image/gradients.png | Bin 57433 -> 31247 bytes doc/design/mkldnn/image/layers.png | Bin 57028 -> 14414 bytes doc/design/mkldnn/image/matrix.png | Bin 19755 -> 22085 bytes doc/design/mkldnn/image/overview.png | Bin 9884 -> 16329 bytes 6 files changed, 37 insertions(+), 30 deletions(-) diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD index 7c863197e7..287ee620e1 100644 --- a/doc/design/mkldnn/README.MD +++ b/doc/design/mkldnn/README.MD @@ -5,7 +5,7 @@ 充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。

@@ -28,9 +28,7 @@ Figure 1. PaddlePaddle on IA - [Parameters](#parameters) - [Gradients](#gradients) - [Unit Tests](#unit-tests) - - [Protobuf Messages](#protobuf-messages) - [Python API](#python-api) - - [Demos](#demos) - [Benchmarking](#benchmarking) - [Others](#others) - [Design Concerns](#design-concerns) @@ -41,10 +39,19 @@ Figure 1. PaddlePaddle on IA 同时,为了进一步提升PaddlePaddle在基本数学运算的计算速度,我们也将MKLML即(MKL small library\[[1](#references)\]) 作为另一个第三方库集成进PaddlePaddle,它只会包括生成好的动态库和头文件。 + +MKL,MKLML以及MKL-DNN三者关系如下表: + +| Name | Open Source | License | Descriptions | +|------------|----------------| ------------| --------------| +| MKL | No | Proprietary | Accelerate math processing routines | +| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning | +| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks | + MKLML可以与MKL-DNN共同使用,以此达到最好的性能。
-
+
Figure 2. PaddlePaddle with MKL Engines
@@ -84,15 +91,19 @@ PaddlePaddle/Paddle - `WITH_MKLML` 控制是否使用MKLML库。 当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 +编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。 +MKLML的库目前都是动态库,主要包括`libiomp5.so`和`libmklml_intel.so`。 - `WITH_MKLDNN` 控制是否使用MKL-DNN。 当开启`WITH_MKL`时,会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。 +编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。 +MKL-DNN的库目前只有动态库`libmkldnn.so`。 ### Matrix -目前在PaddlePaddle中数据都是以`nchw`的格式存储,但是在MKL-DNN中的排列方式不止这一种。 +目前在PaddlePaddle中数据都是以`NCHW`的格式存储,但是在MKL-DNN中的排列方式不止这一种。 所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
-
+
Figure 3. MKLDNNMatrix
@@ -102,29 +113,30 @@ Figure 3. MKLDNNMatrix 子类只需要使用定义好的接口,实现具体的函数功能即可。
-
+
Figure 4. MKLDNNLayer
-每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`的`MKLDNNMatrix`, -分别代表input value, input gradient,output value和output gradient。 -它们会存放MKL-DNN用到的internal memory,同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)。 -他们主要是当数据格式与PaddlePaddle默认的`nchw`格式不匹配时,用于转换内存的工作。 +每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix: -必要的转换函数也会在`MKLDNNLayer`中提前定义好(具体包括reset input、output的value和grad), -这些函数会根据输入参数重新设置internal和external的memory(当然这两者也可以相等,即表示不需要转换), -每个`MKLDNNlayer`的子类只需要使用internal的memory就可以了,所有external的转换工作都会在reset函数中都准备好。 +- 内部存储(internel memory):`inVal_`,`inGrad_`,`outVal_`和`outGrad_`,分别代表输入数据,输入梯度,输出数据和输出梯度。 +- 外部存储(external memory):都是以ext开头,比如`extInVal_`和`extInGrad_`,它们主要是用于, +当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时,转换内存的工作。 +需要注意的是,PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`, +所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存, +如果不需要外部存储用于转换,那么对应的内部存储也会与它们共享内存。 +- 转换函数(resetXXX): 包括`resetInValue`,`resetInGrad`,`resetOutValue`和`resetOutGrad`, +表示对输入数据,输入梯度,输出数据和输出梯度的转换。 +这些函数会根据输入参数重新设置内部和外部存储,当然这两者也可以相等,即表示不需要转换。 -一般来说,每个`MKLDNNLayer`中的`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存, -因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`, -如果不需要external的buffer用于转换,那么internal的buffer也会与它们共享内存。 +注意:每个`MKLDNNlayer`的子类只需要使用内部存储就可以了,所有外部的转换工作都会在reset系列函数中都准备好。 ### Activations -在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是公用一块内存, +在重构前的PaddlePaddle中,激活函数是独立于`Layer`的概念,并且输入输出都是共用一块内存, 所以添加了对应的`MKLDNNActivation`来实现,方式类似于`MKLDNNLayer`。 ### Parameters -对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer公用一块内存。 +对于有参数的层,我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。 如果存在数据排列格式不一样的情况时,我们会在网络训练之前把格式转换为MKL-DNN希望的格式, 在训练结束的时候再保存为PaddlePaddle的格式,但是整个训练过程中不需要任何转换。 这样既使得最终保存的参数格式与PaddlePaddle一致,又可以避免不必要的转换。 @@ -138,18 +150,15 @@ Figure 4. MKLDNNLayer 所以整体上,在实现每个子类的时候就不需要关心分支的事情了。
-
+
Figure 5. Merge Gradients
### Unit Tests 我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。 -测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。 +测试分为每个Layer(或Activation)的单元测试和简单网络的整体测试。 每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果,小于某个比较小的阈值认为通过。 -### Protobuf Messages -根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。 - ### Python API 目前只考虑**v1 API**。 @@ -167,11 +176,9 @@ if use_mkldnn 同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 -### Demos -可能会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹,里面放入一些用于MKL-DNN测试的demo脚本。 - ### Benchmarking -会添加`benchmark/paddle/image/run_mkldnn.sh`,用于测试和对比,在使用MKL-DNN前后的性能。 +会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image),用于测试和对比在使用MKL-DNN前后的CNN网络性能。 +测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) ### Others 1. 如果在使用MKL-DNN的情况下,会把CPU的Buffer对齐为4096,具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。 @@ -189,8 +196,8 @@ if use_mkldnn 3. 创建`MKLDNNBase`,定义一些除了layer和memory相关的类和函数。 包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`,和未来可能还会用到`FPGAEngine`等。 4. 如果MKL-DNN layer的后面接有cpu device,那么就会使`output_.value`与`extOutVal_`共享内存, -同时数据格式就是`nchw`,这样下一个cpu device就能拿到正确的数据。 -在有cpu device的时候,external的memory的格式始终是`nchw`或者`nc`。 +同时数据格式就是`NCHW`,这样下一个cpu device就能拿到正确的数据。 +在有普通的CPU layer时, `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。 ## References 1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。 diff --git a/doc/design/mkldnn/image/engine.png b/doc/design/mkldnn/image/engine.png index 65bbb41fbb389ff5f7906b0284ada77ac2dc4ec9..a60b7ad5553bd6d7d5e255fabc14467ef8a57c88 100644 GIT binary patch literal 17102 zcmcJ11yEekwk0kBf`xSBnkEVEt|3T*1PSgC+zIXu!2`jaBtU@R(zrIR!QI_mo4Mq@ zKT?1G{P$;Sx~i*-=AO&3d-mFEtxebmc`2MHBu@|!5O8FqC6o{l5L1A6Fa{d%jW~f+ z0Pun6s3awh@OzMK8~6dmR7_3`0ii4!>&^fb_!-kqTEh_m;c3U?8}S>%Q&OOvk&J|x zimUE^x>p2wXL5xsOb1)0ih9ygj6uJJN%Na3s)1t%%;<^ zGY_Rd2ON%rvXg0lHt#43RJ?vQ|EmABs5h=P?nM$6jgLtki079?*d>lZErCbJ34Hj| zWaEot;KyefagxCC#Y1=Uva_jmpS;1#E_-uh@B0UXIxqw9gFn695D&;o>LwF% z9}8+~5}vH*YchD#P=zNR`sG;fVcnh0+S*#t!^4Bq2ku!DaWq`W2GUeDo)Hl@^PD6W zB9{1BQ^UjWuqNH`4QTRi^v@sVEtFxl(q)`oR%5vs5VxGw*8FfB+hsEF>t3UFlBVt{ zusa6(;fT7VSJ|>S#^aRWUVvwGrX%w64`9f#fWjU|pDiPYH{?Wxn@H~)5*f9x}^&N`Pxr&TWOnN0gG za+^6A%oaR`zJgV4lg^6PAu4kFoOI8$k$kcXf{CzMaZ1QMQ_piVhKyV5WM27On>hVp zCEP~9hjMgO{q*dQ4XNSp){~)YdVy%XZ<6A)7GL-V2AEBTy*txgBqOmvb&DO?J4RV* zBIhcVOKn)W37`7r`1P}z&x{~kemhGl{c!9=WH<+jI#!Ahd z7&ktPicUgwJM`HkuVIwn0-kv;OrZn^(@>m)MM{=$LtP@*J(w_x5mvid;tOW^2am%a zPXhN7pS?s@RfBvfT50NpRuvCq?R$s6fdvqBG+ewG!vPhPyDVUbGeB z_)Iz8=5zrj)`SlA2s}3_aoU|PgW)XBY(~7;9cpbcv+5sKbKWvP&zXoF4y~iJ#?D(k z&5TZL8$Io1V73uD?H_CPrnbNVR)e94r%Zo;rY$@f?4g^H#*{l$#Qu}}l)0ZTT10p` zNM3hzjfap3LwpPg=%LzP!@V<4cT}ut0x?H}YvH2qQ5w@i^SP>vFogE#93j=Z+NSI2 zWop=y+^(ojEv#5uI!+q32r{9al@S_cPp0|m_iai27Yx7c4oj0thwHcIn}lZE;n&Pe za(0Z?c0&{(($3LH!(F$okW)5QV?8{CYI@t-Ojnv2@9TB@9bRRfdc5&>?Z<2@>%^@F zl9G}++1c6oB_%BUo>z8b^Yi;ieczdy59(YkfkWWiN`Mq)eqmu>(BR-; zeq*DMg^f+r#$Y-r7bj-2d+c;d2|--Vu9 zX9irDb>9II;E5P;G_4=+mxswpm-UR@#3^FIgO2F=BO@b^RUc19%!?#nYe>>kAVDhC zJhq;8N=nL{NOe_ZWrrnjHx+_CHfQ2nE5kU_?6M4OHsVcj;9%c`v&T~f;^mwq$-#t; z^0uD^?u~PjZ4SXXeB1GYLqc*}TU#F^!_NMGfHOeRAI)FJ8mvp_IS(&;BRA*{M% z`|OxrHi+_y0chPwjIyyclFQ7h8gNDSFq%js69EkJDLrK>H=m-pt*wgz{O+1r()BJ3 zQ6siscV%$ciHQwZZZ*y;=|o=W!UxO@qQLwU%&bcAL&5gGTGtfx3`J9 zI~TO^6`6PhDI|{vPOj5olNKwTN#$hIvE*kMxl4!E$Bd01?{x(8s>VAdZ&SFKZM17k zf1RYlP_=60YrDWJO4n__b%MNJrB1^R>c72t#i(90xG(k3z@|)_U4YNp3$E`}(;=$L zaGF3tjvBGiYvk9o-%OVe)KvQ&F{wh?I9m;My#kgcp#H$2@09|ZyYC2yi!x8oI8 z^E--6x5LKclAdg~;c_dDJ4>s@s3_Nhy6x%m7bN?KE(VK6(VYe2vXOy5(OT@NX}|*5 zXK?zA&}toLjWh8#0a%A~u0~@(v=|hn5)UtF;$Nt^xmq3IsTXV>8z;(B35K3-*1O6B zh=|@u|I!9jli_9}L(_B=uJ`$PsmY6Z5DPui4Kgz~vC)(8i1Ra~JoA z4TO156lbN)90sRFZels?z|8J!al|YW_cqrjgxX5nEm*Y?R-PgDD9B<)vTc-F`U2YT z<8ZJ!Y7Y+`M;036q8gv2(&-Ny@ji5d=I!r{z9yhu8<9r!aj{YuPKFe+K`EOxCZXPq zw~^s}gGOC4tKEGp^;7N)oxIn*3_4cKZH9Ky-Gx5s+!NdyC1LkpjXux$xHon#rR)jg z^U|*;1e5c)QT8?t$#iGxDZGQK#!}L8@&%)lsDa5mFQ(0FVyt~ZMlEdCo?S3}X|4M) z}@mNeGt7Uwjd^#I(TU(jKzW2rVNrMSgzbDk!4aX@zwAnc(CU2N(Q9OPt9*^L{+ zqZCX1qTZ;;!q|Jf;pCG8rkdKMhG&GdC$MXSuJg%yUYrQQ%TVk4-O94+s*>gm3*UjD z6tdh^IEQnhs$@L!y6HK_V`m}U&U0&Y{>gKuC_#7E=N-)xhL_4~=$LX*8V>VL$4YB+ zpEyvBT51u6a+wNMp=IQ($Su6M=J2WLNDjtp7>~9Ul{^@?<%g+}q(B!o^=(osQ^;wC z#~`uT#U^h;bhOW%5f8kh?jx=UnXM51Zya#5A?5E9ms{t=VBfELuJ>Qx&)u}A^3*TZ zlI#lx8X6IX3Ql6?d0o~s%Nx3_HpFo~+3-4v`yKM-_7A0H#cqdhrq`m^@xh@IDG~wTx8*mf`1m zP=n5Sm--|l8@}7Uwj_KTfk8!^ z4D)ds*957>`z{p!#%RHn1Z(%t1lYulPYI2G+up7x)_*#B>rwalVS|U6nfq1C$;Bpp z=4X30)(hESs*A3f+Q9{$#m{~Fn;E-??IsP>5%BvBeyB-QBIIj)fRD)ZP;b2MTP5(% zoVnWu{UUk4X51LH6V!MQ=r!JW5b#v;3iE<6{MHf``{Q9<~7!?wS;AZ6Er1FRbR>{j=A3x%t{8JRmXXfibGC4oV0% zFHc}M$+EK)Ozh5|u0`%f-!n54^-{Zu?CT;O4#vOXyG-WqZ4}=dl^9-R2@m$oCnHGV z|33BJy;GuHAKO<<56wcZ5=`44oZ=Q(l~p!nOqpMY>;?Ypmn=e{Mg z31jiIEqpe%07M5L{Y7nM89cTvk6UZdrr&QCpXtt8!Fz|Zfbm>L5~Lgl;EC?<2EtKw z$ViB4#dSSXKMHAS=UWrWVv<&;_VA!@hLb{+rzxN`>#dtPL{L}(+GUXJI-F*GT8~z^0m@fO#%vQxWJRpS`s-#DPOnp1Qe2W zx)QF_U&N8h$V$XLWqS~tY-rO6dmFB;J#cmhzhpL1LTDr~yxIy8`FyOBl+V%Xhj^mY zfvM81$>DJ=@$`Qn9SC+2>?8VJ?*Axb(rD2xZA2<>&p77J-QHkKEWOnv5jI1ZF zJ2lPZ53eIL)O793h!3Z8vI7-{>BPub?`j)dr9PD-9`;C+a##wVkGY!DAXED68CpK0 z%eML6D5WeT-0?x>BOc`Rvb{b z_av2)ZvS~?DB+3Nkrg8qSVne7tg|Tyzd~v6D7zkzKox&JMTd4`KH2ZZ__5oXq&zZ{ z+#T))O7uTR!#ApLPD{0XFHg&JaeueTyx}=#>mverrp{83l9gw2IF)uXU|Yw(v#56< zFxj?7_y|q85DsgW26Koa4NQj@nuKMuK3WHdSdZ$?rADOJEb5wNbM!?RIe40gCig)w zk<27fXlnp?TSIA^7AFOx=v}|pvSrey?JC<{ow|czL{kVnw-w^Jhz_gLN!8)FEE~i9 z>M(>Iwc1R}PIN(mMPs^dW_zgM+13xwJD0~Bs%b? zq`q%PvZMVqx%TNC*P}yV<_4S zyPl2Y7Z)?_Y;A?s)zxWl?d?U(%+BU+Y;2ev%t003wD;?V9VtftKjiBFA#(a3i~uHt z6YfJ?nzrsk`pC`*%XpB@2&3=zCVIakGdrfqg+h;e3zN8i+^% zw@QP#{BNOp|Mh6%O7UYp@+DR-H7&ogrl!6n5Wz5zzQfp8HjjsvbAzHiD7deG&J^Px zoDN}C>A+{C(n;>3Sj9ynB(a5Dw(PQF5|og?czemI4MNNAn(dj5DYhX*(w85G1M-`~ zdt3sFy|T#Ekb?oy7>dYGNiC>JTfjQz3!22e!HU3_Fl!m|+zQ77_8Whb9Jdzvs60Ah z3LOFIV=`hgOEHx4NC@RC{NGVQ|9*Q(%$GKmu*}%qxmG@sL(lL@4!ADUJXm%>m~!NH ztrNuu2K0~FJTDjdtj)-eMA)mU_OuCA5D$ya&Pu@*fZv}TbiMVslq_aYgIUV2VbwcB z_XnqI+2rx{FJ}%KAA9N~db-kitn{p^M3+1rpni@^RscZGtouuEX0j+O5q7Zr^~K1q z5XvYYqC5afto%sMUP#P;0W|F1u9+iX33J%eZuXvVcVE9~3d}bFzLz+>*nb55ADA=P z!BdR_B*LG&nH~{2^K&4F3H%ua7ItObaXbH5)38;j=VZ^vDrpZ7XQ?}W;jI^SQGJt-7blm!P@4 zm^62LvEeexWin|{RC_uoo-$8gPpR!g5(k~{t|w=~jKmeFM)d|+#>Y@rFAHifcFxZq z@Yuh)?4}fWRt01NgRZ+NmN_GE@5#bf5Ipmw?j>QvhBp1=${w^nQUU&zA7TCJB8#1l zDXBVE>h6D5I!AW$GQMagFgZ5c(@)o8y0s>i4N{tITo&-mt4vgVYooMnnm)JE2k>oO ztqSWHhUBh>DbdNhe%q*zHir&&K)|ZW^!Od_#d_r&^8|miqO}?_I;5e9)1`04{^Cv*Q@{geEB>5n4C?RCTB%@yHy#Jk} zz;n>!fRFw)I6vJ;ay=**4fM!jrQ3buSJ!wXwj(Zs5l@j~5=MGd3H<39E;gR$y&yvIk^rUi4sUg6nIH zZ$VJrNPy7$@qq0N?lBMmKZ8OV#*$Skzapw7;4DA_ zV%KkZy3o;Ae|$D0=Wt$9kUu&lK+?RPXv?$^4_eICtTE~e6x0;m%AVp?WqaCuAdF|sy4(>Qu|3SRQ~3++2y!JiV_NzUHS{UNSZ>we>*hOGq=5eOflrg?1p5Fcd&pZ z;rNrRB4b8$X*Eso@FX0kG|t7xq0Q)9wZVYLhKD6MKYpZ7cB|04EfV^N)DR6!WP1tP_AymOC}wnoJ^6cl_ZVpbyv;G zw*ihAd0;Jys^)9WV#65*PG)T!*(Af0)qh0b8q2*Re0|~3n9k@PA!Fs)6*vepjR11$ zx};YFs5P35^1;xN6nXxhPV=k7)vHNPa%a!e)PtrGtaTw0MM`P29WoP)>7+Mh$toL% zrZ6QwG{?0qMNl3wTg0Sn9#vlC@|7eT9oQ}PL#q=f-2y67wHWr_?5S4!?r`kI^_G-~ zgMD8?o;)~{t&Ydq;n;u3i}|KwvS}m1!~4MlM0)Fe!2zO8>YVfKgD+ER%i{Kkz&~g4 zm{mL@KE*nAD9#tA<7|uMLtt9rw5wvZZz0a@qRiAMGm|?_t?AewaB2qf$bZo>#j^6c` zdTwzC(uitkPEl;jRfHI={{pi3n8VFB^FUzd5_qDo6{q{Y(t)NVz5qH`bSa>muP{-| z_o*m^FsFw87ZzqDn9?y+ub%;STeF}EM%WziNagCZdhKl#AsI~`UPBSGnj&T}ykMd` zef4BF4>`Ek{R#M0A9`2k*4LrwHi*hIWVUh--zoj>{RK8=SP{oT0cN;?4a)(;q$KYu zx`>^*Fn1*J$a-CzjfA#2Byg^x(qpjFgpiS**`AMYB5M`>Vq(|uos8{)g^kW$|bT!3|> zoamf;G~n2MrtI?}AgrEnAMI4I{_3*;kBizVeNyS4pxk&*D)%PoLyx^O*KHO%+$(>? z>k2`H%UG`-M;e%9b1<$J=iF4)AER680h~@g;sHO|e22T7r1Lr2=X(iQ$?M!ZR zXuSKU%b8}Ir8I$u?M!WcnxyePA;t9n5YNufGndNFoLmXKe?)Cv=+VL?RaI4Q$;rtT z)zlKP($mu)C9<)(IkJIt0UispZX2{b_y0*J{4cS_|Bx2{vnKCYvyUJ8eAgKtW<@cm zJI-WgR);V6%%X$)hmtw+{sEX4nNzz1pQXwcix5{W>0vxfOiWFIZC?QV@K^n4%qbIp zO!=2(n4<$noj(c+dhWT*-oJm}pbF5^FIp*#63d1l+)#Xy-5$I+duHQ>2DgLe0?dEV zpT>I}&MER@-MVuBM#VoO-I1jw{jtSG%JYj0NiQ!i<``j4PL(P7P>U{~tv%|qvbsHV z>R-f#e>V6zTFdks$(j#L#Z1JJb8~Y?y1PROQ~dl8=~*$J_Jv~aV!Bhktun$*f%-93 zheY$s>eTSto@+X3sc@*~h2aBewz#;MS^W^EZLUEeVTSk3s4I#u7En3_4aCK*aLI7L zqk13|bd8xK;V|q7weD2!)aFa(PeQmMoEQ;dIO0Z;zZfxh_yG#J?>nKD0QD~-4?MRi zSG?kJsfc<+JR*`0W7Z@e$J};mNR)C?%7WwN#!kA12b^I1gx`xWXgCF3G5SYxrF8U* zg3*t!5}TfF<3Dp-D-yXc7+yhkXAkYK2cp@c_!{aXeGRRfTUa<~ySqNsi8!L-+|cd& zB5RsX4FKa)El(7b5GzM<)Sw2`-AH8IEUsUp)gM3PZ}^f|hiOikINod8&h5A**e&$4 zWm6BIys^XR!BYLu{XR=RV6m5$$yy6F$O6|jy|icJbOi6k0x3_x@Q7xJ3;(pIZ zy}K#mXF7?{^C=b%Df-K3Q^@{*Lec-G!6tEsGdFT-E>3R#CAyUhLjZL%@-N^0h@s!A zAYsY{AC)Q=6dwB#R`sRiWjdcn`5L6lJulJ%4VYUy8T3?5!V1RU0DzepprmIybJ(TL z*aJgwQWBY+%J3@000hzb-U#iSBQq?qyj8+#zWxyQitvA>oh!OjBI^MjOJ}a$t3Leg zrX_Koyllu6L1Y4|xiO4f2ulz${G%=YZ7|%!5vM$~+Z5m92yp4+II}3O&mk zq)_HM(ABEv24r698TSLI{g-?cKoW82H($K}9gf?4rZiS>ovNjK1Ra8^hgyhPcM$#h@xBg}Xkr3~nruMvo%&;t;A0Tz5^tQ#sSt#x~|8`OK#s^h}_77NGs61oi|J&$h1b4|T9Ng)Y+yy8~`7-U@V4{9NNx z|EJd6*51xtP00oe5Ip{1FWYyS@UbulAdRJR0|Jg046C?fn;-mgA@xHPAZ50V_q7Un+=?McSy5Zez>A!HX-e!cdHH-D+Yua1eKjc zfLgKoMhJI39Z)<>wXIuswmmu-QV`xYd|e7H#nuc4`_N^3>CKoZ-Z8Toifsv(> zuh*|8#?|WGlRLAT!YJKjFP@C;nFaI{yCt<|hvpui884K;0nFqP@SWljd0chu=`+9ToeDXrUg-Ezf9PDM&}W_popKk>&WfU@ zi}{@SV|=by4W;D|=LR~Pz!`j-8~vbyUb zh$>GVbANJS6ro!=wTF}kG}SCR+e?3e4vp?G#^1`yCU$0s%1(HY^4qm^LIydq>AhnU zEDcT&epeYQlq3K{M|cfpB6UZtEV}dSHhfgpLAT8Ki;sWyUC_Pj@^2NhV(aFWFKvCc zR|RUqS6Lj`1ZmR&CwUYImy5Qdc;F?p9khVGZ!5W!@Zrv;+amS!^g)hTdb_e|FnDt4 z3XhO%=SRrn<|yR{PceLN6SUS zU2w^?91^%54I3G`>U>PRkgm_Ro=G<)We|sS<;Qf$fmlz0nR%IDYenB4scJS!$XS+9 z@dTsSw4p;%g4UP%v56O?2p0?QHakJ}yqK#t=7&z4&#qAhRHncfNJef7g%y&}p+E!5 zAx2aZnV=r7<3ysbtg0Q2tHM665BK&K)k#wh3KrzRKv_Xk1&i>Dz~ko&AF7}0kyM@( z1$Fo7#iTi*fQc#SDB>L{d_4U#GbWgt`P9m4euTGDLFPT1iMo-^Ib2^KD|WiCt}&DV zx{OPKLOBoOtvB2eq-+xUMehM@S&^}G5U17x^FUX$I zvxQeQ3MkFq&;M$HNxSRelg{!NHDxsVcs@TH8B&*898E7HR~?IEoJ9bKha3VpLhk7> zLB5sUQbC?}fBX9v#^eEYf*&?!UIsrXL7t^@ox+Q2>WoG-#p4-qcb)g_n&+WWK+?b* z$;ZSrn74;3I*S}enx(EQ;XFle`p$k+T_?M!IBiRY)r*|9-248GC+pR1V6cRf316Q1StXz15~XsDj)n^+T`PYp@aLBvA#&e%hXG#hiMy2QOA_ji3uz z=Bb@9Y20h>&bow{VdK!Mxu5@e+IM90N~PIEA`^4>2@!{MY0{0btdz03;XMBwdSuR?7P=DZp1IuEIl*Le@H_vT&0zm}dl} z0!sga74F~jpO47xMbhw9W|9eCpMpG=BIZ{oO#pHhHD~WS4pZPHXL6XzQ)9%cjY>ey z+Fwb$^sE;oJ?B&2Rgh%50<~psadLCRv@l~u_WWegs82BddR?y01yBHX9l|UpCwH(o zqp7K>y;LGdWI=!Ry}0B@j3-`L!K(S}#JeLIt&*5UeQv=1;t$wgsN)1jpp6CM`xP_0q&A^6YWZR+!0f@(#!1L}RxreHU7b;fTCcSi&m%(TpAMO4sljFF zA*X}jr*FW-#9@Hda?a=1~J7vvz5d_M_xKITo!d}n_&nBPOQ*g8Lrmc zSWzn8U!jGH2A$2z#J`f9GyFkW-Yxq8#U}9L-NvR2Cs) zCpcCS9eW0!!mS~q(q)(jc3jzI;rLa4AKx)(Vai!&pA=iaRPLSXonHB1W~}h9+ez$4n;a+R=gG+hU0L}by|&-F8C0IPb^E~cZdC~urE-d{6LkLQJygv~u4QJ& zpLng^zgS)5@4Lzk$JMEMZDC3A0Gp`7ZQfqtw0W2%8u_I>Gk4P1JWX_OaH&|+55Iqt zJ95blAGr_iW8s|+!mUTo@$uYg+-mrAntSN;$*t1JR`2tr&t61LN{Ua@O3~KY?}V#K zJ!yfcbYxxcL+P`YQ_8we?NOg5)ekN-9PX!YbKizPf2*UTGvbemHP^*;9s0{lg4=Q| z7hhZjhsWpm%ynBLfypB52CYkvOlWsa6$C<#HPDiPV6k32$KyjeEZY@X?VyPaAKVE| ze3v`7e;N$faux02^)B6*?Y+vQs3?dzlNGm1W0bZ(LXdgxtU|&of-~aJ*=^t}=FAt_ zAk?##ciBL@(8qQ}cLpxDXJHpV$8?d90>R{a>36FcwW)Y??&R66^PSKwq3g4!5w#5| zskX0s=Y^vmkGGG0k&4=lHeum7f#`_t23^^_FUZ|dBFw1}xV;I#E;b2dWRB-m*H3LO zde?V#V(7h!ulHI2(m(27Xmip%tuBG+H8U)_w~lEJG#L0oNNC$}0Xx~OY?iis$3t!E zrjtrmWLp%FQ+c+fFCgsS1C3p5qIGSu0o}PJ)Fof;a512*(96PFK2o;K8>Z*aKEYx$}>2eu~O%1+E)Pv zqx*L-Pj(xJIm9^o(^a2f1fD;Ec+k^h4Qfps%sDBC5yTPV4ZhuvwC`O5t_KkDS}5Qz z)4{9i0y*!0;YNWcwl|0#q9R9i%W`2-;kcvj4ZV-ahFRK`#ELvvqI70;bvTuJJx&}& zclvNzg?+9ERh+@2@}(tV!)KjZW!r#n3$WiZ3hhykniA|4y#wG$ljUuq?9NM~2rcko z@v>*CvU01hGiqf{-%TVZd7Twk&@5?+8si0hU%p!UU9Z&H z7GljR|Cnl9uYc(^G>qkhV?3cz#{O1}H!jB<7(DSpGH_yD7JRiu3t|DBcAXV|&>L3Y zruQ!gtc?ybj7zQYwH6~)2FqF}wDKYS5y1cWq0`njvU`Tk^U00w>^PqN*RXF%?$AFaZ47 zbPS~K^nb)YX*GUgkYEn{S(ZynBFuR#AjSQ@3pY*-D>fuPgmu7Fgz&kZ(|Dlg_bPt2 z*N}LyvbMyj>fO3!G5DQ`tyLdaDq?y>L6p~Ha%gldIgsF$`%z}Jw^l`^dR(Vke9a^R zf*H`{jm3k=z z_x+vOWg1RR+1OR!2t&h z%=5vaCEa65VV4;-s!Qk-V^?#?7`ewQ&O(Y#l`#)s50GZJ+6@zQTyd=b@zM8l`r{{q zN=CS9)Mb=h@V&DMBZh7uG+tm8y-c=Fl(m7(<0l$sK<}8Q4?_CUve)H6N*NQX=Jgr} z-$p5*_@ z1jcVuddtG8@?1PrVR($j?X^~>#XSnn;)`3j8cc*EKJYMl+Vr9VDSP>S@4DwCBkmR< zr&Fk$fm{^*lxt9=Ug%wRIZlAr*eI?>IQEhKpeAARJF4sgBs=@1U+?Qg?5LMbjTY!?aD$#j5HDA&zm!Jl!bpu3(>E9p|WbQgSB`MpurU@ES;%b)SgS3BAQc z;j7K5qhY5)$X!W45F9_AGzSGs{N|tz>_$LY79ENq#KacNS$gQK2*b{4<-S0-3{_7t zyOMRX(kBpm7|x#!Zn4Mj`QQUn9e9i%P}*yB6YLrKV-KM-8sR;Pq@&Aj1_g5`A$&rx zp?7AbenTQVu`>DXMKigN%p{1hdNaEJw{8ucd;ws$6c3&A%dW{lqAo^vD0Si0CCPK4 zs&AIc*qI6?%J~*ZA#dq;pRDs35}4ggZJQn4k{PgEz|At5uZray=gRJML4K9)kHszd z?}0FD&$bE)MG$|ZuK$E_t89(=^rsUw!0j4ujPMFPnY{A7P{nR=;y0&)@FFLhxjAg^9TJ4=>G<++byp?-@7RA}{;hD8wPBqSt=zn}n&WeB}7JYfR}iq&_j(-ZPpvahsILY6WXS970m z5xi8$&?6-6zM^F#?7k!9Y@wF=TZKS=i&?Rj zB`==3l!m6pDU`s!mq+AkJ$(5XHJFoO5s*B-**=v?U|9#s6n2E!BxI=~otln}WdL=D zAukMvO*{ajOcy}jAFlKBA>3eEe@q-$$O+`9=PRvmw%%D}_v}BdUYU-*lp+K_On%UBSeAJ;Q{hB3c+2)p(j%5$1rs4^JMgBEyt2@WtM}*}i zQxQ^I>XX3+fCpdF6f?66gRd0_88rtCCV|9_J065DLrL?fkJJ#kRgH$Ys2SL6p8VA6 zK-rz9;s?+JR9TS15Y+}=uB(xm_G@D z_!^dn^P6yNvo$~f+rc9Q5vIf#(*CXT`_H7f|EDUmf2P*Pg9CbkLhgGvL$y<`TXi2t z?$vx=!(Z%RV#-OT0p6dH;1-{d*T3Q(F+K6eHW590)wGnSu!Mcj0L$B3Nn+mT+eMc? zpa7%t47-K$Yx|wBB#f&rv_rh5I2#7Y#ih$qA<%0YjXF6ZuB3r|2dK}1b9sH8ls-DORd4z_*?)dL*?v=L z#8`Kxlqfvz6n4VfSolJhA3wrf>vS&P){f4?I^}NU4L0vYRHN6(U(l?{@8hx{b37VW zB=}=|9WlZA`Ou-P>(VuDt|g`SpHA^LNi)`rNOJNKEF7F5R@=@mD)z}=mfwUx$V``V zqm?3=VYQ#8+3Be;WZ0&D19)uWe0P~}4=Tz{|9Us7>)9o$Hsi@M7rgJ_cE>C5e*?w`R zBpWIOlp^Lpm^(92s5jS%*1TAEvvW0hHVx=F^0M;H>j}J{($;I{H%9UC0>6W;Lf9Sv zoVOBhwaky72~@{{^CAL79dw&b6D3#Mf<~4HQgh$x-CLy5i2EMHy8jqS`!ir6KMXFw zd4KSRB6eN>Jvn<2E`-6rYpXTIkTEx zSpjM%p2VQA;RewyK;p-dBgmby53-E+t(vPw`A+X}Xe&<^#{e1(QoN)`RqtyYU4;Otq*TK*BJF^l zThty3^4sgxj}W{<1#6eGq2_yMg$B;2Hr;Di#D)?>$GG*;uvawCpv1Y!)?Wi|&s?_@ z>2QQ*mnZnI`Z*42&fT}fz${KqL=z!P?Y9SxzZ;d@xqj5nbo5J;a5)~zjywDedit*aRx%fyVLr$m>clNvKr>}V=!fPYwbV>T->k!5NJm8uEfF#5FR>hs#rDVFIt zQO3K~;avlP)Qkvx`P6ljiXprwfdTuY5>|km2z8u2?#Wwkg(+Lj4?fFYuC2~MFrSQF zyFc_1(M=(9tO~PBQp}ZN8>jQ-q;U22Jn54anHIywr!l zefw()9#O~&AVZ}7!SdA?VzOJPFQTz()0!T>tp{00R63UXXhydWW*V38yD8ASc?SB~ zzYI*=W&e8FrET1!Vd^Y`9W`(T>Z-W2e>!K^`7`8Cs+ieml%B2jaTvLxsj2BBr7iwQ zZBI-*=i=ge>$uTR>=9An5%q%ePaq2sYiR~qW&sGO7>!O+U+yiMIQ}(l$Q=aX`CVZV z@T%|pHM{s4(e0pCV|c&SV`Lsftv#_jmox2`{U?~r0=36)If$sdHUUaz|8-{Wzg#Px z|7MSpGpyo{XMocmD^f>d3k$yjYrSW&aYnC+0E_zM%?T98JU2iSo!th+;c_s?`*92j zDC_(!i^)g`dm^CA5SZ`mYBIpB6~OkaH4p|OX3mE~`E+S55PK^CBMxeUmZs(kmw2dP z3rAmtrN0rk07^1eQ%g%n9}36=y0n4a$QRgJxv?2?FU!my z^@ucLGKWV`J_SV-8X8&-91aI+WQRH*$!V|8pDRm?^KFV&GzvrB$sqe9#M2v?nRS25 z>jes@+kK+&iFOf^meg_SB@O``jfY1XfxF_rl++z#GaKruPZX^$^wCgMz35nf{Z~0r z&W*HiJ4QIAc;qC2^_EJ(6uZ+&Ez-s(;UATn%f*%SRjS2RSm39Y8JH-*H~~aRM1lpT z;gIa8)?LGDKDquj(mVZO`ibKF)M}M0hktV>J3Y^T&^GxsP3= zy(YVDTVXMk`;g7{H7^Fgm#6b}Ztus?yCsn>fXPiFdc_Ay3wtYtA*-n)5e*^Lc5Y zr^RuY|M0$j`#7|3-MG7N-+lu4m(0Qp{vyH}SqA>|2lB4g^?d~$f-~S3CcA68*Y@oz zj$+$_9|XU%KDqS(xo;mQ_!cMOk0#gb$NTn0%xK@ZcF)IheyBe|)a3o{HlBYWRAB7) zyz*$aWKgi$!=gW8haRfjo%C(wNm;t_w2@i=yRejv)2N6m%0{x(-M8t?&D%K*OfczTt#12a zf+CZa27Y}!JhFawdvS=88n_eL8Y_7RJf_^r_qL61LuSRE^9xdA`6pEHnu6wYhpQpO zooe)a`kdA70BUn4Yv^36|4L+(`i?31&H$_<=CQ8yxqg}J?jhrl3u^D7gNGZgJZt=3 zbVPlth+e--uV0W1+_*#Ap)Uqb)X|3D0%5(5_DAPyPn~%heXZrs$6Xftm+c|8Hr!|= zmsor^7e95OX3;BfvpcoAV%RT_P7vAoUboyRAW&GpQ_WZ^kNa7^)pq%j!N+ z-4kcu?##Oe(#5#6ASwJW&V`Oaj@yn)6Mkeok<+`rJ0Y@L=`h+VSwFbFL#to3^cf3X z*r22?raWD!inL2b8H1H1h_9UKkB-nLok1Rb-SQ!VKXB>up~S0#qF<0&Vdui&(Q-U7 z+QTBK-#^4yWR7ml<`uf|(`rVx3!bjDoU3UP39Nnk`-fJImauQe)Ag~iv)a)a*f$qi zVxm;nOleAKI1;ZAq-+cdg&{q75Jl zo%uCbGSiNzEM4_ptfP5(kD$D1BD=nOW|t{B%l9^4`>NP6nMZ9OE_aK?mA48$D!C}b z2NAM7!>+}Jxp0(Sz4Vl3Xng5s1FDg}D$d~;NA30+xd!Z%-XZad!LC7W?dUTO5_=>0 zcB;}m>Nnb5<${YJ8%`ZVrW^*Ak(PdV`qPmPP#AcRh^;Jv8V(n_j)T$HqNLOSJ zSf)9<+?QjlQw~tOV8b!x#aiUsK8K&UUQ6;e*s`t*FMx5c@B$-jv|8=B%dGn~33+=Q zQT=T|``xYYs%Fr1k5%oKma*!4XxrfjQX{87qx((@*uMG_d_=7hNC>t_&0IlE0lmL7 zfMns>bH82a4H$9iv{|%NOM?916N0Tr+y+7Cz0b_YSh- zP>v6$uDqDx=9Ms?F2HZjE;Om{xFNN`5E)g?F^_dh#ALW7ENd&{Gg}YM#L4SGrZhaB zvJsq?0(WVF@*I`BJ9K%DKjFRR$ab-+-Dxd#$A7P=#e+5T_7Yraieq=JqJAg@j(_Cr z3XeNA&5w5A;z$ot-)PaV7TH+}`yXWXk6R#_xvDCI2OSATKoC8){^*>eZ31%az+oNk zvlD>wVd8rq8Mi_&14(p)RXBb7RPj!~Kq`fIe zD9VtDeBwRexzQRZ;4}$MI2!LNH&*7%gWgI{TBwPd@=UlrP+iF#~op)q-dG&~}Urm+&Xh2!$kqVp| zSMy?OAg%NV^%swGrtSv8cPn+fZiz^5r#l57k!UO)^O<@zaor3?8#+j~8S+hZS{x4- zxe^=e5wIu6oe=nS%K|~YIp-=CLtoWV0inJC4j=2Aqe(nFES|DO7?e!o_kgYi!FBqp z`K8{)6+g-gs1i=05n0|;veBo$OvYHxW_@Th9hq43Zb0m zX{K=tWc`8UV5(_x6mq}g2NYR2)j8^pil&ipVzq=eB@yVX>In=Uw{*o=mc5tiWM(Z| zYkHC0RbaHLGig!Q1?TOe%lZWi)ltq0*(PnCNw#8Wslg!xJ`C#|dpQ~#3?V;`SL!S) z2=i^&!b~Eb{Pae3A6!RSmVd;JY!%>~?C0$`8l04)HcLA2GdxC7GGqATm8`jfy@ER% z>h*|F6Oz>0^)zALaznCy&BXFUD__AZL8LY%oFUrx!=XkRuolBj?3U?Wqb##qO1ljEhIX?M zsv7rC9fhU2I@Mseg=_Rj0`xexus;Th>9~fSq^7Ln8DW`rWtcm}K}u-GYCH7ujJbVi z>c@sBJ7Ukfi34F5tjNZQM5v10&%_^tFl$k)?6kg+E~id-wrp8D9@_TqJ!vJH1pIHt zTmh)dK5_MrVhbl3zv_G7G72fyRgR6zG^kJaoxXY73X)Qq8cq8A_p`N&RUdI{$I)i3 z967SY4r3o-`aW|uk>{=V>%WR3gkp<*8>2}re?L43Te#ArbOc^0i~SQf7&owxmUDtE z$g4f~7!utAqRp&C8y8`%0 z4K^=^)Gj!;U!6LOj)@be9g_FHZM?2{12XIx;DiNI`$`$imF@B;f#u%Y+pu7+wo3(` z!VS>b3dE8#7gDp00v40-ZBMV4;FVO}qg&J}Qptl$EJl>G>=kQK7tUgBV*f*FK=K0~ z*4K(XQ?pl(qWKQmUa340{D~% z*^KGygc&tJH+N3944#HmkVO8|4vmBAO@bo@PlgFl7&+C{4zUxFW^~zP}L1c17sHC+_?@M|=6m0=ZMf!jkDL zWE9>Mw^l)lcSf4;9j(!N+V7cVZ=+8zd*v zWWmyR^8G@*UCKH;9;B(O5nqF|G=)8;pQA+)2CfF@8x`euy~a)+Chp zM}JmoK=o6QGsqvnvG=9;L3YR^J0uYCY7Nya;>mSu1BfAHr)sb2=5%_ZqQDsseyx+X zA(c2$Fj|-S<2%w*k@CZp8~FpOw83uGm1fbQ#Q<8(LXU~n{ID-2x&4{=fr`f@BUXY{ zz{aE)d*Ie`(-3{Jeq?*>NZ_=k|MJ)NrQaGyREa|<+EC(E|5ot|r|WKr{|LaQEWC$s ze8+9^OKg(0dpW_-)5W?9zM`{vN6{?>;UVyg`=#LPtDWkLYTLiN+nH>pY@zqZ`|VQa zg~4pPK-#zqJQX7WPyZvV(+pv406}`6?Y+1QN0-N{^wl;pYWz1=#K4)9#RPW*Hm{6V zgh;u@Ii>X;HEmNx;)7kluw zt>p!@TeSI4sf$Tjw=VOeFBqQ?`O|tBBo|D2NA0SaBGU~02~0@&TXP>iGi+9V+9X @@ -42,16 +42,16 @@ Figure 1. PaddlePaddle on IA MKL,MKLML以及MKL-DNN三者关系如下表: -| Name | Open Source | License | Descriptions | -|------------|----------------| ------------| --------------| -| MKL | No | Proprietary | Accelerate math processing routines | -| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning | -| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks | +| Name | Open Source | License | Descriptions | +| :---------- | :--------------- | :---------- | :------------ | +| MKL | No | Proprietary | Accelerate math processing routines | +| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning | +| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks | MKLML可以与MKL-DNN共同使用,以此达到最好的性能。
-
+
Figure 2. PaddlePaddle with MKL Engines
@@ -103,7 +103,7 @@ MKL-DNN的库目前只有动态库`libmkldnn.so`。 所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
-
+
Figure 3. MKLDNNMatrix
@@ -113,7 +113,7 @@ Figure 3. MKLDNNMatrix 子类只需要使用定义好的接口,实现具体的函数功能即可。
-
+
Figure 4. MKLDNNLayer
@@ -150,7 +150,7 @@ Figure 4. MKLDNNLayer 所以整体上,在实现每个子类的时候就不需要关心分支的事情了。
-
+
Figure 5. Merge Gradients
diff --git a/doc/design/mkldnn/image/engine.png b/doc/design/mkldnn/image/engine.png index a60b7ad5553bd6d7d5e255fabc14467ef8a57c88..1f5f65c2cc765a514a3ba9e7b7f468e1dc4b0c3b 100644 GIT binary patch literal 13586 zcmb`ObyQUEyXXNC34sBmb7*8pk&=*+E(HaKE@{b;E~TWA4hbodo)(zdRK9g9+DEfMS#l(X9gu*j6%XG8h=h7=kO)`@k}ulcFvh1B0mZ?v0tqO+<@< zAyl9wEAz&~Xgk9@l5RX>cev4*Vde?dGn^R@dow;H&u8a!gb55w1524#14Js1reTo}l*sPwLjkG@q(F7szMWC# zr=A+=o@2W#F89;-I;Rr07njuZ8AY!Pifz@$&nI?Ii;G@AEqEdd4pPJO?@d#}s#USq>-;7fip_-yt zpq_CMU{M#A41#tFm==|mx&kjL0ky~4m85t{VPn63eWsr%Im;V)<~G51=I>JeSu^ETS62~iL>U6BM)KdSrOFX!E3ZqD z1cD|gi*-j~tKi1cu)?6BV@z$N&^#QxBoOmcMMMy&P$qyaNRf~fI7!>jCU{`Yz<*yg zXQ^bqX{L02+ciUE7LY3a_-!(jzB=$rr8aVKu5cJ|A?ou*sCj!P`w%H|leKg<1;+I8GZLtY9bl&@lDgbRD zB3&yOidE`T(i9P$xVHLw28Zpz4ijo3b#t2&b)_oF7jzVoXJ0q}AUo_Ewgh*f4432O z2A8YJ4Hmk zZkDoa2oGg!9@e?fE~c#w9=f{EdgMOan=)@UP?cnbEzzZ|4ZAdJcTW!6qtObX4^?u>TH-eGs6YQ~N4z1q#}2|(u_j?~D$(~po8Xg%BL=4GuCAV&g;Y4ij8Z)pi>Jp=tmNTV zvl44Bcm9wirUwgFle?dL&HKEDYOw*=LoA(vD)d`7BmB^^L^nQ4NjC>V+0oQAl8~~Y zh>#rt=~sM$s>wPr_5_@ z0=Ck4Z{VIWOnkk?sf3-sw*Ow6ghn+_FlMbr^~@EEN^3a<&-% z@}l+Gu+N{%&=rZ%4=Rs~DDjf0+MJAyg`Sc!STU82yM0ycJdW zC57^WkFOH3l@pH(i(G!v{xi=;P12z@>w7w9Jc5?*k_wVIs zM(Les2FZ=zVPqT?I>z9jpIzAnS#I0g+hw`A9jkr1 zSd#Cm43G$OVxxe(Ru-QB^=l#R%a`1IC~hq6`*wLtreHIzNNZrkFZO9D2@k2?jmBJ4 zy^*o8mHPy_55h^fIXxvx78gxLgoG04$%<=gCJUOIrHs1;i@Ae9COAZe`T1Q{)zufy zh@T{UjY4n|0*yEm%zJ0Cz?{^imemO!^F>gEEjGeUK1#=eV%`}PWJohdXKwPio*tnP+MHvN0%fBWIblo7Q6_IDN${v`u6#8hbThVFgdGx(pMM_)s zy?5AKMMNKlhF+~-6Ybf;y_&1Gk)zT-%6Hnf0?Gh*7-R4bT{*YsXrppB+ zei)*4aN15b-tltVdGAsr`W2dbK*zzH9Lm^1}<}74^eP zL1j3OPral1EUFIqE?(TZzvm>`_iM9#%mISe6(NL4(6`Zbdi}EHEVw59o3XRp8^Jgg z;dgu5<0F*Dk;+5&Pf3$!s@vZ4PClE@yehvU-oGW0y7|fS$7^LvL9xuOQm|U>8rmXq z=Ii6{LT?<)h^LK3;kWB8C4JM!Eyb5K-7T*ZuLPq~F!E??pl;Z0p5e=7A+l0ms&Q!8 zj?_;j`S6W^*C|=zJkQv{Dd5n1wuD|}{%yoLw-1#-YCH( zY#@@m{3zgL$k1JmjU~H71X&u{KX`F|i;0YM_t~iST_U^j&*is(H6JcLh z1AW*0yH;^dZm0VBF4qZI79T+9rcO|^nA@uru!T6?&u^-A?K^2Bw40V$08Y3pN^yjUmp}hCe8@GHhl%I?j`x@alo3~t++37ZV1TBC&5u2OF zr=G~8n?+(N;=t?etgF1?(-p+h(k~)YeOSW;5K0e6IM+_ArP;MZ2K(C|GvH|lFRVP# z2_0*3%xdvkW9FsE|IqbX{dt=2k}V^pPJ~R<1BzTdCzP*LK?ieQCw9LL?L8fe$H+2e zlS?v3cpN7oa05cr)Au~fL{&8r!no~vuaaajydlq3UpN!K=1ndfT~Ib>b5Q^}g@Hz6 zGCjM8(f3SQgvoK{dR|-iv*~!pJ88)|Df!3!Y#ni?Eic6HcX< z-I~G5bYGNz9XFuPeHJ-;v?HUjxxwYe{!NrT4yMcBQZSszyKGzW+tav0%1ZTpcg)2` zv{HKL5bR4^w`l*9^HGk>pA$Wa1IIz0ytNTse*4}HEI-Bn&erBC$yn<;0-WXNTlwjl zuZ~I7-dV~|NA&~UCMoyW32m~^e7&om_*?gH4l=e`IIx(e*6U>;kAl`)1BY7rdB=FpYSW!2b8#~e8M)=C90D1a-8_B`dcZrVXKW8 z%d);aG7j}OkZ&wODQR?G7dE$^7tzr~lYjoKTMnhKPft%9un==ij@M03w+kmg{u^ii z%lPzvg_F-P1dtL{DB*&`!DCUZ(;7*^7PPB`cBCCroLpR7t18(r-N(VQ0puvA4(QnA zBzjhc4S1;#5)vA5^E;`Qs49emM?7GGfq`WJP`J9gUkV@?7pK$~psF4_1`;G#}1Rj*qWCINEHv@xn_4F>@$tbnyAF z-?zOuS~o&~%9E0kkYoV2ES8vnHABhUgS*C2sNIl|kP6pe5Eh8pK*j)yZ8u$MGhff_ z=-_Y?Kwckq&dC+h)dynkhYL)D3ZS7ISWWzf{4&fnHk=c!N-Lyha>vKV7Co*#?Pm2m z^_w|qP+Vu;7ko`CJdhIZSttZz0ELvB8MMo;f!ZyWEChI@RKWloJm>gD5Xlf{ql9EV zK~3TAL&zX;Am}6JN08s3T&l&#{q4-Di56{c05g8l)y7T4dm{j~NFWCji%>nz2XeFa zpvS5v=hg|Uq}}nrcJ1$(3oR2YOaa&{OfaAQAJeD*dRsfcmjJ0$)_j9V8d)$5z={TW zN8dkG}TWl+#)p@wuh76TiT}Niy85>1xBc`~ zSuJX&{(T{>l;^1XXsxx3R1N2j8}fQBn4qcahxeR#i;)ZLXd$r7s)wj%yQ2R}IOcjo zr|kNmf9@_$_QzLTZxqk{LYm8kRZ)g-&luk2Ug$xQ%WIF@#ZT9ce7@`X-;>}ix`@{8 zFzn5YKwk@;h~J?Oip6F395sM>8fUP6OzvI1280) zh<-$U<1Vm2Yq_#;%pbgq*SeaaT=YImc&I`c-(7Nf=3D7mJ$O?M(OW_HFa9Lp^FY$6 zUA+@y{&d#L$mr3kZ?okN?(do+!}FR6CzpCK{V17kQUaoo4s%2`?8>tZ86K>}mpKp7 z)oj`sAI{*MtR8QcpKkk)xHm8NLqQ@Nzul@)&IsE&3?}UvL1r!>B0cSP>I#t~mRvbB zFz`5?laj8ti615u?*Kx2*?Is$PTX6Nju*PDA*ZfAePg2!XSg2|d}J1j!d4KX5Uc>|=!M*MB<&WPTdG)WzWynUT*Zl7=rQYJBF8XHXY(dK7#ouFLHmUYF8m ztpoK{su@-5^DT6UeDX&-%npXpSEUVfq|sm?y}-H z4&u_co>?vK!;(^zi829CiLUk;Z`#nElK3)_-DXcrr)Q^W#=k?}L$UfPw%E0xOoW{@ zlM$|KWOmPg!LLzyX3J8zWUxm3;-cLQ_eR>t=c(i>vv|ceB?A3kHr?cuiNF9P`eoxp z__vl`OiI|hs9^>TVD+Vs#PjC^;Ybq^y#4R24J_F|IVM7t)b2A>=SBH`y^YTr=q@L^ zKI`$L#**6#mo4cFIFirywT_&1j<6>uI1W>)ki^=}5mZEU;~Cr&$8m1~;&&UdkJNIS zosUuIU1&E#{aJ_Sj-i`WMeh>#`{n|x?ez_Qx88gc*hR}nO{bZ=W5G)XT}!2Pk*(>c z!F(S36}J(vN2?Sq6--UBj*LDZqz<3)3M(zhV^LyY?bb2mk>|6t(Qx?mL}k7VH_u2D zr5xV)rSaj}TZs;`a1z&bZ0zE}!6gw~a7HEU;&k`){Jds%30@t>_DU)Fm&a!HOd||^ zfl~YFoUs+nbP0WcPW=ulq1ZS`!Epsq*>N*{9ugk9T%!DO%ecU6+5g0zl6ddarAdyQ z!?uw+UEhw-0Zd82T|&;36}J4dtlWPN6@>-A7d0suaCnyt*>LOBk2knw;7QzGqZ)3S zysvkmf3a)DDDoWj5UmZavx~IR_hZ%}bql-OE^R_T z{BhrTWOTLH@y8cz?cn?s#4!Bb&hgZ~a_f)PHN8~ebMZ}u_nyrI{O%*kR=gO_?-)hd zOX7W*tg<+*>I9q?o(m;@@whx}+4bF%;Y&Gex!(T%hBbffJsxT{`?0a-8vM7Uq~y-o z{_>9X+G9m==Ox(_Tb>|Lo#@fg?=nev+xp+XP8k^)4r>FcW&dBS#Xnl_g;`rsVPP*j zIEXXkl}Mbm0^kM#9ZBKotbq&0|ET>rE?k?rEu28h*S84}1_F<*gY#M26JZEdE|Q{< z5S+=`SvnqaXz1)$vcJL9V(faa{@q>t;ar|VkO6sjr*&mz&w2*Uf=UJpf3RhCy(jdOHH9PFj{Dkufqj}%qj*xfy4wlMpL zIAA428o!lveW&oOvawti3=oGQHW{?z!MYmhWT9jM7G@T_W**Itn2{#nSM0Ft;9yv+ z?6E1-VjqvxNy=#ma16?oMIbWnp@?ta*!f_;zo{M8;ZdN~yY5F6t&uXraJ6)FCUqzG zii*?KUzn!fPTNV{TCXZ+MQnUq9oK1k4%TL!h069rN=9a76pu*o|6O|chqlWWpZv&* zvc+GVtwh~Spy#qK5{BhmpBybk80DSeb{(z%_Jy`?-Fwu9tDVzB11fvf7;0tej1&Zb ztoB2mL4QIKUp9$#a4=5$FAby{*(MjYwgd99@VoY4smENp6>yeG!Tb#sYiztZ{@ems zw%|V^L}-zexs%rxia}$FmgUsW57jnns~`+f3I%15F?SAqayiO@5U7Mg6X0gyg-Uhk z{}8(Wl}P>v_X~Yw($yXHj~?k^^kjWZEqzBNsFObXnH*9LtpHa5PcFz7?yx{vHcc0x z8R<+Q($gfeb_8KMr{SJSPaK`d!*f6t2-(LrfEcNQ7;RmgZo>hQ;WeV9ygXYcRSsw* zH29fA<4BX{>*e{n&9d)#2V;PuyJMam$O8uN0msWKt>PR~a2sji%AL$OpFy344D(%m zY1yw!nN~!YIjyo=e)#h7%k%cPyN!z>-$UR!bP_3(L1=e~2n+?tYazEoK>(-o4`?o(Kv#m7!n|hP~Z(&ptBc(JH}Y z`RVT`6D7S9$Ypy02mwoi%hn%xk69Q)eMU+A!MF3n>&EeAA56HGnDNr9?&>X(#oI^i z&6*?t6wv>Hn=aD8ea!Q%UQOUQq<#4WxLxoO?ZsbGNQG$ z9M`)oyXYpEJ!*%>a?s_OWAmT#-Rnq;Uzq~?0E(~~_q6hS?`4I2V&Q9C*xYHpbZCjR zUoK1W9tD!Tx^vV`wXt^8FC-0mz*$nGXUJXRgCMre-%h+5*L6*lUyqUBpTQwI$InED~$3KY&zVx|?w409A9eX+Cp(d3x#F^0ouV7#%|_v5*XIG?|&N z_NmJf(RoS~&@I+^8*Aup*+A)Y_yyg$IHfbx0hp{!B8i*Ya{n!z78gJ)`9*`iElR|j zbWM5fcR5^+>sY;nsTVZ^!i$9C&*k;*W#ckD>#)wWKTNXq-=u}FVQMk0&iL7Qzk9m~ z>P|{+qR*pTvBmRC16Q}Ksw(!pQHSS?S=R&TPktA#vwC74FsTA=-IL@i;m4Di(UHXh zm9A)e@2w6Rc=G{y){IN^6EkZhU-k98Ro@dFYXLsj4!V$F5+9f;9Y=j=5Wz?fe=N0w^V>yIuUiBZobh;cBEzDL8k)Qddf0_ zzus0C{WrR~>p{8%Op}5P@m3(-ptCFLoH)y?Pv1D@vXdN=sTpNr`fI3G2MlX7>AAOs z6&PWxB-*_dsQYs*C#1)(q*X8A%oB?vXSm4q&fcql|M_|x4n_ei0-t!??rtP^dg`g$ z!Ok5xIGB9Xf5)83KNx}eJH6hp<+drN0PHb-!Jn;5)nn}0g$@}UZamS;ieYn$93)JK zUv5jqFnXJ5nIf`;Qk6P9+1v12>5NXcen=^*JIw|Ko4lhdQ>>*~3;8@_$bC;F__-IG zNmD0RwV_+8yCiO!Hf?i8Zo^0HlowzV1_Y>kI=?prTk!IpL1hYYj3nmvpPKm%^_@TQ zgeUQR7I$)_di(RbhQT0|NGWCO?kZuR7)6K#Dlja-wfC^GC!I;69sew$31<_O-I-1X zcVx9_YrW|hAsA#I`2PfV1su@5;H5Ejs?!9KE=-x}!jToAk0Xsv*~4{$p1)@?T2C zx>D=%m0|H(3BV-UWMgWfGaOQ}P2@&uDVI&?2Gp@9U4u6*PG{Yh?k;Q*RJ2Xwwb~Sj zdgIrGwg{tJ3E#z#yAOl(iG)1n3GThkPRW5_$xPE2<*B+jf>~immafQLLX-fq{K~&m zMfxE_3s*4kc4^rqB&Cq?)67T*$IA<=w`Y>WzY&9zq64KFy+qQC?$F52@5 zIL>uIcuEwQGY%NM*_fX3rEkY#_i`KRT!j7Lk!7AlqRvG|vq1wOc&0#BP7bhYAk0gFy#4HAv`e^pAu?nmhGLhxbwCX8>V(M>vrtA|L|NiIf5w83`jiE!pXXH4%xca#I5 zSR4Q+SlI;_lmdcuWyY^u}{sgP!1$7XlTW3}9IMvmLtT8l3Zb@J=#u|3 zw$NdgAg|MvKOm!|82jNJCm$u=ncL}C+tpEdZ+Y;qkuL=)g7sDa162l~YZmv|4|A>1 zI5Su^bOukjv$uC>N=ex)Rg`kNYkH7~DJ;-ZCsnB0&?5(o&8}|HsK^du?|wDVtNXa* zcU(qtB~S0r2qK!wTQsZA>9h4Mf9fk4WKS+D0l^l2m(mmpl@(E4 zGamMqmN{7Gr3MwB9`&NmOcYOO!nDT$s0K0k#iSladdcXB2;Q&<+w-s!JsTFI$hJ;| znze;zVTgmnKPBZfwX;5cIaOx69LC`%5l0u_89*NQetA5yDs|dn>#Jd1QGa_H-J2zY z@=%~4+3E4GtN(@MCH+Regx+&((83F|nZE!L^&y zo_}2RFAAy{0fp4IvGUk|>m+|9{qXl#gk!SHhPtpO7wrAlJ26zwtAE!&(6G|WkxOHB zwpUnC(Z4x8bJr9QniL9>wE~!wWqK3v9g)`M%U?vSk~20oiu6ol4hu46YTS;~0c=EF zw?kVVQe>f)4neXHW6#RnU)G|HE;%3bWX^wly~8c-_2di2B3|~8r#)dVKJl?sOXHKY zFSB{-h*}qs!K~e9>Ob!civ9ZE(6ympqB~JBl8&2Nr73KTcKa4}tu!5KGZ)evIdOf? zc~+chGZEc%94?XnrKfK1V6vWZXUTxW@UOGjUi;r%ayefRksnv9D{TYy!T( z3%mZC>&wIXh;dv8dhLe-T%-XB!{Z$lKy87&mZ;W)0v6NreW|mQJJ7dxmABh-F^a`# z>R?Vbod{(Y zGMu>=NWp$c;|qdL0=yv|0^E_ODpvL5{=#!FDY5KuF!6RLR+k;Kd{5ICyre9vL{_RQ zLJLgiR>_{W1`oQEnO1xjZ-aXLH8EI^hkjsaZ!nqVdVwqLwU+ul&ubwBo8pnZ3BTch zp%z&qTZ|M?6ei)DaQEp?`u=t!FyfO%Fz34|FV4q<#4;{agBx5F6CEccwFXS`xa-zs zi!C=7A8r{DR2(dY@9~`Pg=MEiK-;f#m&pl|u&^vzjDa$+zfgN;N!mg*)9o{RlWm#w zB3Mn#a+^8JiN3cibK|DyxWH!6(M9vl_e%bl#zs`c3o1m97Nq;W?5lP!o~Wu^Jrj0I z-@x-8V{??V?qchHd(x8ZtAmuCRww$%m7foA&h&!?|;4m>JU_=S|<9 z5#M6sFT~qaWt~T6Q6!rG%}>6nU2Hvlz<}~?F5d6(DmW0&@VYt8Vsr+qLm3PF<8r?k z-4{)1y<8TKf7u!IpA$({<+gBQb}O!%3xz`dkk?P6m){0%b-k|JtElUy;(o~TB#77} zuFtnE<$;>+@LNB$V@b*w#Y8l%<3MwZ{L<6UE_X?Zloq$Ceq3KL$Lulb(cirlk25Kg zx{kg*!6%zsCt(+bvKt1_AI(%hhathW<8N0@u}Uz1;3C8kp!V#fm$N)pZ=vz4zGjPx8`FmoZd$C?ls@aS6xriFPTb30fy?>}eksl2TR{ zLbKgWrfg^n+d}qwMJl*^J|g?RtIQl?Ro3m(QBMDwCR+;7j^ekTf5EqNOhLxSaXYRB z_Z7=&kLat8bbq9qIV^52Y1yVRX;fq>xhU*`)&Ttc9%-pz?6TSZiAXbvni$_J;VS{Y z#p&n1C(Q86jd7{r;5wn&FV~kRbraIJaOsx?7Mv!58g!g0PbQM`YGEsOpPYn7Ur65| zhY!0*sjJ#T_OrKB3$_bmQknp*=|{~U_X3c?BeP>Ti+DWgpjkQiHr0N1{-SDf`m@|# z;_p`dRB8#7>4$AhzBnDi`i8VS-4lT(T?&e7Wbt~4cB%<$Bq{A!j6g~WOG@IaKI^o_ z+TF#!_f-q%?Nmms1F7|%F^SVp8lnr(vwjW3Dq(DCdomuFX78hH!3-Ubg9ta<74cNR zMEr#5_})8L&9+D%&_1OnQ|sR~8Z<&pG}z7! z>EJRQ$9dO7+Gj1F54v(528Dd^E^O()9}~Y|8M2vLBqFfXX%6d(6jXM~Sd8pIbxDWo zJ?tzAbxh5--(RYHMx1XD>fvN`WpacwMcm1p2eU|vzu`K13oXy4fI9f z*B_D*)RXbE@6FH6ZJ+p?tr$kd=}6oUV{@cyx`)f!T4`lN#42DatO(A9h$&Jpp?NK#xX()2~s){HX zgd#O(5or^HNVttR%G#Da4fk(=;ywJIvnRUU#H|%G@Je zo+Z}Axpn=m>D5N-Xubxa08OcWO*EqHW;fa<_1?Fs@RaG2h7u9l4rnciAG@EIzw7f~ z-t4d>SJ5Ty91oC}3M?P`=APTo7ZI7q+dAZ}SqQaYb;IKk86oX5Uqj@xK2Q&>{P^n6 zdFE02%Xg;KtM3~1kixGU!fbPA;VtjlNLRrlq~lPwA=iKJJyi}GE}hZ&77SXBz%Ir< z>*6>l?BJ=uHx>O{%4Ikdy)itKeL?kY~!62AY-J}VLdBeDL7>xWdv z&)2+zp`l@>ns^jg?GISniQNyB3N>4}f7Tpro%$R^@KK;Ia2ObMo__#3SdzZy@%v!i z=2~Tjtp$OW2lGfEvUr#>CeSmeN?bW1EwH|8#m=iUMEx1oL##)BQx z(S(qH^UR&?I?tqk*7$)Oi(7T)D~PN=kDxw=+;w|^FP}35)QJp8t6)b}U*F-$OEm1Cp@PccXM(yf|)_Fy5ozB;Z!X z6+o)ew6%M+ZPeYNbKDar_hATQ(y)Hb&`a7`Ogc4H?RT?(cX28RCxSglLWy_fU91Fj z_9S1mA z&S5}RSL=ak+b#BeB}}-jmg-@GXTkoGd zF(P`wcKDS30;_|S@`ac8Pf?SduSQ*5(bC5cfjm}diAfvcXfxp32ln70tU}|yk%Cd? zP*`nmi1|^kOvh--q<{p-w17qm1pWtFMIV(%3aC~DWyG!Pddc^dhcK=L^VJosMVY^M z;ph9?_w!Oy=rwm?Ik=UnWck)(AF^+c5cPf+$FYpKIDL>*eoab($lFgYvQ%fc@No{S z`6P3^1KP_B8Bbp4GI>^5PR<;{J_A#bzjxdchMygE9s@3x#5H)w)oTbvmFKV zOH*P4*%Dq--W|h9BPt$ezpUxBjVnO|;}gf;9zFT7+v4Ii`Kn=)ek5r2cauF?F(sxdZ8DwiEo^o>vaRy_o(vr)z_|#^ZuK1HQxDh9}jp6EY%e*82 zDmA8?U_Q<4OX}{uOz-Vn5{I2;mh1Ur!t+UE#?Ws=W4rXCiT2VZ<9XXp$blZ?@D~N0 zwiwvK=RPL7rZTL~o(SE!mQ)h<#v2HSd<K{$5F}uDzGB_1yq^M^ zq14>GEqD_@@EB}L@68J#TC#No^+RLKm~uKxN~GL1BQWhqHjAcCiYdNz2Ihtm?cNKC zVcJuUi7M|}#?$_EO5~Z?lNlXzOJTYsXJdrIn^X|%80pL^dfEQQ$LY-*idl1r*zC{3 z_5>fNF$jZ-M59uL%{`gEr{Uk*ibWd(eyofr03)BybMgM|hB_D|6r@Ixn8Jx$-A^x( zti%SxC7l?`{J8`+KKk`yWOP)W87AW|rBB=H^IkOYzhczZc>q$)4byg`XJ_th;#W!C6F{>jp<>adBVj&Kn9JJj#oxF zjZ?jgX?^B?zn3ROZnhU^&3sIwlRbJ)m(|peq?Ye(#(pRkRPizPCH94FG>9w$XYUKf z5na-Ed7%$kpq!%r^q2#X%)F{L86O)vV-4?_R<-9S3R6x$1^i1LZMn8J-%dD;SN5q1 z+DFVQ_Iu5gd}+t{E&vRmtD?IUc~{ zBG-o;NiWv+&J98f!!r)q4cPVaadmVlC}`zM?i#__a;Wt5^?kp@)>Ku=*jQhS`~xs1RQ7dg45^rVgh?vecdnn z6d$M6RN1y>(TzRzwl+5Vc#DE|{SUw1Jq+_9+M@g~lV0Pu7x-O$npNfxI^yss>#DNx z67M!q4f?rl7Nr=As7NVb{0zi-6WxLeA_ZuP>d%sB(H9;Is+EziN}g}Cf1`05iU;wM z@vW7&^fb_!-kqTEh_m;c3U?8}S>%Q&OOvk&J|x zimUE^x>p2wXL5xsOb1)0ih9ygj6uJJN%Na3s)1t%%;<^ zGY_Rd2ON%rvXg0lHt#43RJ?vQ|EmABs5h=P?nM$6jgLtki079?*d>lZErCbJ34Hj| zWaEot;KyefagxCC#Y1=Uva_jmpS;1#E_-uh@B0UXIxqw9gFn695D&;o>LwF% z9}8+~5}vH*YchD#P=zNR`sG;fVcnh0+S*#t!^4Bq2ku!DaWq`W2GUeDo)Hl@^PD6W zB9{1BQ^UjWuqNH`4QTRi^v@sVEtFxl(q)`oR%5vs5VxGw*8FfB+hsEF>t3UFlBVt{ zusa6(;fT7VSJ|>S#^aRWUVvwGrX%w64`9f#fWjU|pDiPYH{?Wxn@H~)5*f9x}^&N`Pxr&TWOnN0gG za+^6A%oaR`zJgV4lg^6PAu4kFoOI8$k$kcXf{CzMaZ1QMQ_piVhKyV5WM27On>hVp zCEP~9hjMgO{q*dQ4XNSp){~)YdVy%XZ<6A)7GL-V2AEBTy*txgBqOmvb&DO?J4RV* zBIhcVOKn)W37`7r`1P}z&x{~kemhGl{c!9=WH<+jI#!Ahd z7&ktPicUgwJM`HkuVIwn0-kv;OrZn^(@>m)MM{=$LtP@*J(w_x5mvid;tOW^2am%a zPXhN7pS?s@RfBvfT50NpRuvCq?R$s6fdvqBG+ewG!vPhPyDVUbGeB z_)Iz8=5zrj)`SlA2s}3_aoU|PgW)XBY(~7;9cpbcv+5sKbKWvP&zXoF4y~iJ#?D(k z&5TZL8$Io1V73uD?H_CPrnbNVR)e94r%Zo;rY$@f?4g^H#*{l$#Qu}}l)0ZTT10p` zNM3hzjfap3LwpPg=%LzP!@V<4cT}ut0x?H}YvH2qQ5w@i^SP>vFogE#93j=Z+NSI2 zWop=y+^(ojEv#5uI!+q32r{9al@S_cPp0|m_iai27Yx7c4oj0thwHcIn}lZE;n&Pe za(0Z?c0&{(($3LH!(F$okW)5QV?8{CYI@t-Ojnv2@9TB@9bRRfdc5&>?Z<2@>%^@F zl9G}++1c6oB_%BUo>z8b^Yi;ieczdy59(YkfkWWiN`Mq)eqmu>(BR-; zeq*DMg^f+r#$Y-r7bj-2d+c;d2|--Vu9 zX9irDb>9II;E5P;G_4=+mxswpm-UR@#3^FIgO2F=BO@b^RUc19%!?#nYe>>kAVDhC zJhq;8N=nL{NOe_ZWrrnjHx+_CHfQ2nE5kU_?6M4OHsVcj;9%c`v&T~f;^mwq$-#t; z^0uD^?u~PjZ4SXXeB1GYLqc*}TU#F^!_NMGfHOeRAI)FJ8mvp_IS(&;BRA*{M% z`|OxrHi+_y0chPwjIyyclFQ7h8gNDSFq%js69EkJDLrK>H=m-pt*wgz{O+1r()BJ3 zQ6siscV%$ciHQwZZZ*y;=|o=W!UxO@qQLwU%&bcAL&5gGTGtfx3`J9 zI~TO^6`6PhDI|{vPOj5olNKwTN#$hIvE*kMxl4!E$Bd01?{x(8s>VAdZ&SFKZM17k zf1RYlP_=60YrDWJO4n__b%MNJrB1^R>c72t#i(90xG(k3z@|)_U4YNp3$E`}(;=$L zaGF3tjvBGiYvk9o-%OVe)KvQ&F{wh?I9m;My#kgcp#H$2@09|ZyYC2yi!x8oI8 z^E--6x5LKclAdg~;c_dDJ4>s@s3_Nhy6x%m7bN?KE(VK6(VYe2vXOy5(OT@NX}|*5 zXK?zA&}toLjWh8#0a%A~u0~@(v=|hn5)UtF;$Nt^xmq3IsTXV>8z;(B35K3-*1O6B zh=|@u|I!9jli_9}L(_B=uJ`$PsmY6Z5DPui4Kgz~vC)(8i1Ra~JoA z4TO156lbN)90sRFZels?z|8J!al|YW_cqrjgxX5nEm*Y?R-PgDD9B<)vTc-F`U2YT z<8ZJ!Y7Y+`M;036q8gv2(&-Ny@ji5d=I!r{z9yhu8<9r!aj{YuPKFe+K`EOxCZXPq zw~^s}gGOC4tKEGp^;7N)oxIn*3_4cKZH9Ky-Gx5s+!NdyC1LkpjXux$xHon#rR)jg z^U|*;1e5c)QT8?t$#iGxDZGQK#!}L8@&%)lsDa5mFQ(0FVyt~ZMlEdCo?S3}X|4M) z}@mNeGt7Uwjd^#I(TU(jKzW2rVNrMSgzbDk!4aX@zwAnc(CU2N(Q9OPt9*^L{+ zqZCX1qTZ;;!q|Jf;pCG8rkdKMhG&GdC$MXSuJg%yUYrQQ%TVk4-O94+s*>gm3*UjD z6tdh^IEQnhs$@L!y6HK_V`m}U&U0&Y{>gKuC_#7E=N-)xhL_4~=$LX*8V>VL$4YB+ zpEyvBT51u6a+wNMp=IQ($Su6M=J2WLNDjtp7>~9Ul{^@?<%g+}q(B!o^=(osQ^;wC z#~`uT#U^h;bhOW%5f8kh?jx=UnXM51Zya#5A?5E9ms{t=VBfELuJ>Qx&)u}A^3*TZ zlI#lx8X6IX3Ql6?d0o~s%Nx3_HpFo~+3-4v`yKM-_7A0H#cqdhrq`m^@xh@IDG~wTx8*mf`1m zP=n5Sm--|l8@}7Uwj_KTfk8!^ z4D)ds*957>`z{p!#%RHn1Z(%t1lYulPYI2G+up7x)_*#B>rwalVS|U6nfq1C$;Bpp z=4X30)(hESs*A3f+Q9{$#m{~Fn;E-??IsP>5%BvBeyB-QBIIj)fRD)ZP;b2MTP5(% zoVnWu{UUk4X51LH6V!MQ=r!JW5b#v;3iE<6{MHf``{Q9<~7!?wS;AZ6Er1FRbR>{j=A3x%t{8JRmXXfibGC4oV0% zFHc}M$+EK)Ozh5|u0`%f-!n54^-{Zu?CT;O4#vOXyG-WqZ4}=dl^9-R2@m$oCnHGV z|33BJy;GuHAKO<<56wcZ5=`44oZ=Q(l~p!nOqpMY>;?Ypmn=e{Mg z31jiIEqpe%07M5L{Y7nM89cTvk6UZdrr&QCpXtt8!Fz|Zfbm>L5~Lgl;EC?<2EtKw z$ViB4#dSSXKMHAS=UWrWVv<&;_VA!@hLb{+rzxN`>#dtPL{L}(+GUXJI-F*GT8~z^0m@fO#%vQxWJRpS`s-#DPOnp1Qe2W zx)QF_U&N8h$V$XLWqS~tY-rO6dmFB;J#cmhzhpL1LTDr~yxIy8`FyOBl+V%Xhj^mY zfvM81$>DJ=@$`Qn9SC+2>?8VJ?*Axb(rD2xZA2<>&p77J-QHkKEWOnv5jI1ZF zJ2lPZ53eIL)O793h!3Z8vI7-{>BPub?`j)dr9PD-9`;C+a##wVkGY!DAXED68CpK0 z%eML6D5WeT-0?x>BOc`Rvb{b z_av2)ZvS~?DB+3Nkrg8qSVne7tg|Tyzd~v6D7zkzKox&JMTd4`KH2ZZ__5oXq&zZ{ z+#T))O7uTR!#ApLPD{0XFHg&JaeueTyx}=#>mverrp{83l9gw2IF)uXU|Yw(v#56< zFxj?7_y|q85DsgW26Koa4NQj@nuKMuK3WHdSdZ$?rADOJEb5wNbM!?RIe40gCig)w zk<27fXlnp?TSIA^7AFOx=v}|pvSrey?JC<{ow|czL{kVnw-w^Jhz_gLN!8)FEE~i9 z>M(>Iwc1R}PIN(mMPs^dW_zgM+13xwJD0~Bs%b? zq`q%PvZMVqx%TNC*P}yV<_4S zyPl2Y7Z)?_Y;A?s)zxWl?d?U(%+BU+Y;2ev%t003wD;?V9VtftKjiBFA#(a3i~uHt z6YfJ?nzrsk`pC`*%XpB@2&3=zCVIakGdrfqg+h;e3zN8i+^% zw@QP#{BNOp|Mh6%O7UYp@+DR-H7&ogrl!6n5Wz5zzQfp8HjjsvbAzHiD7deG&J^Px zoDN}C>A+{C(n;>3Sj9ynB(a5Dw(PQF5|og?czemI4MNNAn(dj5DYhX*(w85G1M-`~ zdt3sFy|T#Ekb?oy7>dYGNiC>JTfjQz3!22e!HU3_Fl!m|+zQ77_8Whb9Jdzvs60Ah z3LOFIV=`hgOEHx4NC@RC{NGVQ|9*Q(%$GKmu*}%qxmG@sL(lL@4!ADUJXm%>m~!NH ztrNuu2K0~FJTDjdtj)-eMA)mU_OuCA5D$ya&Pu@*fZv}TbiMVslq_aYgIUV2VbwcB z_XnqI+2rx{FJ}%KAA9N~db-kitn{p^M3+1rpni@^RscZGtouuEX0j+O5q7Zr^~K1q z5XvYYqC5afto%sMUP#P;0W|F1u9+iX33J%eZuXvVcVE9~3d}bFzLz+>*nb55ADA=P z!BdR_B*LG&nH~{2^K&4F3H%ua7ItObaXbH5)38;j=VZ^vDrpZ7XQ?}W;jI^SQGJt-7blm!P@4 zm^62LvEeexWin|{RC_uoo-$8gPpR!g5(k~{t|w=~jKmeFM)d|+#>Y@rFAHifcFxZq z@Yuh)?4}fWRt01NgRZ+NmN_GE@5#bf5Ipmw?j>QvhBp1=${w^nQUU&zA7TCJB8#1l zDXBVE>h6D5I!AW$GQMagFgZ5c(@)o8y0s>i4N{tITo&-mt4vgVYooMnnm)JE2k>oO ztqSWHhUBh>DbdNhe%q*zHir&&K)|ZW^!Od_#d_r&^8|miqO}?_I;5e9)1`04{^Cv*Q@{geEB>5n4C?RCTB%@yHy#Jk} zz;n>!fRFw)I6vJ;ay=**4fM!jrQ3buSJ!wXwj(Zs5l@j~5=MGd3H<39E;gR$y&yvIk^rUi4sUg6nIH zZ$VJrNPy7$@qq0N?lBMmKZ8OV#*$Skzapw7;4DA_ zV%KkZy3o;Ae|$D0=Wt$9kUu&lK+?RPXv?$^4_eICtTE~e6x0;m%AVp?WqaCuAdF|sy4(>Qu|3SRQ~3++2y!JiV_NzUHS{UNSZ>we>*hOGq=5eOflrg?1p5Fcd&pZ z;rNrRB4b8$X*Eso@FX0kG|t7xq0Q)9wZVYLhKD6MKYpZ7cB|04EfV^N)DR6!WP1tP_AymOC}wnoJ^6cl_ZVpbyv;G zw*ihAd0;Jys^)9WV#65*PG)T!*(Af0)qh0b8q2*Re0|~3n9k@PA!Fs)6*vepjR11$ zx};YFs5P35^1;xN6nXxhPV=k7)vHNPa%a!e)PtrGtaTw0MM`P29WoP)>7+Mh$toL% zrZ6QwG{?0qMNl3wTg0Sn9#vlC@|7eT9oQ}PL#q=f-2y67wHWr_?5S4!?r`kI^_G-~ zgMD8?o;)~{t&Ydq;n;u3i}|KwvS}m1!~4MlM0)Fe!2zO8>YVfKgD+ER%i{Kkz&~g4 zm{mL@KE*nAD9#tA<7|uMLtt9rw5wvZZz0a@qRiAMGm|?_t?AewaB2qf$bZo>#j^6c` zdTwzC(uitkPEl;jRfHI={{pi3n8VFB^FUzd5_qDo6{q{Y(t)NVz5qH`bSa>muP{-| z_o*m^FsFw87ZzqDn9?y+ub%;STeF}EM%WziNagCZdhKl#AsI~`UPBSGnj&T}ykMd` zef4BF4>`Ek{R#M0A9`2k*4LrwHi*hIWVUh--zoj>{RK8=SP{oT0cN;?4a)(;q$KYu zx`>^*Fn1*J$a-CzjfA#2Byg^x(qpjFgpiS**`AMYB5M`>Vq(|uos8{)g^kW$|bT!3|> zoamf;G~n2MrtI?}AgrEnAMI4I{_3*;kBizVeNyS4pxk&*D)%PoLyx^O*KHO%+$(>? z>k2`H%UG`-M;e%9b1<$J=iF4)AER680h~@g;sHO|e22T7r1Lr2=X(iQ$?M!ZR zXuSKU%b8}Ir8I$u?M!WcnxyePA;t9n5YNufGndNFoLmXKe?)Cv=+VL?RaI4Q$;rtT z)zlKP($mu)C9<)(IkJIt0UispZX2{b_y0*J{4cS_|Bx2{vnKCYvyUJ8eAgKtW<@cm zJI-WgR);V6%%X$)hmtw+{sEX4nNzz1pQXwcix5{W>0vxfOiWFIZC?QV@K^n4%qbIp zO!=2(n4<$noj(c+dhWT*-oJm}pbF5^FIp*#63d1l+)#Xy-5$I+duHQ>2DgLe0?dEV zpT>I}&MER@-MVuBM#VoO-I1jw{jtSG%JYj0NiQ!i<``j4PL(P7P>U{~tv%|qvbsHV z>R-f#e>V6zTFdks$(j#L#Z1JJb8~Y?y1PROQ~dl8=~*$J_Jv~aV!Bhktun$*f%-93 zheY$s>eTSto@+X3sc@*~h2aBewz#;MS^W^EZLUEeVTSk3s4I#u7En3_4aCK*aLI7L zqk13|bd8xK;V|q7weD2!)aFa(PeQmMoEQ;dIO0Z;zZfxh_yG#J?>nKD0QD~-4?MRi zSG?kJsfc<+JR*`0W7Z@e$J};mNR)C?%7WwN#!kA12b^I1gx`xWXgCF3G5SYxrF8U* zg3*t!5}TfF<3Dp-D-yXc7+yhkXAkYK2cp@c_!{aXeGRRfTUa<~ySqNsi8!L-+|cd& zB5RsX4FKa)El(7b5GzM<)Sw2`-AH8IEUsUp)gM3PZ}^f|hiOikINod8&h5A**e&$4 zWm6BIys^XR!BYLu{XR=RV6m5$$yy6F$O6|jy|icJbOi6k0x3_x@Q7xJ3;(pIZ zy}K#mXF7?{^C=b%Df-K3Q^@{*Lec-G!6tEsGdFT-E>3R#CAyUhLjZL%@-N^0h@s!A zAYsY{AC)Q=6dwB#R`sRiWjdcn`5L6lJulJ%4VYUy8T3?5!V1RU0DzepprmIybJ(TL z*aJgwQWBY+%J3@000hzb-U#iSBQq?qyj8+#zWxyQitvA>oh!OjBI^MjOJ}a$t3Leg zrX_Koyllu6L1Y4|xiO4f2ulz${G%=YZ7|%!5vM$~+Z5m92yp4+II}3O&mk zq)_HM(ABEv24r698TSLI{g-?cKoW82H($K}9gf?4rZiS>ovNjK1Ra8^hgyhPcM$#h@xBg}Xkr3~nruMvo%&;t;A0Tz5^tQ#sSt#x~|8`OK#s^h}_77NGs61oi|J&$h1b4|T9Ng)Y+yy8~`7-U@V4{9NNx z|EJd6*51xtP00oe5Ip{1FWYyS@UbulAdRJR0|Jg046C?fn;-mgA@xHPAZ50V_q7Un+=?McSy5Zez>A!HX-e!cdHH-D+Yua1eKjc zfLgKoMhJI39Z)<>wXIuswmmu-QV`xYd|e7H#nuc4`_N^3>CKoZ-Z8Toifsv(> zuh*|8#?|WGlRLAT!YJKjFP@C;nFaI{yCt<|hvpui884K;0nFqP@SWljd0chu=`+9ToeDXrUg-Ezf9PDM&}W_popKk>&WfU@ zi}{@SV|=by4W;D|=LR~Pz!`j-8~vbyUb zh$>GVbANJS6ro!=wTF}kG}SCR+e?3e4vp?G#^1`yCU$0s%1(HY^4qm^LIydq>AhnU zEDcT&epeYQlq3K{M|cfpB6UZtEV}dSHhfgpLAT8Ki;sWyUC_Pj@^2NhV(aFWFKvCc zR|RUqS6Lj`1ZmR&CwUYImy5Qdc;F?p9khVGZ!5W!@Zrv;+amS!^g)hTdb_e|FnDt4 z3XhO%=SRrn<|yR{PceLN6SUS zU2w^?91^%54I3G`>U>PRkgm_Ro=G<)We|sS<;Qf$fmlz0nR%IDYenB4scJS!$XS+9 z@dTsSw4p;%g4UP%v56O?2p0?QHakJ}yqK#t=7&z4&#qAhRHncfNJef7g%y&}p+E!5 zAx2aZnV=r7<3ysbtg0Q2tHM665BK&K)k#wh3KrzRKv_Xk1&i>Dz~ko&AF7}0kyM@( z1$Fo7#iTi*fQc#SDB>L{d_4U#GbWgt`P9m4euTGDLFPT1iMo-^Ib2^KD|WiCt}&DV zx{OPKLOBoOtvB2eq-+xUMehM@S&^}G5U17x^FUX$I zvxQeQ3MkFq&;M$HNxSRelg{!NHDxsVcs@TH8B&*898E7HR~?IEoJ9bKha3VpLhk7> zLB5sUQbC?}fBX9v#^eEYf*&?!UIsrXL7t^@ox+Q2>WoG-#p4-qcb)g_n&+WWK+?b* z$;ZSrn74;3I*S}enx(EQ;XFle`p$k+T_?M!IBiRY)r*|9-248GC+pR1V6cRf316Q1StXz15~XsDj)n^+T`PYp@aLBvA#&e%hXG#hiMy2QOA_ji3uz z=Bb@9Y20h>&bow{VdK!Mxu5@e+IM90N~PIEA`^4>2@!{MY0{0btdz03;XMBwdSuR?7P=DZp1IuEIl*Le@H_vT&0zm}dl} z0!sga74F~jpO47xMbhw9W|9eCpMpG=BIZ{oO#pHhHD~WS4pZPHXL6XzQ)9%cjY>ey z+Fwb$^sE;oJ?B&2Rgh%50<~psadLCRv@l~u_WWegs82BddR?y01yBHX9l|UpCwH(o zqp7K>y;LGdWI=!Ry}0B@j3-`L!K(S}#JeLIt&*5UeQv=1;t$wgsN)1jpp6CM`xP_0q&A^6YWZR+!0f@(#!1L}RxreHU7b;fTCcSi&m%(TpAMO4sljFF zA*X}jr*FW-#9@Hda?a=1~J7vvz5d_M_xKITo!d}n_&nBPOQ*g8Lrmc zSWzn8U!jGH2A$2z#J`f9GyFkW-Yxq8#U}9L-NvR2Cs) zCpcCS9eW0!!mS~q(q)(jc3jzI;rLa4AKx)(Vai!&pA=iaRPLSXonHB1W~}h9+ez$4n;a+R=gG+hU0L}by|&-F8C0IPb^E~cZdC~urE-d{6LkLQJygv~u4QJ& zpLng^zgS)5@4Lzk$JMEMZDC3A0Gp`7ZQfqtw0W2%8u_I>Gk4P1JWX_OaH&|+55Iqt zJ95blAGr_iW8s|+!mUTo@$uYg+-mrAntSN;$*t1JR`2tr&t61LN{Ua@O3~KY?}V#K zJ!yfcbYxxcL+P`YQ_8we?NOg5)ekN-9PX!YbKizPf2*UTGvbemHP^*;9s0{lg4=Q| z7hhZjhsWpm%ynBLfypB52CYkvOlWsa6$C<#HPDiPV6k32$KyjeEZY@X?VyPaAKVE| ze3v`7e;N$faux02^)B6*?Y+vQs3?dzlNGm1W0bZ(LXdgxtU|&of-~aJ*=^t}=FAt_ zAk?##ciBL@(8qQ}cLpxDXJHpV$8?d90>R{a>36FcwW)Y??&R66^PSKwq3g4!5w#5| zskX0s=Y^vmkGGG0k&4=lHeum7f#`_t23^^_FUZ|dBFw1}xV;I#E;b2dWRB-m*H3LO zde?V#V(7h!ulHI2(m(27Xmip%tuBG+H8U)_w~lEJG#L0oNNC$}0Xx~OY?iis$3t!E zrjtrmWLp%FQ+c+fFCgsS1C3p5qIGSu0o}PJ)Fof;a512*(96PFK2o;K8>Z*aKEYx$}>2eu~O%1+E)Pv zqx*L-Pj(xJIm9^o(^a2f1fD;Ec+k^h4Qfps%sDBC5yTPV4ZhuvwC`O5t_KkDS}5Qz z)4{9i0y*!0;YNWcwl|0#q9R9i%W`2-;kcvj4ZV-ahFRK`#ELvvqI70;bvTuJJx&}& zclvNzg?+9ERh+@2@}(tV!)KjZW!r#n3$WiZ3hhykniA|4y#wG$ljUuq?9NM~2rcko z@v>*CvU01hGiqf{-%TVZd7Twk&@5?+8si0hU%p!UU9Z&H z7GljR|Cnl9uYc(^G>qkhV?3cz#{O1}H!jB<7(DSpGH_yD7JRiu3t|DBcAXV|&>L3Y zruQ!gtc?ybj7zQYwH6~)2FqF}wDKYS5y1cWq0`njvU`Tk^U00w>^PqN*RXF%?$AFaZ47 zbPS~K^nb)YX*GUgkYEn{S(ZynBFuR#AjSQ@3pY*-D>fuPgmu7Fgz&kZ(|Dlg_bPt2 z*N}LyvbMyj>fO3!G5DQ`tyLdaDq?y>L6p~Ha%gldIgsF$`%z}Jw^l`^dR(Vke9a^R zf*H`{jm3k=z z_x+vOWg1RR+1OR!2t&h z%=5vaCEa65VV4;-s!Qk-V^?#?7`ewQ&O(Y#l`#)s50GZJ+6@zQTyd=b@zM8l`r{{q zN=CS9)Mb=h@V&DMBZh7uG+tm8y-c=Fl(m7(<0l$sK<}8Q4?_CUve)H6N*NQX=Jgr} z-$p5*_@ z1jcVuddtG8@?1PrVR($j?X^~>#XSnn;)`3j8cc*EKJYMl+Vr9VDSP>S@4DwCBkmR< zr&Fk$fm{^*lxt9=Ug%wRIZlAr*eI?>IQEhKpeAARJF4sgBs=@1U+?Qg?5LMbjTY!?aD$#j5HDA&zm!Jl!bpu3(>E9p|WbQgSB`MpurU@ES;%b)SgS3BAQc z;j7K5qhY5)$X!W45F9_AGzSGs{N|tz>_$LY79ENq#KacNS$gQK2*b{4<-S0-3{_7t zyOMRX(kBpm7|x#!Zn4Mj`QQUn9e9i%P}*yB6YLrKV-KM-8sR;Pq@&Aj1_g5`A$&rx zp?7AbenTQVu`>DXMKigN%p{1hdNaEJw{8ucd;ws$6c3&A%dW{lqAo^vD0Si0CCPK4 zs&AIc*qI6?%J~*ZA#dq;pRDs35}4ggZJQn4k{PgEz|At5uZray=gRJML4K9)kHszd z?}0FD&$bE)MG$|ZuK$E_t89(=^rsUw!0j4ujPMFPnY{A7P{nR=;y0&)@FFLhxjAg^9TJ4=>G<++byp?-@7RA}{;hD8wPBqSt=zn}n&WeB}7JYfR}iq&_j(-ZPpvahsILY6WXS970m z5xi8$&?6-6zM^F#?7k!9Y@wF=TZKS=i&?Rj zB`==3l!m6pDU`s!mq+AkJ$(5XHJFoO5s*B-**=v?U|9#s6n2E!BxI=~otln}WdL=D zAukMvO*{ajOcy}jAFlKBA>3eEe@q-$$O+`9=PRvmw%%D}_v}BdUYU-*lp+K_On%UBSeAJ;Q{hB3c+2)p(j%5$1rs4^JMgBEyt2@WtM}*}i zQxQ^I>XX3+fCpdF6f?66gRd0_88rtCCV|9_J065DLrL?fkJJ#kRgH$Ys2SL6p8VA6 zK-rz9;s?+JR9TS15Y+}=uB(xm_G@D z_!^dn^P6yNvo$~f+rc9Q5vIf#(*CXT`_H7f|EDUmf2P*Pg9CbkLhgGvL$y<`TXi2t z?$vx=!(Z%RV#-OT0p6dH;1-{d*T3Q(F+K6eHW590)wGnSu!Mcj0L$B3Nn+mT+eMc? zpa7%t47-K$Yx|wBB#f&rv_rh5I2#7Y#ih$qA<%0YjXF6ZuB3r|2dK}1b9sH8ls-DORd4z_*?)dL*?v=L z#8`Kxlqfvz6n4VfSolJhA3wrf>vS&P){f4?I^}NU4L0vYRHN6(U(l?{@8hx{b37VW zB=}=|9WlZA`Ou-P>(VuDt|g`SpHA^LNi)`rNOJNKEF7F5R@=@mD)z}=mfwUx$V``V zqm?3=VYQ#8+3Be;WZ0&D19)uWe0P~}4=Tz{|9Us7>)9o$Hsi@M7rgJ_cE>C5e*?w`R zBpWIOlp^Lpm^(92s5jS%*1TAEvvW0hHVx=F^0M;H>j}J{($;I{H%9UC0>6W;Lf9Sv zoVOBhwaky72~@{{^CAL79dw&b6D3#Mf<~4HQgh$x-CLy5i2EMHy8jqS`!ir6KMXFw zd4KSRB6eN>Jvn<2E`-6rYpXTIkTEx zSpjM%p2VQA;RewyK;p-dBgmby53-E+t(vPw`A+X}Xe&<^#{e1(QoN)`RqtyYU4;Otq*TK*BJF^l zThty3^4sgxj}W{<1#6eGq2_yMg$B;2Hr;Di#D)?>$GG*;uvawCpv1Y!)?Wi|&s?_@ z>2QQ*mnZnI`Z*42&fT}fz${KqL=z!P?Y9SxzZ;d@xqj5nbo5J;a5)~zjywDedit*aRx%fyVLr$m>clNvKr>}V=!fPYwbV>T->k!5NJm8uEfF#5FR>hs#rDVFIt zQO3K~;avlP)Qkvx`P6ljiXprwfdTuY5>|km2z8u2?#Wwkg(+Lj4?fFYuC2~MFrSQF zyFc_1(M=(9tO~PBQp}ZN8>jQ-q;U22Jn54anHIywr!l zefw()9#O~&AVZ}7!SdA?VzOJPFQTz()0!T>tp{00R63UXXhydWW*V38yD8ASc?SB~ zzYI*=W&e8FrET1!Vd^Y`9W`(T>Z-W2e>!K^`7`8Cs+ieml%B2jaTvLxsj2BBr7iwQ zZBI-*=i=ge>$uTR>=9An5%q%ePaq2sYiR~qW&sGO7>!O+U+yiMIQ}(l$Q=aX`CVZV z@T%|pHM{s4(e0pCV|c&SV`Lsftv#_jmox2`{U?~r0=36)If$sdHUUaz|8-{Wzg#Px z|7MSpGpyo{XMocmD^f>d3k$yjYrSW&aYnC+0E_zM%?T98JU2iSo!th+;c_s?`*92j zDC_(!i^)g`dm^CA5SZ`mYBIpB6~OkaH4p|OX3mE~`E+S55PK^CBMxeUmZs(kmw2dP z3rAmtrN0rk07^1eQ%g%n9}36=y0n4a$QRgJxv?2?FU!my z^@ucLGKWV`J_SV-8X8&-91aI+WQRH*$!V|8pDRm?^KFV&GzvrB$sqe9#M2v?nRS25 z>jes@+kK+&iFOf^meg_SB@O``jfY1XfxF_rl++z#GaKruPZX^$^wCgMz35nf{Z~0r z&W*HiJ4QIAc;qC2^_EJ(6uZ+&Ez-s(;UATn%f*%SRjS2RSm39Y8JH-*H~~aRM1lpT z;gIa8)?LGDKDquj(mVZO`ibKF)M}M0hktV>J3Y^T&^GxsP3= zy(YVDTVXMk`;g7{H7^Fgm#6b}Ztus?yCsn>fXPiFdcrZ1w5>p6;}oUY58Hy8u|K5vs*Fi79L(Rq=V64&%F zI_*UINull5?AO={H7Z>EzSyqCs&C&yy0-j{s_{)(5S9d4*9+x46;6KtB+Ra$*cnAr zPn5HIHo&-YC$zqTZFSQtvHNNrXm#_wlf?#LVFO(Rj!1!-0A~|IE`A4DUT4`h4$P)R zAtppaNery#H75Ut8br9R{hVn1$ZwTq&|e3%@SNg&X#Jc~SUvC?ufKCjy!pRgz>va) zW~R>Vc&Ti2a+oUqv_!&0n}m|ftDIokPtVRQTx;o_mPZ=G#Z+boPIRcOiQBJ!%Xk)o zU0ygWJJz03G``rN@y>QA6s66CC@}$-t<(9Xz~b^*OIJOOgj~m#D1W$ycX90aFI}sh zn&-F4yMK|HsR!h3K^@Dy&;A^7^D(m5{^68}t*I7bSA37N7ekMW>#7*R^Zyxt{W^Mo zT9VQbE41KdV{T!E!(_f#UPhu2uXylq%RgEH(jzZmutuNV8HhuQ&#o)4W*_C`W?S>o z?OIXAxshFMuH+$^_gW6{lDBu%c6aE=ca{_Gy1zdpudXDOClzbvxhW&UF^s^&L;Lko zT?PH9;>TGs>{gVIShYMS^>m>|H@3`p5u4s3yW+&GiP|x zNxbC5Q|vMb+02+JlD4a^In>W@@_f)@6Y)HSbLOUU3HLNcRWG3b#IAl#phPmIo#J)( zq_FMq`tQlZpFt##m^0=i`pW_q#$lb&^-!sj>k4c3k?7QODrs4MO7AK?BhiKO zfR)_=P9Y|BBY_1PF!WPdUXY@H1kKPG?HcKxoMAKWh)p>=2lYBykRfxxtkx~n}<>q!7 z?V{oNMyB=LGq5}bI+R~5(u1#kyW9&G4PT}rFqoEXBj#;Ed_ASsUbDdU^Dcj9k$J2iE(?yCPV4x7NJ@niXJu#msd{a`G+>1dz&;rVn#YCHpF%VB z9RKAC+&(jHHsE2D=B2Pr+_Eb}KiR>iB;PL{W7Dd6{&BHn{RtWCpvvNOt{73X^k;8l z!7sy@il1Bg&^wpr$v6PV^c>swk<$1P2wud$nQs#1?{Q;lM%!=bLB^4a^@)mN`qUxi z=!k1-UhC8YHK+wpHuDY5`?BpF)0|JB$Rkg$QBW_oNeU@p;ppE055n@{258B!MfdWD z+jIu8EgjlVRX+Ala#-Rca$yO~Rnw%h`q_0{1gmXGdj}2+BQL(NmdANq3;X*am@}dB zb{Cf0UGKQoWuO0kKFp5YHZix;=yXvx*i9DL{>ZY$l>pa&2|S_Di59N^~*)~ z&HQsF&ey+P-v__+6qic^Y7@$6hgm?vF_zeHzW?``-oo>%AmC2B`?;04-T?c4ewi5- zWn`QRe|AKMgH#FescD33J&pSxb_v&GKnr^}4%2qKUduAt}uL+9Oltn(!tRtP`>jqop z!Wcm+oxi%V0q(u4?aJ?*c7xk?N^BhVl_utLZmHIfS#XoD*_6DcCFYcghDUn`Yxlu2 zZey@FN-G|cl=yaf=qQ93DzJ_0Rlb?}W$54jYZgT{f=p~2KnwOMc|Y^^<_~u5%E|?; z=Vq9L@ApO_f(r+=Sl?epG@)4n}EbQEw8i zk!5pPh-+&{y2JEEKtKiR|BZ~DUN zuoV2hs}{8L5xsY1c_V{jF){a`eCXZrt@awdTjYwQ1RdLEWrxhYucdQNnEasRR!HnYJG1N{lzeRwXt& zchEOs72x<0L|!2V^}U|z64&i6kw;HQzCTPBxGV#snro>gJaqBJ`K!u_N}Og6yC*De zfr142BgF)G4bF^mn2cN7J*-`QhP#y0;eIKsw~=RCUsISY2!Hgto#Rn0_;IG`@;*^epTt9pS1%E z56mZFr>mA3%7lseP25{oW@rB$dcf^sl=4&2NIGOE;&zJPVRv}Lu}ltrcf!o67aDYk zJSrFEETbf?Q3JLz7N8wnjP4c~q7<7Ju@HBi{-ofO`icnM?Kh_s+MyX~Pncvh7m254!V z_-JYttxus{^M}$@+LUzE$em{!i<2aGieFnTdG^ zq*&V1@=gaiN?A4G7C(-)|NEfF+GZzI#pacM;MVuLYDpioCshPW@~sN|w{a1O*hXX7 zF=4}8QpN8NY>&17!6^oMhQoYu8_DKzBP%pfd6&l&+>5x;uout}319P*-0|Q{BYl7ky<4${u@r9N3-Q)aE z^s~uJq(0o@5>lhAIpOaM3_(gj{hWJu9^7?XMz??P3X=X0Yl$86)M^bi(9obR$(G)&HE>8oKr$v>=N7>zlx}6W%lug6jk&Z2ZlY(dZHs^ zEm}!Ku;<`W1rwXlFuItk$l>abzD1I;LgLCpC~P!M98SsNV$R+|fhx8Wo9`tDObhz6 zN=q1|6uy3MOxhbBzg&@5&mv`v$2P&S?DJKtV|^|7?8GWLcxCn{$*JzVV~M2Ty;eK) z`X|SVq$RhCJfx1Rp01%IEN93GaZ*OH(MnB7oF8WH${dx~*y_gmsf_iv0DGdL!r*(^ zFzO}%`Uo%nd9Ii~(?e{tqLOBsIi0M$#K~YLdQ_|*R@>Bcn3G}-`-07{hKQlTpmt~V zh0J|2;1QSod(C1W5#ucD6Q^I#%eHRnTimb`t?07g|A1T^(Ffuxnh>OWSeNBNa&uWN z?Kx%l?=PY*(@L%GM*|W`xc)OYVt%Gwj|W?EN{?F?mkW&U!UmQpGV(cK7%jc z*{aNzTbyy&rUrtt=N4V|Aw{H#l>b1vnflxx5h^g0S6&Aq4|KCCIN_foSf;tW$4r*< zR4?qPFVQtx-~0g4%RJ?7!j}9se8bksUKn*wV=aFXe^y~UQ%WO$my_ZETE}CmI8qPH zm%N-Su&}hx8n`T~PIJ78Xvq4uJ9sAW=M0vEqD*K$G;)(_F|w#Q2WsZXiBo>iFO&|B zKYz&TgSVNx#C8?_KBbsFG8P+lE=q!fi+|R!W8YbQqkmjf-MU0ywvlHc(Up>$NkA(U z+cX*oX4#u!X|RCKx-%_$S~AFv{vMw*q#=Fl0ULm(cs^~P3?E1cP%eF4nqyS8t4zk+ z(4<;oSDtT+?5M98C6*MWk}i%!R%#sYAreZ#vm%m_+l9rxE|226*F_@> zRZ09yn?+^#n_t-yG&mkWjN&{$ro=QAnTYR_02&(SHY5__p&w&IW_+j64P~p~wwRre zV%+9&{%b&NOIYX7_<;A}Z%*Z}2?6i?{?8xAViF0e<`soV=oQC1)ML}dXlE@}caNrW ztKDYmnK&eyY0quKI^J`;rRCr=LJSf?XviV)PLKGg8ua?n?{2AC$Pb15Qs3WX9bIs1 zL-h+DW+s1L)VA=;gEJVeB5~-L*d;by+^R{4Ju$CW_rW49D;t3x+7vSI*wNQ}a^bP) zVr&)JGY_-4hI3PLZ?7Q59~ll`A@Td%@=CkYt z_^f`@ud&*I!nbH>k~n6Wz5*)${BUMwcNr@ta+1P+(?;ACoGOGvgt_HgH~}2U#N-|4 z$fe84B*cx@6TgN;si;WBhQB4YUBvhapas1DRB=S-|a}5-^Kf zw~Q5;LGf^#ci;#oA@FV%7j*f6{t2f*=50vdpFpm?yAnn>sGF|;$5jrvHpdfypy?h+hZexGD|LGMMtC0%d9S#aE&@x;!*O|EdUru*%MW6r+U1#^S&Y%m563wSx=@*H z*7Rh$rP9#IZ9l_c$aC!UMSZUuDMck_pXY7_8k>+_*LmR6S(8+$r^_bPFqK6gBS#SO zeCdAwY_43`EJs&*+ilQ{Hm zC^+vi`Zm`9bB0TfL?tmx&mggk3M0S2L85lwqcBllYr#cRpyN!B-+l_t{mE$q$KmBD zsnyuA?h&_!bavN4gFn*gBtgjwwz)Tz_chxCmXU)-Y;L&CG9?Eew`m5hPDoTXI>YbY zt0|S}1l4tc{+nbocuw>x0zUY7|8Icsvp4Vrvrn;G$qJIT-4#5FrUosj^Px7Mj^-H_ z#Ik%hYO0I0jWpo){E;w63H>}ipUr~+%=@AzyCHc;;ejeXGA&|3uu*ZU@86UR{}Clb z`0qSE-l1ONTBwQ7AsAX7&{?D6GXx*^s(i3D81xiZ3&1rd2#Vws9ku>+TsF<3^aUcY z6?}Na0u|4*Iv5^5)ixi$8RrQpmbW-uzwNiH_jT!H&CI5w{HmJf>|mHpKZ|X0Ox=t@ zpM|i?xKQ;KLae&n%!PcK_)+!+n!735AG~#hONp_8n8nA&G)I4>hM^X`etpUVUxcXaOtNF&jhoe!o`ulH@nTQ0ClxHW2$siCkdt`o)9|fBhw>HY6x8p04ixVAt=Dyn zgvuyvGNw~z3rTbk(Wa-=%VulWEY0yK_!B%jJZI{2z5nj!SK`JOwru+%iFjrW*m`qU~=8;qVHGs zIXQi8?B*oGNNeIvzS$BKwutWkUH2fI6D7=9QRmnK7Akc1YUw`$ z&LkTK(`;cY*z^+T#mcmjsF0x&Oi!2hakCE4@6P{0iw5=EZl>R`*eyxE;+D?tY;tml zX#9~`>3nbVZ$YKiQ>hfI9)xUE%nVDMy_SiwL9|xw=G#%Y*woR401x?aAMk`t2sZOc z`AHIBLNPv*pb+6t6y`rDD+4lp@2R--Q@FlBT+kEgv>``EE|G|wy26TKETH4AC*#88 zAF?YoQ2Lj=AIqfQ|1qD`KmS3p5R;%c|H=|k2==@G&>KWD$p3KQ|MLRQvv3O%AsEO* zBlcl6-Xw@-$7G(4i@`vl{xACqM0#z2JA})Ps5!6Ovrx;otM2PB+8=Fxv>(%pU&fWg z6TBQ)lfupwuLStS`J&4DTM(WGUu%c2L!$R{HV?ZlWDwczAN_Q2%Qv11$zY0SID8F| zYcxGBPpqLvo}#6F8GZvzn9wQ}F2E|C=Tv1l;RLmP2pR8kmL=Z;L!lhtJquOkhv%2v?_0%qj9G`q{5ujdN zGAQ@j4pnX~(cvq1atp1YK6JV_YUPN{u%e$nC1m5#JCNBXTDeh%@Lp{7{$RG7zv6BA zVKa5gVJ8wO3F_U%&Ay!2WB~g5C2BTZE2uap4afv5D%<9sZPBQH->@tkR*+C?AzOTS z9=Y{g0?p2SqI&H&6NqrBt2W^5yH7!)|AvS>4$MWsxHnHm)ub4Q`!B&xSX zFT{d<(%X}waZ+)%FjS#9w)ncgfN>L0U*~C|CPHI5QahRa`){W3L z$M@@-@)356B&Std+ZWEMC3Z;P5uqS0)bdvjp}H{$_H6YI=W?^Fv|VEUamC0st05uS zRqXxv8tBvzJ>y~c?9NpCrhCUmxFD4#rb7k5?5)OM8H`iv?~7HPQ)`C0NgrM*Hh(rV zD<|;)1EZ|HHfD-7UB3)R+2GLDdTcKm!~*3i(rToV?}p(4XI~vCFUVP3k%SAEo+A0a z?^)F>jX;gd6WrQKi^^9b`5&);e<$pdZP~FlY$hdqyxwFe(|`+r&1Uypu!?Ou_Mz}0 zqR8|%JY<6SGQoS|Y@wj=4RlFtl>W4cyCL>S2o`+52XGa)cxb<~_uK+mWJ*oDfRit z(0)>OdrA!Q8>J91sI;iYK6{}d8 zlg6M%<*tbKsqev8-`9O#`4THBNquM@k9+r%MDRGUOwDS?iS1#l@g#opet7ZxpyTCy zWOhDfDW(zr_WWBZs5z%1>E+3__0MmU!wpd9<7Ba(RU_?LPgl}tz1Eh+)*F8HC3>;d zFZ5y>v>UN4JDg}+4$L#?~HjH*o07HY5JP{Nwf`P>Dj=K`dl;#k4}*wggDt87(7$MfeCa?PPz zl!e|~8VvA4KZDI#$#FiU=IqiEG{>#}mu5jH%)L(;?(-bQ&TZ6yoVJ)$cT?3?@Vr0{ z2qej-{%3}k9ENjWI4C=2&{;$GW^U3^es-1la&0TZ`?`R}?}Cs9Qv`1rOG-C=1klam z{BTN>`&2nH6zz{bWDc*I45+N|n*nv8&AX%yO8|&vc=aOkF2y#fqpqKO6VI~yh(4Py zr4){IWbw}~&b{_-4A|mi7&%UrO@=I%bfb(lERN%S9q_4f4|h``Jw}`N0&Rv{;$zTG zo3Vk+5)9E3MW3*D_FDY~hS6v}a@0&|Pfu;Nn1(TDo|}4|@+eEHdt6GStDC+Mm~2~f z{=n-R?Z89Z9)_dW&m#6bWE`5GlnZkZDo_FdH_YF3xP zIK>S&2@XP_)mElzw!el#g#1?a3XR};vsPc9^Yli>x@{E<^?Lhk%aAC@;8AN2 zHt0*gPCiJ~SFRzsoUTKU$&QZx{+|WCky3g7Uml`S=dQigBzb<#TH{-%3ou`P{ETGV zMyDPfji9$>!4BfmuX*m9l5G54kx5OTgPAWW|dny|f*7zF;ru4mAaAH*z z0g+3#N+IO_)qX3JF|BUQ+?LDXMdg&+$g?)bq3UKdkz+5JbJhpce}}?A4*F|9SYQ@#eVB zhkZ2Fq_c+H#?5CbE7PQHO>6rJ#+7iG+|O41)ccbvDpsxU0#vPRpK6Go<23sVafV>y z`8tPzPk!`8{XMC z`$L!)%cw_s3`7|CCo}TKRzA zkg(wt{2gmhI=EF1VN*!vy@lDHD^l=H)EjzwqnEKj%p4pGVQ8n{Oy8sq2sk*(*d5#yKNBR)_dP3%FpOUVeP--pXVjtK8#4H~~IJ{pd0R_D9gk>)P? zOKm?;)kA|A?)~zxlqghSI_>kqQWfs~n0AM=E``VEy=3ic>Z8jkhpc1|{4mx}?njUz z&I9tkG=_y}F42UaSc^0qh_mK*$M=U#yW~-7y8(_qwTsipX$U|R70vM4d!HRz@rTfY zz8T%D&@QzBKf0EwwZ*WP?PIe0^9-_93$-@$3bQZer&xAI<(@AdW6G<%Gctek;u?p4 zPs_!+$G^{_y?iAnJ6Y-t76P7qLT&>bKE~-^*Zl@Mul)^IjwKm0s1NW3I@RM@4Ybkm zvkG+i`c$)WD7o%j^2PifqxKO8#$xroD$nC$!rrdd;#B!s#@Er`e{a5wa5d+k{IE11 z7%7=bpp!Ywz|`1A$wh}XT|~*UoCd?OT}~b_uiwAikM6Rg$q+>lf|fGBgIC8 zz9!tBeXS7^W}OJen#WNgra*q;{zS8!xtGp?P<=`Y9yjElBtoR{(90M}VJi{7mJF53 z$hbD?FxVQ%+e&joKJj}lVQeA`9QIiGtnoS&HhrW}IFbPG#U;)| z1053z`fGJ$5<0S1{?b*jGbuT`Z1Ys&2YP?>nBM9ZR1~?9yGasSFHLtrkPNYft`Q`h zg?+w@<;NC|LYzb=Y-pTS%l_LGei2tHo{>_H*oLs9V$#GrS4bI})Fg z!UTy5Tt+AGhEe6_NKr94B_g2ha#hRsu)JCfWdF&@Z~!K03VnVi=bR74mgTp1hZox0KUeBzJ>PbmeI>rol})}+stfQu zYSNM(bzX_019Xy&L&i=3O0NArR0FcDKZCO?qY>jVe!)HezTgAZqf3UHwa6ZrCMoeU z2y2j0a6J7DTC8c6(QhV;*EYsyV#cv?fC@WyoW^<&fF&U_A$U&o*U#%NMrq#D6!DC? zc)-6X93~jpe$aMl&jlS}Ujxb$ym(b_2|7M)d`a~A)3r`BNUN;*?T_h1tL|185%Cug zhqRHmn)#I_M^J2!2BV(-FL=cicxH$R&tNu&&z{uZ_t)t%-{c7BX=JR^KM+AU?Z%oe z-5K5QnYh}d+M6z=Y-_-){qa;C(bS(cquKd{ZT9VYKI;@fm}!Wj{5F61lmu9%(ZwfE zoHly+KyNn8teH0X?T`ci_(Ug@%`YIEB@KAlJU zJu2v91yMZY3yLlf+c3R8x zU!7+yj_FlHA}VNiS|&v7OG5`pfF`x@)3#aW`k9eHy?%ix^g|UHX#; zh_cDJ^<#KORG$h?yx(gm&}sl}2SACVcVsmjXDJR3#C+@UyS~iIdQA~Y-(aiNp*EI4 z+=O(f`Cpwx=0+4uheT~rR3C&ji*0;7U<@RueTxKFK*>QCONuG>#X?q(Ol_J!wN(W# zkkdP!ge0o8_rAyfTbtd`!o$+g;ag`PhIfg#Co>P=kiDauxGk5`rZo^=NA?Ce;j1KS zrYVQh;pCg)?QM4^pz30q^%gd}bw$C!6)kED@3qs{Gem`D(zkV8glox2W^BZ+uFIru z46!JWgzoY1hf|!HXU6Ly|sesp#GC7HlJi0cAffYDZzaF)L(3AQS7W z=1-y|aIF7jYm7Th)Bq%dcl0F3NqWTE$K09`C`S(2UF&gFa%G(6}KAL{MF6lHe zHLvzNl+5YzJGI>u6t-YU-+@5psVeyC9-h6&fJvv@gMj19+$us)^|~kJr(ed!wDUd z#_H0sb2Kkf2Z9vHcKTgib^1EJ=yX`YR{edQI{-NAACLn5mjIi>s^`sixc9j)7ae{f zm33kvZ=Vj6RJ%o<&ghOzu!dX8U+JVZ_$bdN}ObSR%54R5*D|w@Nh^g%`kEWB@Qi7 zn`WWBbcO77c3vvPzrbqdOMHrAvrQaclZ-u=mZz#N_w|Ue*CK8&qesE4FV3T1Jl=tLSZJrP1?w9(Jk{mxXw6q;fcofK%aelezI9cwBbr zI7IYt7HbDbKC!d&{$LC0f!p$_G^FsV`ufMN(&fbteLlFLxJ)}|ZIw#Q(3olR6pmfb zREhmg+VcUasI5+Rb9sqIOi!D=8+Z})V|D9qhwFGK5jj~%$RE)aja7El?+=%gFHFgd zo2AKt$Di-w(P=+&ig$;dQ*iaGo9XBN5LjkzS6u5rA3_LS=~A$5137&vM+VIylO%?| z#Xsp!&c2MaQXQGjtx96XcY;kv*Y%QvDTSG)9c<%XeA^^)v?K}4Mv&3j)2B;R6s}ME zNWDEIqu+Z*Ho9g#FBbjNDg$)sP++k)db4av07H=F-3!d*+gjGoSeZa=L! zUzM|JPfPRQX2)}@fnWZa2;`j-7Jgrk8gClz!P?Vt+sf=rDeZ|+qyn#4vy)kmJe9pw zE)B$T#~?aut1P4#ykjn2V9?+b`dLFkBx+)bE6$wxo%b+SY5Dte$?4PN`%`IiUg2cep}FpeIEKu;1m14!L&#> z_v+;(ke>Z*t#ZwSH)#G_t-Gt2#-HLNx7zauJ$+xN+Ub&zH`hoMHAe}C!>ffx!E{7H z;A485gj8(Whq2Yrl#?WD!(4Wk62`0xkbY_Owh+g3!mBK$%s=ek1p=oOwLdrSC{4#` zQBNl%yCfP?ym6!B$wmD8XE8m2yH}Np;0Vqrix9&7607N^05pts{Vw^@h+P}IMGrD6 z$Jv*s2lmbD*a-eGZ5gH6d^=KHpnQ4P^!@&{m>KA5`}@6TAX%k-D(&y=`tEXNyCxL9 zkExu^yP}fKeHrSXy-Ed(((TEoWmDJsQsjw~8y9s{3pQ4psFC{W)wrF^{>jqM}*0lsj zs_^ng8v-EmjCUPj9#!6s-&Rh{<-=p+0IbuF=yR@ic(o^^CsWF?PNSSWja+@O={dq| zOs)Z`P#ei4s!QeHBC|er;pQT3$lfCu-QaREyBR?c zA4gW4HuPg_5PhE~&{GsA?R;DDau*IX*mCuaY+^ws-LrP)3+?Jj`RFJ0aG4zi^rwr9=okYva-fYri2&3f5&R3OD*I{m(77|v z;fN=8X{d#Y_N(W0*JYLMmZZbcaVqYli2CL-Te9*-eT9Kvd%+~kAyYq=aVjZMn)t%A zrf-9hlq1B-OSCnOLU7u$h$Osh7P}tk7x_b*V`zyCO)Ce0*5T~4nF4mvNdrCY3%8G6 z#4`sx*~`($>_lZP=O2iQFwH&fb9oB)5f?!aa~v|N4_fzDOu8|TT~uXSB5MzUnQMz_ zGcIO@Ms=KvzNBMc@{`8P|0U$IdUD@Hh8w3e+5%ncSLy3Vx-rf|Voh|;;@U!|n zeEbmC2`BDG&u@Rsh4N^Tk-)$yn8w#m|WbN|aoH@)QM#}XlJwYlLte0lu@!Oq@BYEVki9UGQ zoBaCcBT@0dmZu2Cjqa>(uIyjDvd4df8f6|x6uK;O_Q>VMx9-#{wW@c9WPY_AsV+F3 zn-!s|I4Bumgk&7u8j5h}l9uq>HMVC1p1Vak6(g<|KdW)cIKDhi5g)r1w%ewc+(=F6 zyOKhGI-ggcD=goJY#zjJ=us*D-kDwuam^utlXu#2R)YAn=kTR@d?ILRhFg`Rm~}BO z9(J)4(rU@w(BJ?=7skM$nVKhU8?eefYhB$eW{7bj$I`@j(zs=D%NBH78mI4SbP(@a z+-C%mbyL zkN*|fA(r4|X(Y>u7Sg)I0^_@=uRkOzTDEi%62(oR_d?ptzb3GR0=B{30_ETV;$CDH zTCK6~Z3}5rNq{4viEg5zd)OJPc5j!bQ(I*1h4-qw9x{G>sxB5G!22v}H zNRZ(tLsK!te_{6_=TWZ!I{Hxr33z=Rw){HQfjP+Yy_XpG$I~&T9@6~rzdDe7Jx45B z=d5Rmc7U||A4!B_8e6#qFx_CO#uRQwO3BZEV!OVwV0NqTlCbyu)G3c9-5i|H1cR8X zuKsxz#+6xkEc_!xHa}9k94lB}*5r!qpal>Dc-x|U>yvDH>KDJ|l1@IZ4@Ext6KT<6`Xr9@R7YIS2B!z-iw9 z>-~L$mh^lCRp^F`Uvk8EJB=Ea*r&9Iw*>SsXwHgZMxHa58}Dcp$-$Nu^Z}CkaqTIH)Uz5pw=tPBB^%36(yxG zYt4yG0(yJKUV0UJHOAhAK|Z3~?>}af0eFmi?J>XGp=|1%XXZYsHiSeOfbnA`Z4Agx zc`K88nW<9xgqm6wDTn;=Z_29Ug!Y;#Cm>6=N45Gb`1x6cKTk$bjNEQO%tpUf75((~ ztT+E4=Qi^cRqOhx%XPQUxeI<{DBqZ__WA;&`#Ld@JxIi0ru!~_Ws1cEs9hl3SteTh zHLF?C3^AzeBEt{r{M+`m5}Ja*eyv|#{b6wIl}eP4D4#mIp223X$VG`TgkwGxwicVz zy4!tSCoosm7yB94*=#FUSa(NaKj@kigqszq6yMWQ81cqD7|ldeQHA z3(#Imlk;dQ!bgFIJ_uvGsm7L=5#ZQB3;Kj@H5JZ3{{fr$8yQ!k;g?tkF@ycSKe--D zXiGhhkLe2cNL3t(VOTVK!aM>hq{0*?@Z{5nN0V-s>0S|r1*0OpU#MIa=>9UQYPCEJ zyFHC_#KKqqt)jw{mgd4q=bQ|lyJx@dSeMc@Dm2pftR#(%F10HE62;5=0D65dYBx1z zeBHpAy!Z??z(zS2%PD9t2bjyU-br-2(d7+&q)^}7tMnLq29?9(?Wef)Wz+Mn&|@o#l@6YXP2L% zqA(o4UhZjY@WtgeOo1bzc=)Qiqpf{Z8lUs zctpSH^D9~Fc7d4Qi**a_qgigg#Fod4k5Eu|JFAuR$hB1Oyz-`JzT#_SF+Aek=ga)p zWh$S$v4i4C6guELz9(#Z%h~UMtN^TlzVS!$>KDul*}bWS{dzVJp+7=T)*XjvxsoHL zsOX5MKD=$ecW57#8-G@sNo;y4u4VIyZ+PPdmodusx0{ zR!3vC#&g&qOWPLxcc-n)i+TyuB#$0WnA(+bJpeXEh22wr)$x1=_?&9>?3_%+?) zjSuS#-J>gxPZJw3){D%mmv`q%n0{v{LbU6`nG+-+J8HF0<8+b;DT9~zcs^%jDu=MGR}?TWF>XfC(4)TPiINNsp9XjhId5X z#=q->X~EFAay%^UD@i3ZR%T8~J|B;8Y7a6`3qD+rxN^V0oMZbdj9n{9_Cz z+ry-(&9_JDv_)kNs(S4(Lt>BT=ZeoGot0gRqCPHltjtO&2>0?D?xRW(DF@KSaoL&- z_Sbq&I5P=!F9y>@w@zR$?OM=&Ld|GxRen8jfv-@0>{u1r{Kr;-XqFL^I z+VmRWl{nf1#ueq{PyfiZ&n%iogpff{TiLEIC1XSj3=|zlHoz2p=RO+f zVZ5(5_nW%W;5h4bS$p)t+NHr*^o&kkRXG5QjA|a_8P-I5?Jy3$g>bfVy7sDBWrCmj z%zO^1-*9kq2os2H+H(etnilkB5yk6BCZOtWAn!#Ak(eLM@-ofu9(w6lqdr+&uW^Zw zDKg&%ja)3-m>(V#WB-cHHm5EYDC%Iq-`95MYaNQ#&b;~fF6g4}%;tmjGL|@e24d_? zRxTdW!b_SbX&B_Y$fl}<84o&XxDpXAGH<@xH#tlT=dQs9978jC+UtNnnNRZezp`|J zPSs-ZzJ3yybojrm3t(*#{0E?JE{qm}p%dIy%dK(3!j`(8s72Z9BL|0h@Rr8v4(t=L zNbPsu2p;Cww>oH({8mozP$Hw^GH%81A5R<1mE|+&Dnb<`<&1iqK2bEKG>tUX6A0)n z?F@+UNuv`!DcdAryM|ON--x$|QT`YPA6XP2j#GHrHBmHI%(s5MQ!=Q(jC7hNuK%xCL(oBZ z)Tw&^Ralpp0qW9ca;}eDSVt0qg@|Gs+7B_!Lc92qDYrj;S_x$A&)F}@8Zh9pynd=F z*uRIqkt8TU*bQ_K!2aN?PNvEL-OixjY#cUH&#I_)dRr%Yjr9hTJpN$^7euQPBVMoP zYToi*;#+V4_RrEgGj$+wXkWa)aROo*C+&_TeEiMz>CWPI^5gZ<`{rC!!wPNX-Ckc0 zo+K$QVf;$SLcNYEQJh^EJ~)7qgw<1uWap9+VO6BVx#LYpAC~la!}qe(2mYWl$QVji zMTs#fQ57G}$x`6Uu#SQ7k127T9CL9W*X+>!Ez(<%_s4{F+<~87gYLpu6fPlqtuA1R zfQ}DULLk2=jtSDZ*t?oH%@&t# z?icg>mXF%0YH|4euH6`t%(3}NdmmKx*4^LwOaIXWNA9ZwOeJ+mt5W_~AT;+l9m4SE${6@K2FRg%K z_&*3@OzcY4(o}jQ$9#Z+kqdTW)8;g^uSvCyjF@?eq)2v;~YySIU+n4Z{x3p}VjEi1NAb zV>I>71`>n<_>}y%pSo7eqHZemAT1;<{T^BTPgx%PG~V#vaQ7B{x#eH6-nwe_teNqw z>BpT-u24lo{h9@^MBsM{*qx1*gX>1MjIIceyrub2+d)zms;`%?t*X?{nH z^zl#75-~)%@n}(G@=SM!|8|G}W#u-P8cpIw_olh-4jgbX4W^v#1NytTP(0rd`?3=> zT}9D!_!2+!{<%N+CiaZsEB5FU2o-X}?0^o6#rUmI7!XVgrC*sJ;2MU_S*@;KM)u!d z&PI*q_zv2~* zt+pOHgA;zVS~#V2xah{Lckr)CD@0xF{9-RBO=6E1hT)%R`cS2G?2Ec|Yzd27*Ab!K&f)3&(y1Z4h9siX>}9E^!nTx!jfbOA((x0|VgRO&&mC|d1*baGZ< zQT^Q>H%I}AK^a1j7Q~??BqXFmLPF^j^iN94FajeXokI*I4FXaELk!(AG@=ZM%+TG< z*?8Xb-ks+>@41<)d1hbi-(GvoTA%Od=8x4NuIrJ=+e$VO2kz_gLt>K+e0%D5oG7qn za2#n#ymHW-8qGb0SE^Zfi5hN?6`;1qU*l(byhMWq%R+yPE4*wR0<%7D3+g%!>dGXF zfKXoKIcIbfzB06}$H}0hvCSbHt`#RkAu!5&084Fcaof#~&E|-XG|@wIhHhs~)92x! z#f$ckls_sH?O?X+Q4Thba-eQ5%_l>eL)@fiw?ZZ9i{ah98k|A3`SsltQ>;Y;$>LHa zu7cEYv9vrXE2RQLAHtlIOEQ*7!W5co?E^OiQwZ*PgG}OHZf33SE{waNqYq(S0(Iu( zr8yLHGM%gLgURnfyS+iAz9en@t#2J7YLtG^ui$CCH3l$GycfLx>?I0gIn5+9BKQ|9 zfS;QJu$IF%nCO=`^8-}4-8A0@!+1*29#uD4%_?Q?q33I(R9CYsGKF<6;`p}_2WP2S z?==hni237BEEp2Zd5Y<~5R8FIo_M9i8?*Mar z07ZTm9nQJr0noOY7(>-JhabX?=#>iFyx5)e1eM3Q0|7KU0{H0eq~AW2!f(tD@OkVFht(sZHt&GYhuiP$HgaKCeC`@B5cLqE{u9 z{7X}v6dJ9XF>Y&U8}e?SLd{>T8m^gk_s0shq}xlhC!9D9uL7-og`fpNUI8I1LQ^r# z!|z(3+$Im!nz^P-esWP;&TKPA*YpXaAlm&J5n(Q?erLN*;$v*iOa2!$cc{7QQE1FU z<>T7Tg%MAOWrmoxJ+L8qB-lwz74vO=9q@dZwe2I<8njmXP_G|x%dOJWGjnw_zPCA_ zQ8K)YfIkph?hQzV9G`zfjUJ?}>5CqsPJ5yL=dVlF$j|JAN33tY*I)*Kc@In{axIJk zBSHz>v3VvQ=M~pK-)f8X4$_wr;@j=EEexK(eY9VMFqN*xogDbn52b_kFc$z5ekBL_ z)%OdTtQ@QzU$^9PCpEt8EjgiG5994$RQQa5c-`c?^8j?F2o1kgr43-`Ohu~(+l~mp zRA3|UT#h`_nsIoav+w5=kM6suQ7Bi@WOMF)oekqyeA4Pdo?4af9OLcU zy4@2!__I}??G{0%C`p}z6)~DIVYz;+*JprK7v5xQO%?}*p6ZCwdX`+T(j&|yv?jV4 zd>T0X!LQk4|SXBf@P9b0B8Q;LxUM;iWfQ= zS*U}lsrr1C7a?!oTEIqylOqiFkIZC#@I5-=zaPra^g8a(fKe^Rg=$)DzuK#WZmCsF!6^$qv2OGVN`7OT+?+GI~>L{7a1cM;xpZe^82!+=ofTI2B-Ux^-Pv@DZwL?ib zLBz+~`wN*rjQKD39c%n9@w~@_;fXs~x)PIQM(d#Ghe73r4GvTpYBF|1pia9YkMWE zPhm$ut=m~r!!hTQ{H*odjBuJ19=3F__8u z18;#0YA3*{39zgDl%QPr1$K9i+X{4Fs~xR`D3(|AW*+7%-AqdTHcXgK(Aa>mN0kwj z&Bs6%`>vT)Gf)7yKn$^Y$(1OkPKEH0AOxvcUVWr8DcuT-4^=)&_g^<&kwb{SYmmB0 zb0E&Fo&MGLBZHd%k$Ji!cU$-l2stS|(j?6YoCeRsYwY&p1G(9X9nagZPK|X$ZpSm! zTi17%<{kxPjFUwcxB{goo5?ZRW23OwUT6p_$o%@y+mT`sap-2QK(sjEF~PT#NOr6qvoEM;gO zM^0LGETMD++IypNcuKAGp-2zH@a$ba%1cuj}92O5{)Jr7H=U2n4zNVzIX8e?3) zBWdX%7}vn%an2VMEE_sVfQy{7SpfxeQ^%abYrjjV8a4dmifZ(-O?njyrb1AsjF3-tzP zURr?}+kc&*_9-&nO)?gdslMH7jh|zl&eUd+^steJo{WxzluhXSwQN2F((5YQ+&@Bb z3rR&0v<3kSvwqE*k+Jbj8WKS^DxZzbBlV2w{}#{OYC#Sm9HJamk59uGkW+*!#7u5FF#PI6R`B2_A>k8hWE!0>mJXj zzfd?78B5Mj(l2Cq7;K3o=1oK4H~pIoo4qGZQVTrowRPKg#GUT&+gy)@ruPw)NGYCDQ}40s&eAcm0>bQ`4}P(=NU~7m!s?M zx0VvG3>>+@Xc&JZ2R%I0`M>_>aTv}qc%7acB$tUlX9Co3J<3Ij2a_wbIS8z!G zyN}TriVPc?9_N1oa07sL#O*YW9B^fbWEc4f-cHt`ylKI)-3CrZfHvc{0aJ$h&?DSc z46pGY0lj8IUd!JWRW$x>%-FIIzrN+)VfJZBbt(}7vVlo2C;SghgIkGqV+EgsePPQqxqR5=0 zz3#pSkz8i{y+x^=jAl8W^7SNOHSAVhtQWO98|rKQ3hkS{P!vtDTf*I`tYV;6jmuY7 zcQtEFz;*VLi^h$_A9*&Ao_2O!8Q13H0x*_;5`lzm4DqC4A0@Laii{gTBoM14esp3dYWv`@uL|MNCdhh=a~eEu@)RhJBor;+dUa}v?G#8#u9O*oe! zQoER!*PrlsnI^KcG}S*fz^Ie1LFZJpKgD61(MA_@IqoQSFw07f%BHWzp&IwRAV*$GoF zXjZ=!6i!BmFaQn;MPXjH0g7e$x?PX=Nw*v`m=8n5=;}HIpKAP)JWc-{YvI}1Cc?cG z%9_2302XjKDL4}~7b+%uMszXni_N1nH=a!2=aT~oua}43|B~Ix&s}Z?-yCzE*(2zk zgaTTYnlroF^oWbW3^}ijV_}+*GEN{ozqfo4y5*2F6ZDWS&(YM~CW=8WvJ_h8WWqlQ zk8DBPG`g1pVN#**LO6&x zw5^UmJjd_8DD#uYm2Wu2dH6k2`0ovq?Jh-Im#-Gq5(LxfZ@m<*IoX{Oie5Iq= zr|CNR$TZyR<2K5J2J&qcq0aV3!P&Ll054pTGIB+zpL$?qMJvqUCYwSNUh&R%v3(fy z+__KoVlRfNXe9waFJjRC7>#9}Z0Vv?_ksQ=X}D~P@g0h$Z?nz%>!^DRv>p8bsKYV3 z2Md)@b&5voJX}w|=}UQ6&HH&Bi}j+qti*luqpi1eT?N_Y&KvOswPU@)X{+N;Hr-EI z7pivVar_&6arKTG7EEBeyNuuLz+Hsds61t6@1+kQNSBEDgDqr1L3eLi(>-0iS$1h? zflzHAVY3pO1)Qy2~niUTg)d;Kt#fC?dq1 zbX*pDkF1#?E9~^bYSBY)VXjZhVSP(#j}g-ELgsgjrtBpy!^9^#iu|?(F~I!aH__U3 zNV;}>JsL*2(LbMW#mb6OeSJbw0y{Xb(l68&k3DR%j~m-Cisw{Ui?elhICTirQnbq0 z=*9`i`hGT<;L}LVd$~BW0`c+HYw|mt>8w{A#ZE*@HNSwT$=wtoiuy{ecQahkoU{Krl08=$%e6?NI0;2 zu~0>o8@h7xVe|ed`#icXNK8=iDv}JXRbH&KU1eXQ-@}OXVq}?5N6eG@fFRCeLL^1# zO_a%TN4hZXrWj7Wn1KU+E=aJ|SIg`eyVAv6G(t$ttmWnhCo7oGyMnn)SzxYCNRF;& zg>UjN3;MfSq<{OFh{;EK3#)yAg}IS$Mk}Mk7@jK)Ia=Scx|%D{hlP$mbZ0O4p;6PZ)y8vp0`$0rcBlX_rRlxVDRn9QWzAX81SK zJQhGCMbB#oZ!I^A+c@xm?!TnXI9xmPKK@PN<90YM;q*zLg{$@$6VhiQd&x3uu+#^g zt?|*yB5j%v@Y+YdvtEE|ElJQzb~R@uNH^jl+kBXx8rPh!>OTl z{scvgC4T*Q%_+|C@NqrgAqH00k?6j8`U!v1FElL;Oj-$r)=4YeDVMs-b*!@#%kfn( z0ri;*DQM4AHYzH>E9!o0K3&kA)tw_^H=47P{z{js1GrQR`ta5OVC28Ak~wX(*l2A5 zPIfHszoopU!q+~^nH@PKc)l1)51sJ;117a^k%_2RI-0j?Gp(c_qGAiRLC0T16)Z}h+dBfW3$(~0x1U?_Mc^&4?H|HniGDwf44GHCkXvJxx?elZ% zeXyEC{6tB3CqQY`-fg@AtJc_PDtc=5qBR5ilArRQ!fmM~clZ=8>AP0aIiLU~jz$nq zFno}!)bBhC62mpD9fXDo?EHxc|M1rfwnHV!MXrI~yhEmo&reJ-W7#sUoPi!BeCyx+ z#bAPCAf=im(a@5Aazj4}Kv&^JHVO4)}B8r7!%pAbFehkMP9XgGN)r555!V5{p})59zW^*{&91MDiCxnB0P8> zXhEq*A~=?whClNZ4>QF!%@{eudughbyIF2;#^L|8r5id~$@pBkY}s?^HHV%kL* zTw3!{b}TF8#{0W=jFYG;;j4o#AHj*I9v zs!D@hvyZV8lE!*Sj#@}%OaoxGP%(X#d3IWrI6@;rh6L0U4%@Mi>E>@7692aGT9W&Q zpZ@stSZ6|wo|W8`b$fV@hA}zCrSaeXiz~U?eeGe2A{O-Rm)aUG~K`1D` zUsV)jb$m>Y+A+;u%==a$h}yk_P`XxuL?1ZbwX1XMH6#*) z8o9eV{q5e1$&P~uVZ^g~y{oT->HSX99(~7-6Ngz|MzLV8gT4_3EoKP_*mzm5-Y#W% zO_|GKF@zGz#R|A>6#UOePU7r;pBPz{ZE^lyjLf7jZ}yU?_z zcy}i*Luh8ui;DgFtvKkM%$Mb6LPqdc=Dx-IZXM1+O`LH`T5Jtuk#+yQG<)EYa*S08 zYv>*pgMy>&$$n#F1xGbC*BM@tP9z7UE98m%A0SxZ!=bI$ z#l>>|ke9>ZL1TdTJtfDfFcaILs>gdr1D`BgdqW^h;eF$5c>$Snuf5H#2{{65Ea`3?V^dzILxX3conr3V3NMvCkUUd!T`cX{TyJw z>&kZ{cAxbXOB}##mhGJaJtIh3+f5*^j=tM_1DbnIT!Kl0IHFF|Q(c!9Z(t9sozJeK z3Kid{Y*s*yczaM3&*CzEz|xqw(+NdJbw#^y$D2J1A`KcBXg>Z}R%BlDZypk+llFkl z8wE;tl&^ZdaPio*&*tV-p*~w+rH(ul_m0nBT{h#Tu?I`BEw%sN^>2SScQ8(}DZ#xz z5|H~+@}%P<)=}6-MsiZ(p%E4Xgrgjxgz z&A)KKD%os@olZ)(XWT3A_NAq#wFP*G+4kH+P`hRodzx3ubYng*E64HU_&Atly5=~H zTvW@orvn5^YE--y?~1uR-H{*N(9%lDf8i2F88x?lwO10Np}RWq{K3+4%m(^hS!*q* zQPN^(*Z(J%20+0l& zz9yH_xxg)JYYs~8)tQhkFjq;n+*15mxMm<=rP^f0JDvY#csv8A~`q9Sr5>2ECH+sHZ!-3{*6IrIkD zPk**%(;oBUwHVy;AO^U8Bq$Sznp}RjYX6hQ?i2t(qVi_b;4RuOG}6wlg1I%$@P#=VY=BIZr5w$&uHkjy!LD4O9fZ zrek>alg_DOG$QAyn&dv|E_Ms*|83Fa6F|`QNZ^Ep?W|yN{`cM_Pq&AuRck>bG%65` zMk9rG^+Q5yy!m$g#Nljiz|2(9KPR%Nrsp!jut3s|7&4VSma5`${=OaMXkTW(hv=(- zlH%Ve2MH~sPl271x>QOUtnF8NF&_nK9lLJE!v2hqRsfk|dO;tKOe_Hmz0u9)=HSst zw^F|J+M6c308@f_F+Q>qz*O=U?BJ<>Kvtfn?U6g9bA9f3OOM+?)ZEx=0B`qNAQVpq z+E_Jl{haaQdo|G4S@y?s(Hg`3F;sxZue=nLP{#h;wB&W@j1GZMT@f^(xo)+8Vzpg- ztq~qS9OF0&Q)e@1ZJWK*)4@W~9Vi`>?mRLxtuoEEc%%6%f$WFK#YNav9Ace?p15LH zOf!gh{?OF=tDus?zqWm)XAb9q>G$#aHS8|`Ft8nF~8c=Z;ddfd>tvJN*r{T=QYV!(* zHZhAx_B+0Nmp$rMRKTywJm>nta(|pxwPE!EHISBZBkW606eO#eWA!eYGhldW{ zlH>*^QGJ5dukXvDyK~j~9pPAh32WPatEYgc;@+Q@`bt=BU+2F;`HJG^aXux<#PovU zImGVjDzDz+8MLE~fJLA1CDUiKUe9Q?n=(TOt!6vkr@NvIVLN2xJ_7)X2QyN1-L>h% zzCPv%ikPZ-l|W+NXe(d%sjkDx{x#2)NbWZUOI}`$>(j*CMlzApP1oUCNjf@eLJ_Cz zl2$9_+-_|e?9(bvH37%o*tb!vZg&5G2tYe%;#ZhSYMSk(CIzFO>)^3#QFth!QNAx_ zluffdCYra{pbSKvSf;7*$5!qTPF&b(2WqEOg;mpJL~WL_kjyffhSqsa3|2!;Y^(cK z%Lc(1ntjT{pUZ>y*rMvpGlV@9*qeD#Pkv+mkfZar3ee_i5xbeV8+9|=)!lr*hqXJXr{u%hZRFU zC?hM$$mrY~d-1OIm(4co@^cRX1lDUy#?FARRx2x`@}FPH+wO`^zTqe-Ox<0%f4fN% z_0sa&$tSslcgc!9`iCtK4B%6vLYJtF6~jfMd;!6dW)*2&fiVsOu3qSIEQ0 zvb65oJNhL9O2S%$(&fH4V#PS0DL7Lr8RMTX{9OHp_*FZKi#J@-0;062noRwliTOFn zGd=AqaG0#;eTRF~Vn*TMr_H#L(;DUr`sic5q!Z%~ODz%!3fywm?r29F0NbSK%nhVn zCO2F*a?ArBahb5W!OR_(KHy15yVWu&StB9nN9Yu=-a=Vc%DHEi7hQ=*zg+MKgSd9U zuRm8+QJ+xVbclj7$9#*aFobQZzYd7#o)BkeMvTH_mKOF%Z}M{`A{k_C3Lu1w!CIZ3;ZjEwRkqYlZefZkUgaB2{C-YEv|;`>%=m>wv@ zn<0UfGvGVPbfg&0?1}diQ3JwfO4}_E`*W-d74UC}&tn+P35px$;xYXL2}fb7#Ux#p5GCymbbelH3$fkr@qb}mqza}GM^vvR zDg8=i(`c~+V>|}0QO3^Vk1R@QV0Lzr){zrM#)9Inu=_t(2Bx98mL-Q|y9-txwcQ2y zTAxWAnLFvDUF8$3*++xkAGQQq7>gvQDl~SGgeIM4CUok%{Cwd|DfsyyHPPddT}#1Q ztPo!3u(-rgnkg=j2`Nk0%bK z(pQY4%E2;K#A>mY1@}Ic;12(vLEl<>vnb3d(@UI7_*y4vg+#1-*tRP!Cnw)lgnRVT z3g~*NZJJWZRnvsp+ghU-;Q79U4$7L@K||ZiIuK9>BQ7QZ9hFQEBP`Y88V@Qz!M(>1 zo|q;O^9fU(K?4|qYUWrW{mmdDQ(8Mg^uw_+Em|*Q06S$M!r@(ipYrRbd&W#4ax4S+q5_7*Ny$94y4RQyLf^vSMg>&z#uW z7>I2s@JSk_0ohH%Vbjz8v23x{#sTKji-SbEb1}7vUN)`RZs}?|>1>Aey#+ zC5I>ax>~6C#1m%lFry-t1$F{5wG`pazWOFNBz#0SFeWRnFrZkI$IkDu9HYIz{G-W? z5a`=hxa>YfeE%u|;La5DkIz_6l9D_fLX)Z~?=nHcsbLLh2jjR+S3l0C1P6GHEIx$> z-)MJ6qupCW)+e4Kw#Gn^>ouUn-`lK?&wG%qjS;K zowb7}G#ZNa2!bzsx@M+Uy~m0n>O!?s#9RVWT{yKVj=JxDRAy(`-5y^i2E1han+`}xTzyCZ z|7;r#xRWbsuP>i8khtqEh{S!WyLH@w3STUJ3G`E|)N0W#O3SqW?+zglI)%|gU=A!z zDsFCNj?-c3%@c25Cy#{o>f$bwl|{3-N~zT;E{M)Q!plktLZE@mD`rhR=8s-^{ct^^&=1(Kh$2J&h@ z11SjsFaJS@6m$hv6?R_7pN&0i;@do(kl$W=yFN}CQgsch2Zq2MIr8^U{*ij-w-k~L z0ohJs6OL>Lt=%5u+h;6dS5enowXHvSYGJXEKPxfg@RMw-!0Sn?tKLVK+3)*(p10{a zVgc}*PTEWx0wYtvNABSo?^VGb7ZMlAL{~7d zcmZGF&XiHk$7C{-vwK+`&2f!0A@1uJPM7Ju>23pkrh)>{RglAX<95k;er=L&56rU0 zz+W$+YAC!<@NPLEmTdnImYvJqqtWq&zISzD-qDbt9*+Y+=#1RBZ!10>*pFr9+jHh( zy(C;* z8jZbm{P(W0_W_Tdy&c9}DVmte*let5wJUe!yEaQe=R7+Jg2tt-pR-G)BGj<;DEtCQ z>b}BqRLjJf!>-LO$W&aS=g1>_$BNA_DLO12WSxBs#+Kz^OK*2{1H-v`=z8v07CWHw zRfxIxfMGYd0v|Ax?Qm-DthrE{-97kIHhg~CdrG&-*CYD9)-!I(MNE%8+fGC@V=LlJ zZrB^1noR2A^^trLjXI{CU4hV~Q+!Q3Wvim9xu5rO1GpzzlfHv)o=##; z835(X*zq%u{H0)4=>fRd=e`VJE&^6%9q)~Dsk;SpL3dlR$))53da?EF{V?cPosA6F zdzS(wb#|{Y2}d=Y+OKw#bjzI%YG$36Pr3k2jPvQEUPTK|A_Cl9?nbTN;z^AoQ8EgQ zPZB{3#&ZW@Jn1@e=w#{EHwfMlm0G5>Jl(Y^3*42{fbB{Te^$|*FI5KlhtU1uUCX;G zvS!Z#d>fqtZ>VICaK%U+XZp^9DqHL#=3NlNa9`nf-v{50BiMxk=YZy!t`&eUf{8(T zENU%v!sj>nuUamM51(!!$9yv~$3^T1$Ef&GIE-6X6ZE?a`fwV!V#j-2;l254)Q}$y zr!M7aFXso$!gRqd@VY%jH#fsk&8Ez7_0?&L9{%G&4yI#&q4i1W!^1Qi>?F|jCbm}k zZ^S%=FcuNN6L}F?^MY;r;Q<~2(|QCZAmH*!N3B}<)d|fY>&}jwX{LajLo#b1?gr*@ z{!Z_CG4K2WTW5Vg9XWzuujyBKS~IvSY9ZwL(Urs_slChNsW2zEGB&Td$%?bYf@9!i z9NJamnkQ&I6gH(#dDT6J0Ut|wZlWjM(!3;nU5RGW7LP91QCVkd6%YBU^q0-CqPl6y zFcfAl-&F+07LTWX@LZTSJh=ZRWc49$ZgSQ|CF&HJ%ZOp*iqlgi`qY$!_VMyizNNvf2RS zt2oi}#L}S!15s_~g3;DBusX@NI(95S-~MB(6C%aou;6iCwbj@7Gx^GV< z-`~7&J4PW4g-IYj>m5hw$q-lM`g{FRXq7tkt1RfaR0u=BdS6PIEbnI&R%){S3IUl# zj(-hp^6&zBtXy#Pb3-iuQPaLCXusjn=o#7JK3&t7_XFMik+0d>t(@9qby!(L(TSeK zkJul)_B-7THTt@)?$uew9Ctjay*A|lA6{-h%&#~XWot2kPFp5)8dv}m8hp#w-X<#7 z`N@`QwJ`8_?j_9$5W7~B5j$k(C$CrLl#^TDZZloUP|b&#yw(-DSQt0`ab=MraO*Sw zLvy}tYpuMVBBzc8QWEx;9K@1Dc08S)J}Pb;Y5g+&L(ErLK&Ge4B-Q3tf6EliK8;=f z)?F)mTwBTZ*h4PyD>c`*rUW;Eb1@ZBOA}0bGNMH0!Nl)0%{I(S^juMKZHQ>O|u)!uKD}tMa=lvn3Mh7p@D$HtLpSE)8RjDygGE6;{78 z0^d1X&E|EqO_c=#YsiCQM|wy{-y0slPCq)dn;`%+@+%e6ngrB-kJE-_PQw>%88`*$dk?>Y<4ieTQA^MsmPWnz+SW=6_ zUPtbOPirC!{rG-Uu;49(KLAVfTECej(=i0*10J=3kvMzc)O&xrY(~VLlMP!q>&K_b zU2)M-dZ+-NyA`|<83KIid^|@h`pfJ-ChLE zYcvYs)^SZ@YcjCn6o9^tRAuH^c~_0#{*N|6!NyZMp|IY*RV%k#+qu|rHhlK}>C8Ch z$_|!>(SnRWc#3U5@^mBQdIWqpzip@zaQ4NmSkCB2><8NeA=SSzF7o+GO{`Ifr?X^j zR+OQ?r%Q2rZP}n9TkmbiB?;#%zRFNBWy|Qne`HvlIP#WhBTTO+c4it|Y!8Pep#wgH zE%IUEbM(YFsm*)(e9zllb*OR%MIzg0y&&ar~cip%$ILc|#PoJ5ftwS7PJ zlN>U5M<)jaGO$S7_nrp%6+$Ln3cpu>{ts6te^&F_hv6-+Ls*{<_x6TOX(6}2h0F4D z$JsV@(>)#W|64HJeBljb)cE7`ymx!rMxul%Gl!OfC5H3lOG2Dy_iJ+MEEjR)Nb--x z;}IUx=J%!w^m}#D#BnIv%bdXkyl@G}Jjt?{uNrDdwUl!2osnPjLpJp5y4<*`TKz0; zEvyS@y(~2B0jks>0xI7(ds$T>x>|SPuQ2+IKN^+(Z$SQ3KV&%9oSrh|z9X(TVC1X; zw+`_tdP-`G00cT3k_7h03Bd9lN`XmPN!eet6#+y4;qn(Nzx9OY#XkG?r@fJOKEL9O zhGp;gg1-qu$$f2doukHUWTO}rn0x2RN7Wk-b$1pk4{>3~<;IUDCbs2!Hn*ATm)|po zY%4q%LXq=lS8opu&6>cO3wxY0`}60NxwXJ4!-XM-R@f*Lx7WH)>#Xp#o(+<5dDX6Q zGbyqPaIf-f%gK4j+GSUo<~l7>Mx}3}f8)vo)bh{OjQyj4xtUmU0>sA;&~~)BX_577 z2_BxYWa82+!KvS0P6&4(CIF~<@};&qYTrx29dTqobKn$VdHX7=wt1tbutV0GfI@MS zCCv|j3zj<##`lq3f0N^p9C({pBD`+7wU+VExUw#2QWQ!FL;lSmB#8wC1O8I0+$KG~ zY|~{OyGkhH3hWuSA%+y(s2}!_nGTAS58xY4|6^nZ#z*nbRZM<(N?eaBb+ykLM>eyyk)mOdXK zg2NE!PR;?UK2@URe;;*IJ^Xog4ngBW*(!39KrDBajmtm>h1C;Wi4&5@?fGvB(nOa3 zFCuU4SXkyJ`=S4Rlb-&bVLW*0bGGK;tlpknB*k9=$HCfq;l6w3Pwqz6lw1naPsih{ zVtUVu3&Z<*yACvR4XwIVNMT&NlQ9B(sE~8n4D~W^!YKlcyA1a!UCiqJG(ujR|{i)4#%*@9H{s_ z_sA&#wsGX?Rb4v0+R-slm0xhwYaqBB-*C5c_0d~TS2ehVwbz${g774u4Ec22XtEJz z_&)C5As|GFs>QzV@w4q!p{HP^{>Rc=Kvv(m^uYzGgPxHkVXg#JWM%Cy`#75xgMc1s zp5jDNLH^|u`NMnGc?{uVCW^`Su5x}`n=e|Od^^Z%d1)ML>Du3Iq5oEaL!>-}4n|m_ zLw9f7YC?BEihNiKYTc7;s$u@OYI21#kU#zT;4K&R<0YnNstgIsG7t})LdFIYFOlkee~5xjlbdeJ8LmD$~l z;hwLhBXE;eai(V2@cA z*NFN>H_@Syul&cG6oKz9Ck@5N0!G7n_tnh!m|M!7eqR@b>Flq_I$1O=JI;sp(+iSp zr^8J>l^0m&lvk$JzpI>HZ5*mOK`YeD?T^7od3%FJD`i{3?K=s$B8D1a3tjIQp>PcG ztog$D)~y%m9Y6xW&`@g%qsg!=mGq=!vxr zuL#j=4TjvH3DlMQra%3-o0{G)T+YcB2lj0Y9oh{uoaP!X7t}N>9@iV}PsUshr9qCb zQLxN$WuV%J%Q)X$`)?q};Q~t~&z7G3nri*|14;{hXVbZ$zh{aQaO2ki>gR@UiP(?X zYW?lSVkJ2Nb%1}aR%%v$nyrSWC14V1b9fd=^`ue!+%3ygnr#@japN`KMxXTCX>*pv z!M_2ul6@2_Q2ZegQvWMT9~=|YTD&(xh@x z7Hlid*d&*j_0L(CH65ql1h&PJ74TwoSW`XysjjhikkVo6iu|<5I>Fzk?w}|0 zykn;-f;FMqXnGo$7@7vf%Uazij9I%6&|0o>GP06e!XTP{!}dRnQFhFwVrhOy-7gGb z5U$hjdV)mTlgkvGaYvxlv;_8t-*Ez38jU{(>YAGDt}gNwkrV1PU5|fNAbG2e1^m5u z-cQBGn_fIBu!Ost4w>Sv>>ceni@JfFsfWKf3fKoY>=J; zfQ={_3qr3+(iKk6AT7WvWv=r^mB_uMnIiytwBw|^6?zyJyhRa@OeX|-}MXRbqhjPCK6t#6sL%-7y;y;GbDvxtJI$@lKo+!8RKWO;lzei+kG zL(18!{l!xHHm;`=Y;6X5oOKl{l!8^S<>A*EH}3^%cQ{RYk|cj=+U)V(vKg_CR36VULOes96ty1jbegYUz3H39Y;*lG_;9|s#l&-+ z_wmR59&;T=8%4wM)mhg(;+X!h!>zuzIBQlKhSAZ= z>~1~uB75-#`VfB7tOge&&!;M0bjqTyf+CA#WN}=AHL=gFSj62@4=RgYQX6h~9V-~p zi5jKkoY=po=A@g|oxgAMOpFPqR@E5(9s25tpQcIl31&BziP*}f@$w#8aAKy)fhPuu zrxCVPB>F~F8i|9Yhf?Ryqrn#7%Vp>qsKbkGgMgWwX!mulwLGvuYRfq}$tlGW(#-OX z^KLh5{f?VOHY1FX1l5G>`;+eWw7b3)T>M$Do2Iiudcp8*896V#2%kT5)ixL>Sl2X5 z$k=(FAGzMs8eA4bzJPLk5@9cXFU3YWDJ_a;FUuu*&0=zaT>{9?%Gs0w-j8AcErC$V zsOVo;`178(An1q5fFJ56g8kGjAVyA=a?}bnw<9xSN!gI9!z(F2W4Yp?0|}6fTR~nl zeOutZ38Bx}X`a~cvGZH1kEM)$MT(z=9Fb5h;j)KDEg457g5+3ET_g5drJwRs3S)0ey;Z;Q(F z>8@z3eKaLh-ZT$)<))g{_1s2>IoU;d$#>Z*4BRBj-8N(AlgZ^4oUFhQPcA}y%wrES z5#t@Ux!z*65i_Bx4qB~iS&vmHHuF1~?e!WBhPe*QMMmNAW`&N;G-aqC+q79K_8^F2 z80bb_uYV;$7+R$#$1UUhlr~Y?E_nB3}bDu2p$mGaehq8jcTp3o;jwS+VM? z*%SoEhjM)@W^yRWu#6tx(gT4&gG%i0^%w$#3 z#(-+vwTke{FLvVbXWK@$hsSkZ-U3A^56g;4K|OnVDAl4KXkAO{Tt{5qb@<^kkj=KQ znKiRp_s^`WG2;mV?tJr6GLocN*#*v(p1i+o*2cdv%ZWbQkk=`oP~mb9WB{M($v6J| z;14Z&VfZea_Tkh0y2+?i&JPBfc7zQ}kz8r(JOG9QaSv%diAy&#j`yxOjK`>`hPVeD z6!w&rcHUZL@sg9j9++Bd_C*2?l6(&;qQb@iMf#N_zIoI}% zC`U!o!df+$+{z-Xsq=%tG)UGHvK}a7y1^TJ8Z0UwJWZKH>n$}X?db7Us<%DhyjIfG zpSJI8PAPbFNk8jhG{5mrO$Q&w+s@Rerb2Q8;oA;;h}7{9TT#~dI3M+NmceK}HLj-E zfb9%7naDjz+Uho~IE3Y>LbQ%Mx{lE zuuJ{YqNS2q(Vp@3>$ z@1H~0M{!TDLR`^0)b6)k+>ZZF$*S|GVRk05ZnBA|lF4DNqP8*A=ue_z(Chpy+gMrs zYNrE_c~n_2PO>+6fkt{B z=R{6DfkK2C7npNz?xHu~R)=hprPrvjr0_ z>7=A+;SdUpiT8<~nA>NiIicRop$;Og4@Y;mHF=jdj;&Lp#h~bUTn1KYq_Vn7((4)5 z;4mI_^bf^$9@w*Ci(FnTFK7mcyTlfoGsDSy&d?JG)HuDi!VtsD_mIDooO;9dQ#` z(yYlu*RChzEkFq}3TL=3wy13z^nhR=8X1?+tMu`KKX1l~a)Z>! zKleY4eEV`h{o?a}Q$TRkq+^qhCU@f>*iK}KzJB&`WeOzO3=|jI=v|Sk_aVwSHgs!! zVc`K|{_3td>wz^E>uD%^R?-Wt3ypM7G2{wO@9}&dsdE)F8Or1B#5l*45Ho0V+Uiu} z0=xdySP-eLz;?9K6xuNke)`5PotB8MC!axZk;4ZD(7;U^2yss zBh?~qv?RJa3s6h&*?oVhk!TV?TJPhRuS_XVnDc(c$@bSwK6^s<>bEghjFQa?v_$le>3gLDU}osNfpbG*Tf!7 zfFQ$mA!glQ0X!O7E@dm=E>_WM;aR8tsmWZXu5r-WBwJ$J+x9Kkkq^EXC=5UB6owCK zt*CtWauMo19;_2=`$gaO(6YGpP`SAF*uMB)tGUcKVsR&8~dJm`Z4(;y_3Mari4l1@$jx+B)!7T^ND8s3+^+l z*58cL+2CXC37=Ua`7B%Yt_ihAjX~0m-@ujgt=&Y4c!kRcR|SY|6AV4Xn32eALN-}5 z&$WSqsP4>wr<>ZAKI3GH&5P^B!uYs=;_+90u9}(3ZH&zkd#o7LSD&5u6E;Bmyj+qD}UY80#S%~3?K@u%ZA$D71vC#w^Jr0j(mrd~sH zQ8jqiuMQ6d$}68CNCCO@9E9aGyl=4xE36;}+6*BXriv8Q&wTVL>?sIFk^FNtjr!0p zxhaw8L)(mp3$@W))*7z~jItuwXGU4%LOBuQ+t;#jyVs*u75I1>Wl=`#nyFmGn8lV% zsX3C8su0U@sU~KU{Fdmu58fhgH|9-l=J3HBEqtV>jO24Bdv*t0q1E#;*U%)W>5sB) zH1@#YL=L!>VJkiSKn61GWX?Jv$a$YnO|;q2#2Ch z0^_RhlfH(AohbdB$IJdMO%YNFlzx)Ub!Kxd-EeaEdT{s>e8gsiHb7MH>F^`GGv{cT+#p_bD)5f2PRoDuVlQd> zW2ZgXM$17$=*XK!dwGzBDXPb?|q7epu~kufw<4~BF|Ccdt#el;7h zyG~Y$u~YR8hinTqy`5&qNg6d0k_KojNGc|HhLQhbbAo8@C#~ISlt?j)yD|Snf3341 zbJ3K@1@Fe&E}hgYpC8*sL5(xk%zlkx$3SqL&mQgAuq=JW_oo_}Xn}04z49M%gB7rQ@(k*7<+CfGu$xa z>r1Zu$$rHVP9-5J2|+6AK+FQE_OdP;M2Z;QGi=?+fzTV#H5t*tsP82eCe}eBlK4H z^kf*O2ar_bF-}##tTM0?wotSU5NsN$c{$bBFHKhKFZV8K@#HaJfGWDxf4X$y9;KqV zaRuN7_1vJu2S>e0g%o>toPW}ls44V@KEI<;kpyx?JR|LgP8@yDW( zfZCGX8TL02reqLp^Gb%P=tdYWQ*ry_5@m5!2p%(DWqy{OS`1On>FN)Pqbk~HGh?ot zo=P1UByH3-ZbuMa^}LWsSK5>!rdz8hX7jLg-1Ug!Gj4I znRmxAv(uOWpVB<$Az#9PhVrgX75@WnkN-rSp-lgSLZ+)l_#Sl+ERKpRJXNH7G!A}u zcPQ^{19r~b$WBH+NpSAAZzm6bKw0`7c2jI%((Hd+nZ@cQ$7?zHyupRj5JwSXb(P?q@nIN>{ zThln5FLV8Uzl`%$7eP2WSJiDNyfsk`apXeKn}p%&*tE9S0^Hb>Cakw-@A&>m&{NsY`JEll_=$`sB*Ghn0PqwnU*{2a&GLF7$O||&*DE7}?@t5r9J<9yC znOk*}c~~^xr2tYyaeAxlh(2+wn!c+JS>tiNJH2r6T+$>u`%F^>lL^{Gg>GRnj4$^T zPxy#ckIg8*DzdYL>R}LJP9C~TCRGwN^1!2o(mN>d4#-F(R7@wvlluEkUu{wwckeMf zoe2avJ?z!>SGLtG9~&i|V9rxi)Kl$+@W;SxCOTm@?%jgCfRN;nF84_3JGW|Z-;?Fy zuSEUQI?AZCShcxWyDRy=eXKHnMf42pw*Tacfug@_ptg2ocbq^UEy_s&W?sU4OzgmV z{71h|F1_2<6-2@t<(V|%X*4IxKUQ!bwP=d?pv=4(e-h`XXJZ{c#PUX4kAnCIYo0?AlTY>1v*JW_F-j%6l7zHVHnO) z+ittS+QY;DCPw}B%!Z$}A;b$qj;Lrp&KcC?StH2EjJa&ex`YR@xTtZHykO$3x{oEz z3ZhqlOr7A(@TI=m_OH$xc-ZmoW@J1xz#MjS9{Xtd1VQXiz?i`4&X`X24tPaLGWpka z*+&pc&Eo>#R~hRXjT_GDhIKw1TKmekp{8&_LiOV*)aR6Z6g4fu0_OTJqE=QCFG<{# zrwDaFR^&i}oS{_i4HnqF+zGDFHl^K?lw5K^ec66J&*V=!2?0891*q-GAZ>`0xIkygyrpxWpD6&vz3!5$j!vEF{-Ln=Y?hP3!P$ zQbkrN+#3S-By6+_?ot;U=EFZyg;!O*wPWH+!Ww+MAAeg*7@Xp?{zQpXz7050<8*Vh zCJf&s$6OPh8~v5X8!F1CT(sy}G-?=J+OQ~L zF2s7ItpZm3L>{gU8oouFAFUo=ky%6PdIQ!wyx@Y>V33LSG6yYpt zBf3BFn<0;|aXDu={1{im#aE5p?f0aTy#Ah^q1zzSC%2zl1_|gju|gMyYdGVCa=Dz$ z=Y;Q7R9FEFw-V{q&o;#tS8Z9-$+u#Vz9xWwm)uR2cb?`wgKVVFK+@~qh(4E0s9*Ua ziB>58_~Y*cO)Wg*s8$IODT_wrC^Kqg?7P}gq7LshnrzpjEnT*h+#`C$?P4j0~Kk~%2aV!*vFY;Bd*!Yd(Dw@C@1p{!;N+dhnq8Y>*# zln#ejMh81_7#MOE%I9?@?c+bFS}QB-#<4YUyb9_cy67+&aU4zD+@}50eY63&J73xK z%r$GRnhS!`c>2P{8F;8HxLuf$iaY6EZrE-K9*>v3XYzK>pHw|rl{-}8&~J}}%CgsM zE?0#Fc|7=m`9?qFrtOsbRUAAwli!&j9o4XjD6eH;#i-_tr@n>5PxAaYr;vGHQk!H# znNfFsVs1{P|5Jd(3F%Si&Pp?nu`UH7&j_Nk^=_1@xJIi)9w3(~x?5q>QjH05j7o*= zYx$5xf&jwW7C?z*eUS>@)3VkUc;&Q)A->>pAmaWTVleQiCJ=6y_1A`XB)MY0BV;@` z!;>Ut;F>nyc%i57S>`0M07!7I0tY^vn!^#i$F;mNej}H5F^!}GfJDOgG)Vh2WIIH3 z9Fq>O_XF{v3h~>w5h+Q*ghjE1W3H*6OFW1-aV;7zm&)PUFuFh=$X~DDkt^dj8lo8q zvvv?0f>q7HPb5R4SKfTQ+K8 z6Q1}PO8??Jta=of4a@jBvw(D*RTeg|z38Ba%phST9Rc^Zu^bTw6_F%FXp`O zHYd5vd3Pl7SL_JA&0YJN0DN@qMT^fp^RJkZJ04(f6wS((A%P&!A%|xJ7Y({HF=gQb zbE#D_#gDpQDs{f>%uHsdRK~+u!%;0DobL_IiQ`f$(4aI?G-^pSjD7lao>pjESzX!l zJwR?ne-Jx`pYk`O8wWF*+p6Ib@MuuP9;#!586OcbPfGqQIh59Q&$=htNKH%QEmDk` zU_Ug<75U5p`vQD@B(rLbw4r)cv`oJ|Y^|&-TZBo5+NMtYg*B&xXThsmtr zBcB*pUlM0;=5|5J(CcOH+~eBy=K!7>Qkq8iHx(Z$vL)*;7FsT36&Zz7QZ~7SI99b+ z#)DzZhSF@)$I!Qy`gTzd0<=NI*<1Z@$g}6k3?o`~A2;9;AZ?t}%+0AFcjd+^-s8A9`S57@_#SZj!;_bYi-{G}hcSB@4vRyKdNtMdbiRsKi`WI5ESs)6FB3HA zxvT|k3iYNU4;}ct3vM|3gLI*b{#oD@GRb_!m6s_==%E%oV1cx#Z#j$WOdfO3`mJmD zZ9ApHlNkm zk83LyNJsucXwEYZq~_I9zfYHZ z?1R|_;mS&XeKuK!j^5*FxL{Uj-|`J0i`X?%UbOSKU))Z zZRlnCX2*N(JkZrr2dl#&j3CgRmARzN$)ctmJ~#EQ;KU>BmDw?x|KoHxVJvgm^A?9c zLsrdKhTdbf1Zxug(yrHz@Gm*icXMLGjm)FG9vzltY)gpaCb~8oaAjtW`MGhcOJ_hh zbX(!**5u>`)4a(1rLyr5mO6ZBxuQKr@A{8A1FP6+fmdjtOo)6#o);3yIqMPA(&?!i z5Mo0trRypK)cK5=Q`-)ADX*4~QBSLK)Q(miN|*my@P}^ob)geZ$io^vY9`qg*#&kd z|Es&VjB6_X|Gy0+LPU z`7bK{CM&5Lwh!DVZjou++>xywmqrY@1PPhUhOQDUh@=Zk&k-1Mp;Z&Kw7Y~2v(=cY zl^>`zw=QFY_V~F?4!Z{zQd-w-T%(7X+`+}yTF^Fvmg9vTcNgsfMWpMUsR$K-j+xcQnDtv7k zhj#j5>eVi1|LUpUjm%cxr!o#5>c1G&ezeel$**99t$zJJV;blc;yZXEd)c&KM5FT- zVQqTRCE(Jh?nyfIE0Wh`LR7aryJ@MLM(0b46(q6xbP84>hZXBMErJm{g;=JY&zv%I zc?MCw_AzaxE3vN=NfexRt3w$#pHGgDH~>)4oVDS^s8e1D{BD+RI=zYIKlh5{P^?ZZ zBG=N%yBs_~A5vqZQsHs9X9QWPyp5viN;Ryls`}B$KGdH4E^fr?3VfYg_EnC2M{-fY|mK8+|pF~ z(1g;lJ^JWVXIpQ%;Ln(NHX-#I+Y6g$K}-DMuD*(PHs9TRc8k>fD2IdVOl@wUjA3V4 z68oXl2Q@~CyB|(|?x_9M28c@{RTJyTLKo5L4XQIBzd9Slf-u!2K*CsYwSGP+ ztpxY+H>+|8%J{s4QvSgyESLo*tr+zMi}`1_8cJSV?AG4q3Ao;P9*ul>yi% zjwN(?VUHPUU8^n8vGjvJV{yT@0DWO+Z!c~%H59%P^Y&p8T(+b z_R1WLoa&Fu3_uQP5=@>yvL<=^-AQA8B`|nT5P`Tuwz)$nN#2tqFAcrQdy|=7aaB%* zkR-nBBn$#ifet}jzx0>6PLngZwM3}5MRcjB64U!~U*x@2z`RpmB zqi-|wyi-DDD9V17eUKU6&NB}G2fv_WBUBDl)(r%%O+ImR!HW6FRlLQ~=?vjQ#%aD* zI8j2yOL{ULj@jU`oZ&pv^nb`dtb{_fM^l?oSxxn`{?*g3U)TPr9LQ-k$@-dRqh@P0 z^KTj>xmGk&w#4JT`Ii~?n)LAS-KvE?!2M|&SR1FI-Q^Ouci$)&3W}8V?V~JbdkA~Z zdk|>%BcI>6M1fyq`!c>mv<%K}VBp*@zw=|`LLz85My};D&el3S!U{sA5tw#D&m`S& z5$m3OZc7Mv_c>og6Y4Eac8x}}eaRQNohGv~K^CU>RWsG8VfqX;BQ6b@A(x=e9BS42zHYJmH*Ik0QaJU?k6}SHN4dVUOt0 z2~U3+KT*I5Ie&-`BNowuJ|P8Fd`agYAnIlf8T;du=v&?(%o!IRaqDLKlJlUC$@}F9 zvwTDPJ!I#Nq}cht>csl0$>fXKz3P>-@d_?|W+wn-=PvgQX(Rd|+!fLu!%L4}zqRcu z=g>l7-FEvXtQW1}%k;)AWtXw*)!TXc8K zZfowX1bY-ylt(**)vLWOz1V_jxD{JSCzo-S{T`%0t>Dhd5~Q>Y2r(+MHp!pF&&k0B z`D3+WvL}1({sHEu&F3F%Yow|m-B_LKnU6SsZr;Ii2dXs$-S1*VvqKyj`+m#`oxSC% zwrC+2zbbgNI6NinP4xm!&nF_|m;blr@_>l%FPTALU2%g$!VpSw^dC=zp12>;evEjD zJjV!Q_s>?`8FqLl&R3Y4i6DuJh%cdO3LubtM|y?YY8Y7WFe#h_vkV?*V}s8EfB5)W z3P>(a%8v`|j`mAgpUmDfeF5^_MQT_DUuIAQw;~{?_z7{#(ls5%VO80LGRR3)8Pc1N z+s@Wzn?vP}2_4F;O~AWPhdBt(7^DL_7&cwYf|dR)O1jM^r(bcEVqNXO6q&HzN3Jx) z(Na+MGG^|NH+`W`7ed)A>P5UjOR~GrO)ux8-nbrGK-vXM|oeT|CkU-rM6{g9IpOYDSc{CvluvOi{aLh+GqNjp>qsw28K zf-hN^l=xNdW&WkO{n)eKzI@0+%&XpG0*MIG@Otq8q_=xZ*Ci@oV|nE>wC09M%PloJ zkg^G;8%D0orMKKrY}6v!1J5{HH(ZudN*lIx>tS3fw26!WzdXn@LOlR z$vkp*Om_?$W3-wAUeKNh)d>L&C>pKtjwdU}{PL_-cc!PMyp!12qGUDgJA&bS%^8KS zdZW%rXPqZB5&HU2RE@^!*X)ufD$bTBe1eR`$Z@h;%K`ldXK;`XZ$C?z!=W zQ9=qPeb_VVPren0H!tf8B(bcrN2?P8>HqA)jlaNBu)$epU}z7%0;lIXf}I_Psg;h! zOcu}m+=m~AT;xq)A(u^K0eplG3ySOQ5F4BWo`VFXuSp9%(g>}_(Toi0DQU)iH3i`U z+5zg(S_bv7GE%AYFx2p|?lGJ}24l(YH{J2+vIJ#d!3S9zUivfXzYxWBR(M%I2HCy2 zDZ?aclvUrO+N-|~<&dKm2KQFw)dvjznUx~^+zvoNd-=EZ=-dW8=sZjp*|3l6xEHru z8KloUx(kz5q8Tnbwz-dX=lEKu*KvsflQmrXXl={|G-w`l7$IGi--cKS($#`&GRM608>?ob1qCiN**F^ZTjA}8b!fG*6`K3v%Xirb}^9My8 zYK(LEvqyHac*2O8Whl!Lm&vTP<8yaNDyER@JqFYnqAYl{ieF-kB_kmA z$%0{QH$+~D9OUNS46$6nAGksolNgJV^7P)^Ju^m2pE!z?`huu1FEVndk41#oM9h6~aydy628YpeKA zUO5$ZLSNqSNMGm4?8a{!YkPNXdy`*3*t;R9s-aVE9c}xv08Y10(P_ncZE;TyMRp&4 z=|5^N*7RLawGmsqmw%ieusQ{(sxI!3x*G12$Lg^~p2A&n7btMdUXM^gi}dycn@?*5 zWjwbX1A0i<=A<;K2WbT6A9yxWDYZmF11yEQGXKI}5WOTs{EOtx>sJog5}|~(gA>9( zzwwXJ6jhqt0f!OX^f^=c!rBF+7uc*vs$x3#BpwpJ9vl-qa`}RMU@^)Rx0=fC*~0;P z0AsOMP*VFu;Zv{XmqI|#%L?uHpvd?c%@6(v4f#>bdVH*iDbai9JHgNrA@5)PBT`HXnKr6AG+dUJ?-}OB7 zYgFD%y^Zo(Zdcrp9%f5|f|M(q=|@Jn-C$1cD#Ld@7QdItcH!2KLL=F311ll?J|e3k z-i(}$TEr&q7ME2SBJEp3TY>83&bMx_$frGW%SY`_q_P7jJxQ_qMj;bZL>h!eMr#qIe`*IL+s__W`pe%&%&<#aanb4`Q$oPzgbyER%{NifHc zXZ%ZlHNb3(xHiuOl|M)Xs@x26&v$$e?bWH@H(kR_cUtG{BRfBJ$4zEM8J-`8nE{Qj zz5A}`<_pl7AvJ2s0jDT~CY3lJ6cKmIc;j+_tok!J8(apCk5~<$)+28ys`#|W3?pWN zvru;4m@zXY3vHlLy_hMa11W}U{fgPlNE5;063&ZZKPv9Q?;hQpZ2uL*&vp*$nNP0u}T7X>?7w@7$_u3gMdp2e_f1x7a z{&&wT2yb$3gZ>q%#DkdMwDn0B69k~+dM@YnXl5g&%Ofs;?q!9g`j)7}@i!+SPlR~+ z!B1H^nY?|e7j@WM?zTrgTKcupAPsxC$v&ygcs~`a8ICq>35m&5&XF`Sg2pR#xOxJ5 zj768zo05`K+k9s`UwoY28j`}1q_f_)gO^sdAa2GNoLu~Jhnt_j2Bb+lJ&=gG18P}b z(A`vEC)KMob>rD{Wim-15I&DmN!*f2&UlJR9M0aH!Ohl@0$IX_qF&p^+@G6RXIxj+ zXpoa^a#(q9;6Ev3D+CHS|VN z&F8m43nq-|_uti&B--Flop~!^H9z~?3u{bY6gie3^`no<)?zW5F zClK)8w?4U;S&@#CFpZ9Z>RDb1Z>Zex7AaU__#YGN;k(^gcy^z$C62mSMdT5scx(Xf zK`_!0>R$NK#Q=VXU^Hb4TfEm(>qnr<1~3*t}@N{+}d>;zui0$gZk6_ z&boCI?;yUHKbqgu3H9=)>Bd?tP~&zBX-11qT6v+RT3V$!^8I4tm<03Y#@0=TD`)X9 zQAhVrJcQ|0>=I>sKU~0X@5kr!S7hH67@__>{4Ty|$71DF$WMBNO=~9}pZg%%BqBmXEQdR!4aH5>pfL}!g=|snryu7CuXE5}fMMJ10{16Fkj^Ia7*Yjx#`Fg% z?H9xNc~ygYitI+sKy|@Onl|@OnteTh)X|BV-_X@y(|E_Fe>ES?S(a;1pcY1YQ*yJ_U-)mqM|e+zrQ%T zVv^{Oj3by}RB56qOFz|{zY{rSVbTjdk#`h7^w?#8`d^5d>M`LRiwkO^sCubnyl&XD z=Q=94z6rC|uy!4{pDZ_#Q}}PLM6?*)BW)6RT+ziwxJ}sfl;XW01M2e`~;!`#icKRkRS7%it<}6=i z0XMGKaCNFQ?a}dmP=yIQ^jLn1_z}I%;Lw2(`-2?dKbu?ZvbqLS|J6Ct7I+U7u8UuK z@~19>H#o5pE?Eu?UY}AP`1KCv>ZksLt*@_&y6vpJ5Nc0LEl^^rxfycgwC^|<|4zn~ z*<-K$urKZoi^tnBt+nIPk7*whFps}aH5;v3{K^M=xKV~aNgS0+U16FI|DGbdXAfmd zkTA!zvL1bUdFz(ELFB&&_T}$&3iKHZ_BI%%UZY>xlb(Q7OW@3N-%!Rf45tNMv5vW$4+FKyzHs7&2>-P6) zo#1#w$}p`dZ|yMNGx9ftIp!3q-~cC={StDs#g4Xy;xFfo5!1nhYWQDz2MH4%vqKe> zRuZ7~pVA0I`}jXM%4l$l8rN&MO>E4*<`LbFw~=3j=jc*SH+D@2N55(q<-2$ALVn+* zg0%;OD5Z9V_x78v+3%GrC)(rGAsjBVR#;Eo20I(*&g1s^P_0w+^)1jI%RR9U?dX!k%jjR^r_z%81 zR5vkA66b-QTxkFt%|Ip!`cV1mJyFZ;11N6ohFO)@QL-E$QK?`Nks84me09<7AGr4) z^t(tDXmP>1$TE>Zwr!Rb4d-=eL+K8mk_`xCkKr@J5T^}3ZceQGF<-y z$7&M-qt!-(x zoq%DxV|gR{#kUR~pi;hVGZ*W?b=!{= z6~xB87FjL|FCrN^kw~nYF1r01fZ!pQ0gCpG*DR@~W|aYaPQ<0_8)2>)^(?2_C6V-Xj-HhPq!MXGowk2w7JAvtdaUTrX*-^fKP(YER>rs6BR3GgG_G7ybg3vB>;JDYakfXQaV{n?@;5(F$x+| z@!@;GG}6_m*C6gcSo=|_!@ap|V06~%NRGcVPOAu^yL?hn$S`MOi}EcQ#lDNgbG>n^ zaRN<&Gg!gW-JE)C{+7h7wc^+F)u0=jqcX*lUUDL%kDq+(dhTz`?GN5ugP@AZX^&DM zPq?{+=7_d#h2+k;DJEI}fFdq_-4F0KO+kC|aa(v^rE0>{cNlMIHio<((EEm;mKo>` z&WFWy0D5d008ut{ptg=(_w_a{YmaDEJ3JBZcK={ z65SXfM!Ef1!OB^B+>yXRyu%@zU5sfEK#jLtUl*gjVGG@KJjIixUYohph(_(Caj35$ z0IWLeB|88K=e;!|;yI*-r~7@2PO3L(fymc z7(BTq3gW80WaWo((fxoL`AQhD>DtQ2qgn^ZO$^+0w0vXsi}_#lm3doGhy)pEHBn^I zncPzf`;v>PMa+$D^Zfq$8o8jU#O{uCTCp_r5ge~M^iOJ9vD++(A&ZEOMw{8&%)LE& z@%Z%Q3q?CvR|mdPTGb{tYl>Vr60`FO@I&aYQ8@?%MMAPX`o-J#lF$jD_*7qrrF9Ec zRy#4#+G?Ys7-;i?9qR>l(y+ezrOT=Nu2O!V0WA7_2Ei4DgBzluD4-BTe}nDgdg;(ln%GRP%CQE+Q{?2I>0hi&WL0k)eR` zoG0N1tUXU4Tg8SSwoG%M9Zg!EK7pU_c01z+SD)d!ff6y2uhqAsPL{fph{GaGdeJCk zf8`bKZ`>nn7~Ek?*(G}}jfzw-=aH?SG;5b;6`n}U$hC3AqC+{t-8ii8~Df#7htcNX58Lze*ht zRo5t=#J%~b0vK`8KLaw_7HPL&2)HIdh*9ZjCS|A+k!KYuUK^~#34BER9^c7OmG-lN zjexXyEyxW3zRcz8fRxUcB!Es)_6}P5q}W@&74V05*COS(N$(Ri)FARJnUAOE`cpad z+{7Je^>5vD#M8TlD}eyP^Ly2L`g}9iacR+rFEs<`WZ80l2m>_8y+(PuU;afH_*uef zm^1yN2WZ6lKpWVq{usrM+vV@dvc7I0FRCGJMCT4vE60{?&*N4^$e~=YK`+F8Y=ra* z#I5CvLIhh)OEVlYv_ICv7x-VF5MYlE@IB8~cafENgIrL&`KHV!i-Pyu`NEMy&So{y zapCT(6VIcZ1O0XI6Ng^uywzH@jsPos-D^A zbQHYf+qp#+7u^Is7Tt6^?_uA|X|ts<8hZaa|2y~W(V4<y~9H)Z}Kb z(kAyotEoeLkd&YTT&KYCEEev z{|yJWA5pHKwYv$`*e7KdPs(kQmqV?etL#wUpIwfQVdv1{^a{H}0yE^6?YFq&>ED-} zGynMSeP#5};ufoEQ4IHCMQUfFiO6U~P}wjX@@z(fZJvK5xLTBt(HwRdWU2hrFIsOv zUMNS}k3&jqP^U3SO*3{sMmJhQr+DZ{CszX_ORFvPt*6B8uc6|n^diu+G2c&ymw9Ch%9_}uKPb~Ht2Vzz7j#OceFYliSv~1eG#00#*4}nP5mAvi$n0eJnQT=C>PCu zAEghnb!*A*um@EEohnt7k$*~^Tg5wd39KJ7KjQbjUn@hK~m zZ^Pdftkz1&MolGU$bIJa)0p>PEm;MKA51R~K7hRy_y`ULgT|kr-C*Oi zpzz#BFP?89bsyoVThs0flNw~&O4|S1+W+(*ZfS92qZ4GxtbXISK?=q26$|;#K-u)k z3wPwzXaKx}2Ka;(l;wF>Oy3#clU&dpAM?J~; zo)+|;PTC$XDJ3J5zbKsEsDkibte53Mr+eLrtH<%!?(N@<*1m}nyNZF9F%+L114PSz zy&X$&y;0fH^PJvqxe@Zr)jTiYwaV<@mmvgz{2XeaM5@0*Q9EnP4HNVGpc$swXyMoK z@Ke`VHkoyLf6K*92JA~CTPs9+YATr_(SC2Dc)$vfuj3Hm?yJbIt_TVNipJOP5g+fd zTmqB=99C~iv{HD%R=5=Z2yf`MM!iLa{5t_}QI`E|NY#mYFKCMD zXb;f@z0@MlBXy5vj=jbi0SS$$X5zT_)Iz2=#LIf*i+<-4Nkz2efuE=*+fx^LH|!TS z{!T->+sDndr9Fn7uCTcZW)#QWn`IWLH!UM?k=)^W6de*jx z0^=*p-H-i5|9cm^MA$KGOSs(u9{%#pF^YLBROm}9O}?G7;&09BjEz&E{I2wKOAGG~ zP*826`?*mq!Zj1L>oe0mk^A+r%TX!UL80tT)a`Hvgq}`vu#TZ+sO5r0c~4E+d<$P{ z-7}w_$Fzq0C%(o$D#ST?p14V{?BNF_7s$A-RB>-s4z#y@p{}MX)j#vsAbg&6F8f{a(*#fms}aG11`}YajC3z4oPME2e&$cu-P@Q)tNH!-mCq;afoXpI;AL=G2Yk2*MWhsoipMz2 zOD1tchrWf_gKEc^?^Y=D_FO14tm%CtiDaO2v?@_qoMrcQBRX|>wmYr-w`t}4KSPMF z7cWcEzz(oedt}@xFQB@g=*%A5cFGL#HrKckgadWFY=13a^h`Jz)twJXbo$=(%F=Nr?$9z0qj6 zAUNl%IUou<&F8dKl@Ivyx#GL)rOKFqg#lu)x#%Y~?X|C)wqe|O?ILpfBI@Ob&8_{y z5ymgk0sA#i_{KFf$;r~0lB5TyVun-_>cYd{fPXYNdH38}>674i0Hd^sy zDL1b!X7$lPEW4`CrH!~xO4$%S(fnFxVi?`Urt|vw=YW7dD82NUPBzK(Kj}gV&cRVF zxq_?W3{O_dhJtPFBZ-l|wb3k#%5T@Z`(nbsHD=9$%PSv5CZ^O}gaIGw0H7c@_ZpAb z+Dhw{RCdkr@jMC`F#$qzF7);_KzZuQjy$Dl`m>q?^|K}tM89p4_Fh?C@qem2_UQF0 zCmXWD~D}|491aZ6I~sJ^zbhSRfjL* zf=<#_bl$?Pn_1#LnSm`KMS3{T+1}U39MWupUE^}sYFrxDi>BJTzT7zjTjk&4fwcL{ z16^Tck&Q*ph$hGgu|}jrs`>xe7RhB*l`CO&3?Fu5ILG@QQZaNJ@%`D@v#6?U;bIl; z%p@Yu9Eoa36^m7M+Xj5|v|eG!H+dd8iZ;he_TSMfV-EjRNi ziNb?}U5n}Lds#1}CV-%gE0Y7mMVPI$1_5kf`&9S>FL=&?DT6LO-o$*b~X$?LG#oAt_0A+buQY!IyOV zKtiCV?PBLw`U8G_NpQ(Ta& zErce$k6e+&*+#}}Q(0T^`KlJ;z@}NMvZ4E`l4$%F>dt&R{}q`aoe|AiQ0K&r-QTN9 z=4Rn(KRv|<=hwi4clwE7eJbU^ruf`74t4gv!3T+|7e-NniUJ;ViPyJjCoh7FN)T7t zaUdgFp&i5TNP6?MEX!s}cQ)`9|J#@JTpT)YW^uI2CN{~)a`vI*lttur%;X04P{BCO z2k?9-nDB4iulT%mw28|+6UH=E_LdoemJmRG*$0=okqBuwnVYbSA#(FZoZiCCEcFB# z!i*Va7eKa#+4U4&Ib}IErlk9|Fk)^Np?2UjIS+nIT9y9;9(=q-WC)063{|NbEX!Qb zyDRbZbMKrCNriDik7EghLG|9mPJs11&NkkUiJkB9yple*S_|hQ<0@u2X-)4Mp!L=`SV07Ge0fEz9puQy>|~&) zGxY40*CN+gOs|!T*nzx}_(;s}^yg|x%i<%w-m39TWe<~x&Tn&^d#l)Voc6c_LgH!s zI);8*g~Gp)yqD9m99vU=czyak;*b@h3;ivOW@3qg9G|A0K@`!P8b7yYZ@S z{g4})CH{oYJfOZ3S=Ym#tY&NLnCwMj zcHE5R3U&uJf!Ve<2`8bFi6OqU&HB|e1+|E-Nfu8|YJApL(|J5k__lH-X2M943wdQH z=jgusd9J%K%b`i=utemaBd(n?VBa;4p*06h#vq0uGQO8BQp&S~7NA}B$px1e*G>A5 zNazu+>^0ETddt@G$%oG)mx}D2WfeSH!FTQW5@3%SUO+oISu zvAR;`=T#Wbu8$Mmfnp@<&aU!Z7wdpm_BK3i>eHy_C5>4PbWd;dj&a`ANCHE8PAdkI zNe%97K*czH>1X?t#{Ohho$no-yVw2J>vIJ=A_OM^aKr zQtAML(d*7ZB5SwiKJ)%ia5)UfBoV)8ZmOHPpKDiXiusAfthGbOe*pIZqXj~7qkR{8 z=bFo7zljHPB?TaxD+%c8+ z@dhv;Vm>M{zNTCfDq0z60}48;J;8=PS)B@_vNMq8g4-+P`pk5vD#Bdg!D)sl&pFgszoGUv;A&68Jwv+HIZ ziRc%3D<5*^?-eO=cv`;SkrrZ`v1-73;2* z)23Fbe^LU<%USN4KYl*Yqkgf9et|Cg9Sa7s;yuqX&m3x&YH~mza@+0Z((#a%Z-N7- z%y<^ofb3KJtxEDBTK6Ho|2RdpJl5OIxw6?>zQSb*p1lsWPEeC_{i@vQ9rI*hKz(oX zD(4KTyq$~G*|uc-WBk=&+Z~;Bo+uGy_GLA?dDf$PAWiQs;O$;69C^u1KB@m9buMVY z(~y6unmFX?b6_Rtr}g~He~&dQ6X4~4wU^8@a!z}1zVaE`yY={hw_YY{Za#*oyzDhW|}HG?oCi=*)qaAr6s$m83Tc{04FNJpZzq zpU)c3+2Cfwy#waZEGwy}rvo;WKaN*SQHU)VX~pYBsT10YIJ4kEBJG*0Pv_moF7s~0 z95UN7?9(RgC^5B|#xV&5lxhxN0{xYHQ~jS(UvU4H4?Rrf_GP;r)0cSF3V`Bq{4DfC z{R#~Dpq4X)z5nV?4rwaxB+Q3*2h_zNZMfm^CU$orvgQN}0uT`Z50Ng@jgI+r{nUr7 zgUA8ZUk(2H7%^M9IO3j}JNhkwqsLKVgl_gs*F!mO#HMnvs#kr=W;Mknxz!C(feYf7 zJHz9QvSnCrBpE46J}tbbYV*J5>!*6T$+WhcRBOlgHuodq9Tr)A=%wlMo(_`UZB40% z!fs^j$*Ft)_j1Y2R;Ks7sV~sx;IZ4rOKuxrAPh24-)_9XYzh#0zi1>TVuG^X`psTA z%boGn`9+-8IcjmS)4sRwo;Ak9c~qNRBO#jgzXE)AnpC1j!2EwD*k`CSG71C+;Gy@} zrm=h)9oLf5Hcp zB*@}zh=|-w9Ivi<5MaGiYMgHUVo&?*O+2$VBY#*@s&Lt09j=VWyVdqMI88uXUC6Tz zXosoZEr@JP=u>s`Mnzf#>|mz7k0um_fAe=;&EEmYv9u|3ny!1fPY#|;T0+_6$VJ5D zbxnZV=UfP!8#mls)fq9(_PlFcLH^Cv2@%Egl;FP;Ry~Ag*J%(aa~HGu?_P)ZMMJ02UDmpqt%ndl|HtukE+d|omhXxVmVL8w(;Tnj^DdW~K=1YW z^%os0V31EkvY34MQGMOly67{FLO@78>4UF#<;hZVD(=4Sx_XI+cit+*7kwT}?Um$+ z9=dn>Mf5e$vmCr0(e?k)v%GEx02{FX<6PJ`?cTteeLgV2qXzxcu8L;*s#gfeuAgN# zqA`G~c`4f0b0L?y7^><%G{Y+vwYXe*bS9 maKMJ~|4%c6sm-vLx8=dUE4pw|R$z$p4OK-Ag)%v_PyY{0=?-cD diff --git a/doc/design/mkldnn/image/layers.png b/doc/design/mkldnn/image/layers.png index 4f87553b41b2c38caca2e54c039e9ccaa9d605e5..306f79b7a844610915eb8944128f57d2b7a3065a 100644 GIT binary patch literal 11646 zcmb_?WmH>Hn>8(kqNTXI6?b=9+${tM?ouGdDU_ncwK&D0Xs`lFaCa$MG&m&`C=`kn z_|o_N*37(XW`4{!-;b=Ddv5N!=dPTypS}09V|2Au@NuYc(9qEE)l^^UqmBz`XpaQ3 zFj3#46nLMZ4v)O_RTR-6lQajYlP8V}S_)`r4N16vtT9mM*q*8;UTA2~`u~0&rE@)_ zMnmHvR(qvj=x=fKDZtZwF&Gi@E%QS#JKAv@i_2%0TvO+f7Jc@N%l>1{!-afJVySG+ z^51rn&1vUJ{m$iTQuig!%fk8n7ep(m?IWo*=}by`Bpk7~m_+@GKcwj1U!v7_O$D5U z5j;nG93;Jies${v>GnI8xy*lia-EMj$$OjeMI)9X3aq7;^dN$_O=Z(7h)0lBkJpFJ z#@+YxJ@y#CwAFteN;aH|>--7*Lh#D2SJH!GEdEElcP00D>Q?)>|L~ToI#(m8Kn%9)RUFcUGC#&zhpDC|;B(mqn zg|w?Y6MhEwdTI8M)eeU1{1!bQiwhh-{2W+Tay;e5GBVyMztN3q%5Er#WYyjN`3&(CQ?4qHmk$iV}bi&cYl9JPHvV#hyJ@~1{>O`y?v>~A;xQ$%|@YF zbMe}Q=pRmp^?$-~;;#_zDwiVh;}bQ!nSArxYrJ7)qAakRMX2nzw9Zh5j5Xxorz47# z+eOkbk2Mp^w%_kTgeR?LA+3c+gkr9=EAtO-re4tDtY;NFT zM<)HO?1SrWUu%ZzDh)Zi49`Ri7wxPAh+GpWmVBQAhD{6Y^@CM{Lu1S2M{yeS4#{+l zbGUq1B_<#1WyV%$7?;>obBx53^2@QCiyG~ZJ@YvYdwm!?ZKU(vgCz<3P*MP5 zL8&OYMCMBk;H1g^7~9J|VrYD#I8{O8M3X#wWKB9{Z(Oe!sT|CyRXdsV+fP*+kVDeX zLk0LQ+6>Ce%;?-$b%Tb->G1LxC&M73No zL5`P7*T?I=?9cl%iWaa09zm>OIPo6VB1#haB(1L^g25`aJC63lQ;5SHNwJmurr;TB zIzM*?pD+EtDDNAT2qEZww#4pnY*+r#_~g7gN?%1tz!$8)^$AAj9(L`@r4$&Wg=no8 z)m6Q2=w&Hy2+pS&y~o)Mu)Cxv$j%X#Hgp!o?z)4N5&D76ctUH7i> z9bM5ww5W@qZ0 zk0v1wKCT?%ZZSqRNL%@C>a3b`B)#s0A3*Cu>m_#}H|_sGq;*%y1Z>x_`H7j2=jV=f zG$$E#a(c7`0`a>&trl*XLoFFA=Z;Y+>i>q7-6D0i_=(!88Yfv_ekZ{wO|qMPJncZ3 zLu{*VwbLRLgyn>Me8UTtnWaWaeB)yudsj6nsSCWvRy32OAYd>CdQBY(jqmyG^4DyYXy0qIe2=cg>49ct# z`i{*%zl<0>$|B61$06Q6+fcqxn56duJrm^-JnnwH?d!VwJ78;4^Vjv&odpNv{x_Z2 z?I?I<`!YAzze~FMdP`XZ;&96~_}UD#YLK%x{dmjQ$$E12QUSE!ryW0*k)3}`mh*lM zzpt@H&~m}Z!aT0E8HZ|+8(3XvUCQvUb%|xo|McwC{W^)hD)s}#;MqoeI?99X|j}>;gA^RQJm&{HqyprKNPj~&w>?XY;|$j-|Um% zN6H zTqzB2R$MTO+r>!a_4+hVBGQYs zmG)ne#6A zl|SnskA3-PHB0eg-c2a3kUjx10OzHU6uKf8P7|w`R9_?tr8Rs6w$wAD66^o(b+49S zprS;!zNKhzJuQGsC(^e(_ywI!DHU4{Nkk6u#pkF=Yq7?bmh-z_oLvW113zFfaoTS! zE)3KwGHi2ZB!&u5CTK#hsT_PQIY-A*ZPtp3GeOLlTY~zysK@t$=-Ex5mVIF`8&zB~ zt;<;-kzkgY2%fFt9!)8k2p1&!6n!F~kMZ}He$^!t==ycMR?aWRV1Ytnz1L3T;1a?n zU}cm~ua<)6$^9kQ{+h52rL%Kt`t`lF#5NMMk<`ij@G>9B?jwY>9!waBNDupYhwL9v z5CgGXVXaDtv&jXt8Og#r0GznCzl6%GHHHlxTNg;xR3E4+!?9gsrl$qCL(alW&d3Ye z=A+`dAJruO&qzipaz=P(&n_CSri0fdd>KF7S|G<|c|Ck{neajsBBL3sOE%h$l+9AZ zc3nJ>{Rg`9_wl*wMGR-z+3kNT###&=?RC5U@tgCy8(#d+HOg1;Jsm|h()r{-)C2;5 zit1vF2Yv{i8m;4p{3$T6b;m9kYFT}69|WY2D+{{+vF*D37viq=n)Y`#wOCuAt=6SP z|GInr4ZQw4qW52LzkeO-))Ejdpk7^^=6lMEr*7#FS}@uh*>5o(@*Sk4pv(;3$J{BT zLr0bGH@|3~U*6s>M?@ZYRO60(;=(omhUi#k!jPbRGnJ#rF)@{T^20g3mP_%?6q2cu zE1?#O1-hh_aWo5Ip(C=jH-T7`On>6r_6UpS;W4Em(+_Naz{zWRdv$x#=RLsXi-eVGpIQLQ0ZZC`sKG$=?gxC zvnGcEjl}pQC1xRnz8QmG4Y%j4ojEonj`U`FL9q46P$OCdRzZ2c293lU)_c_0!Y^cn zUj^J_`lQq)G{a)h%Is1mDSQ_f`gkJ`=jTMz-P!VpMJuDjF<;FTx&YHnNp0ULMZ>LJ z)uQA_T?$=!LO$`Q&OKacIGDY^3+O!;dzi)RFjgkIwTy>2$0w3g|7d0w5t4XUvx=AT z5(BCY&CA+;NwhhMQI7z}*pQ)<@A zyMOaIU-KkE72QNOqyw_+71aYp`zx@Xsztnv_$GH>ClpIrBdFh3_t7+PHQJ z=q<|@;M{cZcGy9L5ri^B8BK5(V$d-u(&KY5J#)DLPd2SyTI+V1&>YC2tgNPS?^&L3 zSA$?Exfs!Zg;QjF(GfZh;t$#1%Smo}K8H?@4sOs5xd4^?xz?E)L3FKe?L8@jeeM)i zLGixBF5ZzQ7qGu_M-Cm(k~DWKhyT^pOPQyqOP+jhMS|oHBOlSjYC$MQTzwn(OH(9p z78^ItFg0@*DefqxY*%-4{{@o1>o{_BfdNUe7CoESHzIdzgS9*R zCn~Ul1fNglwLHkkMazjlYos)mXpz)anaWi7HC9U_|5#a1@PiATFJje+1bgEhA$FL{ zoWC%VMf6)u<**Mv+vhnX>}{M!i|U{jCwHM$IgvKk20T`}*=nGH*7*J21*)*HXXh|l z&Py(LdflDMY{|+$thr8{B{)*;@P!0epagNzMM6bW{M*Uju%8~%D+~|kdrFJNmz!pz zcKdtRT)%)=HH+zm>4e4*aXBHnH`!nP*=2Ob^j!1kmOB+7T0KZXyS&L){y=AQ(qnQ&jANpt!JRG^seNcq znOby_p7K2+Ug@)(1JX?iHoIt9e4P_Nx&OXUTM_OYu)QkmGM%GhAd!*kGg85YX~j{W z3fu@JMMwOo_gyt89_Xvxy@9#{0h3RK9e$c~tCI6_&<}B7-88?u$n^sZ2fKbImoE*l zvV%wV62ZNz3Ul8Mmd`YPRBhRhtQEBx*DyG#_%ng4)o6NHC_IHI&Ny}Gh6__#jEs$X z+7#WDE~;ragXVbPoJ$q5XfZ(EolTi^x6(@uggFp+QR>&p+An zr$L1PS$W_ zPw;0*A_5!C=~hhLn}jKqIZ>1U53W(Sh)28CF?bKxk=AUF-ZB5@aDQp8j&xR1knku7 z;p{%?wc)=IW3!73Xj6@+o`>dn1%d$B@pl%nV2O-?ubBp*y(ogcBFk8CsBO4(w%hiA z2sxSSX5%-Zm6y5dYfo;ue+=&2N{RO8@FHC4yyFZI0OLXwaPi9cY6aw*CpHRS?z&DQT%}X z4TZ^e8O?!tTN9DKEz{LXs7nJ-fHisyhl|^T zq#q*@f2%yN9?w?73ekYaj3V3rQROmJ4#pYl{2pm~DrIMOly00Sma1Tx(pf$d16G?F zo@-Ukb*YK@_OmbMw?OOOn;MCCr!1^BX|Ju9yjntBz)mQIbM=FUCatM9xUr=iSQYEF zMpPaphvOx*d+(#vScF8zE9>_PLJ0oqnu%GNv+;XCz}X~KfRh(&E;Bnbc@#+Niajkv zlCpPPUsH>7fFL}eP3*sGQp!s z_v}Xpa~^Zx+K(09AJYj}r;q;CJxJrt25b|f6uy8ZN|YYlx#@3qf&N1hQRH%dSxgATz>HqV*n>5^boA-^ z$IT=;OVrCBSmmS)kPbn35T1$jKY4f(2#Ft2d1gjj7yKvz6djq-)e!_UPJ6-JXhLa)NQ7&NAx>sG{``ZBIR&uUfiQ%C!$_KJ@C~f2P_zMJQQ&8sKJphGJ0__-c#08VOJ`lc+_?D(+*IaQ3h#!2*_tu@ihOhiy0r_tBf zCVE%0#$TbQSp%j_)93f>a^jpJr!ce6vSieiail)~bDLGQs%~R*k1^oXzb;fBS8Z2v+tRd1KGU$_=2t9(Zeh_AR5@OnU|>-d?VeMx2_F zze)B&>`&vNVSzM?anP-MqXVd|@wXO59)|LF zlfay_*Dw6Mz}r#>x(AwC_m`s#sp@~|aGLZ89 zN5AfrUO26dkV3vUmA6qMc;Z@x~2YgnFXDmK<>nGg4eY<|HlC{!64R?YQ|_qbP{9siYew8W=ELE8nFUffNowpIgj3sO-E zX$TqQI&1-gdt<(g&V+{A%az(|!S)@RBXxFS_#5k}uf3f~&pjXVBqSm7!a=bJU5 zpi<&plU=L2BWvBelxPv14m$jk^Qwak*BFyLPdJ9DDOcml}LqM_wA|NCJFk`!2(f zHg0|kytI77e40!FoXGbyblC=M1v^=Y&j|it>l&HVOj8j}ALisqXq<24i6G)B;()o= z;*P9U$L@%=7^%9VT>cedmI;@l$KuZ(i8w{cElZLRe4+((5LC+(m}lKWQGV+_Dao$A zc-{NMreM$Dm{mQ?3NoQS+7G1n*`{OkKU@ozJvkncyFXk%a%5b3_olG#C}h(qX@%f} zS=5ew^SR2oM>m*9xIQuOt*rPnjZgkZi`p0P{h7KGWafvqFXt!eaX0N&s<@rzE*F+F zhhV&ezLl$`ljkQt?k0-gXO&R;u!Tbzo1*j0rshR1#7*yJ{ZV%VjLUyoqjZS#^YjoSm zEgM3InPN&l8LhkuF{AyVmrA0GJ5sn&14{jE^GI;wWKlG{xMUKivv%?GyWw8}I=CPA z=hoH`;tcyn6`$weCow~t@cnZdk1AFjl7)r^>&tN>+{7hMS4LOm{bjeOD6L;NC{-Q# z`1bjQJL$|oC(v5F;^K>O;?R?~MUMReRJW5CZIbM#YSgz_9}JF|-EGJx!KPDcw3%>^ zs;1J2bd@!=HK;|iHcI`s)u+AgJ1(=DXqhL6)}u;Azx{a=Ei*(rHNJvK!K&S2we=Np zPOESJ6$x+=2Kp>0Q;rfzf<_W@0Io1J2Sv70-pFXj@LzHvU(9j>2Y1k^V|Tfvj7PiH z^bT>7ENKsN-WaK&C%ic1{IxraoO$;n5yuf05?Enyb55hgIgM(Wk*adzL*YQ_#2)=<^@?rgHf;g~ZV??j6oZ)6p#^}t*C$c)War>=)CMzpxRg)*gx@M_IASo(lhbEwL*zNI*V&pBC z9#}r$g+}L3`2<&aY*SuBKFT!Bu?9Q;{#U`?`bm{sF{GIOkNf>Kc(B6Ej54Gq^d~U^ zx0D@I9@4WLq9FB%$E*P+XV|3XIro0bxGf#{!4Sq6*VQh_x3cYG_yx2L;V(+|@K9O( z;cfPnv=e{P=DxbK8f!*geTA9Z+*3`Cgp*)X^GKe*&QK1FM=#2u=5~=>f_%%|N651v zH8M4G>F$lg-0W~aX--T-Aygq|;)js6(VrC;+)`7G4f46OC&xNy`*gE-$}uurIWNCz z!!3leT#~!&fcJ!H>BZK8(eSZ@8!=x7FJjDF!j(WDF>re22R$H|#%5<}Vo0yHP1Kq8 zl9xxfv$4s7ZViMyriGGR>Y8Nj%1Zo%yl_IE2VO5V>__2FHOJfdeH>9(v>f936=}sf226;zQ~*ssa-pa*;WDj-`;StKj%Oi$Vrf5?a=q z71K=*V^^iL5S?C;u)#;=apMcW_#&j6-J@TW`gFc@nV<4WtP+_>(K5|!BZ8I)e`C;I zm47AURW(X_BmIKmse0|iDz{(r`b*li#zEVXQkEczWs>q^>FvCZOFclQBZTf1V zgPo#152RL3sGbgI0ed6E(tFn66_1A!2C8bfKUp~ajCY#DnhD(q2m@+v5iZOLA(6ls z23;!On`Za%BRz8E@p(>YU#yA%J0nO^qIs@b_c zBat9_t?hd=y-MpZEt`_MHfbY-O#2%p$HA^~AhvSKteG@&?E|$A2B*ogNqJ)7#opS( z8N3T*j74iwSM*I~8|PYcsMl;oZ*v%15JbT|4o|&<-!u4qZ=vpNo*L7(8{m>GZuG`QUW!hb3`oSEAqVK4%2sS+tU?Y-(Y99G+a^R9 zFUMWa&A|kRRcAZ8v&pwvlM?>-a{~;eQwjSy{+(srsaZBu2>L@sHU;UdLhyt%AG_OxwviS=pl1wqDVtCWtNE}F(5GKd_R(j= z$U1hAB&8VS$+Iw5@T15ZE$*WRg8m+nut0QrOE7w+4ybbqh>mff;Z4|J7&G)UH1`f9 zL>BgzyjmwkH2;zF=DTo5{$DWppMO+|d`kKy@i%|yus~&ddO8_!E;Ft^B@-Zl!K>{$ z2m?}rgaKxrm`jy6(K=NAL&EjR85@C%;dY<1T3{>qC#YuYN0cQF=9MZo#hKU@^=nHx z+qjph?*#g~y)YpvhLfQz?vv?c#e@+f8GAHzKVyI_j-D30@J+o zd$!`xvSHL@i}!A^+Plto=^6?4jNGz?`A`bqj*)}ommv~xLILgQ^~ZSG8!rzuu}&6j zDVPY`4PU8+$WIb>%id7TP-dKDoxj~)ZMslpkAL?0d_i?p+_88g{4iN|k~YX{mv50- z`{&n>w&7l>dsfm!Lo>!nS2Sr9nH*2Pn=77R9Sdaq{)S}_(P%e(zsZ6pZ|{PRS##Fl zFHz63*A690kSRHHpDi5oj#t6iwS#}HlUqd`E7u9avv}ptbu@NRPW$cwI<5OUZlEnc zcjz<7IZa(yR1VBZJT1c)REF)4`(uDQY_IKO>=IW!%= zA@wXf#!I$j=#vY*?{>y_v2SIHD_$*#2?Th*>YJ5vVD7`Obx}Xr*EgS(XJs7^opS~qsKNC>>gQV4Qe7@z#SquC?0G@NHn5f?M83EDuc`B)MKQPk#1R>TvD%&pjePUP0lP69;s!K*edYnAS5e zQa@k)w9pF$dGFmviL5dv+0O3yaaK-}Z4L50*A!`yKq%Yxv?uz(;?)2x7f4Qi@vHId z&}oy&3iYFtSU-SX*6vI$XNIszMr8d>b1Cz--ZE>in-+#k$IaZ#1}7$Es^+Aw#~{mB z3DNJ-lElhCJAJG87Qo(0!|7MJq3?pcHSH^KR>G;(K?ELbh#FOd>>fv{z*n-@UO=nVGnH_ZUH@u!44Y6{|2*!uvh5Q7F zYJV}^E5v0><^LzTaMqbn4KGxK^~&d86ZqDmlDT2@d_#3LK`OhT~CT}0xN8>q5Dh7 z>=>&HNnu*74_-8`_-zg5cuhU28;^xp75PiUoe>p%0{$PZ?`VdAV)sN{det~{7oNW; z)KK#!G1O!qlBRC()`EUH3wp8sf`$07RU;j>^FU%q)gI5dS8ogr@wy6+do}uv`d-^P zJ-F2JStmGbS7h9bx79arib7np{_VsKwQnR2Vk_-mlj1L>)YQC9HI zpbyx6uOEoy#9x;QE*MXRu!=Cw2-hflr)!NSQVe#0;d%$AZwh z^M$>VC$X3#e>j+kV?zRu$Ywli2HhNhY_ zrph@GpWD1K9@RM<*vh5D=e?}qeM2VI&=W36U4Y9kJJ(!+E1;Nj%{z0Ef%D~AjNNnQ z0?x1mwnM8oUzZr}f?78380QR~+;uWg>rLIErW=%Q@ygHjwVSy3g@8kmeb#;YDq-RZ zta)5yTj=~w_g7(c0R{sa=V8xaI{}al35F^uTpgN*dKl@UItnE|`LI1~kY#q=MHUSu z>a|C*@sg-DFb9orS8F1}7zp)}WdmirkpZ3uFWkPVJnCiLS463ET-$a68U zvMSP?XDaNTHqk$VFwt9wk0;^|oOI`GHu4{GXJauk(or$JFmZOdKVEOCOb8HsPP$(> zz>PJuyvg)8Lqz-}Y>(y(QR;8BRUKIzTk{dCo@sxiNGW#FQ)_5Tyhm)an?=6swz}t$ zbgkda4OwZ@!6#>Dme)XMr2FwZLusHsgc=XwNW$M_^Hum{k2(HxL6>mvbhiSRwWqr% z8?>l!bH((hTlg(n4J+VNpa-NgX}7zPB1Z>Xt%(=m+4+0Ng*@u5g4!f53o$wb6M|2A z+|RG>2~rkh2S;lt<8V{iScj3xcXtZ}eiwEMWr_YZKv3}5n$1)d(id>5LHsr%S$3u$ zMksSYFw^fRHlvka>civvgLOZOP9c zooC*oAvfPlmntF^idDm40z_V<`}WK){9B|C8L0OE+?v-;5^XVVn6EmyIdz65CDBhs z@Q>?FijiB;0A4(Q{?mi|+cnZi+BsgYDG+Ma?8^gAhEEW%;bIz6_IE(mA6q%`1g2swx;F2PNFUKEw@w-{+ ztU-$NQ=}T%DhL@L^sBgztr+wSg+q_{K8&bW$TJJ~5saVRqS9rp=HTKvjQ2j}k-tV> z?;2b*qy7-WjQ=k=f#mb)pmOs2ZGyV^fAkZ&I;MBXZlTLX*J{|fdUwCpeZ^>2_%n#d z@z1Vrt9QRnu%0Flqcx|KRtuS!P9UM4q%Xu9VJpk8cqu^|d zoL#eUXpcIp0w~cQ9O_DEyiFLg4y8)ET9o9SiW?w#58bJt zwIrsWZ>^8R?lx}5Vr-Xfb2v@tKB(PhzfYmm&A$5r7nc&nt-5zuf3rtR;1I2q7cPAO z9iS4sYs71)`SNTDzU7(KW*=mYkHp!Fug+!{;MMra&BFVU zkBQf7Mkh{+dBT|17=iAk$}0)E_g(zN%u5Xp$A0N2hBvl6`iG;4hs>2!INqclpwOqP z@UPKWR2(WS{UKj!l~rfWJs2ZlcVH?JZ&<~>8Y=L~6Co+K0r-uZG||ynW6iBJs%Ua% zhU#d^RO4L&TraM8&d$l+=j`+Br+iNAGi@~>1&9I<4-crJuA+yB zhmXX)q{&EdEz7EEk8yAK9(rm@c$Gu=Yr2czB-^DX*-Eai7WE)S(`DcvL_B zzVOp{sF?8Z#FsQw6b*dMw`c8>Oeeiq_rNVa!$(^@?>kzl@2iJPKMB9hXPn85(VTk4 z{r$6S;@lkGRu{OBtqX?hSaW_q|hpbDJ%R=mj28Owc@R-XG>) z^5L}ad_LF}J(|Vr5leD$lfq(mar~{y!sphG-`d>eP8rfrNkzpbw&z>}_eM+zq@jU! zDJZL`7z&}OX_B7>9ZRzW z(MFyNGo{U*fkrI9j5f>J*qO9XA^jyK)q%$tF*{%y9SJDTTvn9b)n4^5-cDTd?;+N1 zooLDl9L%&*{q1mq`5*uEe=(N-LbInyz|u@<;BL0uVMpm_?JJd%X@lOCL1-Cy zVMKCz-ArGO#`$;va^*9hmoT#TH$t_92F{S?vZNUZtP4CuYZ^fWt4H2AAvk|6fU zUD6U8i1M>2PGR9)*3)Zhh`?DU~H^pq+t1Gdl)yJBDM=uU~JO zoA6nTD;)8FL<-n)@3qLWwG?v25;fBT1B#{Jzf*qET^!O({Wb}eXR2MxsFS8XZ)8km z&M3(?swO}Cz0s=3k9YVu_54OTt&3hsR7g~^$kxRE}v#N5iye8x4geD zqK4%V1nQKmI9VvYn`)M+&Y!T6_&IL|Qvc-1YIng=SDqqd4~pdY8gNvXB)Pr)wx1s5 zAH$7X$g`h6MVR}k^ISksRziG&t;BL+RbQktS*N@_t9G^S^BAz`; zYr#qp-SVs~No`W-NAnj?(yvg`EeF5r$VYW|DK2CQ392)RKV z!z{W6as%u2cg#(k>mL+w%$AX{$-MQ(1SrJ z3MP2embXh1enH1Lltdyx#Gl*Hzcp+ID#LP=u@xjPwE>#8^BK1m7gDd**B1s4E)W?T zTOsZNE#wcQtGZDagljL~^=l8Cp6Zp?ay`uX#67AO(f*okbUh?Y4CTFJV!;ChJ*%$D zWno1g`!gJDQ9Zo>&&cYdc!{#Hy&OYF9I*FKBgj$kgM)KfR83)vF~>W}S{6nsRm8Tr z$pc3wCEERYxYG15kdSgZ#&p8`ZYmJuJ5y3S0asv|GUd^6Dg%d=IX)EKJg)3{$Fab+ zsrdC9O%cJ>+7qH?%TadoYQKOE0Kv`Lw)xlcDGc58AC#!LXTSn+=1TkrQxz(a1~W!Y zBw%ABySe=xyiO2J%~fA)IoocC zTvtpJ7~s4Y=NlP;yR&lper^qP>^1iDC;mBOX8dnzqTO%CJWfmQi)#8E`_>QeBI-lYr%#$LwUdFu@>c`8Td8-dBO2GR6+=z+rnR~mj^F{2B3s8P| zuY2;xP0zyBSWGMuwN#wiWepgz*sj3=A9mVUX^Mug?0KpQ{j`|W zY?FKNhv(k*ZkjjAkUqLh$1G9yk51;aADho(7ig$_AMXCRFbvu{HPqk`=VNPA1AW3S zj`s9vng3xAj*V(Yh8yygZH@8qnpo)3PxCkMa*r?8?D*-vn#vn~5MO3yNcc>XRlvwF z(T~ga@H}#WKBVn&Tn{++kid4|x1%Pq0Bk$@wfQ{DE)973@Y6t`t;u^M?}^Ltn#Zl^ z%?W|J^5BpzThvXI+{7%gb6irnQJ9gey?7 zq=NtPhk$O|HxEc+T|l@aG}}mVerQ@6sYC!cN}bBdGg^@;X0J3HsM!lLzbh!;0oD>G zYTMeF-n9|RpkZiu03WeTL9ILzOSYvI2A(~nZYz1l7e>(^x z-d}%C#cirQR&A5evM|+xF7BKA8LZTXVIJiWK4$7qELhtMzn^*4@>bKcRQ9MCdRoA6a*H=(r!U*jk*6Lw?9 zIgU{737v$KzOYecm|!SR3eO->sMQevApPD^CCu+6OcVzc##AJrPeycoNsAd@#0#TqI=gNYxHVGt2vblHcuj+tJ5E zY;6T|l4axhE>I!2l~Mv~tBvX>CyakWIuzf36Vzs*H`N)m z{_$!>nDKCQ(=O6Wx}%~#_m{aq+qFX6zq(dsC-hx|h0d2gxXy&+^k-cT=WC9Fnw+op z){|aiS#Nzdq*l55D&YBG0NI6Wmg8~a5_Y9nH8KCpm-ASni{y> z7Mp_gJ&bTPRJ38OMK~)}O>7de{f4c6!PorC?HP_sgYmH_k3yK_oUB#&?}>?+>*iOX zyB45yS8T!4;L2%E5rsMx!jWgUdE0JWa+|&VRrLI?zRfI+W82(p+PCE2_x8^Lb%TZ@ zgUp#V`k(ehuei=_fIB{F5OSWxS>H?uG(qlj2LWrA){n^##b31r%NqpyetI$xUh83y zhvTK93vVTHWFK)w$x790#J*Gxok=cxB{`pqw!EDzu1e5zCCpwbpJCjK|(?yEBQ6ZL~wE)!$vETQrWXpcscVwu<`t zUhNNJ4a_Uy9D1LqU(@PU*q!~w9BFY8U+mU$wJC!-oSyF8@a(J&GA~eF?LFD_grlYL zJOdJ%(^{r`QeGj%xRNMRrgZq867q`KOA#?PxKvn4PA5Yj@m(?W_>VwOWsj}q`~w<3 zWLr^5B;QZ0}iFnchtf|2P8kZk<+>goQ|oJ>G>|su#Hu^)Jn) zz@`c9tXG9vBYXtnGuwafY!)vmAN0ps9fDG{ry@Y6NwZcR9zSOv)o=J@Kd2Blwx6%gL{{MozK0ZD0C$ zbzG{%A;~v;ifvOQiXf~;ui=m+^sYI-7(%F{a+VQ$(yu`juZtaw0AF4#=DH zHriA7ye)y)1pM6bVciwX#s6p6?8h{1U&cxPiZnfAzkH=1c=!gebyxDU)m=Nq+VQB8 zq>mCe=X;@CK0No~Dx#ZTrLsIHcmWT3ulGLI>CqkZk*4=Rs*PfUrvI!oux=)O@R?zV zkI{A9QK0no^vHH{Asqzd1x~G@Owdc~vC4#|H}|1O{A3H2Gh5 zNRcgpYw<3%3*g5#8j;S_TGT(2g7OhG{BU&pp1qCqbk;TRkym~jX^_ZH>-9BO?$GaL zQ5I7po(4yyzEU3N#X})9`prUtrsrIt9|aUBRinF+PdoGTo>-6oWk=WB+_@>s3xED2 z9Ykw(@lhJ@9d092iCE29uuhTUAzkHEhq}~}UtU3Oq4b(H%bbxTfqk3d9uMQ~&=Nt< z!6gt|ptccp;L}9+Oc-Bp6ZIJSN3;SVgH70Qg~EWnVye$+B=IDlY<;oIIEfMUEC6@9 zXbcvZke&V$@rwBNDTurpk4#T`I&%Nz>mg%5xPAp=hj)`W8)pDVmYpjV0X^~n9h-1X zW-1P5y?0YT5koG#Z!-k2F#;qoLiBqXDie7_p5gJ|lbw>a=cmAC>v(0@?ven_hXx7w zi70vD1eaYn;rQJi?NsEO^yJ|}@+vY^rnF~yXTU5N8)oQHuF`;GbrnYF7mu%vD&GAav!D6!Fp-f)*^;BOn zbb29NS7ZQ-(Ra+QppT+_nry~vr(JU?T?ivPRkBSmjmw-Ohb3%Br)$wMBbhNb2Pa*h z5o>N(O#Es!lVAu4*-xRrdL}<{8aaH^5w8o*d%5bBwQ}CzZ>L&LGpm_pFv@N^?e;beDh&grgKG7~Dd)hzOY&Y}UPhC@33r3o78F z65;NY7iYt1E;?5OkGQSZ)6YkTz4~9UI*X4kP|xDzQA|HWNqmPFm;oNTnHuAw_8;cp ze`1sV?`e7yNt2^r8yU@Kzw@@tSiL5wlYw_hKiK^B=uY*whd|t%01!jWwcV&s6bhToE@ z)|`|)w~n)G(ywZrN&uwqI5~A|+T68X(qERJC7%AYyo@GLn$+_Wg|gakqR`+?7P8G9 ztMPjJIBxRSaw;>#2Ip*jhT2xeKSWtG*@`<%0bQU$SIY^HQpbB-v&{Ubtc~dv|KuEF zGPx;9HTxH3W$F~5jXvbIRaBf_2xJ3q{VZDK8*hC@;tY#zPf5)x_HKIDTIA_1kF;$k z0Akqja}BFKG_b1JcK@nn({WJBGLOp@cm7d#;BMVfV2LcmZ(gc9yb~xXN7MpurDpBB zPR2~hxWf&`yaqRtm>y9HLK|p+?Qt%#@bGk#llA~CZeaDeZPd7a?`NtPIBvR}7mS>y z#GR7ngsAnD4h8)=mn>Rp++u3~w^c(*X!HeVoCf=PKHG7i#QE#Zkf(C#-}CtQeFgi; zBXT#q(X#K8vzbK?($pmOO702a-1k;_wq3J?OiFa=RTsyo``3^eQ_?S=MY$oEyhA|R zH}&=h(!bc+w_b8x5J-Z_in#C)RNaQamC^L?wX|O9unp zHdP7u95MoZsv;hT6YVT|7h3!R>7<;tj*fLsj%DvFlV5-3&OMjIp#G}d=ur0{-+mk6 z?PE6RG=tb^p(%@2bx1t?jv!8~e+P7Y2w21k*VI+Gf^mVOeFbG-U*#(e;doJed$cV?SzX&JG1G*h`uOeoDz3wm?_{;HHikn--J*@Ji`f>_yB*rf z#}y*vGBtwubL~pmni&6z_HJ6n4W_5a`w;V5tE}W8% zQSo1$J68D;_*c4L^ysaMbvk<~ex#yzDckO>!Uu{m#6~&$tLYDcRlb=1?c=Xn@-VHh zul@vNhjJp5HaV6AyxNI{+%crZ)V$Jzd@>SX-b{b+LRB<{#{Oe=K*?&p?=PZ4QZdoq zsvT#a30jQ1U}657ab4DTsEYBCXUs`XF28s)fOv72jdF|We9oe)1UPNZ2oq6x(u%#X z9er^~s^jW8L5)GZRQ;dFKL55d|I?7@KkL0ul}T^Lkb%n%7T@)+W{Vo^Xq(1qJWuQ0 zIR*}anwVb`vP?~Yl3TKIHp4%-w=GYX!ph9_?wi!7zr9O5pjg*51HYEYJDw)%3B@*Q z0V=Hr>Y4u`)N??|4MT~2;yJU+f5WF7`JX)J&AtB$H-4)9-H4w)$7I>e9%%G!T6EUe$H-K(ai0}ROp9~fB`JBheArtl zI$l#E2k+0?e%g!6U~rIp5Gf0R&3;Sowy~7q%S3*!v76nxtmhn&)L$*w;X(A_vZJ2` zJyDa+mHX0;DwP}irs&Ca=BH2FUpyM#a(EXNGwf$LxQ{(5WZT~m%G+9fUkyF|)knja zrT!G>5}TMjSUqX2+-w@AyD$fTwfA#Si`x-ULOg8qPS^;oG4& zT}6`9aArR3o?+jtqvPro3lH;dHFPcmGZRMpG0X4|x!=li_0^R~_-v`^L_0XWk>rs< z_2ux7Wv|(jERws*o4*{tYxu2oX54EMcHwqaz*BhUx3-wECEr9cT{F;15wl-yF*H#K@$c&>ayI1T$5+}I{_UN~n6h2Wb zxm-SercG-4;7vKSb&y$uc$F8QxzVlywcm?@W-qHTBBQ$rOL@Vnz7}Mv5@J5*3a24@ z5urCumPd5x`#a!%Q??*_<~J@SVnht8CmMDaPpL>{iVvG%LVr)z+P3*z z%admDX*)!|&GY3^hWp= zr$wrbV<&qss+1~=IgRH#>#dg_>DBoyfEB$@W*hvqypOtMQIXc8 zE4REpPL`8~mFq;@rl|lBqL{Td;r3NLBIl;#nitJ5$*!iCTz+c9?>>B~ajpQW$CD(U z-^aVR`Nr|RpjG&!waMmkGOBKGAuabXgiVqatWQr)MVdWWu z)1!^H(|;xDTy&2dbVd`_g7ay~kT>p5Xpw1bcZ4$sb(33Q+vyL`xliJk%vCTBJ$5J4 z4y7wjv21O~`H$)`c#`5xHx`L{?HN*etE$V^sP@4T>+Lp@`$>pl zrPD}kAuDMI&g*=>>f;nKSYS*@m7~THzA9&PFVVRo-5e;=+bVjR^%0b#sOt%%63Z1cy7qn`U^(v#^)D`ctuZ37u3#iZ*Wp2+cIfe|$ z$`p_a9U{xgi6V39Y-68|&gfOaLhv=&LNERPy*UeyF#%;d##*Y~(b>sYviQC0YZDuR zCva_SpPKS&BmaioRcCa>s!w3>P9KVM17|ZSvIpKot=LW3u!51&2*ZSR1#fALF5_36 zPs|aMC%J`#6p+a3jlRMY-!1}qyLi7;4 ztC2w@a<--3*w}%vq%)UM@R^NBa`I=cios*zq&hZ7#Bh>fyM$x+S~OO!woF&f$0w!F zaQ9D4Z}bJI=jePbMs*|9G#xfZ7V3<&EguiEGO>BjPsDJPT=#?!nf62r-MJRG zOVD%aXBWbB6qJ`s!RRJ*v}rZ}a}_5sk(2lTnf=N7lt>ik5`YmRhtu9fAcHrh4E1%Y zT2DlCan1+hDpTfQ|FnPobfH~G8L)5*b7Y15nB~>w)*bjFppK-VxsRe-RDpM_?-rb0 zNq*$D;X-dewP0L+f!dSUNl`*-`k&0gR7W17DpyUzO^8*V=I#jNk7`GpNOl?(Q;4YC zSJ$BUKL6_3@5WRpOM9T05p}iCpMd_&ONEKBRc#yQ564(51;TLZiy``m5@jN=2$n7H zrD8kPdVsQJ$Laggo?dr${6unxvn#<*E?N&hwQ(DwjQcB^ew;n?axibf5i41Bp?!DHa@AOpfo|*r!|z7l$Y2&d-r|r zJ*E+BQ!E|e696D`haGf+qyRz`|ICqF-@v62$#UlbG62u+w6qDMRC<6FV4RWw*|v~*g%=Qauea+k*x$-h+)z+_9%-&;jF6IirWO_;dqr5 zT&VDBdC#mu0b<2l5Y?T}EfoX5%a%%Ld)A^?l?{5~?n)$K&U0-d9CnvBbm*{D~Ekf3IJnuv#J&gg4y zBAt5zQS#sR#A~#{tbo+^>a9iGvV3BJ6A=4Spt!-h2Mr5Spse_pXB&Bj4>MoAuYlqa zP!2VHX5nmNxEH!<=0-xfzOHRs0+D|^{WhF}-fqv!xQ{)q{aCEj4da^dhJv{tzS9{a zb|e6Rx8wYMWax~ojj&_Xae)T;^pc;a|F>al?u-D@r=515r@xpyit;<3K#$@UBxr79 zUM1GLA4LPYj{)YVac^i*&MtOh7u?k-k_wr);nw8Hd<5BdPu(f{O2j_WS@$F?z77Ur z24MN*uH!FSg|t8&Km^`Ja@1mEev2;)R9yvJT_y9&1<8&Nv0z{bZLhDWFg6x-bhlEu zD}0BD?lfaoi(L8hzTaFezk~VR9+I8wr2PBvyR58o!vQo4N7C&v5~k7-$1(b3Zz5NF ztNYfg<;y9FWjGYr*?th$`tI@*nFqG6*K%WdDU^RuLg|DR4jb}&60VuAoM$wf6H1zK z!Y$5aya;)wiouz3P&i02g+lxKGt1_l>At_hqo}q0pWEg9PL9E%ckXfrJ>L2zs0T=s zJ+ZC#66X&u|2~yH1>-O7Aplq5N|2_%2RXODqEGPowbgd^lUfVED%RTL4lB5SL6*La zB_n_QrytGM-JWQ(--z=$&hWf%@~3BcXFuCzTStz6R$ZPq=6<{^|JL^P`Xl|Ee0`P} z_8EV9^x|&sEGq%lqSYHklxnx@m`oZ+0K6SySi3H_uXnaJxV9i;c46*9{MT?h(s{ij zaweYGPi$A)b%#5+z@ynyc0mYPzBd{fbbD1nnb^ZxHn>M>i*$N{G4=b>2=TNP)2Z*O zv1TGM%)D^94*%Bdq=2lLjJHLt(Z0}i4%)}J3PLB+d~ zrMC%zy2OfkiYW|AxA?RlTd~Sx`*eSuXA`|}a5f<|s2p_%$?11btCEIvK0 z%Wy0ZYz$jZ`F9`YUuswXx7%;#)tk<=lbtKWFM%0;c=}BX*J2IQrUq}yu7w7`JBbmL z60t5MgpkPZUj!S~P&dBYO^ArMh9n+wvLV%K!JeLIsK;gJ7b$^YiTgKLyUFF!YltPq zy@Je+uS2{|&popFdP+$^S`TA+T3$U}J88;8EFX|YU#!@DP+gATS$a35JuJ;5r0spP zGb`t}@goVbKm$$w#z*jTSSK;#d|VLvU}2(zbm-2GuKe4wQAa}9V*d*rs?4d^ubkDm zn!Dwhchb`nZVd#>8qO5Xo9!Vpa*90-t+bNP8+MQS1v~?67&FeNS}0g9FZb0VVD&I9 zl~_Lvo<2!SQO_|hsPkCs+un|poX7L{YGrZb>Dz7^n~440@{(k8cjL0Hs#f))2fTRb zTU0zDaC5k32CLC>ijI5ud3OGfH`!KKk6wb?O9SL>d@kf79r_XCHXlfY`0CQIr4*`U%X;v<51Wla5Z4M>gFfQPA~hL?jaJ3^O27y0x{L*Jq<&)Pg* zX+>;Yr8(8H+=5wln@PP2VX?s@GbhvQU2waVJAH?Lb!G%46!Jg<|8WaY3}+0X4Fqk3 zSxp(icO**Q31lY_O+|g>3%#DxcW1K5!t~HGrr4KHzVXgyDz!5F{o7addSyMyUaB+b z>P49(&1wJ5mmSImyt58c>}rN-#vTnMjEn`YDLseXx0Jj)qlbTbspc(Qj))0p%hrR} zC?7J`fN|rIfxmQ!IfLwVJ9Q3(-H(MmwUOP?lWD|jTRI;xF^I!LX5)-I{BUb zm6DV!NOl+x2fi`c?MTYswV|E&tDV-;x!GU_9z2z1%C<8k#pRy{bZAXsX~$22lijOH z#Ypj*EA z6%sVjazT@2cC*_X8mOP!gIavk;$v^x8kA>}6H?X#)i*jtVa;k!BIS6RqbM&gPnc#8 z_-1$7THH}ihi0?U(^_u^!L)lBG;p}oLu0Og)`nQ9;-339D^PPAyA5Gk}NniAd_l%!e=B`oBi#032&BABOf;f&u7KFqI2Fg5yp@g{T3Q;b@gQ;UW_!cp^#kEvdYd2X($ z&nZy$aMcb3pNrMo{7Q*YwRtLTOOI)*`Ku6BU;gq-wGUFlAcL5)3iQ5tBjcngMFS5N z;xsNet1HiyxB+}UG6xGXD26I@8tfGB%{@C`q967J4#r$9beC>ScS@a7A)aC?S7=_# z7hftsc^NdKDdvKlXZ&g*gaKM$aU$1tyx3r%hy={WB{YmtB1|FWpSg%<1gGa zULX0%mbjaiDzgsf)2dc0aZ3KWOdsKDyGM4vakF@%*JmQF^V!N!cOK)<&}3JAB$k&b zh_Bbsjndu*kD<3Z>HR;|sp6a7V*_CwgxSWw3oVEvsH^YC+_!8*g^>|3Fvm>mE*+B2 zv@$|Wdd=(W>7%Tz)rARe3zshjv4J}-`B@kl#&%ie`NFlh0^h^Mve)jthW9bRdkw}Y zv#VY%PtaG8>2@%WSiV|zm@#nKtXouFY5FNjU&mHu-_+qn1!w6Ly;UY@BUX5+6hb%j z12Y3o&)yz~r7|HKuYY5M!9}2+TI#W?MtM1IXCkPk*L#S2{3Y)OGyNV_h|VYP|hu6WV^ z3O=MfO3g2meT#dF3PqNb7VpStfQ~!I6(OK*cdi!bo2FO~4D>{!L^T+m?O6mn$3cUS zb6{YH%Aeme9apY_bsRi_$79uqrlLJ*>TX3E&%wp%@L+-Z7u07Y_ zeALG2%P;x35<&eA^w{eX8j%mY28g^3q8D*hqW}_Ro6mWjs^4z(GFBynm_7S<%}uPi zHAL3RNj!I!UP5fZZHFJJ=m2XGasWVC_cN!XJA9AIIu#zAG3yF@|K zMjU!j6dZO-rTws%`oTqTGJd|F6Ylo)FoE4Ake*|ex|t37O35#5_Qw)%1<@)#xR|v^ z5EDpXUgGX7hF@zK-9!qJhF&NSx40+6rK~Y@gV7@nCb>>#K{b;bJYVlW^^0D}ZjpgG zl3W&_kybdqEu_j7=J&fdl=0C@r<;|Ha@Ta;JZaB~#pvXh^#E1xSh;FL z6z?pQ*%DV@Om`2{AaW=*ba1gjGip)G5GVlW|8XqgD@*jaAZ^jqjy~f1_kRrc>XlHg8>9K1fS=`M_Ch&yg!<(W)6OCp$*onHmwX4n}7*Uy$I ze=79WU3trN{ zU7D&JQWhw)?qXUV*PqE+TA(nYAJO|cag$3Df+K+WL`uZ13(KKfS_od}L|AMWD&ve9 z64Myt<-{$%Sw}5sd|a0_j@Z?*pu03G!*67Wk`3_<>NNfG2IF~_HG_e##CT+3#Mi}9S#TQv1Bz{U93?-rdL;si529f;!@KtHaW!Q}pJSiBpe!jm z4{s%j(tD6WGYhyTelFn&FDzulg3D%ybi5bpiQ2x@cGxh%-Y`%K`VvMM304zMZs00T zu8C>>hi8udvpDJhZ#n&Q(Tc2j`?MBs2bGgt=Cqr=L8*U|a-C?>N4U&oS03}bJm0-j z!5TBH)3Y3~d%WMRtMWddKH#NE_SK2?e}wQY-rO7lxeV{-o7?7~$|_=8^73pb3^f=j z+=csT8IGj4PtlzJ0z3axOn({ua`Etdi>C)>Yo{XPV6rDdI85d?(WTw zO4^<}xJQ8sah38#7L&jqqcwJB@l&gd?WqROV{t`JnHUk)jT2@seESMuuZpl;|I6t= zc|N~uF3!%5$?bgxO1_lu3}{X%@|DTk_5Z2gHXh=O>pA7k=Uu=KnRuHH{8@b8Qe>Oo z=#=^XyPmx*XB{sKF7^)ZsZOo&%|<>-?GjqN{QBH>zxf;0Pc{9yW{Vd?yI=bE5|{t% zrOJH%x|c}e4w9@U3S@zru%^O$ITb_F#@2^%`h4``+$D1)(c+#a*QG}ixKYX3X;f1{FDeffs)K3HV8 Vd&Ccgdj=3sh^Gh8}tqX`x9cl+Z*}1XP;TNDYK0NJ1wR z0i{C-B}fygA%q%8Lg2*TbI!Y-v({7AdC&QSHB6Yy%sq3LYw!KJc&o3g#m>gd1^@uq zwVymT1OS*50RX1kr%y3Hi8(&)$=ERY8fvKlDh95tGImZntAbPkfU3l^hxROt{WCA0 znEL_%9N+)`Fr|oa@Bsi?q1umCje~56(}8}sm!ZoP)>e&yN4Na)eZMtMt~;FUF6=M{*5+RkvP~_QC1b&^fa|Q^@*@~^4)jt zhJE}`0tU{=Ex!+gk;<99q5)u8gIKDK#)n=_*P;7hbTZY24KVlg zR*=VB{6{C9UWcY$$ZZ!ah^ z0|3|WM`riUx2t7;udCt91o-;kg5qoCi1_vcUDEGE8^rdoVY1fWBt4mkj=jT)P$t!GBmpTK5VPXh&*g99dL|Bx5f{>a59^|0w$ zAKAtU++@Py9Hj zv0l0QEuPVtTgt)cl8T8O#UbtljbHy*TYPX<$b3p%@?RuV>m&8!iajLuTWc!vT9siA zPfAFmLj%`TAZ9k@i>=09gAOKMD5CZ&&(o`IhA5!F^@(CYU*c!cX1YU1miA<-m zAS-Rn4RXrM#m~W43tROgCo*|XbvzjDemPsO%6BC@E8F>{pKzbxYTJFfeZ+gn$vvot zIqY;Dm)W2e>b2LwHuuA>-qA8G#kyY$y&s=)x|^DEn}J+mK>{Mr7A?G0D?q{5i#?3z z?%;X$%0uU<5YEHxb}0jWyuN`80f{9DZMAMiRUW4p(Ph@BY|KA-;aU=+j1#~?A4*$I zR9_tbcrlT~qq`gX$Z%D9_A4ubMh~hXCz@AiB_?69H-3Nt6%nT0mfcU8Bmsb(8$pl9 zjP&E>#pH2XS{5cXV%UK^cX7s3m(rtn(@7S@Vu|2evoA#PMI-=qV zZ_rg5QOYNsEsD+s1IUM4EkmZ8V|%Oqihr20m_@!(G7|><;_yDG!7i0|^*DNNtJ-W9xN{bHbZFn$+*S|RStt(o+ z9|16t{-`jXj0d^(mUTt{AjaP&zs-brEFgb>FbOFJn9+99_i-_l2jyE|ET?K)iiThZ z!3g?6e8t!9TB^+YCXp3jr+hN;2ZYVZc)`Rxx#CQ(-XeajCraO;c>HaGJ*``bNQT3e zy==pZXaQEhve=jl;4qaD^n zWWBN2eEPrzn#7g-guuj_&OwTn}IeY439^X@R~u(wJz`FGw6kGs5NDS-?yZu`c`L^Cwk;}Y$L z{P15#Q&VfiMBt=L;;)(>Ud4NjzecSp@IxxJ^`*&i0hQ{XFnWUDY+Gm9j@>~*e@d>C zG5s2Dw7extAyd5KDi=$E3anW~D)N}z?6$tY;Z(j9iW{BM8up|yjfa8 z)*t-mJ-qVaeL0Wvum$_k>TXMW{W!?2SxnW<(O<4#bgXuYrN=4Y(waYGa&(6-e z3M#JjUR%jC=w!Pteg5&P!0h-u9Fd99Xq%a~6jr~qm-B7ae+BxslTr^XjxnEcQU@8h zjG4SFp)8Ciu{Jofu3huwXAA$R8Ev$b4@={{cFC(3SsevsEh!Fn$Y<>&o>)Ln?Hhy% zFDolr<)d(;6%qS_tPN`LzJCTNp_LS6H$Vc_d99F1F4(i+)JZ15J$RFr^{#@+7eB0P zrS)>#P!~S$?=&6WnWpGs-2dfNZ1HPd73oJy;+zLkL64=qk*=c)j=S}cl?-HDyX3jO zE7w?TkkHZVyzF0B_n>I$)Yr@hRMwD|^kx-%w~h z0*<VuzHE{AboU{oxwz{CJCu@#WXuJVFsBUl9&@!&AK4<*JHn z*pfNpXm7vu(MkiRqW2*+c)5EQm`UDxSkjK$hv9C|GWMxL2W}NN!)}SGb)`%Wh;-c< zz)SzltZr`cO!!W*v(V9^(Q^B@Go)D){v7C=l=x*u=2o26H|c-VKaa-@vBdDe!L`l1 zSEad603u2g=PsD*3_Aa+FLEbKlrkjK-%tJ@%aH#}f{8kJXQALEH- zlIQy(#(E0M=u<;h>9-wEx%c`dR?RL zU=;A9R?OQ1UOR(zESjEo)tMD4Kjsx9UT$YOjZe90!f-wSb*C?iVSk@eE%ydO>JlL|7w>H%$ z-7apTf}?*sXW*8H661M)J^ddk-V;tH57|FMci&f9) ztuSnD3H<+kZ?%`imve4#3>#kO*JIF_=!ZY6)z~h{?uDu70?vSW!rtlRE^Dk!CI%lo z1k9~n4?3Cp*=N+2ept{9tC$`2X?C-Yp|^Ep=Ib#5%w|^JvxwecNuc(MiiJ6#8>)Yp zm~xwI9e<7oJ&RFEI}K>{(b_!y_lW-HTQXAeuCirfoR#xiLJC;vYe(k<0DxZpwp(9c zol)?P_{LFS>1YZ5Pv z9nkuK5p9TmoS8qPFfHc}RL8$=xL7Luv-aI@Mu#$Tmy?qV$n*Ho5PK)Dp3%ab)4<5v zBV)3>83?ytHnpM8@E(-0XF|&5qOz8zA&xA8O=#D@J5c(eCr^WxhqF-(_cCVhJmGhg z;#7S_yG4jrS)fanR@DhWs&tf=-I?S_L|2RnQfAPNXsF7zNc8M@JjpG7som)Om;re3^-LgCZ z0RYT&8JF(;Qwf6!pG?Hi=DHYhC7@Rz>^~1)nIugo+s>Xq)1DyW6EJ&kc&zwz(VQ`? zue^{V@kMrWBk;5)=a6y_H_^BvKx>cYW>AWf|Cf%Opz==7SwntM8|vl}pX7_Liynik zKw4k9+xiiIX`T?^YxM<18%GsrbAN!qYPH1i8atyclYbIhd|QEdb7xKxGVx#Mga1E> zPXFDq(3|c(Op3#Yl?`&#o}2t`oHzM#C}!h^8;P%K-@}3%)K)0`Y!Q#oP-8@087@6N zU$Q9luxv&Q>uH4jgDtFHeu0oSNt@sew|7nES)8=a6!CCV_m*;Wd^o23yfKlCU-|AQ z#qvX$rR18WwYiJH6xab;FE>0Icbfxg*c`= z5b}jYq91IhVZ!n6*nB)dVEf>k;*-073Kst?z2-^nA}pUGe$0drqo%w(K{aWx(R1y=#Qfd{8&rLNw|>LEB|dED4(-Rgi_4rw&gWK%q{=UA6p9vQfM)UR;hOd!8$14Ry z@}pe@YXecxDPf1{yu-d|7vJb2U*#-pgU);eeVfjN|HD#v(Jenkn9Z$EgFCYdnzL|gh#h<)X!8!~+$mz)=fH?F`{uPt^Ha%zp8`f$G;|ELZkge8|HHyq1)EK=%) zL%fN3lhg+WOj}Eo8?@ZnLi+aO%I0+*_h#dcJb>rWiSoh+3&%osAlU!`OKW>X zgZ=@PCcLK#nR(4@>dEfU-^QgoXE~p8f96OKvw2eLw9k%lxf|3x%FJK)%lKgq;p$%K zB+vHaEG_>El7j3#uX&0Mzn1{ot7Q}tehjfJ{+K?+mX|4p3JRqy%Fu@SvU^6~3fT&P z^n3I1_**uc0ucV2;mmdam|s#v)(4Fo8xX(PEF@v)3tg$AN`sy7RdFp6c;_;fw6#oi z7uL;E#XeXb_LmC z?`W-(bsP^OyYaA(IOd$BEat1wAr_R7s?oR*OTifq* zWvgr1QY&rttDXfmwFJu~&U`JB`X$9ry27z@XVNz8vKJiIMRlqe1NFiYP12Ui6t^4c%PwYP?mrini!r|k7fcmxvz8Zp^{`I0ZUH&> zbj}av!_!F%zBV)7;rXuVv{KuJrt`}CaK>-M_h}apoCI7i!dW*ngdZ!@1t%0rDGLp8 z7N(O@lyo^m zXvHpkt*@9+hWchBbSC+DP7%{Pl;#U)O#%Yo)lwnX=RRGDK%2e&(UqbKk*t^lsm}AxKi-`4ZI*aR4xPN}t*N>oOxMDp_ zjHZSTk|kuvJ*_OA1X^v^=R0vmlu-_&x#oQ<*)0=y1b^4DWMVS6X z0&7KSjj0~zE7I58JiCBd@mK^$VI~Q)n+_UIPEn;S#U*Cfa8l%GU*O$3AQZSqfhSl~ zohPR4ez=D8N{^*1jZ(6swtVDvWZmtj?Byj|Wg8Z*9#jd7N6v@fYz-Ho#odFB**y3F zvE?_moV)yMzrf1liV$kEqjmapBdX`=+3oepoMyXH#aWd#b94YgnHVIq?Ubx~_Ux2wuK@?)aYWL} zcjME_oCnb3Mgc1aH$|_keF{#h>~JeHupF&9IE8FBE&p9nWN16{ISgOP-*c(znDF!4 zJPo_20X-O0Y3%e-tXvC5966?oao2i?a#C!@lQE%dRxUu%tpx+i46jT=9C4>b7v~ee znGiCQb*BFi|Jt^>ybZQG3$JiyOAY^u!8XJSwo7|i`+p>=sGM(@2>p?mmav1Y-j&|- zIly-7O@>V&*E^2FD*us3gdaN9hEcv{ib@Dy5=R`bJI;T$h5OQ97#^KZ@l{e!3TT_v z3z{@dhPz>WXL&6N%MqT%qltCc!IE$&iB}@9OC4tNd9zZbMm(!~t-YqA=m(5Q_JbV_ z^I7M1YFq2p==G3#)ZtC8;2&wi0^o#|uDgG3g@c1(CM~1ON-z%O`uKz7!6~Ys+Lyuy zi!c@0N-t4%=sbL8osF_CFF~qVcMeS#Rv8b59a+_gh3{DT{y@AF`rR@#rS$sBe(0Bp zn*&(BhYd_;ZwHD6H9>Ksg9MursP7uUHg5FzntkUVdU0;A(GE*Rq08 z#!B%gP050TCM__guVUmY;&G}!t1Eg`qbE^-lV?lo@Q?l@ ze{_iGNe4Vrm>k<1<<-b|8-|LqcRu7$=8(r-E4X9dPm8d<2DV#$G4_)S8)L>2)pDTa zX~oTUQAN=G`?!jLKd;FbnK|1HF+#8Nc&Qo@g1&t7 zl7-&2hwqp*t$*wpjfNIJD^q*M5|vuX{3J~Cx2m#m4R;?>|E;Saa_~D zn9}OA;7@HwQJ99;F5xPlt1xTFg*#R}f%FPHBPkHY=tRha{gNT7scbN7PUZRW z`jF-!gWL)aoDS5MWxDlAgG3EP8Ku_{P(~&gxwdsqMJ{TGjb+lEG?pWK9$Q!LQ}%x? zjjw`$KYlICK9}$2zP`F7n$+`id|)@^rNfQcyMTT!&V*q_;ah&&6oa5y=)gK(EIXGYiyd*0WfK@?M2rN%?6adh#>W5`p z)I6I$XR}N(xc?>r5v4hO8un`^%@Z%>YqcE>c zLc5kC+>Osz6Pa>v!y@ksm|(bQta;ZP=Lm*tS%(k~e+P9hu{Cz8^7#v{ul zBu-1m$hUDlS#2+cQYXvvrGLgICz=h_Epk5%p5COj6V-$MuTb-Dx)r>DvYw9enEh zc3~W;sNwg^kC+f0$A^Lzv~D$w1x~!!$-+-p2vZ!Pe*|Vvj}m2nf}JL-kB`6dT?TTx z`>7ZDK7}esT@ATc%&Z*~t$f^?Dr`$`X9C4l+Nyr2IMV^+aBIc`#dO3@373rqZz-z? zfYrs3(tUeRrow`#Z!stnr7AAwo2*_xmA z*;N{VF3MXZh!M>X=uK$L9bysnX*xJ{z|8@jBALWZfT+ZvdgzLht;8B zHBdnhABxKbiaa+=d#KW#oQS4?5Mw9@P(#S;o(H%;t5;U`C9QAeenJEWwV$ZBmk{Yz zrxYj_5n{K|6@uP!<$J66qD(C!LO5Hk7m*gzz=15(l7%%42-%JK>vrqO=CR>ZG%Nl3 zA@X+bS2JElFKdANikn6QL~Kk5r7b+x7_Pa}vN7rVmM?JJYJ$HkREGU|%6;{i(wQ%2 zk5a2M>1Pcj2fo454!3_E>1XCfUxQmg?u_x&wxI7>z2Yt|hMEUr@3UyF;xX+n)vQRi z8M-G~3U>@N4I_$^o`DD!l@NX7iISfdJoNhCp^_yQtl1}RzLu;V)jOo~|hfA${GLYcKd-W8J8@017HH{`L)G;XjTzSLg&0Qz&BIdk@co4A45~ zbM$W|*E}*-Ep}Xs`hx#Jak2EYF}=JI!0keyS-aSX3P2u=u~@}?YO!4WrtiB#(PMhE z-Mv7QU#BcI-@x1@b)hadxFDS@mQ}4zl*5s?pHV0Rvh1!eQ$4L}fGpYqn`e>RQ)=Eh zW94USAq&zI8U&nmOKRyMu>9yI+cj_|@AxOt$Fx$y59)h5_&mbDg$I9U+HTFP_`2L8 z#fT->vu8>Em;ANulf9A$ zaZgR>cJAsWLgz0#HwR5^kO5B};i(A67h}H!QC;J?9{Mj4?`|yG;w;vjev&%MtY`!R zXy0^qJIwDvnHDOm$AN9EW&;Oi$=!bHw@}!Wm-&PRg}j2?cNDoYehqoPWXtTfC^oI| zfvIZ4lIal7S#G-=lkr}&Xg#cQ+*tm*%1?()_da`7vVMKX?{`?SPZHh8Nn^iU$7QR; zc4`3v$?l_q`m5L=qgq<14+4|sZsFS{pj`NC>gm25gaOk9)?|6r-4Rk%-TAkT!4T^> zf7oe>=#>SMIxoXhN3ar5{$>brjQa<9p8`z!|Y^9aG3*`D2Nm0>z{ z63eqn}oWo4-)D}yMgl-AF6UFD-}oB+Oq0n;w?U>omBzYi)Kc-(rLy* zhebzT037sv>ZbRjt9m1eyXb;hf^*GV)m?`G%e{BY*@+o-TqjG9$KnL3?tY=dqfd*S z94j_mZc6-632>lhAPC)|Ou1-j2cC;($CYQ9JSr13J}ta=%Vd5uJy%S4-Jn30)2o;e<^;0`9!V-4`}nyy-)c3!b{AQ#eStO=R2`}qY~ z68CT|udLWM@Zy?2ALrK#(_1SyOVo5Js#h0W+!{E?Lu)EB9)P72tw%?ju7DM0CyaR^ zc1OM6c+^zhcNUZ0K+Y@(hf431z@Il$b*NPT@mvL75LQOTzfMpzBsYU|`b-x+-ETiH zu9ZGD3#{pqiqOn{h08kF{g~sjX1Qy9jwU1al5RHibKv7po5gRKxHeqk?Rw#1!JVs% zn2&EuHh&qi#P)>eopsh$^*n$UL+|Y>jK)X1yWM3+aU%U`_3-XI_}WhCj(OlrgTdJo z!;0Bs`F!1^jn&ii+!W5GOOO+xd{TBgQ^~IyEQ-;`AlpQ$SyzAn3Dr#wc$Fz(KXS=&S0M>0*Jpp5Q93$WsAVivSqfH#y55jK)sQ2EaSOrWAJo>%Cq-G|ZbCS4 z#sm95M5m2KDlwL<^rZ*W?j{!-uDZ-+L%bpfn=#6x)gqio(SdVOkHxgve2m3APZ$_) z0Nwf7DE4ytGltmehL)003pS*pp;#$Iai50LIHBRN)?Q?xwOHS|!dtRX?&QXAivv0b zDEvHS&aJri!g#7B!{T@?{p^i5#y9eW1HkZEQs6XO0c=CCTHHq>XNA*N|IK#xl7Vy; zb5$i~h&Q9o|Nh`%-soY${+jbH*fusw3k-Lb@o+ZW!H4_0HP1rR=~zB%GPhoSnr5)D zk!uK8KQvG;BuFZeO`fHA&9->>1>1H_`O}^3v?;~U{O1U!N#=K^C+Xv+Q|+Nh9{RD|_O! znVA5EnbeBv#9Cd@H46+iA&j;-Hngt1fB)9b8PHyi_vk+enCRN2Jcr6TgW0pG67fIA z_lq%7Tm5NxH6l5hp*4y!>%vnYD9i$7ek&|H_iDo^_Ein6%7lk~(LetCIHhHF(0Tgd znxx~|Y>Y+f_6R4v>5!~bZhuvNmS+uKOhK`0g<95typ`$Q<9KHhc`w3!@xK4?d3Mcu z?{ZC_rB@%bym|Nh4J9)MLJXMl=hkCk8{xgR8es*aiQ4>Rob9vzV1HOX0@gemlzZDx zZ`dDl1FvUYI%-v%dPW+tddgE7>1ceZ9^#Ag5dPyQ)?>V|P?ClOm-0dr#5M)BfhdnELspr=SM!4|jz#NBdR7PD`)+^G8;K z6?`OSEMv`Dg2|;Z#Fz-o5WE4M)WyvugKyaG&;z<6ySQ%sgPyT7;;y_4@A? z72$_mqVL3i$Wl~CtsBH^q=7#QwYHXCGd~OCK+|IMRNo@A8A2aGXs>L2lI?>NRL$F!akKNwdRn!SCaUSjhh&k5sFfg ze_LZCt?d>rL-RJUFZYudM`oMZ8eOCELR3=PcI&BD1+=GsUa47NDt8w0;Jo}5u!d&0dE27ZT%*gc!g88ABX!;-zfzA@+G@;` z%?`X~D=wet7mFA;7Ks-x=;4&-Ov`|*`{x&Qhq^qj7%&gpZ`g#`#kK6b6eP{oL#q5* zR#wWQvURv^W%92;pzMEF+k33FBThl@? ztd_ygJ}iwZ6D$nZpxRTVehp5m83h)aAgr{;xTr^5Lga{0A_U0I)nml=JY`mD?ooap zR`z92`08cuLIco;hA%6HZf}d#?#}RtS;9+(M*JVs>m|K%{L4oI2tlg`77*Oz>Nu}8 zp&f;-Rh=zg8r~M?0iXLGHe!0wI&GAzq+6$x9w8yPXuhbBZ8T3^7mVu>T@*w;Vd!Y~ zv>v&|w{?f3^@YxnYoR0cg*$qSF*BtPZd#w>j@d_^)`s55n-| zhN>^74+xbTwH1z4@<~{;*65}Pj#Jpvw0L6-=~Zaa13ijU{IY;YW%KQS9lk(` zL0{lrGIMq*Z!vj`&#hyn)h*rg^xlh;#7dxZ;l^DvUd83D9>wJgy-60QcHOm3(ArZ^RoWg==||G|1?g-(MqQ)|i> zac{vXEw%Wq=csT~Ba@Sf^5T#?)Whl0juYOd7yVBR-IY*^ung0Pa4%eXp>%zJUuA;p z$gl^c+i#DzrT+x^-(rdoQRSqKU$PKiwte1cY;Lr3g~|p^&Yo0pzO9crz1KJdb6%Kc zIge%JBaP{~;?Ci9?o}gXo<{-t$6D5bqnuKfj-Y?Buj`CD`F}(5{s%>=|1)m3D(fl_ zy`{KP5$n!CGRales9~<_3eCYzt$)jr`S2Mokonc*kBn-dAh8SO7(k$I?>@eMmcHjJ zMxN)#k#B^T|1$_Aq6Eo-cv9HQ_{F6^kBuVNx2Y7>5C&XDpLCWnSA`tOcYSYLFb$NQ z^6jSP1wJD8?8I&Rvv-oa;S=Zobpe5}-^tk<*FbuJYU983%aV0Gxs{A$6gk)>CldU1%O*0q`FPpZQ&G29 z-#)rbol0vU&S#@Us+YF5nu+A8z;VN!H)VTHy>FTWJpEi5Lt3%OeMC8s(9^SS2Fp=W zc9DSdqa-MAF`w?<3LlQ{4v|TjXwKBZUWydm4L>aF2iCR7NLg3@!v+jL2#%!>cd(}- z*7&cVZT1WPk)!zFh3(OLd)uuOPK-+Ae+XZ%e>&BdMdU^0dvuIRD!N+ojpQpxiw&xe z6s_IF<|U#zEUh&namhsR$DiXp#ux9SXBj}jB1zl$j6%JTUs%t< zX9|B@`DW#aUlB^^CwSi+ljuDsEOlSsH1*@ytk(`SZuUmGruj8jzNb0pDJ*qR;IATh(_!s{mveNWcqX5(aRJV!5HCpa!k4p z%WFDv#QnW31~Io|R%^0z0%##mriiWz)lwcH*B%p!v|Clp6mML|R`2(S^xcT`9VLhP zI{$+`-Q~ap+t2=42>PSSN|1oFHBYI|NH^5DdveyRM51cs;Gu*8{;X2IjC2^w2nQ5@ zRGNgI#;)=1Cb$X~(L=62%_7$!%6DsoF+5L8uiPBR>;xvEyRm37bNqch1xYXb#2EGW z2XZy%=V4XjQ%@+p$D9)M3fI;BBl8^0K0KI&DO!$K)=QQN4%c?2!vmj|Qm`ZmU;HGL zN3T>WuCVzaEO;hv#7Wd`tA&jzpi6r9J z!C4!A%@Yo1f==h(yQ7tOx%Jb-QF#b0vwK}n1?AnwB>6VFn<3cO1Yz*_XU{Z@BG!!= zpa7hGP=;^3qxI`aCwCwBJ_e*EDw`qz>L|1To7lk{n`bs=tS&I_ z&qi_5E1cGA!Y~k=qj0Br217=Q1O>&w`ajrbE(-k|JUcjY2v@0*QM}`!{5D!kX~pNI z`%_ir4>&({+dqQg)|4KR^*#NGs-Ldgs~3mjw|}B<(As7$Pg&VQ0&Ps)2yP|A)@;Gg z;(`iHTd445~CtI4^3sbpBDbF|aj6H)Q z4%D`Selu18lZ*72*YK8Mr^=8vV`~dLfe8zTK-$?y0TUq&<2fpL4@M=-ownvf`!2?A zV(YzSd#z(pRgekrSv_#HV<9PEvps_6sbj8;CF&Y=#|BWLjGGbG)790RVh3>tGc z=8vF$6^UY8Y3cLlEOd)K9fUUsnSCu>D&Ai^Gw;oJE&Kfv;HC7c7m?uAUrOlU&n$Jf zd(j>odq=I7-wMWXo!pCM*1|!gz&w^7&`4<K>o^(=xiYD9^4C+3c$cug}r1|ZH zX82DUx;y8sWmLZxG&dprP;>T=NCcy`56->UwIIE0Sak#We_TZP zl;=HEX}W_L$=>CD#9X&sq{ta??tP4M;-ZF7sc=81^;2Hqv+6edSJKZ->%N3C;Il`s z(KA)|AV$)>8Z1!z0#uHUnURH^|6)wQ=3zv9P#eo!55qiBeM>Cb1pLWM+L4k9Pvgtc zD25XtaY|mgQd5S0=y}%H1;LA+>MYKPti2OB6Zl@JwP9`*RE8z zRcTYfOs1c6=8Kmrfv#7-cJ;Im@qTbMvVME4aEU5{Mg7^n8FVt@4O_N1=QWXcMkeMy zTku*@MvdB)>L%$C=8)_4PlSk8_)T+YW!?4ZafAFa$yL2eNI6T@CCbKyQ)p{V#?7CX z!zo{j`rg%ZQ)kioP|F%uiLALUaoCy8n8L^UZ>dIzJUwAQ`cHFV%Vy_a%R_63uN%Qk_ zjv6IHPw&|R`T5~&(U1W@j@WMM-1d+3c}`X4N1G{dc(oEDqysKz*|)ZKleX75hPNKa zJvIVZdC~@UCuf_3W^@D3iQ)e#pYs{Vv(d%{Cb!X-Pl~eteTpj-Z(diW^!rzqROq_} z(=GM*jR^)$R<)^EpD_|DBF30$80QH_I}8p5x$wVPbs|l6WRoko?rQc=1lObjzkzLS zt~6xYtui`B>&>GT1=jW4Y=81*1;a`e;lA~rHDPuZ`}}xoJm9sn((j(RCnx^Ty--^p zUINU%!^25m{W%jXz3pwJSjOL3^;WqaF>hDPFtB`FAq&|{xjrTKm8~k{&3t`x;AO4n zUD8yv@_Mey)Ns0w!(NKJH~#1!dw6(@iAByEG({l_XnO&zGR_@x-?w>T6 zW2}b<1F&O$5+!f*{{y-LUgQ9n7dQuvFw8j~44aTl|01RO>Xr1bwm{Es3bi;(u&b&R zw~m;EYX%(Rw~yadXMU7{>*wX9@M>T&u_#VMCN z*ePZ(&sqF+_SHR^{ZYX0$&WVG+n>5wmCp~kijEdjA6EFL%vlGgb$ zzo(`Z6tAM4YbBN$P}zL*dysCd2Yz8HZ?EUaf!~Cn*na#@^o=064%S8)Cx8}NS8sO+ z9jn#qZ5rAcgNZ)ww(Z_CUVg(MSG0^GYyX0GFJc^kfFrwb1f94eM&7SPn~y81CPNhV zCTndVCY{ohsz~wHf=A1)v>C3-+zMP(Q=ALn>EdNH4Qyi zf#qY5R{I&*vYddBP_NkLmrW8UVMbU<2B3I>`8x($G1)>Ss1PG3>sX`nzU);7%7(N~ zdNl(lgv@C3glw`bBSlWs*B8R0ofFBg7Tuah6du?gZ(20c!!hnv0zn!1KrZJId5(|| zNBumOM7Nd=TNi7xZCX* zXu!;g^sSOwRs|%#(K+L#ojxtq!K0m3eDCXb%#9un8VpM9U1;+Z10%|ZPr8SbC;Q2K zfie~v-Z@GOku!`=(mz&Id)YnPFFJ-=+MaG@%?RH|`SnVIkKQO*OQI zPp0|00l~Bq{3Di+oh{id6TQE~2))Ev*z^3b7pC!G3L0?sXUM1q<^-VP0gI4|$}a~7 zh>2bsIdoZTAAL{hDe;rs(BlkBn_yU!xYCB-NnrTY@cj@VnMXoPW z#$2!M9;2Qg|6v*l-1mDmm{`jm3yZ5~fYh^S$c{kn+k>AcrpZdSsY`OB4_`=xVcAOf|;->QN^ z!_YSW1+Q+r9Ywywmi~Yl1x`8JGt@w(eCnFr6v9wV6k%_YzNM_E@!s+UY|&e+$0@scqn6Td*=)EX zkQkd^e&rF{c02l~<-vsFR!?!Zq3Ok?a0#xHZZxh|e@+nok8!Xq_d}r} z>L$DNjR|Al>ZtFOyPHM;U{ug?4i`YYXlTE2G)-xp2~9QWigfOt3pyRaJs}K;2dDj+ z0s%*77P0=J+!2A)CN^Dvj=QZmIRo=7XFj*Mu$Fh5)4~9y`Ps$viu;WYMlqNX*S+Qy zO|>vJQ|S2$hccqRWD9evBru|@n?;0=vFM5VCINqx6s;W*apaHL(l^Z9_w2S^2rr9s z8#_;^v4pV|uo=nS02q2?Fh9cOFmf0uSNOqD0&mDfbe=WH(IkD?y`! zTIit5ll)dqls}u_Qa)Y@#|2SB(i=yM#t(~M`?jCx_Pdf+6lUu470V_HzO{eQqD$MzArCc`KN2l!yUvR%1E@!)M}5EnuN9+3i+ELV zS>+YNrz3NI^Wc<)KYvbyprt;)7(H+mUY2 zW`TzZyY2Chf{4IoRwZOS_=`*}d3V}y6S;tJkb`G~3UgD9nLHIUf|X^< zgmz3_Pk+YMC@uC{J#o;;1w-`_O!tvnq0}dOE>v(5*$Vk_w2Ho@Q%?SVlyDDe(JgZZ zH&anz)pW|BZ8-r~2y1yCo5{*gJGj=tuo9dGjNfCq5x({9TW;6ZPGBYacyE4>oXZDM z>&K>S()ML1;o$+j8kMmXm1MiU>`Y}orMj!Tt*a5NmCL~+^mE+5#2=$dk@iN{%Z9O# z1PoU`FjV-85pf}usURbyc9bU;7RhT~j-QHgw*qsKDFv(@>Mg%kS$^Xg2s$cMoI70t9bYCF>W8pELz;i!wULFO7AOS!J{ zosE3vYVYZC69;&AlQTl*h(P#X%*}_AHDi3ku&5Y8GdnW^;nExnye@~~Z^zLa; zi3EJ@toGic%@``R1pXzej#0_hcMGa;mfD?+vt|4K>3ap|tUez^Rr-zQRA2|Mty9}! zewjw^=+Pm=BdT_3LkB^{hFzaMA+6zd{Vl{*+U}ok=1N6(UaK^;GjKovN#wmYdVKKA;j@KE3dnk>au>)6sytA}e=_z9-=U*?fwWlwQCT>rU z1R|Dgg}IbL(`QVX5N6l6_b zWuOJgg|!4e3G{1KsjcWH+%ZYv$Ocr2w9hK7P*fRhNC}!NN;?(X$MfCM@H`4F&tF_Z zvAWfZ`@>I4jB<~^H%4A5^e0TX&PuIW3>uF5;r>Cc($MVO0kdPs^I!V4BC{a^I03)= z$ot?gC5|OtMH%0D@<@LMKq>ZI=#1ao?*~m6kdG{8<69(@WDO&$Q?s@Eu@))J!}qGE z_l3?#N4i0IxpseMn#QDgJ|R$|}~>qw{cozB|{mb5)vcDk--6*IfWMKh!w-O6`R3UE`sA ze}*O4u)l|4v+pN|jiLz`#4TVN-ohu9Gfbu7TY9(w*G4Hw?R2yZ&MB1=&`l>9*4LYm zA`w8L{5{CmcVwM@v~0+|GCT0CN>snacoDeMzUAT2NC^GC+g<5{$BCCl9^g(gs(@*^ z#9vdTcGaIp8*= zZcw)ot)U0uvA4)YBfDfuUDr%p7+!J4EtM1*pQr~*7B3X1rRKMZLT1Lc(Far%qcXL} zjjM9mCTVrw8?4otEgb_d_KxirIa)0fM#(~;3Q{F(yv5`1a#~5p?$=h_2|Ik8c-j!W zq5j&c~I=YBfzZ96K`$?bndPrVZi*tY%dR0!IH5lPPd zEAV!F7T%yYsB1i{0jRt9U;IOl|Hq+;kCWA|0$@&<;mCv7hOQUT+oIhVf`<_nn&mLa zHKGJu-Yfj@hvt-p?X~|ElRa)*$e&ell$*Dv+S=8RT+PbNbU5BvAb9BTw=JOlCXcG0 zH2jY}`#9zA=i95+EL5_e#iipP>pc0}+VJWXm3xi}07py20zdpv2d?6-|NW5dc0jOz zFYC1B&-QWt?)TqzQ(W$5LbSBkqLmTvBn*H(W3=;47N1vUNSU~^R_^w%ZUwF87Ts zx;5>d;qKH}tGB>yGqb-brL4TnSADDY^OC(cFF!kyx!gAQN_*bMEpO*-tGu|Sd7fFl z+1hQD)0gX>@7j_&&+Pfz%=6W^KJ{#QZkv1gQ08)*+&lLJGGCwilv{drxBu_9<*{2m zZgJ21Cg#7oR`2`gkQbjTazR5YI!5LdZ{+80;l7((wlDq6tX14=4eYP8xv8x^9}he| zz)9g6Xdm=}U7x-%%;vUR&-K3R5pZLx&;IXQCeOQeeQ!pTzSAq4g6L0MZl&=DP0PJ? zQS-I*yd6un_|6M!d>*u1SNqK7Qn83!c>BP--fz@ArFMs>kd`fh#rxqyc_5ycxJ$cCZ+gv_6Kl;^= z+t;ph0S8G=C;|8Vf4h43>_f}Ws$+5+<}CeGv}Tve{kqTV+CD_c$4G<5ak^i0*_c_% z?q!~4Z)~G!5uj2_Qvef;Rg7pK)exYQvwK|MG$Q7(8A5T-G@y GGywooK%VIU literal 22085 zcmce;cT`i+*ESeHKtTixB2uj&T>+&?7g6aYl+Z;w0Tn_IAP6W(QR%(+P!l={ML=nx zx6mxq5IPAE0(1Gj-`< zFP`avKs4t-AS$8rbijY$sUERFL*=fgrVJ_{;9LdXoU>QbR04sjVi=FDX@U0_-o7w# z2Z1jAIQylF7rev)0)eMrK2tLAwjj*XCz-FBFCPXjrrnEw?^^7xwZWP8(DnNlCY2lG z_vL7M<;54QGrQ}fyy$5dd;7#{7OdOCXlaO(v(f1_jiy|qdZqF1Wohnnr<*OyMdh#J z^S_m@`703cdg(Hoa=$ZnGf1ZesSp2L1SX<*+xEEv74YIN3)NW@YjqB|5|Co>Wfq_T zU3ji|*4(}E-yWPd_`>eEakDa|Hh|;iQZ?1mD_6q;1V0v49lY*v0R(an4+4qr9Dgt* zcctz>{&Px|#6c@di&sk+DW9n z))aktIg-9MK~0_Nzs|c%d#-{DS|@sZaO7|yNc|=I&N7dz6SF1Qn%NccLiilW>e9oa z8^E+*h*47m4R71|^FR}H?hY@|fUbNAI&1zvJt*>>G~hAaOK(chG512g@$bIrw?5VZ zB@#aq9bO6$80jG{n^WaywrI^)QUsaZm(}u^)b~qu(eD2j>PveOOK__-ib1NX_svO9 zS!*{>ylK*cJy#deQc=Tjl@V&?xUC%g;2|ADmLj9Wl2&AYIof-CDZYqOH~ypEnV}O$ zO!O1tnIT)vZt;SytVl9HK3VG=A=+zaR8};5bUERUpx$rp9^s4HvM=L)xBWb&?7bv^ z=hK?3^Bl#kCZJa;^f`!i>m;k(k&sVRfua<7u4xR$=dg^@n~EB`0s@H{oj)J+v1g#~ zd_}fWh1YVu6ylLhd zGlP_Xob~f+JXhrCiCF#}Cb{h1pERbux%U`T8-F$X7xk}AlPUGS(B)rRvc?l!7E$Rm z317cj=hIFNO&3#PHZwM=Nm?=G6S7h+K2ymFnc_DsOEDB2pUk=F21>x|Oi=k*~O*h?9qQ(#0U(^O1MBfDd!M}{ZaM`mi2V|b0C z+Fe^w(A!83hpq~RBpj|BQiN;YIevf|>#+z^fvh$Lm4lw?hob24CkwOB{k9cLTLO^% zj8*B@I-B49Qe3(}69@ZMLtR({=C_qkQoEjAB&E4uJUbM6o2Uwe^mRoO=G}Jb&Zm=8&yQ!GJZ}&Xj|8{EH zoz?E`(*4VmJTO4#xBu1|`6do>zT8rula6)b=%h$YbXuEr5Pu9W>3vV=lDuc_qtO;r zn@usQ%!UezdE05%P~Lj!-?->Mj!g{imSLvzm}F?YCc){rQftQ?``rJI_M!C2KslYC ztECfWewP{zx#}^HF^A(E$Q^c~OhhBYu&**d^T&E0>O?|i{ZF>3K|#sDs+oBw`osQk zc579(gy7C&V$bY?I3N0+^}VEQEaZyCPLNXQ67EUdRN(PXUvdj`MUAr&Td0*)Ak;}< z$T~(0y=|DzRf4a=jVFHG4F@zBv_)IL(Ngu z{G&T=y9+js@5Zxi0Zc*Bku|WV zx2$3WKEg1z3&}d&((y`+vB3NY*oa>A#DxtT+!jqs@M{C(nZ2oG#;3&{SXHNL%Y?>4 z?~RO{j0*QE8)ce_T6G(6R4BEq$>;H^nA>j57+h!Q9m~J!70AAQdTE06ap8{)^9%0D zN01xkd#!i6X+UFL-N^aVwvfck{a~`U!A}dLcsDtUO7>U|E^uR~Rq=z?2l^{(dJ+kZ z7iX7olr(zkFB$>2R>QNRBUPit=yn1u4?LgM|u)NQU4H?O ziTCBrZujyfr65-et!X=%d9W{D zC&H+tq(S2?otxpOZ%#ucwESY@P^JaZD80 zDc;1kE^HwSxwy1*H%ErTMunc=)X!3M+MF8Lit73=dx8odNJ%ygqHD~vgHG1bqkT$? zfRn#73E&@fZmqN^$v5=MEJs_HI&U(bYSooV@LfC6t|NwFRw?L1LwFJ^e~TsM+c!=; zXE56*b^kyv;fE;9{MG!xh+pcuXr`QPOv?z1 zFni{lzPXpgiK5NzjfNcmoee4U1zHUT3#2(MSIWHyE#`NTZ{FB_9A`lZI0)z>GUxIg zWjvmWayeC*7s@I-cZs6iRBwFGURdzf*S=B71OnLsN!#0!DI=xRP0Lbg5~2~@Debx3 zRqaj_1y(dfoLu!wS>KJ}LaAZgM~~>93yIo$lm&I2J3i(TY$rPkXd+0emfFJ9H~Gs_ zD3uwa-UlVGnjAG>uI!#Nqi01AJiRj3&(nlVJe#K%0;=fpI%368ZB(X}|X0Dn00jWq`+ca_3cdl}un6Ns=@g!|$^cWaE;4POCsq#q4e z4!P|K|H58!KX6XpSAWj9y*52O_)um<+ecTch}B5i#PTOfU^=>h=;E-+ym@Oara~F* z`YP(O4z%9EA~4E)P$Jt|Zjy!eN($=>#O2jJZZNvVTP}aBam|YdlUZBZKpDHuB6<~d zsmSj;b*84pg~YWZ(%{8&js>pryJR0#AH9CX(s1aIr+eWUrLHVrjZ(CcXimGybXc? zM!%c=AB^#T11kPo#PPp!kp0eovc^KvW1P*V#Ppb8WX?U12nsG~PW)w>I0ZND$-M^x z^#Nh5@25UDi@%qB$ZU0i7=(I3&yQ>nLqXD?BV7ehkO<9x&hSei#H$Dso0vr-6F~KG zakox~^kKv{=hF2y1Z#09(9^B1{|++&kVO2Er_MY3%~;g_MacX@WjyPf!i0S$ldw~v zo?Zr;ySPSrVTC6nJ2m3ax88TlDquC@;$>b|QrnLC2YPw?uwC~n3F9LVC>q1B@RP`Z zxn71q4@VR_;d6A9QYR>=_cMU99zEurgq4ica=D^v5OB|lg3)WAo3y+>whu`PMnZpz zRzSlc(7(gfrsiMWglB1pM?;T+kk=6Z&mOm3knwBe$j{B}(JrzhgBTzAqKWV}6>9t~ z&|)YE1i;^D}PpL1KbHKm@+bV)eX>?p(=b!YWTSl_>l5r93Ak zKl*T$C?5+?JN}uKP3e94V9 zF)s;3QJA^%w2*cxYHP#^E_^jzAHa&Oi_ZYJ|Lu1^s6kZ`1gf9W8#4ra7<2_NE%5U~ ztns4tbLGVBkbWYLdC=D3_b0b)10GmkU?9<~1->$UkJvgsy6^_TELW(g?EV2sH?hBt zql`Daeet-^)bB?a9i6gJY<&7(%m{xyi^Ix3pPRXzm#a`z0syeQ&Yu^%10zs8n~Nvt z!s}g@WM`-Rut?m-cj1BLB?fIM-}D=Cy`Fuzq+=BA%03W>-Pu}k| zg3g9`E=F9zRX6{b_6q}CjvvVhobdxl5mbvxy$0uGrq`%iv!G^%*DNR@D#YpE*8#{D zF%I<43v`lSj#k3GYwl!tUsF~lhUescfQkG<98r7_Rxh#hL? z^Y2|=zAQ(W=vj5IHI=!&m4Yw=+d>e?*hAu;i#;|=5_V~FoVBwyPYPS2zT1ug7Dn0Z z>0_~HIzVrFXT24m3g@*Rdxd}PG(refm3%W36#iK?#!7pJ{^*@$5K|#%RhC=8kKniI5?1(N5(PH097p}Hm8^vn4qNG^HFDmPAx;?m6iSlF? z!vPz>`Lp@u71t;<@3@PrI4cGWtO%E14@L;SI>W&LU~T^&7yIne7;JiV$wIIndto+I z;*(T*0D`{<^*w6VGp}6mKZ6353gc%^5MPAb+n?jY!Vw8(4*IaKOqZBUZRmkI2dGuZ z=7;JjU1ckLKmTeiWJsA1iqa-u)2l2Bw=D`7dGpoYKuJg;#cngQW8vC z+t4Tlp#B%Pse@IQ-kKM}4`u10j*c0&_O{Q|zrIOMXuNd(@OBIl!f#f;%D5rF_0V}ukD&>85V$9ZE>*DNT7-_ zG6guN@x))BuD8f@yR(914xu|8>l+C5+;mI^al$mX(=a8IRhVo3BURh2z{=ApeLm>RchR=?hgWGBgXIT)w*~a}4eZm) zW#W_v6{KGGzH(ppn#0_ORExEEXN{o`JzKy&86QHey&c4k`}%q=DO8E=?J?st@vqkP z>x3f{gfuMYE$rPmH!G`ye=!Px?9}$%)8Q&aQ)WbUl)eG4#toMB3PP;{{7)Wy^(u#p zt&aurT5CRWZ_WMl1IS6z3)xOtY(!0H7Ore&1NzlRS^X zZ<5+STo1&&G(2bbImn8W9^XiS2ENF2Sc^*dvgu-)N!ZK!Jv)*d70oL$ZLlx9j{09tb?Q{;sZYMT|G^#gRDo7Sw33Ljy|`R{?0&;$JjoAv5XJ zM48#UbWtrZnPZh)h`m&ww&x%}H+ppgduOD6lbKA}CVuwR9`O?)aCA?L9HMZ;ayNDg z_Kt6(`Ihx;x`Uvkze7X=CB6AM9?GI}dWKueIyui@V>z#DceG)AMv^61j9>}WevsGd zkps2FaJ+`?$FCO{w))KjVvgqLW#)djDP=Cx|E=IFO!b3y|GI=*#`^uC>Z7F&rp-dw zi#`1TE-eb5jHK7ESSCI>!?Z{slQSi7%D(_}Yy)WvdC7F2NU9j0^zxE~0A*69XnjI9 z9R7<$$h4?H1qS4fldA5W6Bo}WNv($FUk`lCtZ9Jyc7F~N2(OVpKKHGBEQD&2T^l{& zA%YpiueN9s*U@&OWqVVtgEMggS`wtmzZpjG{@sWQj4oU7xAJ#8An-N*prb+piZ}+9 z%t(|BwQ?*NIH($}-H&OC+D_XJo?!NTGCdEKbq4B?LwcYWkAZ7h*uMiMPKah)K!7mY ztpemfcQ`umB>CzA?uK_Si@sKnsE^Eg`v;JHUoMtzhj0FsLS#tQkHmqBgyrkg-zxZaZQfrJ&#g(xB7&lQRpI4jpArgO z%0-Zt-t!FPb$IDOueyjN=AMdTmcOFXFK@|hNV&*WXTe=LY`uT}ZhtQWA+W(dpYr=o zstfpRTLFC&ws{jg03|y&5wtU<94(K2(3wgzKhhj?0xE*vfe0~Kd1^{gB-86RS%A}Q zj1c_Qh+f8y&7HaAfkD2yP9CW$L8EI&GOD#dCSr_IWcX1nBv&GR@6f^XWs;W6wn1ioz|I;7xdC8MDIikSnwCdra>SN;B z-Sx=SGMt^j>Vb`#zsxVr~)xlXJ7; zThh||`OH!Kg7z@|V!%f&-o1@_rg)Nu2E{wZHzT_oT6GIk z>(RfKanc|Q+}zO*b#JjepAnCVTk@S7T_nwC&8}~fKc^;4*LjVGCrIW~>bxqx=t1fg z;tfe2E}FI$KRd=%$+<8O?8>BpW0vzN9_QQvdS<@sd)kvTbnGfLd&mB1y%9^r$#D;x z2SzXu0$?7X@X3sSkU1!>L>j!Ko58wp&VWF{poR$rHt9ujzN@w*A~!R60}c-0=b5$~A^3QC-DvvyJ$@DY zV3Fy|eEE(siv#W#EXw0Ykz-I5+saBOZK6=yH#F!hJ(yL@>2Oj&Gq z3%vObX9pLDy2^>T<#D=#{hl;@X>JH_duw?2eQV6$n%ym_QIh3Eae33v8_5knO}Rk3NYG!_mH7?QvEU#(F2juo_KCc6%{kOUHa5`!6zt#tB27b`UbmU zZ_bojc`3uPMZ3>72|QQP^MZi&&lQQ!3|vowZ*(||de23ntRUyFom=0c?~kOHIOr4i z4SccEG1RseJk0wT9@}_pk7X>hgrjq_Ns%p>;or8ErMcPB7MMkLgUzNngCW0h8!0y= zSwtUBM=36_c=EBfl%r-`G;A-@`F7(ikH2d<6P|m+0S*`vIfTF91PR%k)i0i+Lz)vJ zT_JL5g(}QXnBELC<$ZYN%iGmmKv?Bmfy4BXzdW>`LpBY8prRQ}8%Y+s&abF~cCN@BpwQ(?!lk>IZv5&d z1KI}g4y7JT56f`E(IdHktZ^4L9}ZYBdzqICtapC?I2+7 z;91CL7`!y-k%<^@qZ=}Y;bR=zKJ%q;D^*f$!s-nVvmU%hp8#*+h+Gl*6=}{Je!%LH z=qpyR``vzrww(RGKsot@5%b@N751ndw%KKY+!gi* zYp;r`ydm7ojWL_-@fZ)zBlE*AJ2~}iA=FOsG#>Mga#1{^jlRsVTZTdVYhTA>V?Gm= zuAZEh0$iNs#Lmc)gbkKa->@L|ZpYM27W5mH7=y zYGp}R7~Y)WG}1?OCJ=QCQNnZ z(`K0LhxVy0TS%q=wUruWg(VKd#K}U}irN^9y1S#VJ`Fg+$w6*+ME&VX%$t48Uw&G$ zVaPJ*$eKQXPp7w2ULi5%w&9cNglc(00P?8d!WQQw0V?hzGL=4RV_(gEg%>85V2;&BHbZhFh0>{fk^ zK0KK#FDLCC+urtlv39Ef{}gR`c4MOcWZx>9#&QfⅈGu*u(g~eVz=rG>{s&^mT9a zn$&8xU)227%sui;QvN^;_e{^7eVq&h2gO1^AgeusVJT+c4uEF6t6b z7Z7vZdHMzlwm5Lv|DuBr4StDwE5D9SIvkY=>QzIq`Ci92kkkVZ@R>|)!5()ND!q8u zIv;iwEWdfGKzLEGI+g7a1%cOI{9@m-P=Y@SlkEI0%|>5i)8d&Y*NiF0^I)tUe?Bot z(A7k3j8w_Aw9al+Fq^0L#CT8yV}|_lHY0=Ka;F{&CwcRK72x=4orc2qL+TmxNHxhl z0k^X4<7e2v-kez(#U+%v1yaZGx0$V`6$JRQPFx~8L@Ho<-Uw?%%Lo+nFWKIQs@o)v z0%1OJ!&x^g>sU3u>>ayC%LNaS)RviCE2R$_8r?bXKln=XE7bS)K9=E;(Sq^~@WC{k zw)O9o%gZjZIw5qGEEYd`x2PS9>ug@36&46AYFFIzd2q#;;8WqN#1GIkVl)vk3>PhS zv+j$K9t1iPA7$(;OhqTBXO{R-YId1q>drjJ$iG!XD%LX5Ty- z+vbE#ij!XSUdj);EzF!AJMg(ts>Q9Q(JyWOZjx3qB4w+b)Mu28q_cHC9gTY3Y(*QH zN_|NG_hc{N@@iYwLn&V!p1pR|M$Ejo73{nSj9s9$d5&f{qCIU8Qbdh$%3qVEYnRIl z4&r|8`=XRFk~`S9{>L77kL75Ki0kw5#{qwqY_rLIZ{SPuhCa_jmo*#J_L?l}N}Dnj zV)u0-oG~8Sl%O>-LJ6VXg2R?9}U!kf$EmljQM{W$gS45ULk9{jCs` z?wE4s549`O2rKfC+T$>4bmMkJkFo&s@ej>j<aV&UmV|XZoQGjt}chp^xwa%LUwhaC0(g5qN3byo8!REB$pl7TN zExISUucBNmUl~(MsR`*=?~0-ybPS01^;ZBlAx_=c~$p*7Naq_P}ysy@4mt!bGgFt zkNadxO2HV@lXOPiMAUes=vHKQs?HM|N2LTHo0ju0j&PEIi3cvMmBdngQ8VY0;P~mY zY;E6x0xrws5TOG54^CSryCZy9^HfXk;}_ns zKWJZ%J$|sMV;xXE?KQjUUg#qJg9W8<9_^-Ck0Y-82k@i+foAT_vYlS>kU zr}TK1kJCOg4vJ@!h4+It>k3H{p*@UTnGCXCt{PzDFKVdpz>|L~%h`!e;>!^XbnjHZ zLyX3HiW06^)y8=h*^!a-G$ZhV;=Zf}n_9_%M$bZAPJY5Ul*=)Hj)iS$&z-T^WoPb> zh4afLc@2{hpQ<9{bo46Yw0s&iywZ1zYYTRwsM+FidnP5JCbL=B%pQEwwe3-U(j@4b zp=&Mbx}l3`PFWk9K^yRl#uuk3slKJvZ8I4vaYF3rOhi2avrFT0smvt=^00BkPOYi6 z6~NaZpi}DOk~2+>vMVA7r;LJSi{+ain#cJnsBSjC;B)fj6m&hg#?|EX*zPJX|J5OR zho>K$6h_U}pZsE&H%i~#roJZGQF=ieeqeAX>xJdAlBD~0)^ZU;s56G%K;_mQox6cq z<$@CSw;Md@SCp`(_Z`p&`f&j513W6_Pk3Pu$kY!?AOVt#&`(4@O1=ogLpgp)sTbrdWW>`kPQ!1B zTuzLVS@&8VyuUr|uG+lzm#0q1Arp1#Ye|T)kWp~j*Ev;5ry1iH*ecqVaAHmVGN%TA zpij9&?y$N&R-`!^@qTp;FNHc5#C#vk6pf!mkk^LUpPC@cEoFE-9r2K+~|J?h?V=qk{qgCOy=$@9O5Z zuS~#VNRKmLE=ZM?qoiTf%ABd)VihB(z!|f{japcf2R{ipNCJP(wQ_!1qS>5uQ*|-w z$YoShX*O6r~JEYLv=B>)n&Gtcd>@nDwQ&Zs7bg6W)#7tr=#)b)r5Vv8=(nb5kus+V-i6d1@KxQpQv$N8R$>tO&sxoi?RqGun{7 zzIOrwH&$u_J>A@;FezvansNzvURQ`iP~DN&9chDBgG#P>#>Q}c{o(7OO8MEKjfv`l z9)o}!vXZ!Fkq#%euguvycllxCIJmcv3}&Y7f1e%vZZnA&LRA+Gp*^y0LThS@WwEQ= zPa;SUrgJ)tQVKV=y)*wNZiI`mOcP{rjxhw6^X8Erm`Ep_1fhVaPkXwDED`eQ+LfZL zZ>Z1yomK-=KmAKCXh+}I)e#eBn1)%5!gZOIX%*Lca`&&f6vbwVT-G0w_}Ie;UKpn| zG&J(=&2z;Nn`U^>Ye#u*f)fFyolo`=%f%)ih96*c7xJ;1hY{R6s|XH#UGDM^(*htz ztL+XW@JGkku};NhrOnb~w)z{5d@!C>>C)t#J)vRHQI$lNkuW0z=nvU&B(bFVVQrY7 z$O`w-5)MYxu9yf@G%--hI{saX8wjH!t=jO|tLxL_r^>EF;zR$~L|Hb*6bgiEsItcO zgjYaxR}Kf>CVjaT5V#oCQwOfqH_@?eDK2`iyQs1M==@c*HzZ0}$HeWS@ms17bgYr+ zZ>N#-(oWh3+GBaMsA8eDQ%CEC3LvR_>)1*--JZ4iP4#t-Py1^w-KG;LoqHIvsXi~q z&5(ODE=PAaheIEw@B4F2P9hxLb$2B!+IfpFhf#yZQd0KQ($8_wUtkGa1Rix(af0K)myn9lr zMt>9UHFm$Gz+gA*v}zOkcT*ya9`=;w{q1d^p+2gJ9izC#u(oekY0uqc$!Cx8n8O?z zwonQ;d6;WXik5D{wO$XQoyD@K!xa2>-rR`IFQzLj?RPUwckX|l6Z~l)TfS6#R+lSU z9mRQc9Lw3LAd#NA5i0sLK*GaXACc~qV<4NS^v*Pa?c>w~^v)l4a%7=)*oN?$CCAaE z%$SEWOLpV62gZ~mq6~SrQ0pip|IWGVtKv&5+@rkB`(WDNkjT2y?o@Q7BGu0~60Q3Q z`}eGQ{*dJY90#4vtTI2*={jE*fqi(TO^&KtQ|3-QdR=Xhk)5E&YH*GJbi!VH>ePEx z<9w0?i_x+wDR<{(e_uYkuS91?G&#_hC8q~8wxOxYJVa0NkK9m#y5Bx38%@_cOlb^s z!kq3*jabV(TjcT0*UuXIefYD^OJXuH@a+tmqNU;#yCBi>*C#TViYv1G!q|PMvagWO z2K*z3I=|1=-$?nSh}8f;YlfBG@jvbJmL!xk>tU(D37sUUQ~MeeZO*^2wY}7Ta<eVzNe*h%sWVE1B5|B^vFQ6_Y8 zQG-^ZM97fcO(OwmC7dLBUq5?4=4s8uKuLj7DArxl>*Usht1&!}oERpe3N9}^V9(Af zv6GVfG{T6D2m<5dz8y7Eyq^X`ZW`#{+fH;{k$ZO8dS=4l4>dC9xhm?nsZ1`xL1A>S zfL=>x?BfHOD0@C#Xp&Pq>sYjKY^fb@xw$NcN7E@2Y&`UEjSk>j4urX9nMCeYMmzUj z(=j;ekjsFosDt&(DX%Jub^0fR+_X2@csjhh??K9;j^>(b*GC$LT`sJ>zl<%)BRKeN zOZV;BHeP`7lwe@ZbCX!n6aO)WoDDNyZC1VE5N6eN-_rxLqM)qun+()rV5Mb~?oz`m z>C7#eb2jn|k-wJ07zhG)OK{;`YyA9C&XXReYiM=RBmvPC?i;UQY%`@+*I>xz4=6(yfStBmIwATDEmKrwu{h{y#dZ12Z9gf%K~$>K>Pk?H3b@ z%yRu+b>Y|+jizgRI2%cS1*He>#7(SP#B5r&RM;7g8uN1%Zm=&(9;MdiPoa_a)7mJp zHTf7pGnK?wfdgw^Trss3e;Lic!!ZU_H+Twv*nTnykP(=RCN^a2&(`bDRt)dDK-F@E z_rAmkni%{lG7R6tsu`e%M@v>!E#gse%ls|;(8P*d_Z%BL&zPmtmWbIBNpk%o%u z?P0N5{`KMRImh$!qPYp7hdS~ZeHzrkLjwPyIkD+DYaGz6Sl6If4|nBnBo)(MWhwe) zEUv$jUm*EKThl6qTYbKS?MXJ(%e*DGeV>V}-}Oz*1BI@1zD4<7p8;~R@vmwPl z+l|+RJs9uv4u)?{k&UJ*j?_n}U;D517P{}@m@6*Ognh^xw)&mf?2}A6QJViLvFT|X*A4l65>>OLXGl%LTeup4ewfj+r&hH;C4?xrtvd_$?p$NV#8 z*!)bMGBS@bbjHP?aNly$&NNDn03+n44Ba|OzIhzbgM3Peb{h^_=ml7ZTRm3*0#L-+ z2+alF{V zZ|^Th4lqq|ZkFas%rAM;$r>=thOPZNQz3CdoV@#H&lzn4xX(R4JK=_TYYH%QTJsG? zf?oeE)>>MFC{E#2|_W2{V7TiMlx%rpGjXx1gvF+a|}h& z^sx|NbnlWbyz{*PN<==UcO-{DKPEjGZtTe^9&v)0w1~hU#SNd3J-zX+5A(ykPDdbGW54u-u@#k zKWw{Z>9M#dA9!BjyHck~Oye1oXq~eX3P|DRG&^tIxoZB!59>LFSHqP}@YGIckMy^k z_%$sab;**$2v+7^N!1dfe1e>P1N!N|c^@5ro?qYe37AH2`K1wK3&FhKG7Q;z4;{nh z5Sq!J6CT4yFW1*It)~-A^7a*bPQ7#;v0uJ>T0#>S3k&!AB<{HXK}!G5NxqFxh^^(3 zRv<19uS&ddZHCeg(`25;^IPp- zpO*q;aqZxjsfPF;j+P|s8y5)eAms5b5|QzES1s1FPoC_Pp#?GL@I`CJd6L+* z@g(;&!(|g}^2?EwH^T(2$eC0NKFV3r0#;&p}h)KM8QueAp zrhKQvv|F~l=8+(UbMNSIm%45T>z>(Q&!TqIcU{h0G~fgRJJIq=Pb4y>lOHwBV7|I3v^HBRv8Zr1J%K2v?dKHcxLcU)iJL^C(L0z6BfeAb z4IN|5>#;}2e~>*i1_Q|gh@LCU`TOLZW@Lb6K_PtCxT%oqsVHK@M6OOmfG`Qq(3<|j zI3n<3(EeVJpM~_WEy+wbz=Hcx2D`Rw2}(`5#qjwjE9KY1d&(|rq)}XcK|yUdMVGgWO?Rev*4{8U6QcfKu)!by7E%dc^IgYg$T-G_ znk1-Bg8JAs?VFmPVgr^b&S!gABt3+{;WlBaO2yu#{P1a93Q1nvd1rM+4710SGTV$E zm51S~&vvtY0<4x|oErb{tG&>*j*yfMS;1zpQzpL)7(c5Y0r>1OKq2SkJO3Hrv?2HN z(DaJ)Y+`K!AQ7tmHVbeZbOOPhjtRUq%!#MR`##0qh)%GQFNG3N>D{L7G=Gm^i?xmK zcfx6xnBkX)N+O=(BsysV&?6+5m8LIru*=LxqkgA~ZO-Z0GF!Ar95I5@v@JieF0+r> z`gm@R-}CGZ+gYhFzUWX$6ZBD6a`0=Th+t1|b?}S}5E)d}#ey#E6Pc&xMhLYzPZf~Y z(;QbX9G<_<3~=IJu*m(O4+|$4t0Y1phHn6rGPSzIjpyLJgrEh*Y8@&Yon~V`asi0Iiotm)C$cW8CTQ?sM@xy+lXJ9wTBvXgm15&~ zIhE|}sEui2;>U=O@vO9lv5B_f&P@(rvuA*Sm-p7*%ErcS(1J!2qC}{&Cue3$FKYfn z$|d|xP|&h$P|{9FLV1Af$gs0fg#fC3W?d{g>#l;VhwSJ@-8bx!K=uQ=`(t6S5+L7y ze4tjUzKcNe@{JwJ+V(73kf2R)A|Ohjn@vJ*U&RS zLOx#NynU zU8!XJ=sOOm;8as<5!AiExZ(B7Z*0Rp1OqZWm_$Es&>MZWj!Rg^I0YfW{+chy>%r|a zLZ*Luubr2GF%3c;E&~+spmRme_!<#r1_@iM0M`QB0=e0J?994`sVVqIBmF3n_&ks* z)_{+rNhMVgW|FbEm}Y5aaH9K9sjXn`&mf{Gxk`Vf$x9hqoi;{7qS61wb>8<@ zd|R8(2chOI-s`kEls2g?C=r9aM?Z~{gP1#^pe^I0Lu|sG%JC0#h&G_s93mwhMOV8$v+gZr0Is_%w0Gx^!v!$`l~da!CVdUXH>6@u@}y=jp{+FW zxq78B3vflsExwh%h-s^pCvCrhb4$t*6Tk|cj>7lKNr3tQDj5N|{90Su`16rWXJgd2 z?)q`8e>R5^)iN8q9*r4*tcG4+{I9|PH=Yk*sX+%y(%I@JK|OM8GZH*`hl+#QvW$Lkek zW((PNT7WOnM zcZj7V1du8JkFFWcXY5WS%2z!+ORfEZTpG`Hqt>!16Ohn!WW5Hg{Qtv41AtNf^DKf` zZ%RmqXLD6Ix~8VhLSfqOj6Q1FRS3(N9(e;Ken3hA5PCgJ8lC_;iR)oyYUX=xfGnxu zZ&ZtCpCz-o$z|c>Kwr8rf)2dyj0njTlZ4>!MfK#QtyHiL`;wg+g%tQk#tm7YIi>=7 zhQa@A%R9{u{Zsmt?~vKuYdCs9gz=+?u_hOjR)O*pEAy&HcLQ)mNLH`|V4)*$F{=jZ zmIt_n!Lwxm0XX|8(#7d2K>lMk0Zp(@_=|VKFdu9(d;~m6h!5X>)NaGQw_dSYJ93h- zDw@x}_ubI&?a2vjpJ^)kI~O$*JaivPjQdE8d(9E~LbZ7`7QegrRcFlJ`T;Wu&OVsD<5KVYxhYQ%el!%qRl{Cn;1 zTsz8?>R~fey)qPW~=riSk9L* z(@p?o3{%IG(6(-%9ywh%S6(N_UBtPq+%PZ(XlLQ6#mX%Mp$x~#7Ax_<<21-40DMdB z#;LS|SE-_i!e<<1KPR^uNgu^`yE0s+VJ-fZV}RQXS;!7wAFi}sx!e6%MEHDWn>rvtPcgbw7lu{(FjBH}{0=(?P>t6wXrC@s5m{c$qO&EMNgk_G zC{0>pAMT4d2;;r9zqXt3bZV8d^15Ss-JP5pi$#3O!)+4pU3kP20Y$I1cj_~|-P;?Bvux7Oo*NFCdndrcwF zxjg;EN{sI|60U%}8--!Ax^*T>w93$f)>)kC*;>3fRN4Z|8f#Am5d}F8$0Dk$fO6tv z${JwysXI?ZPy)fIO~68L(a=Sd5D*y%QU*eqfFjMIL{X~L1PBPB6WR=@^pMbt z7!^nep(MZrfqS?!YrXGU@0a&}J|F&P{nt5r?fvZM`DqfJte|9Y>2kT?o>R|MSKIlw zTN-`6fZthFZ_)62L^zg<|FCqeERV#HyFF7#ODK@t`w2UD54%4heL~^LOua~4+}m?$ zcJ;+Ob&liyu$FFkgc;eTu|ShAs@VoqW6(D>o*$NDUJ|jJ1f2q~3ppXA4-`u4=Jf5} z?omZ_I|nel=GMCgjx9o$Vxt&sSbxM@dRT{D1f_?n^Lc9rugmIeB0S4Ef|-|T9vocu ztIHbQKvU^6W;<^KgqCPVc`JI#zjEL+9y7bf!)~@Shy%aB3&k8aUmdaCW6R#Gj#F@Q zdL`B5U!go$GhK?^^Og85PjT)M(&?v78zF)dEaYa%Bl}5RiI4OmFG3iG{bKgX2vv~C zS*(XW)GN`pZk`kp$87nm4f1(!*`pJBVU+-C!dSDwz+mB4#Zga`9Mkro-JTERcROJK z3QF=KNwUeG4WIGqk8G7+H~5;noAte8r90J<@vofCL2C_Rg}Gmw;Vt;4pHZqfq*-!^ zW^t6I=MR+DGjz-axO&CL`Y8+8?bMW%xwX z2c2=wkT{2V5|EnwmpBmH7{96Eje0qv1@&X$~W2;g;5gi_S@7ji0dj1ASz%} zUd~zxH%c9;{Vitx!#B#1Sp>kQK(B7eYJ@z!us8(w)tiIJ;2?z(xQExl*YnOhQ;5YQ?vAg;8Zi zAkV>50r?TE1S2QcV2_iLkjIWyzqIRf9*BE$uP$x*MJ9PWxxPLF_C z56*~lABPl{XCm50ME7JHcAm-E?YzY|Ew;=HB=GloE=S!ZPnj@+yle~+LNUA?S)n1? z#7S^I)3mVj8(8C0riA4*G!JAX6fGgGbhIJNB!G**Z6*|>{oXOwujdb;jwZtQq1GEX z=1I!f@d5VzAcd9jJ3eywZ}F0;cCPM-x&1Pv6iv3e{A>@<3gIS0B4)MyB2FpN9)=(I z)L{d8u8_m}EY{4|4yr#qmH(#F&E*7`b>J;y2uCtAy@H!LBUGw$^T7V5oOtCr`d+B_ zht!T!qJ|}KGZ#dYDQCu30nQ1+DWsz`Ym!Yo&*n^xEen`CZL9h_J1yt(rXf&~pN$Ma zpgTv8gUXexAW;?Tg51Hm#%b&WC(j;(myhgXJ@v!qly|;V^mZB`J^<*Q-gqNJ%pU_G z9k$w=%84$`gmKZ7vMm*!(H7u>hLWa|MiBy5!eP8!N5hI1#h%E{*mxDZl;xy&EB5hO z^Rlv)3-uKmFp{aOi_>xI!Y7`5-XX_{{mp1{@LZ z`hx1q?cBV6x%3Hber07>a`)DL}yI=F21f@WR3C4DniyU6Q`Fhcn~M z@#!+UiJA~t%)V+2{>4J`pd7C2uUWWU=}A4lC)MS5KD@RN3H3veKN3|oj13Vk zIGSvMyVr)TxcR!>g8S((E8$b%-|er=hQ8AZifv2}Qib+F?1Et5;o;tMak0i zZ|=OI(N)Dh=lHxueP(&KTm)aNc)L*;m~UW*kEV!zPOEoxA|JY(=7RVS>)pg;T^3o{1v__^0S|wYghZ+7u zt;QOwYs_&XWv+@==?2?v{QP{Q##pVcTh!aRtGNVf+`%Vg;5_8ghAAqa8Ws}Phj=$; zh};3iX2};6pof=nC^1rT0d2O zaz72#&+qE3^_2o-O$m$DB6ns&(cX7(d1VUxd<6av-;(p$M=&OKx|-P< z@*~lcm4OW7zYBhq;^m;tc-e3pzglpHaw4Is0f;DwtRb{tkJM020N>9S87~t}5z*L!BueTyq>hT|GrJVTb&~6eD{#Oi<0xwM%>TAPJh*l% z+{1O}SlcWXB8xuC4Y6NDNF5`4mj=6e3UX(e_X-3WMglSpnPn;1lNXy? zSBV1SvS-6D+UL>hFSh{^4zW51m8jxN*-ydSVDq4F>A~*;rJY!Yec!xr`Hn6O{mYT} zF7aFFvYMZN9DLSopI4xG-G$UG3c^4ZYy8&-x!}10GZGP;JKTNlFUBaB6+svFs|)I5 zek{~b9pSYM2z7V1n1?@b+h|0~HcTHt%|tt^<7Ta&BJ! z&MkyD%eEMdNj2r0Ul$(aIsE0}M%#}7C?~1#$P0NlIA;6(^-T6jS7Nn1MLf->H$LW= zM6q_w;3%`;hlf(KDIzt>O4jvd*vJ}nZbiyNVJbcpg?$aRU9i-ke`IYlHcE1ps|b3@ zdzD!j9(#_ZPbV+r^nYa>nM^{J$APDby&FcL9}k734PSJtdG}riAr;s0{+6x``MWpL zfwwT$j+Rd=Htj#m){!D?c-rKZfUFlB#uAe&$^utoc6&eciK(gZN59aZLJR%o+Vad5 z-ZVuLpcF_oMbET0_Em1WThvIR;!cz6KzKP|)O9zsPHKAK`Bx{yY*{1cs*I1F!^=tO z{`CT*y;eke{M`^BCpTF&J4g9dM@+ zjs<9h#Dt7SDtxI2qso{35^YF#-kQKhtk4>Fzw5VQr}x^%(8e7IuAQAXoWDK1y44oB zX1*=yl|48Y8NzO`3rl{OEj4sr59_(4wO!51>?>tu#^__nR53GPG`#9D_bVCh9h_^Q zdL_)a=XIF+m7K2%zoFXbUaSM_26hQ(*tE6sD_ye+Y6~=bDtQJA2hcP32Fghzfy@VT zg6j2&KawK^K$>P;7XjJ4zkp6YG1EMkqPmO`7u;Kz;sxWS_DC*wPc$9=ljd+^Cd&Q{ zDRP_;ieURq4?4R%N%%#}7*h2h&&K0X3y&smB7B4Xlt^k8jEw5LKeTlI3>v^VL(L{l`BAgmEsJj?|AH^x zrd}6_M+(&D5@1#uT|JzVjmps<V;QIyPci)Py{u>O(?4{cxjzUx&#$+%Xz)BVg2sTlnM1sQ(2hpF?p83`;IoxK;O ztI)u(Mw+fUjTAH&Kln*4Vc5JsDZJp;nYKoN4&bidf*3&)RYciZTR$Cje{fI<1DOZ3 z4Q|fQJGPmCWdha1QKY)3TadCO%sx2B(`$jt?rB%iU-% zAcqeDN;Nyg&j0YGOIp|bINp=aRz$&_Ka1E^XLxux`v)tzWL?Yf;BP44Y0qV<@?rE% ztLCv0J~jZEy)&(9#S-pK)nPmLpJl&h6aq`pU8z?(Zys8qZ6;e!`R#l`HFpRP zAm`$%7ksF-k!rY&6GgCPlvejmA_J!HS6|}yT9Yj*Rp>`i^cy&ik^rfzeSj?8x3VDh zKwbs2_`e_AdhNlyn+~?{bUGXtZ{}-2%70;fh7gmy&m{fWy9qLki|Z2K_{CXsQKeL~ zLZ8*4U5_%m=Er0y(*86q#8 zLMQHSQu@6%%ce3-ZbvPhIymEkMRsT%S)(se(0v7{(Fp}7oLr$$i3^(Y|9t8H1z@zg bZx6dwj(Ge|H_ZZgUC>Pv%j=a!?lJ!c<|gv( diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png index 1d81b5a4b5db687c06b92f88648f9895711fdef4..8fb7bbb9dd654bf363d701d0c8cd4a557043d188 100644 GIT binary patch literal 10766 zcmb_?XIN8FmoCjn3B83P1VI4>2?Pxw9i&JH6$AuA6i|ATfb>wMOBd-#QK`~<4N{~C z5~Q~v5L)P+8@_L5zWFor%$<9mC+A5{a(2$%Ywfe%^}cI`>uRe~Q?gSM5fM?Vt10Uf z5fO(F4jOU*;hQK#nJ(dt*i~Ouk*H*VbCqyHX04#5Ktxm?Lv;qfMmVQ%QhV-7L`2hh zbr2`srQskV63kUsR(OIm+eoL4d}Zh}yS3iGPIzrXO!S@33vLU@2fi@0cbcq~XVmCQ zxiGk+A9=?x;7+vsohON(>mQTmJ$*s(^nPNP;uFT-Um(pZHk$#a6AA ze(od&7`zE-cnE#?5ZaJF_~-P-<>sH$nTD;cM%dPAT=b^TVfwo*$-m8~rx~+bQyzo; zi9VEwNYVTyqdIXk2*JPtAe=D^0STY)0PP}qxX>LeXppN!e}8|1?eBeu6%)|-i_kFK z(BJGRQM5Q8E16F;PHR;I#7CAaVlJyTG1d(y{IKV*?C-WdUXB_qme>;lid}@Y&CHmU zH#DR%;Pk+eqG({I0QvXt-@B%!Odo&qxJ%LT3*5pKju{ZBU-U7b8BgMbi`MHX-f z8f4JcZb5KaU~n4OYF(R0|P6m#)p$KY%$; z*0%&nE)JthIqp*<G(!Zzm|ReOifP-0Omc8^{{j@S-MhK4WyT}L0Bg2{*Rf`PITG;8L& zd)PLP5+&^I7)eecAR`G(e37~}DqOTxfE>sSd!VB4^xb7wmG{6;?)1-Bv^b8q;+%G~ zj=z{!nKQz>$z}RcvSM#!qwc7klmo)HQqxI?&#R4jx`rMopX?jxfw5MhS z;Q3G>rsTWVh2%QS76eW|It|}h&lmwc|KHKX}#y7(DGHvhTj*)65P?< z4&fPc9r-D;n8?8`O~%uqHqNs=zgwm!lDO098t<|*?)I$%o|ADz+jKgOP5Np^H|e5K zLb}Le&|p?EbcAAnCo8mVsqz%0xNHIOv7+58yqlviFYfhtjw92TJgi@})$Ckodm_20 zI>=h1+M`krTU{4(nZg&TMDL3_WN{%yI`P3NFZpnmBkNF_!k-lJ%DfYI?#+m}FDe9C z11MC?yQ2j|lF2#Lh3LpVKdtvkprqA2S#t(b?(Y3zLL1Y~aG&>E_X;nUl|eL#-V^s_ zeW@8MvA%_AR#&^WFn3cR-xR~R9@I{r-Yyoz50Pm4gcvWBap9E`g0yQj#hHOCM>} zq;0d#HPMM0V-cI3=Q!=>C*^yQ$JSDcj5?aOIAVSfZ^KJus?$kastA6$G}zbc7=NA? zG<5S5txovtBiVb1s|e8lqnpe>})rG06E-3dhdqL6KW~%Uw0XLq?tnL^S+@%Y`e@jZNo-8ED1(YDE!A*eH>6v^;rxOw**hy0SNK+Mq2`b6c+}2NQFeS5&O)V+0K&cd z=(4^mi2VNDz|O&e1&70RP56o=bV^M+^qOiwGluROd@=`2nr^5oT}Tv9_C-Pmw;~RH z{n9V5sL=nmNd*meul_kW$a)2;{l3?;84o~_MFB%JP0QY#*Y&bI?DQb2Yy_;hPdCBy ziN3>$uNrF>B&teH)sfI4K~ipg=wEuxIc$+CPa*`-uh;cILl{lAv)r1@)IYaxG)>fx zd}TQbg&r?7L>zr;IbAKA_Zn+Dy%Q>Z0)vT9izu3izkUckHUj07JJ?+OoP7BqrGsoS zzQAVmHU))V23pJFW2@hbBMeMr9IDDe)xp%32kr2#+6B*ZqfZRjwr(j7)o;v4GvZ%) zH)h2qQ)w?uD&03%y7)rpT1nWk^sg^aEF+Jip?gm}ad#XKq$%OR3w!OshXc9Nf-73z zj?bu;ZQ$?}21p@jtX|TCS7DRs^FywrFrK zwl`d^#@!ul(=<=-Bwp-kI7cAK$Wz@ON@_5QuQrmA?9dNwuQ4P2*2s9! z4}V&Q-fYYZTW_)sW?T-jd$7cKs8naTp8&6WltqH*$I<6DzdpUS5PD4-UFuSnrzI6m z8Q3nFilwnRJ&P(%fU^5&ZLym#7{chIt;`}8A}!n%Em`{7L?x02gx?Nk*z<{7xs*dR zDcpBO5L2~nyW1a@kL{(*_aI@=sG{pro9>u9-)*owSocJWWzwDWT-`{4ZP%Sy6&+b_RmHNU^vXcV0$g;=^|m_P51dUj4NHjN^5{lQ z!7JJG+w&;xq1-4`!J4z|?i7pN(M?RH5ds=0&X7i31<{Fp8_`G*4;vkj4J`sgcKAc+ z0g~w9v>=U;05t2)sMf8fi7z9JFC;thi^snV9zPdQN8#|{ z{1XpC<+6B4I7w@hv=mXZ^_Lfw-1bb__oR~KkpkZ(>j}#t+h5-0f|kEiR3H9Nr(yfdn9gfm%JRpl5?&m{dU43 zwvlFs*HQ{#nUtA{Q~g5^#4fsCBrw}|G{4~}xy5rw`P+>!nGL$@=j!BIK*8SA6>$m2 z+i9tHPZMxa@Z_eZCQEDUg7K*-p)m|5qUziXOo(&S3HzEq=an2|XUwJ%(bMv!%%s6Q z;b3xd(vYIxN6KOkPE};MH6O$i&yePz#*vZmGvq~_>U#nguzdCEv(a?Rp!&D(_P)N& zDe39LRu!k!H1B{0Wb@=+@6Hcan0tGafRx>p4A;nlGs+*qUD|lp#JMbQa}#1xcCCcS z2rg9GG`n-|)#fARZLtOJp8l5#pzzqqK3X|b2T@h#&<~NK%VBf69|&oA_Wx_X{;vd| zloc>Dg<0D0xuOYuS?gQ!K?=d)Jy(Q-EkStG1mw1Z=#Ku}dD^T_xI!gjy>#8VbYbW_ zWF?4?IV2n$%`ImpBqWqjcST_+2^7XCekV4k*x|7`7fKoK;Bb$5r8AuIb$%8WixmsK zB1HikT>wNxdC$02xDsqnx{Cj`hX+h+tFEf*shyo&o}kS07PnRXk0-l3*gh;5;ie(s zipNM`L`1~(bhj})m@Yg&LrJHd2;{@i?oVghh%6tUnbD?bn|nwRM)?1E9#o8kKwz#w zw4w&PhmGQ3d#eM;PuV{R3}*P&XHq6`wX|4>Kw}I@;FIz~!|fT8Jz-!Nk)=N-i;h8k zmV-bfWW~T%;O}08iinJArT={V<8`rpu&DlmNZ}jRyx2fyp98 zAS>TKx2yk?CR%J`g~231321ZHXD;)n&Dwjnfb1snzI$*(Om7V!w177p&L zEkjE+PVq5$xY|FxB2sO{<74MmvxgN`3&@ek(&f7v^5_7@8f;UVHdND{#rQ!WNckW1 zFb))(W51cy98A2>^G+~`V6!$ld*{*vB)B~m;S52i3_}GbA2MB0LTKH8L;C);iT;Ny zG4dAoSr5m1zw;WuZ%3w*lizA{qy{{{j>%7CIZ@X>ovJR7Z&v((!SLBvFlbO$(X8mZL zsa@W#!@(`4%N02pLx&=l$teDa(^+Q&<4Zq|hxWME@{1ok-sPUJOONVQo8ziqUpjhQ ze|Mqj-9cmSF<v9+sS2G)FcgPlBmlzgWT)O`cip zcy_m63M3^Zy-!R`v~+jBe{1Z|135YK{(%7&C=~i?b8H*|Q6lYlg28PiatsRb@!obM z?m$}u-54P!pcAR_dwPY?QTnwGNpsUl?VKSI^0C>o!Hcxn8w)rJMH24relZig%SMstz{RX7c_TspQq zceo)GMm-^02Lwc& zR^x7vhV1n3m>PHcGqTUua#}k#?+zjn??l+t8l$Bgj9v(sNxXY1J%Mk_Y?Mv+8=q%g z)9@5D959%bO-^$3Tm1T~f^m8oMwqP^02NJaaD>{XoaY6r3_j51Vlx2NFC0*2(&0Aw zX!e%rS);WYKb=EHby~{wmThPOAIxfd1@`KGO9anNtzY)cdKOKIneshxm+_fk5doKK z+^zrVn9o6lT4e*uQjq%%#XJ1wjTv&i#l%RBFSzXMt5SKWxDpbuyz-h(`wjcwf^P|U z>e3fNHKy2Vj$~Rztz!8-$Tv=2?#>%5BBkcSBkp%G)kPC&6|iiAE9=W4DH+Ghht<7R zUx#q!(lf{W!2w;ZsxR%B@4dNZc~b<%2Y*A^ zG?GYSq#1kPs&dWo&pk^xBvg^pJgmszQ;YewCL}tjkxsBNG#|l4PKvdXuT3+9@su@Y zXz3c2q}%x5Ipoy(B(`UxueL^KyG4k~3mw)DO~p_sAwAz`|J+f!gDDu)>J}iH z*Dp1h8HCJ@lgBC=70Zi*xL==`|HsG>G;#GwH+a(Ij@MXn1*JQ zd?*95%Ty)O4%I9S7c{3OjTTT4+&dB)+yB=u@!*U`1(>d+iaH1mjTz%*6EjSBf^ zUJunVC$u;;66bQ^Mpx-g!Jn89=Ruq;tA4je2g=-6U5u-92_`?;ZulpndE8wzUIS1Y=@UeL#5;}$?u+~@2FZwZHIOE@nm9~b`%*6tt6J*YRa zDp@xufrY$KYiMZb>WW~Eh>D6@Xb-09I^O*|azaoc-|P6{?!R)liqUgEa%a>MX&7A>5AKD*lDHD6K0N8w$ ziA3d@TpR;lUICgKC_LlsI{U2S`r15?|`-f*=nieGg(tNgfFl zYPb=$`EHwy7Vhc_y%hy5qaFrdIZqbk%fJGw#RJAR(>|`wQfB^+Mnc5TqttffWHs~X{lKCY<9=uJtBxys(PQV}#)VQo^ z+`_IL{jPj?`rz^%3qU(BuPhFbBbY$2_V7)GR+hedswUG3&C-A&ihZUJS8}se<&TSA zE|0t9IgW6mbm`ky_I;&WO#n8!)dIqwBD}114?O0sNa$4-{QpDw{MYvScM!D7Yupt% z3t#B1@hd+1RVt_JP}i#}yJtA^_H(K4Ja_UeCs%Rr$Bx6LTpBdzH3B~XqFGUI$1QZ_ zPUc&l^*3%>)BgIiAGL5idv5(I`rwCuyb%n&M|kGkv_>-A3}!B2GtZ{Zzh6pI1OU%a zEoW(~4Y!-Nugh+iECeSHvdBr)q>icnob+98wkM%&YN0ja1^ zl5X01PD$Cw{;CJ+kQ# z^edbHY`pY*y7FSIrg0vu^KxTVhs%!NopS5bY%MbO`XKS5cInYFhuI71{k@uzgJ-&9 zhUsRHxzNyAq-gP9nN=f^MBy>PvBb_gSMg~11yI~LDjBxF3^UDa9+4ex(D(gZk7r7J zuKxGsaYoCT^)IP;ShK+kx2eXaW7(wV?t0JF%e_6{Ep5DxmSq9BQiC|X=7LlqqcAg% z8r9B5_Kt0?wy0) z9Hof7!^78Rgz~~kdh_FnB5&lh*71Tm$@gPkW&4bmveK~~#4OT-%zK6KFZ+Wl*4T+~ z_+4-oryde7LDzKXyJA4dtS})Jy0@V+1Rvg4R-yvE2CDFrpIa!qY=gBkjfLcXqq3if zB#w5H-&xwN>u&y#x`|Q^pL&^Br zMX4iWzKcc>l*mT`4~4ctYrW|4kW{X#{Ka^`jG{d9&Qg}HUV7Z`IlC7Uf;q7R5Wq8> z*V&HMl4K=`^DQ9^WpIYePHckBS~-8Ryr z=0OGL>(#~djS+n(BnlSh(fIkp^DdXZr`ysej~40l+{9xC4b+{z1EulK`e;SSa`{it zIA*RHtt_r`bCkt7OMs&F6M&gZM){Aa3~A zS-x$BE)HDtcGmzY(&>OO=9}Tz#O`5QuaByW6kUvy9IgFvTOqtU5z`re8iW34aJXCgD`5W2#j(~pF@zRGlI5?!yTsWH4NcBQ4I zb)B4el~-3MG2rYP4q5#g`t1opk4bt*L++`NurTx8yLY?R*4{k+PlVOVs&}fmy!(Gy z1A@2D{ibZppDUa9k#gUfhnsk(x97Qzt|2rT2}J_*3&)nKolOq2){r*#J&iRlV%{%B zQ1!-|(26QA^El;_l6CSyILCqmj-V2@wL=uZB#lkt14d)C#a`!4{>~VU4M;}bRgl{g zt9>@&wNoK6S!gLm{uC7*W-01Q9zMBNWpSbWgJJ9IE|1yzOt)r;n&C z=Eifz1r9~e$-)CUA+KV>rNo3w1IoXo8Ax6iAyWFP8}&gEOer^fH^x8`9ZRz{i+QVG zaEKpiNOBq;`3vRhX_7ll>|&DKlV0oQ`@X9plyG#>eX4dOl3UcM{fW z0ZD!Jl>%gp=4Ikm)fL~kg*KX8p|~5ftl2uZUE*iCW@p{=cgjh5JXujJ!F8l=G(jqu|X0~0gh9fhwiUTn6CHsUPxUJk48=8_T0~lI+3pP4 z8Db0y*Y4pyUu${>-il4>A?y#HN->g-o9Jl77bxIdF^r5fSo9{#3)Yitj~9j+AhRzx zywcZ=-25W>-=6Nnxo;lF+X}>lZ8O93$2U!Bn|Y;GIhIMg-Dh}jI$ViOL0M^%F{s0i zx?L0O#9e{qDf^W#hT;$W!9?XY%HRVBiU2n;3Cflwc--6hRCoaWp&Vgtwer{h%P z=JVpgPE2CGt627#E5z-qpJl}=O1M$Got>SHrq;uJhT8sFVz1zuG}tI~CT~}z@8yG!;CFZZmKhIkAbmIH#ygIL zQ`-niJa`QNISt@Q;dowd!^YbV-J|=H-^q9%OeZK%N<5 zzYdIi8UDWCF!JyX%3URDVsrfWjQ`-^;C*^}x}~e@cO!Fik+HEceam$F!;1yS)8do43B;Qqg8!Z3;sh#eDOl-KNns4HB%ezAAoL?clyt) zG;H~5)ODrw1YV}ky*6S(6Dk@(ZD8T-Ylz#_jn{k|jx$gCkLIsLLQZA$Y_ zLvC%0ATH8B=B8F9P3=f5w<@4&T~kyDnhBBs+8ew+6?~ zd#a{o#Vv;%VPq*iF3>MugWQIHC*@+k$;h<9c@4?=ijX%`mcET1kyIO{KB3=Y=&rbN)0-(E3sncmpI=_TP(>ZBF}W%V zAKmi->Iv9oKVjVrw`diA#M@*hYXER)tpC$ahE6<72zH4x>YiI3g2uf1g4@7iZ$EnC zk(8k0X|#3iXY?}|o3@bSjPIw?@B;jt$fcE6IG5`qzs>T}1QU~vy4q;I^fYCkM|Px% zqGK%v9PiJ(I$h-paKB3;=~^O6_gIecyV75-|9zT)ys^?a`ZLWQ5JTvfXsz8@eiAIR zV!xA`7DN}V$U4Q@n`_1d1Hq|&j(2?+taFYi6Y(U{3FyR?dY+5nHii!SedKdAk2QvC zi#(~n_r6cre)iMGZpS6QkoRFEZwK~-i)fM1fGGi9jw1EeyXGhfx;&6PnW|u zUlpgR3oD-I%;#(fa7>DGDP)+6yGj^(Yt@vx@S+!*e;RY1B3TJpVbjn$c(AlSPDYD3GE{!Ft{>Ig@2KM{zo@J zoYByC1ncCA7*fNlY$*#FawWl&BqNPzWY8l8EF<+^o1S$=F#EW{Cj}>A`6?tGyo8)C zYB(J5c**rmcLhyaqeJn8Kk>NQ$b*E|-)tr}E=_JU6Xi4q-MNLk6$_PlanB5b7@}=e z3uY@PD%J+O0WIud^UP%%CR6`Q=pbn|^nxYQPokkA{JxOYQ* zNAlkX!vhPUOfKE;ntJLVIomtjeS3WtigcvCwWgM)U&!>$f&if^X&a@xwWBZc;liQ533YL z9*p|CpKVdJ@W>R6E_vZ!aLNtsRRp0{ggSS2Qy108_fEsQ`DxR`pOOA_2WO#x(2rwA zgpj%4DV~u961gYc^tQ`aU3x1$)cz!RhzKSfqp4C`Ue~tDtbaS0bD>ovylv#@2{pE( ztV??8Xc`#EeXaKG-sW#WCGQ3%DBu1EenBFTYcTjWCZY`P9T?RZB=Uzf;5`-tZV3vP z_L9E8JpcMj{tf>rDzT0(gLmw%VHSM!A#O!*$bLgA_&@`1yTLb(R047$lehKQ6j^`b}pXuWEa>wh_NcvtkQsZRL6_1JA) crc3TKz7~P4PLFXy4=0iOBW>jpMT@}y11`C-kN^Mx literal 16329 zcmch;Wl&t*yY>meT?35;hY*5$APqF`fg}XCMuS73Ay{ySU_lxQ!9oHAcWB(*U4y$z z)9^OWbI#OE)zq2!*Ss|!y1Le`-o5s{_LASau6u>QQCA?qrN%`;K_O65l+{8(K}|z` zUSeY+uiPfN;UT|JU9}WIC?!L*JIDgMm9&~P3QAcl-km81vW(-TsOyS?LfG;6K~3T$ zq(MQEG*Xh4e&=Cykb##-+v$0vqGtnPJy6zT3}TTAEAXI>!T`{sVY-kdM7BEhVze=L z7(8PGkkW&C$_ANf$<{hh$y%K3IfZA4T~Wa$jKU^owJFI_Wu5a9=ER^L5ND;HP1RV* z;r&NA&17}G)o05B2*mv!=DPb%LqpBY&F$*y=(c%xhYxz|1j1&NRX3+dD^00X!e-os z@`1Z#RS+GLp^4%L*1IV?*jO~Jb%MV#JjYi3 z@F7{h!M#|6P1svQ3@4fxG=a5AEu*Mv&Ze5O!ls_S?FyXlVWXYQ#f`4(Cr%w2QlB0j zU7ae@U7sq}-|Dnl&{UAi3r5~XM)S;jB-VsD`P(;&>AAUGHnp_Bl295h@-+W-C}0x^ zMsHWdjN=<60by%q@oqD$ZkX|s5*2nRk*7I*1#=6M(Y#>AVGR1I0r>PC%cEulv(ea8 zl=bkr3f*XsBtz=T%)S6)5b;I$jueMKUqt;7Ls5&?+_54yW5+fY0k1z!Fk5sy2u)k5 zu9x-%Vp_BKBfbg`u1IZ9+h~_6sok*TBvWa5J+pNWuMrJx>$ozr zwoVu7-vZOn_;6#!23dZM%HUeP~sK2t2T@Z@&mk@K!mo$QjKipXl9-pnQ z#}bJqzvOy#yZtkWxGFS=4Qs0DGR^DeZ$cB}p7)6y%%O{;;MxG$bqR3Ph8jKYPD&t) zuS6VnCpt6!g2mq*30W-QH1bF z3GYznR&KQ}-1H$B55W(s@To!Jo=>~n(P+5gX?*iD8K}JL^*dpiI%xv!1Q=8B@e)$l z{Qb}RmeQQF;+!#@pnLR|t=TV8B~sE-3~QUHIcG@J{jl88d0%`NugtZ#TFLLYF5Z=c zpt+{}hXjkDSo^l+BGmDALkYgW?usTOMO)wuwf`YB#IycUrM{<+!x8vUtRXRa;C?pf z&^9CvAC|haKHNyAj8d%nPEaBF;Asr=ZyDRHHC_3UM^MQb^>Cbe;f!YLuZwT6*A3iE zzA+rHp1-xgTM-`0Jh-gi;})^`(}a65rclXQtPbaTr$v;t?vu`*cWd@gkhQcXQMO)s zBts-;d#f~O9JpTFUp0LTrd*cn>x|_ve~tAG`kd{2Hn-{O(zEPn33ecyg?2GlXFfvj zZRukmrF>yb8pb|NB@+F|_>*R(wmGs_q1zeMjcpCOZr#$>it?iGEqSyzxetk97~hT2 zIG(c$P@+Im*$`D#i$=A&y0@O^Hom{ zn7+xWMwb8G{3y$iVB+`6_kz>-n~x`a_jiutEgx8>=>(znTOizJcXqRK+*^I#-dG!W z{KdH2@%j$VS#q4E;xAp@aLUWNNKU?oHx;_oy9NyG_Y6Q`$zVT0RGk7$<|>^hU%-Z% z(PC_7N?e9e{m`Z$+%n#Y)=a@t!50EdYvOaKOI$VAZgvTms6Gw|4ZoQ>LBtH0Ou;yB zh&51nK;mp5pl7C9&YeZD9X*Fe-9Y{LxPz5?4LEzXm*>^E!JP zJE-c9@59CFw*l+YufsML(6f2mxKX=fH0c!0UnqbCg6s^MUg_<^^w*z=9#4L#sr9N7 zuXO6Ot=ot`%u~t98+4MMUA&3vDaEG{Xs&&+b^xa9$WkdykpifcmfNHd=lSzR)1V1_ zCMYfj4uh(8Ig%@Nt&}iJbc}gj7}U*9Shco?A`&WL2Sk$lXPGfE0jMJTjnqyt-#iH$ zy$}~bk!(ydI(cc668;pI8B^fHN;BM>aPv@^Ci)YX(f%1rjZUF!_aQoae(*nzak(~(C#EQ5PD(6`O?quliM1SV%%Z}hxqo5UD}JQeNi}p2R~D;5 zEF&^R0!~sAZXRsIiLyM}wvZ3dVdMPu>DAKSS@x5_XbYEY$!^x}6_})95Z3?i@&eECj{h)_AC$G=renbW#d`z7?C`z6eH}ZGD z2(o;@^Og4tPI}`)beg zwU>3YV9sQBH55)0PC)O?yP}>M>el|{`c2vWBE%=gxf)99GURW@m-g|^p%_YSqbOzV zNm=+4_p^2a?B!LG?}QH3Yd&))`xQW|j#f95dHk!?SD3CZt%(l)iVioO_O%JiB8T&P zCVcNTV3`$8t!QmUbS$mW>%EtL)BPCox^ON^JW3@A!Z!|{fy)fS5?H1ig@Nu(o+q7} zg?2N-?Lj9fmqe>;W-LGxZWfnF%Ag|OQ>830LlJl1BsD-Lzvr`btlGqP$F|zRH>I`1 zU%@(J^rBOh?KCURm(BxcxeKl>rEv`Q_zJ;aoV4h-ilU7`fT~%71j=M<=qeDV8w{m% zV6vt;2$PItQD)AjVo{>x**gw(rL1&5G5VvhDj zDx*(%GMzvlQH0dB(K`+}}AiStAK<)!;0$<^1_FA!R#lO$0vlNe}4QSkC&C-z{`XnxLd zzdJ$0XFcr8D5>j;=)0O<)* zT4M_WlezjY5ZuufmU1eyI1ldx%^<-5-*raj%qx@V7p zovmu9+!WYpdMR3eq+J5`G>Ck>aqDp&iz4O8SVM51W%)Hb;H$^CZdIY(g|qWi{7$}_ znTHD2Jg#<;b0E;$b`x`6*~{CdQB0oU@(;`j*MZ7L5}m*!(z>_1!KL&@;#=u$k%Jzw zKCh4OubQ{cz9Fuz>V9&u2nwn0Nn_nKx{fks#kzaE-T577O0bO3a^(H&W0`Dtr${kc z9LiAF6rH45yfkb4_j$(D)NG5m=ZuxmR+s%#_CwSC^7GYgi!pui$~wVZ++X1zIld)&SJrb_X_ zNk#oOUrud`kP3QX?_>XAt90-3VYvf_>_XENdrX@XDtzoeuAAH=CZ1hN>^I*(Q#6`- z-h0xMJr|Km$8x`V)(j7ZoOWL&h+^(t`fO(-2M2GWw*r+3>^gljqP%H66Y)dMbg?qo{m5Gu(U`39Qtk> z)lY2MD?HnA)f88>mgg`i0uX!OT%Wg#V#%j{|NQOCE%&t1UNml5oSXvv!QP6T@r1P5 z;#uf3>ZBo}DB2L`A=c#UVLHC-zg*u|ef4Pg%vXJx-T{<#e7otI_XW;2>xk(67XGN1 zE!8w{z)7bGE>3AK8_qf@@i0a3?4Z5{JYs}X+H-@is^cs^wJ3gB@2?IA&qjC6Y+5eo zmpaVZI2YG_Fj(zJMTj9u3Bm89K>sEaxMK)g`V%WKb}KzWi0r4Ghx>KB2`QDsKnzA~ z1&I!^0Y$6SRsox%CZFc>d?>C1=h6!$IzY076)Gd?ruOP&&w^Nm+U3n;#ZZK@*p;k4 z)GyMaHmlOPJ>scv-9-zU521dR$B$u>Z^_J*1NGBJXJ&PFtxq1r2z_u=Ze^1{`#0;% zOp5?^ie{>JRAK_98+5UXv1nJ$M5l;YXo8)*{ro21YvOFJ3s-U!&1(=G&P!U)x$9@4 z{Q!gHmUXFa|JFUfq_nC<4maqc1f=BOzThnE22oH!YbEe!Y?TU?j|lR5GM4EBC^R9j zTlaO@90)KKtVOH#Q!GW`FY zaGKDl)+@^}OP@nm49Y}^k1X)!spM_k@EHTJ>84Q36*uSWS6*SwzF;sS$^2zPX)UHb z74GM^4zz8)Qr1-dtl?ZCX;hwUxHeH4qN7_%0niS`9V|^Ka&#qb#1YX@DpOyN$)G*e zsGU?kxt#m}r)75V6A0jI|Go~5UDhNLlAzNVzz`wLp^Fsy8Xxr|8fEZ8Cn8{;!}QZT zj&m;$+O{D0Z>^V`ZPx4y@>+Mj@(j9Yd(7GNMa4GE8hkEsBNNYU4hhiO`wi{CeevV- zxTPh2x{HoJ(BSandeEtSu(04ZJ%uDEBOXDDqZaS423@j)XRxRX^Y;YUdqKxFt~`aZ z?I~PMD-#3>Bs%0Pluig~nmVXB5N~3j>QVT}StUvhU9tX{Yemwr^@*fPuapNMb)}ss z8$ai1zHBIOFomh-*;?e{x=&Y7gId6mjlkf3YLpYS<&*YQ#lc6jhX|?fbFxY4qH<%4 zHHajf63IOge9nqm>~eNfKb=?b#bp}L?qVbL8AGo!wa7gWM}KQGaPI(;y481d&7)zjM! zTVwziBRmTTecm)2G5{E^pDY^o(yh$##6B0nhO-6|ewvaJuQpLi(I}In zy;0EGBL-{OPnC_Fd{jL6jbpDo+ zQOrxPN#`gRBxLN88~ zaJ`c!@>#+>8J?~BmSs5_yGB3IhT06CCxqj|*00WEUkB6krz1>66zu*adcBL1=(5{o z*5e+TpWplZ`Lh9JS?M2~aAqtq*qNK3H&jpv-P_%D5l?rPf2W5}Sxa4u_xDL>@q2qf zF$0Ui!d*8Yq-&*ZE0(Jc+NqOQH}<7`;l}J;iWr-0-HH~tGLV?*tC@$VZR?Zig@t`R zeSJfHeJE7eJ`FktVICP9TT4qzGmw!9q!D*N6P0I9qYe=lca`n{GfgtwHQd*V2Zk~X z5@>%+EFYh4-t7F!4h#N??)Am>BpJ5vs;Q40Zt0Q>o(03>1_put923~+4*c@Mo89ar zSSGJ?GIFi-@#O_-XS5f*j=qq@5$mqfn*ALJ)7g4s&y-#0d~D&0I`;s1PbRO5B)kB6 zq0Yc|_tCRs6Tau|s#RuYEV6w2jxkH{%Nif)<;nj)%T1}lmIEn# zz(;M&QI)9tBZw^!jh{qAW-WN6oQQ@|5yr{VAH399D(+Xr2x5-(>`ZYr^6#Ij@s{1@E8ezNR z8KsRTgKC?6y(?u8(ma9H2%#U5VxFgEcr>)kt~V)q;q6{pEC3@;#X zI&xUJoP$Hq1k}!%aZojcjo;74|JYd}<`m!C3pkv}EA2*$zfHL$E5zP4i` zkAEQ;79MSuf+7{g7B!witYn95Cm=5>qk;i36&vb|EQ)JRr-D#82>c}ACj-B#jb(&@ zn7}Zo>`}9noU!hb4Y3(yfqHh`UjJl!Zw(lnnXt^v(yg$J*1#rYhB`I@nO|@r|1wk5 zHhIGrJ+S2%pMc0Io>_QP884XaA`6w6CcbV6FqpFP>C-3KyvOFE`TgO~rCv+m-6gRf z02Z6@Tu`{DEH1ik1hAMwqlN@rp8(9SBSA0A;PK4V@!`=pgh)a8@nQiDdQ3ZnI+_DA%o9JW>I6vi&J{+kJvehr5$l2$hX?_^idTJGx-0n z5JuA9>Brx_=suMRjIB7jgMBe|DJpF~kb7(&~QfI_Bf{~*o}tPhR@od%#{qfPa8#qjQj z22s7e1(JrEy0?0z+?d7qQeKXA9o z)j#*vQ(xSYOy;A{R_mina(;I&-Veah=BP;TFMCC_8)$sXI(R~JtH!6$=9TxZ%n(c; z6LPvL_=>(T7L4j zwP0^_meFS%8u|ViceCGidvGy(xYP+t*gwzv>#96IW*4T?SeoY`R=FC@aAci8$*E?)GX%fA3FfGk#Q| z;rPMJ==8`Gf@}`(uM+I@F1++$VYkvC=3mN7@)&`ObAm&WxNXqntoytu6(h?#?j9zue1kh&l-)Q#7-+6~iPJv%56x{(NfZKZK8 z#+O}W@cEo@Y4UO^qNHP{`%P=LJ$VS6C;SR%a5Rs@PG1{6-0v?+Tq14Ywt<@jKqy+v zzSzW+c}KY9>0+AOlEI0IA*06~h6@>7{TXPn%dRPfs#6E0S7+p zM8BBb$rXcdsg0c|zyu->E9j7ZZo>>Y$YY}d?Bo6DcWl1P5y`1)@3!rqy74{@8sy6t z#f}e`tK$>C{n|_AblWQ6!yUw!O~TrGY!Vh8yaKPGQUz!=cv*CjEYD`R0(eZQ0KF!q zg^^T>m#@pv@eTN<)q4zKw}JEEDHfP2^viX(y1t0lYMI+cskIVp4?8=;btNXj=YBvR z!TnR~O{R~jx!Zxvk*sYfrTj}1<{=hhO)@;T4~cj=uVa{V`mdjBPl%9GYmy%Jom@a$ zOnL@iKh5%dk8ki6z~oEc#Tmw$;tzN9Qa|(LyFPFuf~~U&3Dy4bb+ zxoMdvNY~?f!=0xiYRO5Q5Ys^Cq?w8DRS`Or@f&9MBpc@4&QhM7B%ufi+dJKL2eWhL zY?45;x!hfK@aAmF>e|EI$GYLvF1wlM`%?wqZJIRyXkm=DP8 zdHZ{CsErrud_Nn>>0`u=C@bLRF<|$$+cWh_Aptukp!i;DRr@eLvoJGFVC-8R{Q#HK z&Y?ynrH;d+&lulzmF>hY!nhj-e!Bc((~)Re&+iu{wlG6(j4yygsU_;1ZICi&caoOv zn-XWztfq!_FEGUR$$WLm_)5|^2>!=__yv^Tfr)*W5-0lSi_J9C&pgt$=s~a$R7=6s z(bpe`2c;eQ7=!_NaV-tMg&UHM^y-E6ex>sofrlT0Ou~wJ+TJ;|(3|4;4O@|gU(+X8 zqWW8OndW&P^`5TjnVlm&zF-t@3F2V^j=Zpk5CGmG6Fw)yOUJuxKNgvEG;ruBxJh~e zGms(+n7-A4k`Iy@1J325Tr%#2`@*9qO>>Kh)`ha?d5uc6#$w1ycw#WRS=7wO=IoAX zzta77TSSXhCT77)hfk=40g(*yO$lL5z?lJ+3g$>s>c2j}XL1#}EvX zQD)M#Zfj%XUGI*CJj!`Up^yC$qI!dH$Qqvs3JZU>f6gi@=VNmB)tpIY1IdXl#l^*% zNHitBkXKQO)IieXIU12*04mPsxQ-MxAK88koyq^+FhFV^_XoDqB^Ne`+Y5*G5Q2NJ zKGr*=&Din0L!J{GHwQrT1<9Iye62W-6o!V4W3%2BZDgm&#*ait|3{5)7mbW4sKe}X z6C@x)z1*oFTnK9aNctmSCMf)Z?Gd?ttk4DO?4CuP+2m4*5E|;)@fI9L4jWQ+0r_Xp1SrQqk~tk zUNz1z0R286(rMZ}wrmHQ2tq?LK-%2H$@TPaFHpg*O*2~aJPQ?2#vmGHVrL#W!dw5Z zt$r^bM)Oe5vv%R!4laXT+efRRe-mwCW;L{fiKrc<8>c7GEDjF|2BvO@EiI4rLNg2 z#yH$hAc~!`+bqY6ymU;b@ex1esCEQRh?Ayi_9zN{j*edU2HJL!XVGBy zv2Hy($V43II}F5F-F})z!&@vMTxxGR9{>GAYs1v1^l}5b=pa56HMOZQ*PM5<(=NF- zFK?zBHX0RVNv>5g${9k{-YA70gIJHPk1T0iq&tqI!(&bhyqxc)yWRTGkg;iZpdh{> zl_7q_+s>n5?eJ;Zcf%ZJ!1;G^%Laeiy>nXu#2gWj&9>3`)Xz0t|Mva`r2in+!{7UE zc6x#C>h%Gzdf4DfiH&DZp7CD~x2CbeGjZ zhsoV42Wazh|7+HcW5(=4W~azTM8DrfNnW?FN1^S*AiF#E53YZc*)azSwc_p`etFz7a@6n zVCD0y+f1zVxI6jA;+7*lOe$ladGEl>H<<#hVYP4k&K#4o-!WlL)?0!9boWf_V7T0H zhzPh0TXFm$1T}5&R>)wq>mCxB$FwJe+m~*|{v==#bVGND>pEF)95wmN>vk#&%S<13 zF5|$!+`obUaq>E-w$J8f0J_}Bb$bTtN9@7irO>9qiGUMn4~K>=B_4{RdDrRL!*BES z7t0o{=o_O2#us82ft_}1Iudx`&<4V7c6mK1n#1}XZ&>W6Hd8?}kD*i41Fg$ap+fC& zmV1z+O8W)r+eVJbdrhxC`JH5tRELyX4f6)=1#0Yyo}&2K9YFvw6L>&Ee3d{>}m znxoryVXO1|eiPCC9M*o8&b7EP(McMOBciUMTto1g=kC)7F-`!-g2k5KhW111?eaCk z#jz)XiaUyqk8Vzbrlk3V-S>4@icc=#yn@T67|yNmOIsiR?9NgO=2+nH*!|1-V)9$C zi%Nwf{YAzZ(Xw03!?n$Wq0kV$CXJ%?E^)xYz;;`e?5q0)zpK6WhCVG5LB7nR#+tEd zi!T3LUh1NP<`K2{u3qEo{Dw{F6;XFENO9a#?b>3Fx%@r^0g;F%|B z*umYf!Rq{Q^H*zec$m}ZKMzHPMi=_oUF$m59SX^vSK4EKVsFSm+sZ3#mLwgikS6c_8YEfv>+oC_UhenRy57a+zqEb@G zv+9bSY@>PKZ~R$acMbeJ<>m%298)SItv$Q9LfYWoyDN)6_lM=~%L=-Rzq)-J;6oMO z|6OO`^$q4E(66IM9>k2vJJWC|dc^g>tJg<~j?ResWXiccI5NciZB~Veb6OI;`>l7W zj*lnU$LQda2usH8KFm9ABuv`>caJ=yQepaapNW(dalDqd*?q~VmQ6|u%6Pu&>5U?z zZC32WWoT65nZn=5Ao+K)cO{;+jLm4OOp5$E=&7j5s@hj8v$*nDWK%pl&>-cy z@60xkpSxi`l>;&Jn+jCF6z$K*$FN%;PAnY|;5gZOar$J{q?*b(rLvZg4;RYf)Jmmr z+eqFKjh#CWOp}Teun;*y(AZ^tn7hArUtMK4Iu)xp+oxjQHHB(BibdrjpCvX(p1dq=Xy+>Ra2*o&Sq-jwd+$))3%e%dB0;%>N^{D+QtWN;BXt!p-@*)Eb$Fybz!-hWV^7vPu)MkBD0dn#}=DLw@##9P`Yuf z!AyVIh#HC-Y?hn{ojDx)x*w(!;1dn!9uLTdRsEjZr+fX7Ah^yMG^=rr3K@0BGW?|$ z^_3fTsepC~eRb_UZZ8=-Wj-$wnF_G!&PnshtU|o8YiJQPueL2!Ge|NHx4h|uf1kL~ zK5Pz~vlz$+{@8fZ)YMc`P*8BxvE!bRk>N1;>%GH9Us6d^)1up_Pqif_+>kB{SeP;% za%p3yFmO+?b#QP;Qc6j2@t_)1cu+0l)nrv=Wo>>w+w{oDN>|LHM9gT|f5KV+ljRr! zSVFHuEa@GaKSs?qh% zd(Fa6*G*J^1()tOV7L62OeJxBCU8*LXbdthLt55@Qh5xoehmLJ>q;8)2F4)_{BWIf z_=D5&>d%|%1`ng1esLA|w2CTNI}awPUtF;|6*vn!;3MpUPII3*#8=i>#-RTir36JG zt$LZBS$6u(Gr2$9C!R)+fA@c^cn@nY&`(dSj9noMSCxL8On+2;cOWW@di4ati&zf4%;@=FGe-TDRX$d=5=(IC5O51WGUEYz^8<)ZrvI^i1)^0Tfa_3(zFRT~}{B=%F=oG|vT@Tv(QXfXAW5{AAt7|CH81Hcz zRXEDNb+o1gKYBgwmpQ{u+W*1aEA|c}*>Cx(*QmjMuV%Oh6FgYIK8(2fIUzOUXeK^& zspSu8#_O+G-uDxY;P2aUL`E|MzuQ4GC|d1!GMDhluC02mtRTZQJ=VwEn1g2OgSuZ? zdgW2akX-}I9`hVoXt`;|zOlqJo2|Smq2l4}bnr(X96UeoRAWn;&ReN>l*mv_L(Jsh z)LB-+;&9~pc!v>+%GQPU*IT`TczgxqeSgo{Xj)TmvPEZE+t2fg`|S$i_6WFd=_pWT z@|?H2X7MbSQfumArxYcvgG5O!Kyt1eGw_3xiOS zb*7w>oZ&l>2fB;d=X;4@$-^_VH&CkrpMV4+|AsfrSQ<3Od6`+v? zHcXQ*NRc#qec6Ra$sQsx_)ImQ<&dr zeFp1uee={-tyfcWZo%)8jo~>U-QAL@v>CB$|G>Vs!BP6PRVC4-x<+Hs)9$$qN z^op*Mn@t3orC!GI&W*1gk6V(8s5Fxe3mU)^AO9k>3#w}){Gxs(irI&d4YyJty{^k~ z)^GQRWc&ix#yLa?{-nK+An<^`dwU=fc93ji8?TR>U2$+MKG)BJ5yjp8=}7WsjG~*G z1VjE!mrd5|j;%1TC?c~N>e&(;?2(EH38y~st?H#Ym*e~Cm)y=14zv#$4W;v&Q%&Vn z7fAs}tPDL2JZ<*ftp<|ZAX8?{un9kaJXX;t@#N9txqq<`F4R1%ZGi*>0c=w;QyfzO z6ny{1_hwOTk&>JM4a-bZmMsIM|{m5ZwEQ)sDKye;l zS{nU=`1q#IaMDQc0wKa(5Dw1YyUVclUjx25i*m;_`S_iBnf2v2I%rSmB?o@Moy|7 zKR(<^|)*KZlDF_O2zB#C>Nq$W#NM~ z?2L}}gnsuEOUYjL+0q5ZCQQ@_{}^}$%KwuPgUWG8A!7&FAW>Pp1OEjXa4t|VlP-9} z>3mWlh9yVySn|Jwd%=}-tUIw@MNGKFO~q_b^pYEFrrgebh&Lyg|})fsk? zd5;xSoZhL`b3^+r*kd5LS~Y8GH(7FDXUlbtJa#z+o3EVjm#3-?*6{dO*=9xS5NvgC zyzM!;EJ|V4L@P?{ku@@rlKOj~--j@1a1(~Ju4UgxK*u0i)X*yZ@*m_l^%@8f@E9WBaZNe--IUi0_BS4E;)E94o_kCe*BOqi2tze%BDg7B!D~h&Mpr`7_*XGDmRoRl)-p0WzE-m36$qldNTl!HK z|D`3P>GAP#$d`IUE2{$is*g$fbuM|fj*bvII=YhIzxy+sXL){%F>&Hw)Zwk0seTTT zn&8y$yIoMVoLVE89AM)5g90v&B9laRjHcXYnEH!>0m2S1?7{D#rW$ zqjIZ1kyCa!Jk>u`uh$CoT7#G~XN-;RD2}pp1@f;O)346o5V_59hA}q3Cgs^HKjU0l z_O?VQiXU}Ic?6^Sakw4dH^|z`l9Hy8=2D$2uap4RNt)$K=e&0UzzfV8O=}MV{YHYF zua-2&n@n>aDNUX=VYO|QKHoE(yp0@-ajjGPj*;;}-q1G1sl(LI^JM<)$~dk+a);R4 z%a~9s>t@W3N z&#Q8_3$J+|zCQ@{|*9HPMM-B%!^9^GBP|*Y7qQKc+?*~j2|)^hA#WX z2WRJA-=6>W)s6CQhFtOZG#lE^2GC^cDTR9;@V5jEb;=E$7g z5`03VRj7nLVT(}wadb_4ME1M@#>smB>#wLCbGo@SP5j!sq>c*@2#{&W7bNDOH2(|{ z0%zNQ8>3KoUztJ*f0}7~qQt6gTz1AMAwiN`agxj zd>D6A`x;{y!>`aqXkU*&kmn&5QZ`1Hod+1kjh*$_h2Yl&^JY*fnwhOy#y>123R(OS z<_{+~$$|D;9XUi!B7PT9s6Krs-zC>i0!t;s)llJ82Lr;eC`KzSrCOMj&$%Qll1x&# zg?~$GMq$GKptAlzw0v-6rD7ne#3`$2Sw=JqiSl)nlET;)jnRo!_Q~@aA)ci1vWAM* zm;~(4F#~k4eLWjEHv=m1LLeXvB_gUZt%#W4B`_1I4e1G!CVJrEZY)8GaNipmE3S_(ts6)O2Fm{U#(^ z(<0Taa`raNyo{%Px7y_dJS;4OG!wHP1Yn2A-5uvd?*-XZZ}q_(&fdB!_B3D%XI}l4 z#;m#+f~EX1LE3<}f!0@P^~qnB%E&J%O4^Uo7!2k5H8U9+Wdv0--jL|nR_96$dpaZ# zDr^xCL$mtnH@KX&{f*mrV_uh=Vmqj^h9oW9B0y^UG@DD<1|?gdNTMqcQT2I8X30CO z?C%ea&x>wVw={v^5;Dsz=j49j!pNHMU8^$;GXs6d3_zs(i#Yu|g#+ZlK=_;Dt?P%~ zJnkZr3!z5`*BT4!v@k#2)>5rxM5eIbLO4Q=)ZsQk36B=7FJT*kJdDwDQreu4f3R9B z{k*im=aM)}V66)HWC&}?+y-vv=Boad+jrj|@Sg8y!%mc zV#PSm5vXWM1|!k1ZA-z|Od~7OAK6iQGp;0Qxa_;sFBc)18)q+_x(6_<-^RC_LmqSu z`P@g%e)1aF!!wU~a$Q<^`Oc#7h;`KYwJojNXA)Sa{OcH676@%1!S177i)3$#?_&Zr zHweWKE}CoQ2fol)7CHTnMz2i`Yp*i_4Ggr4Dj{qzQ*m(rC1D~A_utQjJrEH3OHXQd zRk4EgEnJ*hLi{0;A<4e=OXY}gMz(!sCh5lDQJXsnIf>dXKz8UF*lA!8k@TrOenNu@ zDY?@s@h1kF{#<7FyApY*ce=81Aeegp#6i4$;mt_{s2yLoE^&MrJF-q)F$%p+{KTac zK~;Es_f9P3OXF#m7m^Llf2z=pyOh$azw$ z5u|zWCP@%9w319BPrwAMIB*L3p1g~pu_#R$&JbQzHp{2NWH^KpW&0={umwAa3$WPm zJ*}hvrLulKnA7`mMooGSn{Vej`BlDio2a@3A1RS3OYaTXLA(RK6{izv*5`QK69m>z zUmc+3FhFI&CuL?i!*%m&vETZT!r^w(V2NcJQbNG?$4hExQ%0Mc3`w;-Hjd>!rR3vC z52i2+**RZ_BhK@5Zhc{qp;C^d_12t$Fsg|58*_o8czGtJl)YoJGsuM}Z3BtUv%0@6 z3+j18s5D7XuM%3oBCku`RjV<4*w1Bq2}qQE5T&Y7cWn7Eo2dpW+NV|_RW`7{v~D?& z^qt#EYN_~Wrh+`@)Pj^xwp^pW+{6!t@m$YsVJ;MEhV1m!;)UQ=hdQ@4k=G~G_SjSC ziAL^EdEL_Mfb(^>>Djkj?{o>FhMjcm^W~@eoWoozOs3+dE!v48ss86#YNL6QVp%9P zTG}s%+MG@FcrH-7aUhAE_^|(CWvvci%J z8tGpPPUx)I?4dQ zS}xUQy); z#+aP zH@g)RSb3Q~sN$LOM&0@3=`yPVY?b}D0?%@JsvaHzdicl8iMfyHn9%;+hA4Gqu-i13 z2d|z$>0#-SD8$-IwCV2G-Vx24p}Cfxo*|OHimX-l!bmHNHusQe9W9z0ZfzOTrm$YI~`&MznL3w2WK^&p}-N9kzZ`trz8gAfRHR z5~c8iv{d=%9uv&oVUy82F^4{UXRJo1^T)S{al>nO_Dbj`g^Bv<``Bx?DHr)QgM@xB|S&1xS2lQ*CgLBo- zR`D*B;GZNpvj+=H&rT3pcF&ttWIm+NM?1U6v@>L`;|R(a(htWCj5RYurr~L)q~rdK zQogAf3MXT~{V$I^h1WFTp~jMomJC|E!qUw4<$dD|6Qo7$@aShhWN&hzorJ08iewf!*=E2r()X3xaC`xkbvLzssfd2)E C Date: Fri, 1 Dec 2017 19:26:01 +0800 Subject: [PATCH 160/275] Fix grpc compile warning (#6050) * fix grpc compile warn * update * -Wnon-virtual-dtor -> -Wno-non-virtual-dtor --- cmake/generic.cmake | 4 ++-- paddle/operators/CMakeLists.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 9cf256fb6d..66c8e3ad7e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -505,12 +505,12 @@ function(grpc_library TARGET_NAME) set_source_files_properties( ${grpc_grpc_srcs} PROPERTIES - COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}") set_source_files_properties( ${grpc_library_SRCS} PROPERTIES - COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}") endfunction() diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 937441b318..8187af9374 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -217,13 +217,13 @@ op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_uns set_source_files_properties( send_op.cc PROPERTIES - COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf) set_source_files_properties( recv_op.cc PROPERTIES - COMPILE_FLAGS "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cross_entropy_op DEPS cross_entropy) From fac96456c241a1a34975453d1db0b8418446fe2c Mon Sep 17 00:00:00 2001 From: xzl Date: Fri, 1 Dec 2017 20:41:34 +0800 Subject: [PATCH 161/275] add prelu neon impl --- paddle/math/Matrix.cpp | 23 +++++++++++++++++++- paddle/math/NEONFunctions.cpp | 40 +++++++++++++++++++++++++++++++++++ paddle/math/NEONFunctions.h | 1 + 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 88e9180690..be87a4c296 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -28,6 +28,7 @@ limitations under the License. */ #include "hl_top_k.h" #include "paddle/utils/Logging.h" +#include "NEONFunctions.h" #include "paddle/function/GemmFunctor.h" #include "paddle/utils/ThreadLocal.h" @@ -4157,16 +4158,36 @@ void CpuMatrix::print(std::ostream& os) const { void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) { real* input = data.getData(); real* w = W.getData(); + real* output = data_; size_t numElements = data.getWidth(); size_t numSamples = data.getHeight(); size_t paraSize = W.getHeight() * W.getWidth(); CHECK(!(numElements % paraSize)); // this check from ParameterReluLayer::init + size_t partial_sum = numElements / paraSize; + if (paraSize == numElements) { + for (size_t n = 0; n < numSamples * numElements; ++n) { + output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements]; + } + return; + } + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + for (size_t n = 0; n < numSamples; ++n) { + for (size_t i = 0; i < paraSize; i++) { + neon::prelu( + input + i * partial_sum, w[i], output + i * partial_sum, partial_sum); + } + input = input + numElements; + output = output + numElements; + } +#else for (size_t n = 0, k = 0; n < numSamples; ++n) { for (size_t i = 0; i < numElements; ++i, ++k) { - data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum]; + output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum]; } } +#endif } void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) { diff --git a/paddle/math/NEONFunctions.cpp b/paddle/math/NEONFunctions.cpp index 3bf47901f1..0f83149422 100644 --- a/paddle/math/NEONFunctions.cpp +++ b/paddle/math/NEONFunctions.cpp @@ -49,6 +49,46 @@ void relu(const float* a, float* b, int len) { } } +// b[i] = a[i] > 0.0f ? a[i] : a[i] * w +void prelu(const float* a, float w, float* b, int len) { + int offset = len % 16; + float32x4_t ma0, ma1, ma2, ma3; + + float32x4_t zero = vdupq_n_f32(0.f); + float32x4_t vw = vdupq_n_f32(w); + + for (int k = 0; k < len / 16; k++, a += 16, b += 16) { + ma0 = vld1q_f32(a); + ma1 = vld1q_f32(a + 4); + ma2 = vld1q_f32(a + 8); + ma3 = vld1q_f32(a + 12); + + uint32x4_t flag0 = vcgtq_f32(ma0, zero); + uint32x4_t flag1 = vcgtq_f32(ma1, zero); + uint32x4_t flag2 = vcgtq_f32(ma2, zero); + uint32x4_t flag3 = vcgtq_f32(ma3, zero); + + float32x4_t mul0 = vmulq_f32(ma0, vw); + float32x4_t mul1 = vmulq_f32(ma1, vw); + float32x4_t mul2 = vmulq_f32(ma2, vw); + float32x4_t mul3 = vmulq_f32(ma3, vw); + + ma0 = vbslq_f32(flag0, ma0, mul0); + ma1 = vbslq_f32(flag1, ma1, mul1); + ma2 = vbslq_f32(flag2, ma2, mul2); + ma3 = vbslq_f32(flag3, ma3, mul3); + + vst1q_f32(b, ma0); + vst1q_f32(b + 4, ma1); + vst1q_f32(b + 8, ma2); + vst1q_f32(b + 12, ma3); + } + + for (int i = 0; i < offset; i++) { + b[i] = a[i] > 0.0f ? a[i] : a[i] * w; + } +} + } // namespace neon } // namespace paddle diff --git a/paddle/math/NEONFunctions.h b/paddle/math/NEONFunctions.h index 69085e3335..d67b2f47a8 100644 --- a/paddle/math/NEONFunctions.h +++ b/paddle/math/NEONFunctions.h @@ -18,6 +18,7 @@ namespace paddle { namespace neon { void relu(const float* a, float* b, int len); +void prelu(const float* a, float w, float* b, int len); } // namespace neon } // namespace paddle From 362b7d8a5e8d3ac09f99e449a876a315d7b0cf90 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Sat, 2 Dec 2017 09:39:26 +0800 Subject: [PATCH 162/275] Rename gserver_test2 to gserver_test_with_python --- paddle/gserver/tests/CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt index 6dbf5a01cb..b578a906c2 100644 --- a/paddle/gserver/tests/CMakeLists.txt +++ b/paddle/gserver/tests/CMakeLists.txt @@ -31,21 +31,21 @@ gserver_test(test_MaxPoolingWithMaskOutput) set(PYTHON_PATH ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests) -function(gserver_test2 TARGET) +function(gserver_test_with_python TARGET) add_unittest_without_exec(${TARGET} ${TARGET}.cpp) add_test(NAME ${TARGET} COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET} WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/) endfunction() -gserver_test2(test_PyDataProvider2) +gserver_test_with_python(test_PyDataProvider2) if(WITH_PYTHON) - gserver_test2(test_PyDataProvider) + gserver_test_with_python(test_PyDataProvider) endif() if(NOT MOBILE_INFERENCE) - gserver_test2(test_CompareTwoNets) + gserver_test_with_python(test_CompareTwoNets) # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it. - gserver_test2(test_RecurrentGradientMachine) + gserver_test_with_python(test_RecurrentGradientMachine) endif() ########## test_MKLDNN layers and activations ########## From 3e8c3638dce3fb274a2914fe37eb92ed10723656 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Sat, 2 Dec 2017 10:00:13 +0800 Subject: [PATCH 163/275] add WITH_DOC for print_operators_doc in docker/build.sh --- paddle/scripts/docker/build.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index fbd0b6b078..0f889e6853 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -185,7 +185,14 @@ EOF ${DOCKERFILE_GPU_ENV} ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/master/master /usr/bin/ - ADD paddle/pybind/print_operators_doc /usr/bin/ +EOF + + if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then + cat >> /paddle/build/Dockerfile <> /paddle/build/Dockerfile < Date: Sat, 2 Dec 2017 17:44:27 +0800 Subject: [PATCH 164/275] Fix ConvTransProjection bug. 1. Make ConvTransProjection support for dilation 2. Fix err config in Projection.conv unitest while deConv=true --- paddle/gserver/layers/ConvTransProjection.cpp | 4 ++-- paddle/gserver/tests/test_LayerGrad.cpp | 23 +++++++++++++++---- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp index 48132a3ce4..e7f081c023 100644 --- a/paddle/gserver/layers/ConvTransProjection.cpp +++ b/paddle/gserver/layers/ConvTransProjection.cpp @@ -24,13 +24,13 @@ size_t ConvTransProjection::calOutputSize() { if (outputH_ == 0) outputH_ = configOutH_; if (outputW_ == 0) outputW_ = configOutW_; imageH_ = imageSize(outputH_, - filterH_, + (filterH_ - 1) * dilationH_ + 1, paddingH_, strideH_, /* caffeMode */ true); imageW_ = imageSize(outputW_, - filterW_, + (filterW_ - 1) * dilationW_ + 1, paddingW_, strideW_, /* caffeMode */ true); diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index c5359f272b..f8b36cb386 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -238,9 +238,24 @@ void testProjectionConv(size_t groups, bool isDeconv) { /* caffeMode */ true); conv->set_output_x(output_x); conv->set_output_y(output_y); + LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x + << "; output_y: " << output_y; if (isDeconv) { + int deconv_image_x = imageSize(output_x, + (conv->filter_size() - 1) * DILATION + 1, + conv->padding(), + conv->stride(), + /* caffeMode */ true); + int deconv_image_y = imageSize(output_y, + (conv->filter_size_y() - 1) * DILATION + 1, + conv->padding_y(), + conv->stride_y(), + /* caffeMode */ true); + + LOG(INFO) << " deconv_image_x: " << deconv_image_x + << "; deconv_image_y: " << deconv_image_y; conf.set_input_size(output_x * output_y * CHANNELS); - conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS); + conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS); } else { conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS); conf.set_output_size(output_x * output_y * NUM_FILTERS); @@ -260,11 +275,11 @@ void testProjectionConv(size_t groups, bool isDeconv) { #ifdef PADDLE_WITH_CUDA TEST(Projection, conv) { /// test ConvProjection - testProjectionConv(1, false); - testProjectionConv(3, false); + // testProjectionConv(1, false); + // testProjectionConv(3, false); /// test ConvTransProjection testProjectionConv(1, true); - testProjectionConv(3, true); + // testProjectionConv(3, true); } #endif From ea1a643425918fd39b4b61d1e8414003b06168fe Mon Sep 17 00:00:00 2001 From: Siddharth Goyal Date: Sat, 2 Dec 2017 21:46:43 -0800 Subject: [PATCH 165/275] Add hinge loss op (#5837) * Add hinge loss op * Update hinge-loss equation for proper latex --- paddle/operators/hinge_loss_op.cc | 113 ++++++++++++++++++ paddle/operators/hinge_loss_op.cu | 23 ++++ paddle/operators/hinge_loss_op.h | 69 +++++++++++ .../v2/fluid/tests/test_hinge_loss_op.py | 28 +++++ 4 files changed, 233 insertions(+) create mode 100644 paddle/operators/hinge_loss_op.cc create mode 100644 paddle/operators/hinge_loss_op.cu create mode 100644 paddle/operators/hinge_loss_op.h create mode 100644 python/paddle/v2/fluid/tests/test_hinge_loss_op.py diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc new file mode 100644 index 0000000000..1e13897bb6 --- /dev/null +++ b/paddle/operators/hinge_loss_op.cc @@ -0,0 +1,113 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/operators/hinge_loss_op.h" + +namespace paddle { +namespace operators { + +class HingeLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) must be initialized."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) must be initialized."); + + auto pred_dims = ctx->GetInputDim("Logits"); + auto label_dims = ctx->GetInputDim("Labels"); + + PADDLE_ENFORCE_EQ(pred_dims, label_dims); + PADDLE_ENFORCE_EQ(pred_dims.size(), 2, + "The rank of Input(Logits) must be 2 and the shape is " + "[batch_size, 1]."); + PADDLE_ENFORCE_EQ(pred_dims[1], 1, + "Each row of Input(Logits) contains a real value, " + "so the 2nd dimension of Input(Logits) must be 1."); + + ctx->SetOutputDim("Loss", {pred_dims[0], 1}); + ctx->ShareLoD("Logits", "Loss"); + } +}; + +template +class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + HingeLossOpMaker(framework::OpProto* proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Logits", + "The input value (Logits) of Hinge loss op." + "Logits is a 2-D tensor with shape [batch_size, 1]."); + AddInput("Labels", + "The target value (Labels) of Hinge loss op." + "Labels is a 2-D tensor with shape [batch_size, 1]."); + AddOutput("Loss", + "The output tensor with shape [batch_size, 1] " + "which represents the hinge loss."); + AddComment(R"DOC( +HingeLoss Operator. + +Let x be a logit (prediction) and y be the actual label. The logit can +take any values from (-inf, inf), but the labels should be either -1 or 1. +Then, the hinge loss is computed as follows: + +$$ +L_(x, y) = max(1 - y.x, 0) +$$ + +Note that the labels passed as input will have values as either 0 or 1. + +)DOC"); + } +}; + +class HingeLossGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Logits"), + "Input(Logits) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Labels"), + "Input(Labels) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")), + "Input(Logits@GRAD) should not be null."); + + auto pred_dims = ctx->GetInputDim("Logits"); + auto lab_dims = ctx->GetInputDim("Labels"); + auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss")); + + PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims); + + auto pred_grad_name = framework::GradVarName("Logits"); + ctx->SetOutputDim(pred_grad_name, pred_dims); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker, + hinge_loss_grad, ops::HingeLossGradOp); +REGISTER_OP_CPU_KERNEL(hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CPU_KERNEL( + hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu new file mode 100644 index 0000000000..ec20b08e30 --- /dev/null +++ b/paddle/operators/hinge_loss_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU +#include "paddle/operators/hinge_loss_op.h" + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_GPU_KERNEL( + hinge_loss_grad, + ops::HingeLossGradKernel); diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h new file mode 100644 index 0000000000..c0be496f9c --- /dev/null +++ b/paddle/operators/hinge_loss_op.h @@ -0,0 +1,69 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/framework/eigen.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class HingeLossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* pred = context.Input("Logits"); + auto* label = context.Input("Labels"); + auto* loss = context.Output("Loss"); + auto place = context.GetEigenDevice(); + + auto x = framework::EigenVector::Flatten(*pred); + auto y = framework::EigenVector::Flatten(*label); + loss->mutable_data(context.GetPlace()); + auto l = framework::EigenVector::Flatten(*loss); + l.device(place) = + (static_cast(1) - x * (static_cast(2) * y - static_cast(1))) + .cwiseMax(static_cast(0)); + } +}; + +template +class HingeLossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* pred = context.Input("Logits"); + auto* label = context.Input("Labels"); + auto* dloss = + context.Input(framework::GradVarName("Loss")); + auto* dpred = + context.Output(framework::GradVarName("Logits")); + auto place = context.GetEigenDevice(); + + auto x = framework::EigenVector::Flatten(*pred); + auto y = framework::EigenVector::Flatten(*label); + auto dl = framework::EigenVector::Flatten(*dloss); + + if (dpred) { + dpred->mutable_data(context.GetPlace()); + auto dx = framework::EigenVector::Flatten(*dpred); + auto alt_labels = static_cast(2) * y - static_cast(1); + dx.device(place) = + dl * ((x * alt_labels) < static_cast(1)).template cast() * + (-alt_labels); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/v2/fluid/tests/test_hinge_loss_op.py b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py new file mode 100644 index 0000000000..a8757a891f --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py @@ -0,0 +1,28 @@ +import unittest +import numpy as np +from op_test import OpTest + + +class TestHingeLossOp(OpTest): + def setUp(self): + self.op_type = 'hinge_loss' + samples_num = 64 + logits = np.random.uniform(-10, 10, (samples_num, 1)).astype('float32') + labels = np.random.randint(0, 2, (samples_num, 1)).astype('float32') + + self.inputs = { + 'Logits': logits, + 'Labels': labels, + } + loss = np.maximum(1.0 - (2 * labels - 1) * logits, 0) + self.outputs = {'Loss': loss} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Logits'], 'Loss', max_relative_error=0.008) + + +if __name__ == '__main__': + unittest.main() From e5b51c4d102ed180aef3940bd8e885c4bf5f9d95 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Sun, 3 Dec 2017 16:50:24 +0800 Subject: [PATCH 166/275] Make lstm_op follow google code style. --- paddle/operators/lstm_op.h | 70 +-- .../operators/math/detail/lstm_cpu_kernel.h | 426 +++++++++--------- .../operators/math/detail/lstm_gpu_kernel.h | 305 ++++++------- paddle/operators/math/detail/lstm_kernel.h | 128 +++--- paddle/operators/math/lstm_compute.cc | 36 +- paddle/operators/math/lstm_compute.h | 32 +- 6 files changed, 505 insertions(+), 492 deletions(-) diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h index 721aa42c92..a78f548aaf 100644 --- a/paddle/operators/lstm_op.h +++ b/paddle/operators/lstm_op.h @@ -73,15 +73,15 @@ class LSTMKernel : public framework::OpKernel { T* bias_data = const_cast(bias->data()); // the code style in LstmMetaValue will be updated later. - lstm_value.checkIg = bias_data + 4 * frame_size; - lstm_value.checkFg = lstm_value.checkIg + frame_size; - lstm_value.checkOg = lstm_value.checkFg + frame_size; + lstm_value.check_ig = bias_data + 4 * frame_size; + lstm_value.check_fg = lstm_value.check_ig + frame_size; + lstm_value.check_og = lstm_value.check_fg + frame_size; } else { - lstm_value.checkIg = nullptr; - lstm_value.checkFg = nullptr; - lstm_value.checkOg = nullptr; + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; } - lstm_value.prevStateValue = nullptr; + lstm_value.prev_state_value = nullptr; Tensor ordered_c0; const size_t* order = batch_gate->lod()[2].data(); if (cell_t0) { @@ -90,7 +90,7 @@ class LSTMKernel : public framework::OpKernel { // to reorder. ReorderInitState(device_ctx, *cell_t0, order, &ordered_c0, true); - lstm_value.prevStateValue = ordered_c0.data(); + lstm_value.prev_state_value = ordered_c0.data(); } // Use the local variable as here. @@ -140,14 +140,14 @@ class LSTMKernel : public framework::OpKernel { static_cast(1.0)); } - lstm_value.gateValue = gate_t.data(); - lstm_value.outputValue = out_t.data(); - lstm_value.stateValue = cell_t.data(); - lstm_value.stateActiveValue = cell_pre_act_t.data(); + lstm_value.gate_value = gate_t.data(); + lstm_value.output_value = out_t.data(); + lstm_value.state_value = cell_t.data(); + lstm_value.state_active_value = cell_pre_act_t.data(); math::LstmUnitFunctor::compute(device_ctx, lstm_value, frame_size, cur_batch_size, gate_act, cell_act, cand_act); - lstm_value.prevStateValue = lstm_value.stateValue; + lstm_value.prev_state_value = lstm_value.state_value; } math::Batch2LoDTensorFunctor to_seq; @@ -214,13 +214,13 @@ class LSTMGradKernel : public framework::OpKernel { math::LstmMetaValue lstm_value; if (bias && ctx.Attr("use_peepholes")) { T* bias_data = const_cast(bias->data()); - lstm_value.checkIg = bias_data + 4 * frame_size; - lstm_value.checkFg = lstm_value.checkIg + frame_size; - lstm_value.checkOg = lstm_value.checkFg + frame_size; + lstm_value.check_ig = bias_data + 4 * frame_size; + lstm_value.check_fg = lstm_value.check_ig + frame_size; + lstm_value.check_og = lstm_value.check_fg + frame_size; } else { - lstm_value.checkIg = nullptr; - lstm_value.checkFg = nullptr; - lstm_value.checkOg = nullptr; + lstm_value.check_ig = nullptr; + lstm_value.check_fg = nullptr; + lstm_value.check_og = nullptr; } math::LstmMetaGrad lstm_grad; @@ -231,13 +231,13 @@ class LSTMGradKernel : public framework::OpKernel { } if (bias && bias_g && ctx.Attr("use_peepholes")) { T* bias_g_data = bias_g->data(); - lstm_grad.checkIgGrad = bias_g_data + 4 * frame_size; - lstm_grad.checkFgGrad = lstm_grad.checkIgGrad + frame_size; - lstm_grad.checkOgGrad = lstm_grad.checkFgGrad + frame_size; + lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size; + lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size; + lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size; } else { - lstm_grad.checkIgGrad = nullptr; - lstm_grad.checkFgGrad = nullptr; - lstm_grad.checkOgGrad = nullptr; + lstm_grad.check_ig_grad = nullptr; + lstm_grad.check_fg_grad = nullptr; + lstm_grad.check_og_grad = nullptr; } math::LoDTensor2BatchFunctor to_batch; @@ -276,26 +276,26 @@ class LSTMGradKernel : public framework::OpKernel { Tensor gate = batch_gate->Slice(bstart, bend); Tensor cell = batch_cell.Slice(bstart, bend); Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend); - lstm_value.gateValue = gate.data(); - lstm_value.stateValue = cell.data(); - lstm_value.stateActiveValue = cell_pre_act.data(); + lstm_value.gate_value = gate.data(); + lstm_value.state_value = cell.data(); + lstm_value.state_active_value = cell_pre_act.data(); Tensor out_g = batch_hidden_g.Slice(bstart, bend); Tensor gate_g = batch_gate_g.Slice(bstart, bend); Tensor cell_g = batch_cell_g.Slice(bstart, bend); - lstm_grad.stateGrad = cell_g.data(); - lstm_grad.gateGrad = gate_g.data(); - lstm_grad.outputGrad = out_g.data(); + lstm_grad.state_grad = cell_g.data(); + lstm_grad.gate_grad = gate_g.data(); + lstm_grad.output_grad = out_g.data(); if (n > 0) { int bstart_pre = static_cast(batch_starts[n - 1]); Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart); Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart); - lstm_value.prevStateValue = cell_pre.data(); - lstm_grad.prevStateGrad = cell_pre_g.data(); + lstm_value.prev_state_value = cell_pre.data(); + lstm_grad.prev_state_grad = cell_pre_g.data(); } else { - lstm_value.prevStateValue = c0 ? ordered_c0.data() : nullptr; - lstm_grad.prevStateGrad = c0_g ? ordered_c0_g.data() : nullptr; + lstm_value.prev_state_value = c0 ? ordered_c0.data() : nullptr; + lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data() : nullptr; } int cur_batch_size = bend - bstart; diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h index fc3ad0ce58..a734ad31ee 100644 --- a/paddle/operators/math/detail/lstm_cpu_kernel.h +++ b/paddle/operators/math/detail/lstm_cpu_kernel.h @@ -26,278 +26,284 @@ namespace detail { template void naive_lstm_forward_one_sequence(Op op, LstmMetaValue value, - int frameSize, + int frame_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - T rValueIn; - T rValueIg; - T rValueFg; - T rValueOg; - T rCheckI; - T rCheckF; - T rCheckO; - T rState; - T rPrevState = 0; - T rStateAtv; - T rOut; - - T *valueIn = value.gateValue; - T *valueIg = value.gateValue + frameSize; - T *valueFg = value.gateValue + frameSize * 2; - T *valueOg = value.gateValue + frameSize * 3; - - for (int i = 0; i < frameSize; i++) { - rValueIn = valueIn[i]; - rValueIg = valueIg[i]; - rValueFg = valueFg[i]; - rValueOg = valueOg[i]; - rCheckI = value.checkIg ? value.checkIg[i] : 0; - rCheckF = value.checkFg ? value.checkFg[i] : 0; - rCheckO = value.checkOg ? value.checkOg[i] : 0; - - if (value.prevStateValue) { - rPrevState = value.prevStateValue[i]; + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_checkI; + T r_checkF; + T r_checkO; + T r_state; + T r_prev_state = 0; + T r_state_atv; + T r_out; + + T *value_in = value.gate_value; + T *value_ig = value.gate_value + frame_size; + T *value_fg = value.gate_value + frame_size * 2; + T *value_og = value.gate_value + frame_size * 3; + + for (int i = 0; i < frame_size; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + r_checkI = value.check_ig ? value.check_ig[i] : 0; + r_checkF = value.check_fg ? value.check_fg[i] : 0; + r_checkO = value.check_og ? value.check_og[i] : 0; + + if (value.prev_state_value) { + r_prev_state = value.prev_state_value[i]; } - op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, - rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); - - valueIn[i] = rValueIn; - valueIg[i] = rValueIg; - valueFg[i] = rValueFg; - valueOg[i] = rValueOg; - value.stateValue[i] = rState; - value.stateActiveValue[i] = rStateAtv; - value.outputValue[i] = rOut; + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, + active_gate, active_state); + + value_in[i] = r_value_in; + value_ig[i] = r_value_ig; + value_fg[i] = r_value_fg; + value_og[i] = r_value_og; + value.state_value[i] = r_state; + value.state_active_value[i] = r_state_atv; + value.output_value[i] = r_out; } } template void naive_lstm_backward_one_sequence(Op op, LstmMetaValue value, - LstmMetaGrad grad, int frameSize, + LstmMetaGrad grad, int frame_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - T rValueIn; - T rValueIg; - T rValueFg; - T rValueOg; - T rGradIn; - T rGradIg; - T rGradFg; - T rGradOg; - T rPrevState = 0; - T rPrevStateGrad; - T rState; - T rStateGrad; - T rStateAtv; - T rOutputGrad; - T rCheckI; - T rCheckF; - T rCheckO; - T rCheckIGrad; - T rCheckFGrad; - T rCheckOGrad; - - T *valueIn = value.gateValue; - T *valueIg = value.gateValue + frameSize; - T *valueFg = value.gateValue + frameSize * 2; - T *valueOg = value.gateValue + frameSize * 3; - T *gradIn = grad.gateGrad; - T *gradIg = grad.gateGrad + frameSize; - T *gradFg = grad.gateGrad + frameSize * 2; - T *gradOg = grad.gateGrad + frameSize * 3; - - for (int i = 0; i < frameSize; i++) { - rValueIn = valueIn[i]; - rValueIg = valueIg[i]; - rValueFg = valueFg[i]; - rValueOg = valueOg[i]; - rCheckI = value.checkIg ? value.checkIg[i] : 0; - rCheckF = value.checkFg ? value.checkFg[i] : 0; - rCheckO = value.checkOg ? value.checkOg[i] : 0; - rState = value.stateValue[i]; - rStateAtv = value.stateActiveValue[i]; - rOutputGrad = grad.outputGrad[i]; - rStateGrad = grad.stateGrad[i]; - if (value.prevStateValue) { - rPrevState = value.prevStateValue[i]; + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_grad_in; + T r_grad_ig; + T r_grad_fg; + T r_grad_og; + T r_prev_state = 0; + T r_prev_state_grad; + T r_state; + T r_state_grad; + T r_state_atv; + T r_output_grad; + T r_checkI; + T r_checkF; + T r_checkO; + T r_checkIGrad; + T r_checkFGrad; + T r_checkOGrad; + + T *value_in = value.gate_value; + T *value_ig = value.gate_value + frame_size; + T *value_fg = value.gate_value + frame_size * 2; + T *value_og = value.gate_value + frame_size * 3; + T *grad_in = grad.gate_grad; + T *grad_ig = grad.gate_grad + frame_size; + T *grad_fg = grad.gate_grad + frame_size * 2; + T *grad_og = grad.gate_grad + frame_size * 3; + + for (int i = 0; i < frame_size; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + r_checkI = value.check_ig ? value.check_ig[i] : 0; + r_checkF = value.check_fg ? value.check_fg[i] : 0; + r_checkO = value.check_og ? value.check_og[i] : 0; + r_state = value.state_value[i]; + r_state_atv = value.state_active_value[i]; + r_output_grad = grad.output_grad[i]; + r_state_grad = grad.state_grad[i]; + if (value.prev_state_value) { + r_prev_state = value.prev_state_value[i]; } - op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, - rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, - rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, - rCheckOGrad, active_node, active_gate, active_state); - - gradIn[i] = rGradIn; - gradIg[i] = rGradIg; - gradFg[i] = rGradFg; - gradOg[i] = rGradOg; - grad.stateGrad[i] = rStateGrad; - - if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad; - if (value.prevStateValue) { - if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad; - if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad; + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad_in[i] = r_grad_in; + grad_ig[i] = r_grad_ig; + grad_fg[i] = r_grad_fg; + grad_og[i] = r_grad_og; + grad.state_grad[i] = r_state_grad; + + if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad; + if (value.prev_state_value) { + if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad; + if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad; } - if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad; + if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad; } } template -void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, int frameSize, +void avx_lstm_forward_one_sequence(Op op, LstmMetaValue value, + int frame_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { #ifdef __AVX__ - __m256 rValueIn; - __m256 rValueIg; - __m256 rValueFg; - __m256 rValueOg; - __m256 rCheckI = _mm256_set1_ps(0.0f); - __m256 rCheckF = _mm256_set1_ps(0.0f); - __m256 rCheckO = _mm256_set1_ps(0.0f); - __m256 rState; - __m256 rPrevState = _mm256_set1_ps(0.0f); - __m256 rStateAtv; - __m256 rOut; - - __m256 *valueIn = (__m256 *)value.gateValue; - __m256 *valueIg = (__m256 *)(value.gateValue + frameSize); - __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2); - __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3); - - for (int i = 0; i < frameSize / 8; i++) { - rValueIn = valueIn[i]; - rValueIg = valueIg[i]; - rValueFg = valueFg[i]; - rValueOg = valueOg[i]; - if (value.checkIg) { - rCheckI = ((__m256 *)value.checkIg)[i]; - rCheckF = ((__m256 *)value.checkFg)[i]; - rCheckO = ((__m256 *)value.checkOg)[i]; + __m256 r_value_in; + __m256 r_value_ig; + __m256 r_value_fg; + __m256 r_value_og; + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 r_state; + __m256 r_prev_state = _mm256_set1_ps(0.0f); + __m256 r_state_atv; + __m256 r_out; + + __m256 *value_in = (__m256 *)value.gate_value; + __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); + __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); + __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + if (value.check_ig) { + r_checkI = ((__m256 *)value.check_ig)[i]; + r_checkF = ((__m256 *)value.check_fg)[i]; + r_checkO = ((__m256 *)value.check_og)[i]; } - if (value.prevStateValue) { - rPrevState = ((__m256 *)value.prevStateValue)[i]; + if (value.prev_state_value) { + r_prev_state = ((__m256 *)value.prev_state_value)[i]; } - op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, - rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); - - valueIn[i] = rValueIn; - valueIg[i] = rValueIg; - valueFg[i] = rValueFg; - valueOg[i] = rValueOg; - ((__m256 *)value.stateValue)[i] = rState; - ((__m256 *)value.stateActiveValue)[i] = rStateAtv; - ((__m256 *)value.outputValue)[i] = rOut; + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, + active_gate, active_state); + + value_in[i] = r_value_in; + value_ig[i] = r_value_ig; + value_fg[i] = r_value_fg; + value_og[i] = r_value_og; + ((__m256 *)value.state_value)[i] = r_state; + ((__m256 *)value.state_active_value)[i] = r_state_atv; + ((__m256 *)value.output_value)[i] = r_out; } #endif } template void avx_lstm_backward_one_sequence(Op op, LstmMetaValue value, - LstmMetaGrad grad, int frameSize, + LstmMetaGrad grad, int frame_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { #ifdef __AVX__ - __m256 rValueIn; - __m256 rValueIg; - __m256 rValueFg; - __m256 rValueOg; - __m256 rGradIn; - __m256 rGradIg; - __m256 rGradFg; - __m256 rGradOg; - __m256 rPrevState = _mm256_set1_ps(0.0f); - __m256 rPrevStateGrad; - __m256 rStateGrad; - __m256 rState; - __m256 rStateAtv; - __m256 rOutputGrad; - __m256 rCheckI = _mm256_set1_ps(0.0f); - __m256 rCheckF = _mm256_set1_ps(0.0f); - __m256 rCheckO = _mm256_set1_ps(0.0f); - __m256 rCheckIGrad; - __m256 rCheckFGrad; - __m256 rCheckOGrad; - - __m256 *valueIn = (__m256 *)value.gateValue; - __m256 *valueIg = (__m256 *)(value.gateValue + frameSize); - __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2); - __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3); - __m256 *gradIn = (__m256 *)grad.gateGrad; - __m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize); - __m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2); - __m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3); - - for (int i = 0; i < frameSize / 8; i++) { - rValueIn = valueIn[i]; - rValueIg = valueIg[i]; - rValueFg = valueFg[i]; - rValueOg = valueOg[i]; - if (value.checkIg) { - rCheckI = ((__m256 *)value.checkIg)[i]; - rCheckF = ((__m256 *)value.checkFg)[i]; - rCheckO = ((__m256 *)value.checkOg)[i]; + __m256 r_value_in; + __m256 r_value_ig; + __m256 r_value_fg; + __m256 r_value_og; + __m256 r_grad_in; + __m256 r_grad_ig; + __m256 r_grad_fg; + __m256 r_grad_og; + __m256 r_prev_state = _mm256_set1_ps(0.0f); + __m256 r_prev_state_grad; + __m256 r_state_grad; + __m256 r_state; + __m256 r_state_atv; + __m256 r_output_grad; + __m256 r_checkI = _mm256_set1_ps(0.0f); + __m256 r_checkF = _mm256_set1_ps(0.0f); + __m256 r_checkO = _mm256_set1_ps(0.0f); + __m256 r_checkIGrad; + __m256 r_checkFGrad; + __m256 r_checkOGrad; + + __m256 *value_in = (__m256 *)value.gate_value; + __m256 *value_ig = (__m256 *)(value.gate_value + frame_size); + __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2); + __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3); + __m256 *grad_in = (__m256 *)grad.gate_grad; + __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size); + __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2); + __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3); + + for (int i = 0; i < frame_size / 8; i++) { + r_value_in = value_in[i]; + r_value_ig = value_ig[i]; + r_value_fg = value_fg[i]; + r_value_og = value_og[i]; + if (value.check_ig) { + r_checkI = ((__m256 *)value.check_ig)[i]; + r_checkF = ((__m256 *)value.check_fg)[i]; + r_checkO = ((__m256 *)value.check_og)[i]; } - rState = ((__m256 *)value.stateValue)[i]; - rStateAtv = ((__m256 *)value.stateActiveValue)[i]; - rOutputGrad = ((__m256 *)grad.outputGrad)[i]; - rStateGrad = ((__m256 *)grad.stateGrad)[i]; - if (value.prevStateValue) { - rPrevState = ((__m256 *)value.prevStateValue)[i]; + r_state = ((__m256 *)value.state_value)[i]; + r_state_atv = ((__m256 *)value.state_active_value)[i]; + r_output_grad = ((__m256 *)grad.output_grad)[i]; + r_state_grad = ((__m256 *)grad.state_grad)[i]; + if (value.prev_state_value) { + r_prev_state = ((__m256 *)value.prev_state_value)[i]; } - op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, - rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, - rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, - rCheckOGrad, active_node, active_gate, active_state); - - gradIn[i] = rGradIn; - gradIg[i] = rGradIg; - gradFg[i] = rGradFg; - gradOg[i] = rGradOg; - ((__m256 *)grad.stateGrad)[i] = rStateGrad; - - if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad; - if (value.prevStateValue) { - if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad; - if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad; + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad_in[i] = r_grad_in; + grad_ig[i] = r_grad_ig; + grad_fg[i] = r_grad_fg; + grad_og[i] = r_grad_og; + ((__m256 *)grad.state_grad)[i] = r_state_grad; + + if (grad.prev_state_grad) + ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad; + if (value.prev_state_value) { + if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad; + if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad; } - if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad; + if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad; } #endif } template -void cpu_lstm_forward(Op op, LstmMetaValue value, int frameSize, +void cpu_lstm_forward(Op op, LstmMetaValue value, int frame_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same::value)) { - avx_lstm_forward_one_sequence(op, value, frameSize, active_node, + if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { + avx_lstm_forward_one_sequence(op, value, frame_size, active_node, active_gate, active_state); } else { - naive_lstm_forward_one_sequence(op, value, frameSize, active_node, + naive_lstm_forward_one_sequence(op, value, frame_size, active_node, active_gate, active_state); } } template void cpu_lstm_backward(Op op, LstmMetaValue value, LstmMetaGrad grad, - int frameSize, activation_mode_t active_node, + int frame_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same::value)) { - avx_lstm_backward_one_sequence(op, value, grad, frameSize, active_node, + if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same::value)) { + avx_lstm_backward_one_sequence(op, value, grad, frame_size, active_node, active_gate, active_state); } else { - naive_lstm_backward_one_sequence(op, value, grad, frameSize, active_node, - active_gate, active_state); + naive_lstm_backward_one_sequence(op, value, grad, frame_size, + active_node, active_gate, active_state); } } diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h index d138bbe411..91bfedea53 100644 --- a/paddle/operators/math/detail/lstm_gpu_kernel.h +++ b/paddle/operators/math/detail/lstm_gpu_kernel.h @@ -26,189 +26,192 @@ namespace math { namespace detail { /* - * threads(framePerBlock, batchPerBlock) - * grid(frameBlocks, batchBlocks) + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) */ -template -__global__ void KeLstmForward(Op op, LstmMetaValue value, int frameSize, - int batchSize, activation_mode_t active_node, +template +__global__ void KeLstmForward(Op op, LstmMetaValue value, int frame_size, + int batch_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (frameIdx >= frameSize) return; - - int batchIdx = 0; - if (isBatch) { - batchIdx = blockIdx.y * blockDim.y + threadIdx.y; - if (batchIdx >= batchSize) return; - value.gateValue += batchIdx * frameSize * 4; - value.outputValue += batchIdx * frameSize; - value.stateValue += batchIdx * frameSize; - value.stateActiveValue += batchIdx * frameSize; + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + value.gate_value += batch_idx * frame_size * 4; + value.output_value += batch_idx * frame_size; + value.state_value += batch_idx * frame_size; + value.state_active_value += batch_idx * frame_size; } - T rState; - T rPrevState = 0; - T rStateAtv; - T rOut; - T rValueIn; - T rValueIg; - T rValueFg; - T rValueOg; - - T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0; - T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0; - T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0; - - rValueIn = value.gateValue[frameIdx]; - rValueIg = value.gateValue[frameIdx + frameSize]; - rValueFg = value.gateValue[frameIdx + frameSize * 2]; - rValueOg = value.gateValue[frameIdx + frameSize * 3]; - - if (value.prevStateValue) { - if (isBatch) value.prevStateValue += batchIdx * frameSize; - rPrevState = value.prevStateValue[frameIdx]; + T r_state; + T r_prev_state = 0; + T r_state_atv; + T r_out; + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + + T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0; + T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0; + T r_checkO = value.check_og ? value.check_og[frame_idx] : 0; + + r_value_in = value.gate_value[frame_idx]; + r_value_ig = value.gate_value[frame_idx + frame_size]; + r_value_fg = value.gate_value[frame_idx + frame_size * 2]; + r_value_og = value.gate_value[frame_idx + frame_size * 3]; + + if (value.prev_state_value) { + if (is_batch) value.prev_state_value += batch_idx * frame_size; + r_prev_state = value.prev_state_value[frame_idx]; } - op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv, - rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state); + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state, + r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate, + active_state); - value.gateValue[frameIdx] = rValueIn; - value.gateValue[frameIdx + frameSize] = rValueIg; - value.gateValue[frameIdx + frameSize * 2] = rValueFg; - value.gateValue[frameIdx + frameSize * 3] = rValueOg; + value.gate_value[frame_idx] = r_value_in; + value.gate_value[frame_idx + frame_size] = r_value_ig; + value.gate_value[frame_idx + frame_size * 2] = r_value_fg; + value.gate_value[frame_idx + frame_size * 3] = r_value_og; - value.stateValue[frameIdx] = rState; - value.stateActiveValue[frameIdx] = rStateAtv; - value.outputValue[frameIdx] = rOut; + value.state_value[frame_idx] = r_state; + value.state_active_value[frame_idx] = r_state_atv; + value.output_value[frame_idx] = r_out; } /* - * threads(framePerBlock, batchPerBlock) - * grid(frameBlocks, batchBlocks) + * threads(frame_per_block, batch_per_block) + * grid(frame_blocks, batch_blocks) */ -template +template __global__ void KeLstmBackward(Op op, LstmMetaValue value, - LstmMetaGrad grad, int frameSize, - int batchSize, activation_mode_t active_node, + LstmMetaGrad grad, int frame_size, + int batch_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x; - if (frameIdx >= frameSize) return; - - int batchIdx = 0; - if (isBatch) { - batchIdx = blockIdx.y * blockDim.y + threadIdx.y; - if (batchIdx >= batchSize) return; - value.gateValue += batchIdx * frameSize * 4; - value.stateValue += batchIdx * frameSize; - value.stateActiveValue += batchIdx * frameSize; - grad.gateGrad += batchIdx * frameSize * 4; - grad.stateGrad += batchIdx * frameSize; - grad.outputGrad += batchIdx * frameSize; + const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x; + if (frame_idx >= frame_size) return; + + int batch_idx = 0; + if (is_batch) { + batch_idx = blockIdx.y * blockDim.y + threadIdx.y; + if (batch_idx >= batch_size) return; + value.gate_value += batch_idx * frame_size * 4; + value.state_value += batch_idx * frame_size; + value.state_active_value += batch_idx * frame_size; + grad.gate_grad += batch_idx * frame_size * 4; + grad.state_grad += batch_idx * frame_size; + grad.output_grad += batch_idx * frame_size; } - T rValueIn; - T rValueIg; - T rValueFg; - T rValueOg; - T rGradIn; - T rGradIg; - T rGradFg; - T rGradOg; - T rPrevState = 0; - T rPrevStateGrad; - T rState; - T rStateGrad; - T rStateAtv; - T rOutputGrad; - T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0; - T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0; - T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0; - - T rCheckIGrad; - T rCheckFGrad; - T rCheckOGrad; - - rValueIn = value.gateValue[frameIdx]; - rValueIg = value.gateValue[frameIdx + frameSize]; - rValueFg = value.gateValue[frameIdx + frameSize * 2]; - rValueOg = value.gateValue[frameIdx + frameSize * 3]; - rState = value.stateValue[frameIdx]; - rStateAtv = value.stateActiveValue[frameIdx]; - rOutputGrad = grad.outputGrad[frameIdx]; - rStateGrad = grad.stateGrad[frameIdx]; - - if (value.prevStateValue) { - if (isBatch) value.prevStateValue += batchIdx * frameSize; - rPrevState = value.prevStateValue[frameIdx]; + T r_value_in; + T r_value_ig; + T r_value_fg; + T r_value_og; + T r_grad_in; + T r_grad_ig; + T r_grad_fg; + T r_grad_og; + T r_prev_state = 0; + T r_prev_state_grad; + T r_state; + T r_state_grad; + T r_state_atv; + T r_output_grad; + T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0; + T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0; + T r_checkO = value.check_og ? value.check_og[frame_idx] : 0; + + T r_checkIGrad; + T r_checkFGrad; + T r_checkOGrad; + + r_value_in = value.gate_value[frame_idx]; + r_value_ig = value.gate_value[frame_idx + frame_size]; + r_value_fg = value.gate_value[frame_idx + frame_size * 2]; + r_value_og = value.gate_value[frame_idx + frame_size * 3]; + r_state = value.state_value[frame_idx]; + r_state_atv = value.state_active_value[frame_idx]; + r_output_grad = grad.output_grad[frame_idx]; + r_state_grad = grad.state_grad[frame_idx]; + + if (value.prev_state_value) { + if (is_batch) value.prev_state_value += batch_idx * frame_size; + r_prev_state = value.prev_state_value[frame_idx]; } - op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg, - rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad, - rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad, - active_node, active_gate, active_state); - - grad.gateGrad[frameIdx] = rGradIn; - grad.gateGrad[frameIdx + frameSize] = rGradIg; - grad.gateGrad[frameIdx + frameSize * 2] = rGradFg; - grad.gateGrad[frameIdx + frameSize * 3] = rGradOg; - grad.stateGrad[frameIdx] = rStateGrad; - if (grad.prevStateGrad) { - if (isBatch) grad.prevStateGrad += batchIdx * frameSize; - grad.prevStateGrad[frameIdx] = rPrevStateGrad; + op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig, + r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state, + r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO, + r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate, + active_state); + + grad.gate_grad[frame_idx] = r_grad_in; + grad.gate_grad[frame_idx + frame_size] = r_grad_ig; + grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg; + grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og; + grad.state_grad[frame_idx] = r_state_grad; + if (grad.prev_state_grad) { + if (is_batch) grad.prev_state_grad += batch_idx * frame_size; + grad.prev_state_grad[frame_idx] = r_prev_state_grad; } - if (isBatch) { - if (value.prevStateValue) { - if (grad.checkIgGrad) - paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx, - rCheckIGrad); - if (grad.checkFgGrad) - paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx, - rCheckFGrad); + if (is_batch) { + if (value.prev_state_value) { + if (grad.check_ig_grad) + paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx, + r_checkIGrad); + if (grad.check_fg_grad) + paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx, + r_checkFGrad); } - if (grad.checkOgGrad) - paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad); + if (grad.check_og_grad) + paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx, + r_checkOGrad); } else { - if (value.prevStateValue) { - if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad; - if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad; + if (value.prev_state_value) { + if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad; + if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad; } - if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad; + if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad; } } template void gpu_lstm_forward(const platform::DeviceContext& context, Op op, - LstmMetaValue value, int frameSize, int batchSize, + LstmMetaValue value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { dim3 threads; dim3 grid; - if (batchSize == 1) { - int framePerBlock = frameSize <= 1024 ? frameSize : 1024; - int frameBlocks = (frameSize + 1024 - 1) / 1024; - threads = dim3(framePerBlock, 1); - grid = dim3(frameBlocks, 1); + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); } else { - /* framePerBlock = 32 batchPerBlock = 32 */ + /* frame_per_block = 32 batch_per_block = 32 */ threads = dim3(32, 32); - grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); } auto stream = reinterpret_cast(context).stream(); - if (batchSize == 1) { + if (batch_size == 1) { KeLstmForward<<>>( - op, value, frameSize, batchSize, active_node, active_gate, + /* is_batch= */ false><<>>( + op, value, frame_size, batch_size, active_node, active_gate, active_state); } else { KeLstmForward<<>>( - op, value, frameSize, batchSize, active_node, active_gate, + /* is_batch= */ true><<>>( + op, value, frame_size, batch_size, active_node, active_gate, active_state); } } @@ -216,34 +219,34 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op, template void gpu_lstm_backward(const platform::DeviceContext& context, Op op, LstmMetaValue value, LstmMetaGrad grad, - int frameSize, int batchSize, + int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { dim3 threads; dim3 grid; - if (batchSize == 1) { - int framePerBlock = frameSize <= 1024 ? frameSize : 1024; - int frameBlocks = (frameSize + 1024 - 1) / 1024; - threads = dim3(framePerBlock, 1); - grid = dim3(frameBlocks, 1); + if (batch_size == 1) { + int frame_per_block = frame_size <= 1024 ? frame_size : 1024; + int frame_blocks = (frame_size + 1024 - 1) / 1024; + threads = dim3(frame_per_block, 1); + grid = dim3(frame_blocks, 1); } else { - /* framePerBlock = 32 batchPerBlock = 16 */ + /* frame_per_block = 32 batch_per_block = 16 */ threads = dim3(32, 16); - grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 16 - 1) / 16); + grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16); } auto stream = reinterpret_cast(context).stream(); - if (batchSize == 1) { + if (batch_size == 1) { KeLstmBackward<<>>( - op, value, grad, frameSize, batchSize, active_node, active_gate, + /* is_batch= */ false><<>>( + op, value, grad, frame_size, batch_size, active_node, active_gate, active_state); } else { KeLstmBackward<<>>( - op, value, grad, frameSize, batchSize, active_node, active_gate, + /* is_batch= */ true><<>>( + op, value, grad, frame_size, batch_size, active_node, active_gate, active_state); } } diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h index 9daaf91981..78f9a249a3 100644 --- a/paddle/operators/math/detail/lstm_kernel.h +++ b/paddle/operators/math/detail/lstm_kernel.h @@ -27,19 +27,19 @@ namespace forward { template class lstm { public: - HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, - T &prevState, T &state, T &stateAtv, T &output, + HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, + T &prev_state, T &state, T &state_atv, T &output, T &checkI, T &checkF, T &checkO, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - valueIn = activation(valueIn, active_node); - valueIg = activation(valueIg + prevState * checkI, active_gate); - valueFg = activation(valueFg + prevState * checkF, active_gate); - state = valueIn * valueIg + prevState * valueFg; - valueOg = activation(valueOg + state * checkO, active_gate); - stateAtv = activation(state, active_state); - output = valueOg * stateAtv; + value_in = activation(value_in, active_node); + value_ig = activation(value_ig + prev_state * checkI, active_gate); + value_fg = activation(value_fg + prev_state * checkF, active_gate); + state = value_in * value_ig + prev_state * value_fg; + value_og = activation(value_og + state * checkO, active_gate); + state_atv = activation(state, active_state); + output = value_og * state_atv; } #ifndef __NVCC__ #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default @@ -48,24 +48,27 @@ class lstm { // Only float support AVX optimization static const bool avx = std::is_same::value; - HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg, - __m256 &valueOg, __m256 &prevState, __m256 &state, - __m256 &stateAtv, __m256 &output, __m256 &checkI, + HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig, + __m256 &value_fg, __m256 &value_og, + __m256 &prev_state, __m256 &state, + __m256 &state_atv, __m256 &output, __m256 &checkI, __m256 &checkF, __m256 &checkO, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - valueIn = activation(valueIn, active_node); - valueIg = activation( - _mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)), active_gate); - valueFg = activation( - _mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)), active_gate); - state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg), - _mm256_mul_ps(prevState, valueFg)); - valueOg = activation(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)), - active_gate); - stateAtv = activation(state, active_state); - output = _mm256_mul_ps(valueOg, stateAtv); + value_in = activation(value_in, active_node); + value_ig = + activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)), + active_gate); + value_fg = + activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)), + active_gate); + state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig), + _mm256_mul_ps(prev_state, value_fg)); + value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)), + active_gate); + state_atv = activation(state, active_state); + output = _mm256_mul_ps(value_og, state_atv); } #endif #endif @@ -78,25 +81,26 @@ namespace backward { template class lstm { public: - HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg, - T &gradIn, T &gradIg, T &gradFg, T &gradOg, - T &prevState, T &prevStateGrad, T &state, - T &stateGrad, T &stateAtv, T &outputGrad, + HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og, + T &grad_in, T &grad_ig, T &grad_fg, T &grad_og, + T &prev_state, T &prev_state_grad, T &state, + T &state_grad, T &state_atv, T &output_grad, T &checkI, T &checkF, T &checkO, T &checkIGrad, T &checkFGrad, T &checkOGrad, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - gradOg = activation(outputGrad * stateAtv, valueOg, active_gate); - stateGrad += activation(outputGrad * valueOg, stateAtv, active_state) + - gradOg * checkO; - gradIn = activation(stateGrad * valueIg, valueIn, active_node); - gradIg = activation(stateGrad * valueIn, valueIg, active_gate); - gradFg = activation(stateGrad * prevState, valueFg, active_gate); - prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg; - checkIGrad = gradIg * prevState; - checkFGrad = gradFg * prevState; - checkOGrad = gradOg * state; + grad_og = activation(output_grad * state_atv, value_og, active_gate); + state_grad += activation(output_grad * value_og, state_atv, active_state) + + grad_og * checkO; + grad_in = activation(state_grad * value_ig, value_in, active_node); + grad_ig = activation(state_grad * value_in, value_ig, active_gate); + grad_fg = activation(state_grad * prev_state, value_fg, active_gate); + prev_state_grad = + grad_ig * checkI + grad_fg * checkF + state_grad * value_fg; + checkIGrad = grad_ig * prev_state; + checkFGrad = grad_fg * prev_state; + checkOGrad = grad_og * state; } #ifndef __NVCC__ #ifndef __AVX__ // If not compiled with AVX instructs. Disable AVX by default @@ -105,32 +109,32 @@ class lstm { // Only float support AVX optimization static const bool avx = std::is_same::value; HOSTDEVICE void operator()( - __m256 &valueIn, __m256 &valueIg, __m256 &valueFg, __m256 &valueOg, - __m256 &gradIn, __m256 &gradIg, __m256 &gradFg, __m256 &gradOg, - __m256 &prevState, __m256 &prevStateGrad, __m256 &state, - __m256 &stateGrad, __m256 &stateAtv, __m256 &outputGrad, __m256 &checkI, - __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad, - __m256 &checkOGrad, activation_mode_t active_node, + __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og, + __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og, + __m256 &prev_state, __m256 &prev_state_grad, __m256 &state, + __m256 &state_grad, __m256 &state_atv, __m256 &output_grad, + __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, + __m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node, activation_mode_t active_gate, activation_mode_t active_state) { - gradOg = - activation(_mm256_mul_ps(outputGrad, stateAtv), valueOg, active_gate); - stateGrad = _mm256_add_ps( - activation(_mm256_mul_ps(outputGrad, valueOg), stateAtv, active_state), - stateGrad); - stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad); - gradIn = - activation(_mm256_mul_ps(stateGrad, valueIg), valueIn, active_node); - gradIg = - activation(_mm256_mul_ps(stateGrad, valueIn), valueIg, active_gate); - gradFg = - activation(_mm256_mul_ps(stateGrad, prevState), valueFg, active_gate); - prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI), - _mm256_mul_ps(gradFg, checkF)); - prevStateGrad = - _mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad); - checkIGrad = _mm256_mul_ps(gradIg, prevState); - checkFGrad = _mm256_mul_ps(gradFg, prevState); - checkOGrad = _mm256_mul_ps(gradOg, state); + grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og, + active_gate); + state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og), + state_atv, active_state), + state_grad); + state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad); + grad_in = + activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node); + grad_ig = + activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate); + grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg, + active_gate); + prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI), + _mm256_mul_ps(grad_fg, checkF)); + prev_state_grad = + _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad); + checkIGrad = _mm256_mul_ps(grad_ig, prev_state); + checkFGrad = _mm256_mul_ps(grad_fg, prev_state); + checkOGrad = _mm256_mul_ps(grad_og, state); } #endif #endif diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc index 0febf8e3b7..ad3a59bcdb 100644 --- a/paddle/operators/math/lstm_compute.cc +++ b/paddle/operators/math/lstm_compute.cc @@ -30,12 +30,12 @@ struct LstmUnitFunctor { detail::cpu_lstm_forward(detail::forward::lstm(), value, frame_size, ActiveType(cand_act), ActiveType(gate_act), ActiveType(cell_act)); - value.gateValue += frame_size * 4; - value.stateValue += frame_size; - value.stateActiveValue += frame_size; - value.outputValue += frame_size; - if (value.prevStateValue) { - value.prevStateValue += frame_size; + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; } } } @@ -53,20 +53,20 @@ struct LstmUnitGradFunctor { frame_size, ActiveType(cand_act), ActiveType(gate_act), ActiveType(cell_act)); - value.gateValue += frame_size * 4; - value.stateValue += frame_size; - value.stateActiveValue += frame_size; - value.outputValue += frame_size; - if (value.prevStateValue) { - value.prevStateValue += frame_size; + value.gate_value += frame_size * 4; + value.state_value += frame_size; + value.state_active_value += frame_size; + value.output_value += frame_size; + if (value.prev_state_value) { + value.prev_state_value += frame_size; } - grad.gateGrad += frame_size * 4; - grad.stateGrad += frame_size; - grad.stateActiveGrad += frame_size; - grad.outputGrad += frame_size; - if (grad.prevStateGrad) { - grad.prevStateGrad += frame_size; + grad.gate_grad += frame_size * 4; + grad.state_grad += frame_size; + grad.state_active_grad += frame_size; + grad.output_grad += frame_size; + if (grad.prev_state_grad) { + grad.prev_state_grad += frame_size; } } } diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h index 28d2c6fd3b..9652399d4c 100644 --- a/paddle/operators/math/lstm_compute.h +++ b/paddle/operators/math/lstm_compute.h @@ -31,26 +31,26 @@ typedef enum { template struct LstmMetaValue { - T *gateValue; - T *prevStateValue; - T *stateValue; - T *stateActiveValue; - T *outputValue; - T *checkIg; - T *checkFg; - T *checkOg; + T *gate_value; + T *prev_state_value; + T *state_value; + T *state_active_value; + T *output_value; + T *check_ig; + T *check_fg; + T *check_og; }; template struct LstmMetaGrad { - T *gateGrad; - T *prevStateGrad; - T *stateGrad; - T *stateActiveGrad; - T *outputGrad; - T *checkIgGrad; - T *checkFgGrad; - T *checkOgGrad; + T *gate_grad; + T *prev_state_grad; + T *state_grad; + T *state_active_grad; + T *output_grad; + T *check_ig_grad; + T *check_fg_grad; + T *check_og_grad; }; inline activation_mode_t ActiveType(const std::string &type) { From 3ca8a8ea4fc898bfd20fc5609c694f82df82fe61 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Sun, 3 Dec 2017 16:14:04 +0530 Subject: [PATCH 167/275] Changing RelWithDebInfo flags (#6193) --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ba29d6bbc..6aeef23330 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,8 @@ cmake_minimum_required(VERSION 3.0) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") +SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") include(system) From f40bdb155edbe9a1352f614ff2add76d33ab0444 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Sun, 3 Dec 2017 18:24:45 +0530 Subject: [PATCH 168/275] Polish the Evaliuator design doc (#6195) --- doc/design/evaluator.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/doc/design/evaluator.md b/doc/design/evaluator.md index a62d75ffef..11cc129d56 100644 --- a/doc/design/evaluator.md +++ b/doc/design/evaluator.md @@ -1,22 +1,22 @@ ## Evaluator Design -### The Problem +### Problem Statement -During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted. +During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. ### Evaluator Design -Currently, every operation is expressed in the graph. we divide the evaluator process into three steps. +Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. 1. Initialize the metric state and add it into the block. -2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once. +2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once. 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. ### Implementation -This design is shown in python API. -Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. +This design is shown in the Python API. +Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. ```python From 2a3a1e9a93258a67c8491361d9d83e3181723a3a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 4 Dec 2017 10:56:53 +0800 Subject: [PATCH 169/275] Add DataFeeder (#6102) * Add DataFeeder A v2 API like data feeder for book demos. We can feed data directly from reader. * Fix CI * Remove batch_size_dim for feeder Also add __all__ to data_feeder.py * Follow comment --- python/paddle/v2/fluid/__init__.py | 5 +- python/paddle/v2/fluid/data_feeder.py | 98 +++++++++++++++++++ .../v2/fluid/tests/book/test_fit_a_line.py | 7 +- .../book/test_image_classification_train.py | 13 +-- .../tests/book/test_label_semantic_roles.py | 60 +++++------- .../tests/book/test_recognize_digits_conv.py | 10 +- .../tests/book/test_recognize_digits_mlp.py | 28 +----- .../book/test_understand_sentiment_conv.py | 28 ++---- .../test_understand_sentiment_dynamic_lstm.py | 28 +++--- .../v2/fluid/tests/book/test_word2vec.py | 15 +-- .../paddle/v2/fluid/tests/test_data_feeder.py | 13 +++ 11 files changed, 177 insertions(+), 128 deletions(-) create mode 100644 python/paddle/v2/fluid/data_feeder.py create mode 100644 python/paddle/v2/fluid/tests/test_data_feeder.py diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index dd25bc19ec..59986c9f0c 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -14,20 +14,21 @@ import optimizer import backward import regularizer from param_attr import ParamAttr - +from data_feeder import DataFeeder from core import LoDTensor, CPUPlace, GPUPlace Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' + 'DataFeeder' ] def __read_gflags_from_env__(): """ Enable reading gflags from environment variables. - + Returns: None """ diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py new file mode 100644 index 0000000000..3dee0b5b73 --- /dev/null +++ b/python/paddle/v2/fluid/data_feeder.py @@ -0,0 +1,98 @@ +from __future__ import print_function + +import core +import numpy +import six.moves as six + +from framework import Variable + +__all__ = ['DataFeeder'] + + +class DataToLoDTensorConverter(object): + def __init__(self, place, lod_level, shape, dtype): + self.place = place + self.lod_level = lod_level + self.shape = shape + if dtype == core.DataType.FP32: + self.dtype = 'float32' + elif dtype == core.DataType.INT64: + self.dtype = 'int64' + elif dtype == core.DataType.FP64: + self.dtype = 'float64' + elif dtype == core.DataType.INT32: + self.dtype = 'int32' + else: + raise ValueError("dtype must be any of [int32, float32, int64, " + "float64]") + + self.data = [] + self.lod = [] + + for i in six.range(lod_level): + self.lod.append([0]) + + def feed(self, data): + self._feed_impl_(data, self.lod, self.lod_level) + + def _feed_impl_(self, data, lod, lod_level): + if lod_level == 0: + self.data.append(data) + else: + cur_lod_len = len(data) + lod[-1].append(lod[-1][-1] + cur_lod_len) + for each_data in data: + self._feed_impl_(each_data, lod[:-1], lod_level - 1) + + def done(self): + arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape) + t = core.LoDTensor() + t.set(arr, self.place) + if self.lod_level > 0: + t.set_lod(self.lod) + return t + + +class DataFeeder(object): + def __init__(self, feed_list, place): + self.feed_dtypes = [] + self.feed_names = [] + self.feed_shapes = [] + self.feed_lod_level = [] + for each_var in feed_list: + if not isinstance(each_var, Variable): + raise TypeError("Feed list should contain a list of variable") + self.feed_dtypes.append(each_var.dtype) + self.feed_names.append(each_var.name) + shape = each_var.shape + batch_size_dim = -1 + for i, s in enumerate(shape): + if s < 0: + batch_size_dim = i + break + if batch_size_dim == -1: + raise ValueError("Variable {0} must has a batch size dimension", + each_var.name) + self.feed_lod_level.append(each_var.lod_level) + self.feed_shapes.append(shape) + + self.place = place + + def feed(self, iterable): + converter = [] + for lod_level, shape, dtype in six.zip( + self.feed_lod_level, self.feed_shapes, self.feed_dtypes): + converter.append( + DataToLoDTensorConverter( + place=self.place, + lod_level=lod_level, + shape=shape, + dtype=dtype)) + + for each_sample in iterable: + for each_converter, each_slot in six.zip(converter, each_sample): + each_converter.feed(each_slot) + ret_dict = {} + for each_name, each_converter in six.zip(self.feed_names, converter): + ret_dict[each_name] = each_converter.done() + return ret_dict diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py index 9f98493adb..fbf46ac6cb 100644 --- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py @@ -22,6 +22,7 @@ train_reader = paddle.batch( batch_size=BATCH_SIZE) place = fluid.CPUPlace() +feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -31,12 +32,8 @@ for pass_id in range(PASS_NUM): fluid.io.save_persistables(exe, "./fit_a_line.model/") fluid.io.load_persistables(exe, "./fit_a_line.model/") for data in train_reader(): - x_data = np.array(map(lambda _: _[0], data)).astype("float32") - y_data = np.array(map(lambda _: _[1], data)).astype("float32") - avg_loss_value, = exe.run(fluid.default_main_program(), - feed={'x': x_data, - 'y': y_data}, + feed=feeder.feed(data), fetch_list=[avg_cost]) if avg_loss_value[0] < 10.0: diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py index 0f0cc5b540..4e71b6f345 100644 --- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py +++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py @@ -113,23 +113,14 @@ train_reader = paddle.batch( place = fluid.CPUPlace() exe = fluid.Executor(place) - +feeder = fluid.DataFeeder(place=place, feed_list=[images, label]) exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): accuracy.reset(exe) for data in train_reader(): - img_data = np.array(map(lambda x: x[0].reshape(data_shape), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - batch_size = 1 - for i in y_data.shape: - batch_size = batch_size * i - y_data = y_data.reshape([batch_size, 1]) - loss, acc = exe.run(fluid.default_main_program(), - feed={"pixel": img_data, - "label": y_data}, + feed=feeder.feed(data), fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str( diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index bcd6f4d6bc..0494c7cdca 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -28,17 +28,9 @@ def load_parameter(file_name, h, w): return np.fromfile(f, dtype=np.float32).reshape(h, w) -def db_lstm(): +def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark, + **ignored): # 8 features - word = fluid.layers.data(name='word_data', shape=[1], dtype='int64') - predicate = fluid.layers.data(name='verb_data', shape=[1], dtype='int64') - ctx_n2 = fluid.layers.data(name='ctx_n2_data', shape=[1], dtype='int64') - ctx_n1 = fluid.layers.data(name='ctx_n1_data', shape=[1], dtype='int64') - ctx_0 = fluid.layers.data(name='ctx_0_data', shape=[1], dtype='int64') - ctx_p1 = fluid.layers.data(name='ctx_p1_data', shape=[1], dtype='int64') - ctx_p2 = fluid.layers.data(name='ctx_p2_data', shape=[1], dtype='int64') - mark = fluid.layers.data(name='mark_data', shape=[1], dtype='int64') - predicate_embedding = fluid.layers.embedding( input=predicate, size=[pred_len, word_dim], @@ -120,8 +112,25 @@ def to_lodtensor(data, place): def main(): # define network topology - feature_out = db_lstm() - target = fluid.layers.data(name='target', shape=[1], dtype='int64') + word = fluid.layers.data( + name='word_data', shape=[1], dtype='int64', lod_level=1) + predicate = fluid.layers.data( + name='verb_data', shape=[1], dtype='int64', lod_level=1) + ctx_n2 = fluid.layers.data( + name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) + ctx_n1 = fluid.layers.data( + name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) + ctx_0 = fluid.layers.data( + name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) + ctx_p1 = fluid.layers.data( + name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) + ctx_p2 = fluid.layers.data( + name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) + mark = fluid.layers.data( + name='mark_data', shape=[1], dtype='int64', lod_level=1) + feature_out = db_lstm(**locals()) + target = fluid.layers.data( + name='target', shape=[1], dtype='int64', lod_level=1) crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, @@ -139,6 +148,11 @@ def main(): paddle.dataset.conll05.test(), buf_size=8192), batch_size=BATCH_SIZE) place = fluid.CPUPlace() + feeder = fluid.DataFeeder( + feed_list=[ + word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target + ], + place=place) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) @@ -150,28 +164,8 @@ def main(): batch_id = 0 for pass_id in xrange(PASS_NUM): for data in train_data(): - word_data = to_lodtensor(map(lambda x: x[0], data), place) - ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place) - ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place) - ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place) - ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place) - ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place) - verb_data = to_lodtensor(map(lambda x: x[6], data), place) - mark_data = to_lodtensor(map(lambda x: x[7], data), place) - target = to_lodtensor(map(lambda x: x[8], data), place) - outs = exe.run(fluid.default_main_program(), - feed={ - 'word_data': word_data, - 'ctx_n2_data': ctx_n2_data, - 'ctx_n1_data': ctx_n1_data, - 'ctx_0_data': ctx_0_data, - 'ctx_p1_data': ctx_p1_data, - 'ctx_p2_data': ctx_p2_data, - 'verb_data': verb_data, - 'mark_data': mark_data, - 'target': target - }, + feed=feeder.feed(data), fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py index ba686b56f8..35bf8da924 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py @@ -37,20 +37,14 @@ train_reader = paddle.batch( place = fluid.CPUPlace() exe = fluid.Executor(place) - +feeder = fluid.DataFeeder(feed_list=[images, label], place=place) exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): accuracy.reset(exe) for data in train_reader(): - img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]), - data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([BATCH_SIZE, 1]) - loss, acc = exe.run(fluid.default_main_program(), - feed={"pixel": img_data, - "label": y_data}, + feed=feeder.feed(data), fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" + diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py index fa18965aac..4dc2c50e1c 100644 --- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py +++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py @@ -48,40 +48,22 @@ test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128) place = fluid.CPUPlace() exe = fluid.Executor(place) - +feeder = fluid.DataFeeder(feed_list=[image, label], place=place) exe.run(fluid.default_startup_program()) PASS_NUM = 100 for pass_id in range(PASS_NUM): accuracy.reset(exe) for data in train_reader(): - x_data = np.array(map(lambda x: x[0], data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = np.expand_dims(y_data, axis=1) - - tensor_x = fluid.LoDTensor() - tensor_x.set(x_data, place) - - tensor_y = fluid.LoDTensor() - tensor_y.set(y_data, place) - - outs = exe.run(fluid.default_main_program(), - feed={'x': tensor_x, - 'y': tensor_y}, - fetch_list=[avg_cost] + accuracy.metrics) - out = np.array(outs[0]) - acc = np.array(outs[1]) + out, acc = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost] + accuracy.metrics) pass_acc = accuracy.eval(exe) test_accuracy.reset(exe) for data in test_reader(): - x_data = np.array(map(lambda x: x[0], data)).astype("float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = np.expand_dims(y_data, axis=1) - out, acc = exe.run(inference_program, - feed={'x': x_data, - 'y': y_data}, + feed=feeder.feed(data), fetch_list=[avg_cost] + test_accuracy.metrics) test_pass_acc = test_accuracy.eval(exe) diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py index be875a952b..f103358edc 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py @@ -4,10 +4,8 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid -def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32): - data = fluid.layers.data(name="words", shape=[1], dtype="int64") - label = fluid.layers.data(name="label", shape=[1], dtype="int64") - +def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32, + hid_dim=32): emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) conv_3 = fluid.nets.sequence_conv_pool( input=emb, @@ -55,8 +53,11 @@ def main(): dict_dim = len(word_dict) class_dim = 2 + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost, accuracy, acc_out = convolution_net( - input_dim=dict_dim, class_dim=class_dim) + data, label, input_dim=dict_dim, class_dim=class_dim) train_data = paddle.batch( paddle.reader.shuffle( @@ -64,25 +65,16 @@ def main(): batch_size=BATCH_SIZE) place = fluid.CPUPlace() exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) exe.run(fluid.default_startup_program()) for pass_id in xrange(PASS_NUM): accuracy.reset(exe) for data in train_data(): - tensor_words = to_lodtensor(map(lambda x: x[0], data), place) - - label = np.array(map(lambda x: x[1], data)).astype("int64") - label = label.reshape([BATCH_SIZE, 1]) - - tensor_label = fluid.LoDTensor() - tensor_label.set(label, place) - - cost_val, acc_val = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": tensor_label}, - fetch_list=[cost, acc_out]) + cost_val, acc_val = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[cost, acc_out]) pass_acc = accuracy.eval(exe) print("cost=" + str(cost_val) + " acc=" + str(acc_val) + " pass_acc=" + str(pass_acc)) diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py index 094a3cdcda..cd28f04b85 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py @@ -3,14 +3,14 @@ import paddle.v2 as paddle import paddle.v2.fluid as fluid -def stacked_lstm_net(input_dim, +def stacked_lstm_net(data, + label, + input_dim, class_dim=2, emb_dim=128, hid_dim=512, stacked_num=3): assert stacked_num % 2 == 1 - data = fluid.layers.data(name="words", shape=[1], dtype="int64") - label = fluid.layers.data(name="label", shape=[1], dtype="int64") emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim]) # add bias attr @@ -65,8 +65,11 @@ def main(): dict_dim = len(word_dict) class_dim = 2 + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost, accuracy, acc_out = stacked_lstm_net( - input_dim=dict_dim, class_dim=class_dim) + data, label, input_dim=dict_dim, class_dim=class_dim) train_data = paddle.batch( paddle.reader.shuffle( @@ -74,25 +77,16 @@ def main(): batch_size=BATCH_SIZE) place = fluid.CPUPlace() exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) exe.run(fluid.default_startup_program()) for pass_id in xrange(PASS_NUM): accuracy.reset(exe) for data in train_data(): - tensor_words = to_lodtensor(map(lambda x: x[0], data), place) - - label = np.array(map(lambda x: x[1], data)).astype("int64") - label = label.reshape([BATCH_SIZE, 1]) - - tensor_label = fluid.LoDTensor() - tensor_label.set(label, place) - - cost_val, acc_val = exe.run( - fluid.default_main_program(), - feed={"words": tensor_words, - "label": tensor_label}, - fetch_list=[cost, acc_out]) + cost_val, acc_val = exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[cost, acc_out]) pass_acc = accuracy.eval(exe) print("cost=" + str(cost_val) + " acc=" + str(acc_val) + " pass_acc=" + str(pass_acc)) diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py index 1b441e15c7..8b928ff9ee 100644 --- a/python/paddle/v2/fluid/tests/book/test_word2vec.py +++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py @@ -57,23 +57,16 @@ train_reader = paddle.batch( place = fluid.CPUPlace() exe = fluid.Executor(place) +feeder = fluid.DataFeeder( + feed_list=[first_word, second_word, third_word, forth_word, next_word], + place=place) exe.run(fluid.default_startup_program()) for pass_id in range(PASS_NUM): for data in train_reader(): - input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)] - input_data = map(lambda x: np.array(x).astype("int64"), input_data) - input_data = map(lambda x: np.expand_dims(x, axis=1), input_data) - avg_cost_np = exe.run(fluid.default_main_program(), - feed={ - 'firstw': input_data[0], - 'secondw': input_data[1], - 'thirdw': input_data[2], - 'forthw': input_data[3], - 'nextw': input_data[4] - }, + feed=feeder.feed(data), fetch_list=[avg_cost]) if avg_cost_np[0] < 5.0: exit(0) # if avg cost less than 10.0, we think our code is good. diff --git a/python/paddle/v2/fluid/tests/test_data_feeder.py b/python/paddle/v2/fluid/tests/test_data_feeder.py new file mode 100644 index 0000000000..4549693203 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_data_feeder.py @@ -0,0 +1,13 @@ +import paddle.v2.fluid as fluid + + +def test_converter(): + img = fluid.layers.data(name='image', shape=[1, 28, 28]) + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + feeder = fluid.DataFeeder([img, label], fluid.CPUPlace()) + result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]]) + print(result) + + +if __name__ == '__main__': + test_converter() From 4786ad1457ba923476b04ea62a2396d3936bae24 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Mon, 4 Dec 2017 13:40:26 +0800 Subject: [PATCH 170/275] Make the new framework independent the old framework. (#6201) --- paddle/operators/softmax_with_cross_entropy_op.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc index fc027d6f95..0c30228863 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/operators/softmax_with_cross_entropy_op.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/operators/softmax_with_cross_entropy_op.h" -#include namespace paddle { namespace operators { From fbbfe8b8594960934529330dc1321e1fdc6c2a6d Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 4 Dec 2017 14:21:13 +0800 Subject: [PATCH 171/275] code refine --- paddle/operators/elementwise_add_op.h | 39 +++++++- paddle/operators/elementwise_op_function.h | 108 +++++++++++++++++++++ 2 files changed, 146 insertions(+), 1 deletion(-) diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index f04fe3ec60..686d45573d 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -19,11 +19,48 @@ namespace paddle { namespace operators { +template +struct AddFunctor { + HOSTDEVICE T operator()(T a, T b) const { return a + b; } +}; + template class ElementwiseAddKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + using Tensor = framework::Tensor; + + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + z->mutable_data(ctx.GetPlace()); + TransformFunctor, T, Place> functor(x, y, z, ctx, + AddFunctor()); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(), + "Rank of first input must >= rank of second input."); + + if (x_dims == y_dims) { + functor.Run(); + return; + } + + int axis = ctx.Attr("axis"); + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(), + "Axis should be in range [0, x_dims)"); + + int pre, n, post; + get_mid_dims(x_dims, y_dims, axis, pre, n, post); + if (post == 1) { + functor.RunRowWise(n, pre); + return; + } else { + functor.RunMidWise(n, pre, post); + return; + } } }; diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 8aa35b2c46..22b96b9312 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -16,6 +16,7 @@ #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" #include "paddle/framework/operator.h" +#include "paddle/platform/transform.h" #include "paddle/operators/math/math_function.h" @@ -54,6 +55,113 @@ inline void get_mid_dims(const framework::DDim& x_dims, } } +template +struct RowwiseTransformIterator; +template +struct MidWiseTransformIterator; + +template +struct RowwiseTransformIterator { + RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {} + + RowwiseTransformIterator& operator++() { + ++i_; + if (i_ == n_) { + i_ = 0; + } + return *this; + } + + bool operator==( + const RowwiseTransformIterator& rhs) const { + return &(this->operator*()) == &(*rhs); + } + + bool operator!=( + const RowwiseTransformIterator& rhs) const { + return &(this->operator*()) &= &(*rhs); + } + + const T& operator*() { return ptr_[i_]; } + + const T* ptr_; + int i_; + int n_; +}; + +template +struct MidWiseTransformIterator { + MidWiseTransformIterator(const T* ptr, int n, int post) + : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} + + MidWiseTransformIterator& operator++() { + ++j_; + if (j_ == post_) { + j_ = 0; + ++i_; + if (i_ == n_) { + i_ = 0; + } + } + return *this; + } + + bool operator==( + const MidWiseTransformIterator& rhs) const { + return &(this->operator*()) == &(*rhs); + } + + bool operator!=( + const MidWiseTransformIterator& rhs) const { + return &(this->operator*()) &= &(*rhs); + } + + const T& operator*() { return ptr_[i_]; } + + const T* ptr_; + int i_; + int j_; + int n_; + int post_; +}; + +template +struct TransformFunctor { + TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, + framework::Tensor* z, const framework::ExecutionContext& ctx, + Functor func) + : x_(x->data()), + y_(y->data()), + z_(z->mutable_data(ctx.GetPlace())), + nx_(x->numel()), + ctx_(ctx), + func_(func) {} + + inline void Run() const { + platform::Transform trans; + trans(ctx_.device_context(), x_, x_ + nx_, y_, z_, func_); + } + + inline void RunRowWise(int n, int pre) const { + platform::Transform trans; + trans(ctx_.device_context(), x_, x_ + nx_, + RowwiseTransformIterator(y_, n), z_, func_); + } + + inline void RunMidWise(int n, int pre, int post) const { + platform::Transform trans; + trans(ctx_.device_context(), x_, x_ + nx_, + MidWiseTransformIterator(y_, n, post), z_, func_); + } + + const T* x_; + const T* y_; + T* z_; + int64_t nx_; + const framework::ExecutionContext& ctx_; + Functor func_; +}; + #define EIGEN_FUNCTOR(name, eigen_op) \ struct Eigen##name##Functor { \ template \ From 54205c99b6154375cf37d8cb8ff4523a458b5052 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 4 Dec 2017 11:46:40 +0800 Subject: [PATCH 172/275] add MKLDNNLRNLayer --- paddle/gserver/layers/MKLDNNLRNLayer.cpp | 163 +++++++++++++++++++++++ paddle/gserver/layers/MKLDNNLRNLayer.h | 78 +++++++++++ 2 files changed, 241 insertions(+) create mode 100644 paddle/gserver/layers/MKLDNNLRNLayer.cpp create mode 100644 paddle/gserver/layers/MKLDNNLRNLayer.h diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.cpp b/paddle/gserver/layers/MKLDNNLRNLayer.cpp new file mode 100644 index 0000000000..741984bb68 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp @@ -0,0 +1,163 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "MKLDNNLRNLayer.h" +#include "paddle/utils/Logging.h" + +using namespace mkldnn; // NOLINT +typedef memory::format format; + +namespace paddle { + +REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer); + +bool MKLDNNLRNLayer::init(const LayerMap& layerMap, + const ParameterMap& parameterMap) { + if (!MKLDNNLayer::init(layerMap, parameterMap)) { + return false; + } + + /* the size of inputs for norm-layer is 1 */ + CHECK_EQ(config_.inputs_size(), 1UL); + const NormConfig& conf = config_.inputs(0).norm_conf(); + localSize_ = conf.size(); + alpha_ = conf.scale(); + beta_ = conf.pow(); + + ic_ = conf.channels(); + oc_ = ic_; + iw_ = conf.img_size(); + ow_ = conf.output_x(); + ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size(); + oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x(); + CHECK_EQ(iw_, ow_); + CHECK_EQ(ih_, oh_); + return true; +} + +void MKLDNNLRNLayer::reshape( + int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) { + CHECK_EQ(inputLayers_.size(), 1UL); + reshapeInput(bs, ih, iw); + // ic_ and oc can not be changed + CHECK_EQ((size_t)ic, + inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw) + << "Input channel can not be changed"; + oh = ih; + ow = iw; + reshapeOutput(oh, ow); + resizeOutput(bs, oc * oh * ow); +} + +void MKLDNNLRNLayer::resetFwd(std::vector& pipeline, + std::vector& inputs, + MKLDNNMatrixPtr& out) { + resetFwdBuffers(inputs[0], out); + + resetFwdPD(fwdPD_, inputs[0], out); + + resetFwdPipeline(pipeline, fwdPD_, inputs[0], out); +} + +void MKLDNNLRNLayer::resetBwd(std::vector& pipeline, + std::vector& inputs, + MKLDNNMatrixPtr& out) { + std::shared_ptr pd; + + resetBwdBuffers(inputs[0], out); + + resetBwdPD(pd, inputs[0], out); + + resetBwdPipeline(pipeline, pd, inputs[0], out); +} + +void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + resetInValue(in); + CHECK(in); + resetOutValue(out, in->getPrimitiveDesc()); +} + +void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr& pd, + MKLDNNMatrixPtr in, + MKLDNNMatrixPtr out) { + prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring + : prop_kind::forward_training; + auto fwdDesc = lrn_fwd::desc(pk, + algorithm::lrn_across_channels, + in->getMemoryDesc(), + localSize_, + alpha_, + beta_, + 1.0f); + pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_)); + // prepare workspace if necessary + workspace_ = + passType_ != PASS_TEST + ? std::make_shared(memory(pd->workspace_primitive_desc())) + : nullptr; +} + +void MKLDNNLRNLayer::resetFwdPipeline( + std::vector& pipeline, + std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + fwd_ = workspace_ + ? std::make_shared(lrn_fwd(*pd, *in, *workspace_, *out)) + : std::make_shared(lrn_fwd(*pd, *in, *out)); + pipeline.push_back(*fwd_); +} + +void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + CHECK(inVals_[0] && outVal_); + resetOutGrad(out, outVal_->getPrimitiveDesc()); + resetInGrad(in, inVals_[0]->getPrimitiveDesc()); +} + +void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + pd = nullptr; + if (in == nullptr) { + return; + } + CHECK(out); + auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels, + in->getMemoryDesc(), + out->getMemoryDesc(), + localSize_, + alpha_, + beta_, + 1.0f); + pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_)); +} + +void MKLDNNLRNLayer::resetBwdPipeline( + std::vector& pipeline, + std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out) { + if (pd == nullptr) { + return; + } + CHECK(inVals_[0]); + CHECK(workspace_); + bwdData_ = std::make_shared( + lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in)); + pipeline.push_back(*bwdData_); +} + +} // namespace paddle diff --git a/paddle/gserver/layers/MKLDNNLRNLayer.h b/paddle/gserver/layers/MKLDNNLRNLayer.h new file mode 100644 index 0000000000..cfe5621252 --- /dev/null +++ b/paddle/gserver/layers/MKLDNNLRNLayer.h @@ -0,0 +1,78 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "MKLDNNLayer.h" +#include "mkldnn.hpp" + +namespace paddle { +typedef mkldnn::lrn_forward lrn_fwd; +typedef mkldnn::lrn_backward lrn_bwd; + +/** + * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer. + * + * The config file api is mkldnn_lrn + */ +class MKLDNNLRNLayer : public MKLDNNLayer { +protected: + // save forward primitive_desc, which can be used in backward + std::shared_ptr fwdPD_; + // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/ + // test_lrn_backward.cpp, lrn need workspace for backward + std::shared_ptr workspace_; + + int localSize_; + float alpha_, beta_; // scale and pow in paddle + +public: + explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {} + + ~MKLDNNLRNLayer() {} + + bool init(const LayerMap& layerMap, + const ParameterMap& parameterMap) override; + + void reshape( + int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override; + + void resetFwd(std::vector& pipeline, + std::vector& inputs, + MKLDNNMatrixPtr& out) override; + + void resetBwd(std::vector& pipeline, + std::vector& inputs, + MKLDNNMatrixPtr& out) override; + +protected: + void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); + void resetFwdPD(std::shared_ptr& pd, + MKLDNNMatrixPtr in, + MKLDNNMatrixPtr out); + void resetFwdPipeline(std::vector& pipeline, + std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out); + void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); + void resetBwdPD(std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out); + void resetBwdPipeline(std::vector& pipeline, + std::shared_ptr& pd, + MKLDNNMatrixPtr& in, + MKLDNNMatrixPtr& out); +}; + +} // namespace paddle From 343b1a962b91460637c6aeb8e48bc048c8337905 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 4 Dec 2017 13:51:24 +0800 Subject: [PATCH 173/275] add mkldnn_lrn unit test --- paddle/gserver/tests/test_MKLDNN.cpp | 45 ++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp index 56b523f220..ad1dbc3ee2 100644 --- a/paddle/gserver/tests/test_MKLDNN.cpp +++ b/paddle/gserver/tests/test_MKLDNN.cpp @@ -272,6 +272,51 @@ TEST(MKLDNNLayer, BatchNormLayer) { testBatchNormLayer({4, 16, 8, 10}); } +struct testLRNDesc { + int bs, ic, ih, iw; + float scale, pow; + int localSize; +}; + +void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) { + cfg.layerConfig.set_type("mkldnn_lrn"); + cfg.layerConfig.set_active_type("relu"); + size_t layerSize = pm.ic * pm.ih * pm.iw; + cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0}); + LayerInputConfig* input = cfg.layerConfig.add_inputs(); + NormConfig* norm = input->mutable_norm_conf(); + norm->set_channels(pm.ic); + norm->set_size(pm.localSize); + norm->set_scale(pm.scale); + norm->set_pow(pm.pow); + norm->set_blocked(0); + norm->set_img_size(pm.iw); + norm->set_img_size_y(pm.ih); + norm->set_output_x(norm->img_size()); + norm->set_output_y(norm->img_size_y()); + cfg.layerConfig.set_size(layerSize); + cfg.biasSize = 0; +} + +void testLRNLayer(const testLRNDesc& pm) { + TestConfig dnnConfig; + getMKLDNNLRNConfig(dnnConfig, pm); + // mkldnn_lrn <==> norm with cmrnorm-projection type + TestConfig refConfig = dnnConfig; + refConfig.layerConfig.set_type("norm"); + LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0); + NormConfig* norm = input->mutable_norm_conf(); + norm->set_norm_type("cmrnorm-projection"); + norm->set_scale(norm->scale() / norm->size()); + RUN_MKLDNN_TEST(dnnConfig, refConfig, pm) +} + +TEST(MKLDNNLayer, LRNLayer) { + testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5}); + testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5}); + testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5}); +} + struct testImageDesc { int bs, ic, ih, iw; }; From f13d725acf8b8c4d18cfc39e0367efdedd840680 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 4 Dec 2017 14:32:27 +0800 Subject: [PATCH 174/275] add mkldnn_lrn python interface and add it to simple net --- paddle/gserver/tests/mkldnn_simple_net.conf | 2 ++ python/paddle/trainer/config_parser.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/paddle/gserver/tests/mkldnn_simple_net.conf b/paddle/gserver/tests/mkldnn_simple_net.conf index 8bbe91e56d..0e9d6b31fa 100644 --- a/paddle/gserver/tests/mkldnn_simple_net.conf +++ b/paddle/gserver/tests/mkldnn_simple_net.conf @@ -51,6 +51,8 @@ tmp = img_pool_layer(input=tmp, padding=1, pool_type=MaxPooling()) +tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75) + tmp = fc_layer(input=tmp, size=channels, bias_attr=False, diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py index 5b173694dd..da9679277f 100644 --- a/python/paddle/trainer/config_parser.py +++ b/python/paddle/trainer/config_parser.py @@ -2287,11 +2287,17 @@ class Conv3DLayer(Conv3DLayerBase): class NormLayer(LayerBase): def __init__(self, name, inputs, **xargs): super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs) + use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0))) + use_mkldnn = True if use_mkldnn and self.inputs[ + 0].norm.norm_type == 'cmrnorm-projection' else False + self.config.type = 'mkldnn_lrn' if use_mkldnn else self.config.type for input_index in xrange(len(self.inputs)): input_layer = self.get_input_layer(input_index) norm_conf = self.config.inputs[input_index].norm_conf parse_norm(self.inputs[input_index].norm, input_layer.name, norm_conf) + norm_conf.scale = self.inputs[ + input_index].norm.scale if use_mkldnn else norm_conf.scale self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x, norm_conf.channels, False) if norm_conf.norm_type == "cross-channel-norm": From 7b827d95adb0c8ae5dff1bdad7fd51ff50065dfe Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Mon, 4 Dec 2017 15:37:03 +0800 Subject: [PATCH 175/275] use awk command to replace bc --- benchmark/paddle/image/run_mkldnn_infer.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkldnn_infer.sh index 03a76c0540..d795bcab1b 100755 --- a/benchmark/paddle/image/run_mkldnn_infer.sh +++ b/benchmark/paddle/image/run_mkldnn_infer.sh @@ -4,7 +4,7 @@ function clock_to_seconds() { hours=`echo $1 | awk -F ':' '{print $1}'` mins=`echo $1 | awk -F ':' '{print $2}'` secs=`echo $1 | awk -F ':' '{print $3}'` - echo `bc -l <<< "$secs + $mins * 60 + $hours * 3600"` + echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'` } function infer() { @@ -58,9 +58,9 @@ function infer() { end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs` start_sec=`clock_to_seconds $start` end_sec=`clock_to_seconds $end` - fps=`bc <<< "scale = 2; 1280 / ($end_sec - $start_sec)"` + fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'` echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log} - echo "FPS: $fps images/sec" >> ${log} + echo "FPS: $fps images/sec" 2>&1 | tee -a ${log} } if [ ! -f "train.list" ]; then From 1fe05c458fa1d7ff1949759c2a06ed6d19ab8048 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Mon, 4 Dec 2017 16:04:07 +0800 Subject: [PATCH 176/275] Generate docs for Fluid API (#6215) * add doc config for fluid * fix typo * follow comments --- doc/api/index_en.rst | 1 + doc/api/v2/fluid.rst | 18 ++ doc/api/v2/fluid/data_feeder.rst | 9 + doc/api/v2/fluid/evaluator.rst | 9 + doc/api/v2/fluid/executor.rst | 9 + doc/api/v2/fluid/initializer.rst | 50 +++++ doc/api/v2/fluid/layers.rst | 302 +++++++++++++++++++++++++++++++ doc/api/v2/fluid/nets.rst | 22 +++ doc/api/v2/fluid/optimizer.rst | 54 ++++++ doc/api/v2/fluid/param_attr.rst | 11 ++ doc/api/v2/fluid/profiler.rst | 10 + doc/api/v2/fluid/regularizer.rst | 25 +++ python/paddle/v2/fluid/layers.py | 6 +- 13 files changed, 522 insertions(+), 4 deletions(-) create mode 100644 doc/api/v2/fluid.rst create mode 100644 doc/api/v2/fluid/data_feeder.rst create mode 100644 doc/api/v2/fluid/evaluator.rst create mode 100644 doc/api/v2/fluid/executor.rst create mode 100644 doc/api/v2/fluid/initializer.rst create mode 100644 doc/api/v2/fluid/layers.rst create mode 100644 doc/api/v2/fluid/nets.rst create mode 100644 doc/api/v2/fluid/optimizer.rst create mode 100644 doc/api/v2/fluid/param_attr.rst create mode 100644 doc/api/v2/fluid/profiler.rst create mode 100644 doc/api/v2/fluid/regularizer.rst diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst index 25c1dd00b9..e6f632e1a5 100644 --- a/doc/api/index_en.rst +++ b/doc/api/index_en.rst @@ -7,3 +7,4 @@ API v2/model_configs.rst v2/data.rst v2/run_logic.rst + v2/fluid.rst diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst new file mode 100644 index 0000000000..43fc19dc49 --- /dev/null +++ b/doc/api/v2/fluid.rst @@ -0,0 +1,18 @@ +====================== +Fluid +====================== + +.. toctree:: + :maxdepth: 1 + + fluid/layers.rst + fluid/data_feeder.rst + fluid/executor.rst + fluid/initializer.rst + fluid/evaluator.rst + fluid/nets.rst + fluid/optimizer.rst + fluid/param_attr.rst + fluid/profiler.rst + fluid/regularizer.rst + diff --git a/doc/api/v2/fluid/data_feeder.rst b/doc/api/v2/fluid/data_feeder.rst new file mode 100644 index 0000000000..0fa78f7dfb --- /dev/null +++ b/doc/api/v2/fluid/data_feeder.rst @@ -0,0 +1,9 @@ +=========== +DataFeeder +=========== + +DataFeeder +----------- +.. automodule:: paddle.v2.fluid.data_feeder + :members: DataFeeder + :noindex: diff --git a/doc/api/v2/fluid/evaluator.rst b/doc/api/v2/fluid/evaluator.rst new file mode 100644 index 0000000000..a23f3301d0 --- /dev/null +++ b/doc/api/v2/fluid/evaluator.rst @@ -0,0 +1,9 @@ +=========== +Evaluator +=========== + +Evaluator +----------- +.. automodule:: paddle.v2.fluid.evaluator + :members: Evaluator + :noindex: diff --git a/doc/api/v2/fluid/executor.rst b/doc/api/v2/fluid/executor.rst new file mode 100644 index 0000000000..3a283538c1 --- /dev/null +++ b/doc/api/v2/fluid/executor.rst @@ -0,0 +1,9 @@ +=========== +Executor +=========== + +Executor +----------- +.. automodule:: paddle.v2.fluid.executor + :members: Executor + :noindex: diff --git a/doc/api/v2/fluid/initializer.rst b/doc/api/v2/fluid/initializer.rst new file mode 100644 index 0000000000..8f587837e9 --- /dev/null +++ b/doc/api/v2/fluid/initializer.rst @@ -0,0 +1,50 @@ +=========== +Initializer +=========== + + + +Initializer +----------- +.. automodule:: paddle.v2.fluid.initializer + :members: Initializer + :noindex: + + + +ConstantInitializer +------------------- +.. automodule:: paddle.v2.fluid.initializer + :members: ConstantInitializer + :noindex: + + + +UniformInitializer +------------------ +.. automodule:: paddle.v2.fluid.initializer + :members: UniformInitializer + :noindex: + + + +NormalInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: NormalInitializer + :noindex: + + +XavierInitializer +----------------- +.. automodule:: paddle.v2.fluid.initializer + :members: XavierInitializer + :noindex: + + +MSRAInitializer +--------------- +.. automodule:: paddle.v2.fluid.initializer + :members: MSRAInitializer + :noindex: + diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst new file mode 100644 index 0000000000..89e5fec13b --- /dev/null +++ b/doc/api/v2/fluid/layers.rst @@ -0,0 +1,302 @@ +========== +Layers +========== + + +fc +--- +.. autofunction:: paddle.v2.fluid.layers.fc + :noindex: + +embedding +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + +dynamic_lstm +------------ +.. autofunction:: paddle.v2.fluid.layers.dynamic_lstm + :noindex: + +data +--------- +.. autofunction:: paddle.v2.fluid.layers.data + :noindex: + +mean +--------- +.. autofunction:: paddle.v2.fluid.layers.mean + :noindex: + +mul +--------- +.. autofunction:: paddle.v2.fluid.layers.mul + :noindex: + +elementwise_add +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_add + :noindex: + +elementwise_div +--------------- +.. autofunction:: paddle.v2.fluid.layers.elementwise_div + :noindex: + + +dropout +--------- +.. autofunction:: paddle.v2.fluid.layers.dropout + :noindex: + + +reshape +--------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +sigmoid +--------- +.. autofunction:: paddle.v2.fluid.layers.sigmoid + :noindex: + + +scale +--------- +.. autofunction:: paddle.v2.fluid.layers.scale + :noindex: + + +reshape +--------- +.. autofunction:: paddle.v2.fluid.layers.reshape + :noindex: + + +transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.transpose + :noindex: + + +sigmoid_cross_entropy_with_logits +--------- +.. autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits + :noindex: + + +cast +--------- +.. autofunction:: paddle.v2.fluid.layers.cast + :noindex: + + +concat +--------- +.. autofunction:: paddle.v2.fluid.layers.concat + :noindex: + + +sums +--------- +.. autofunction:: paddle.v2.fluid.layers.sums + :noindex: + + +linear_chain_crf +--------- +.. autofunction:: paddle.v2.fluid.layers.linear_chain_crf + :noindex: + + +assign +--------- +.. autofunction:: paddle.v2.fluid.layers.embedding + :noindex: + + +split_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.split_lod_tensor + :noindex: + + +merge_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.merge_lod_tensor + :noindex: + +cos_sim +--------- +.. autofunction:: paddle.v2.fluid.layers.cos_sim + :noindex: + + +cross_entropy +--------- +.. autofunction:: paddle.v2.fluid.layers.cross_entropy + :noindex: + + + +square_error_cost +--------- +.. autofunction:: paddle.v2.fluid.layers.square_error_cost + :noindex: + + +accuracy +--------- +.. autofunction:: paddle.v2.fluid.layers.accuracy + :noindex: + + +sequence_conv +--------- +.. autofunction:: paddle.v2.fluid.layers.sequence_conv + :noindex: + + +conv2d +--------- +.. autofunction:: paddle.v2.fluid.layers.conv2d + :noindex: + + +sequence_pool +--------- +.. autofunction:: paddle.v2.fluid.layers.sequence_pool + :noindex: + + +pool2d +--------- +.. autofunction:: paddle.v2.fluid.layers.pool2d + :noindex: + + +batch_norm +--------- +.. autofunction:: paddle.v2.fluid.layers.batch_norm + :noindex: + + +beam_search_decode +--------- +.. autofunction:: paddle.v2.fluid.layers.beam_search_decode + :noindex: + + +lstm +--------- +.. autofunction:: paddle.v2.fluid.layers.lstm + :noindex: + + +lod_rank_table +--------- +.. autofunction:: paddle.v2.fluid.layers.lod_rank_table + :noindex: + + +max_sequence_len +--------- +.. autofunction:: paddle.v2.fluid.layers.max_sequence_len + :noindex: + + +topk +--------- +.. autofunction:: paddle.v2.fluid.layers.topk + :noindex: + + +lod_tensor_to_array +--------- +.. autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array + :noindex: + + + +array_to_lod_tensor +--------- +.. autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor + :noindex: + + + + +fill_constant +--------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant + :noindex: + + + +fill_constant_batch_size_like +--------- +.. autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like + :noindex: + + +ones +--------- +.. autofunction:: paddle.v2.fluid.layers.ones + :noindex: + + +zeros +--------- +.. autofunction:: paddle.v2.fluid.layers.zeros + :noindex: + + +increment +--------- +.. autofunction:: paddle.v2.fluid.layers.increment + :noindex: + + +array_write +--------- +.. autofunction:: paddle.v2.fluid.layers.array_write + :noindex: + + + +create_array +--------- +.. autofunction:: paddle.v2.fluid.layers.create_array + :noindex: + + +less_than +--------- +.. autofunction:: paddle.v2.fluid.layers.less_than + :noindex: + + +array_read +--------- +.. autofunction:: paddle.v2.fluid.layers.array_read + :noindex: + + +shrink_memory +--------- +.. autofunction:: paddle.v2.fluid.layers.shrink_memory + :noindex: + + +array_length +--------- +.. autofunction:: paddle.v2.fluid.layers.array_length + :noindex: + + +conv2d_transpose +--------- +.. autofunction:: paddle.v2.fluid.layers.conv2d_transpose + :noindex: + diff --git a/doc/api/v2/fluid/nets.rst b/doc/api/v2/fluid/nets.rst new file mode 100644 index 0000000000..2c3d075422 --- /dev/null +++ b/doc/api/v2/fluid/nets.rst @@ -0,0 +1,22 @@ +=========== +Nets +=========== + +simple_img_conv_pool +----------- +.. autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool + :noindex: + + +img_conv_group +----------- +.. autofunction:: paddle.v2.fluid.nets.img_conv_group + :noindex: + + +sequence_conv_pool +----------- +.. autofunction:: paddle.v2.fluid.nets.sequence_conv_pool + :noindex: + + diff --git a/doc/api/v2/fluid/optimizer.rst b/doc/api/v2/fluid/optimizer.rst new file mode 100644 index 0000000000..233762fcdf --- /dev/null +++ b/doc/api/v2/fluid/optimizer.rst @@ -0,0 +1,54 @@ +=========== +Optimizer +=========== + +Optimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: Optimizer + :noindex: + + +SGDOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: SGDOptimizer + :noindex: + + + +MomentumOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: MomentumOptimizer + :noindex: + + + +AdagradOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdagradOptimizer + :noindex: + + +AdamOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamOptimizer + :noindex: + + +AdamaxOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: AdamaxOptimizer + :noindex: + + +DecayedAdagradOptimizer +----------- +.. automodule:: paddle.v2.fluid.optimizer + :members: DecayedAdagradOptimizer + :noindex: + diff --git a/doc/api/v2/fluid/param_attr.rst b/doc/api/v2/fluid/param_attr.rst new file mode 100644 index 0000000000..ca0c8af9e8 --- /dev/null +++ b/doc/api/v2/fluid/param_attr.rst @@ -0,0 +1,11 @@ +=========== +ParamAttr +=========== + + + +ParamAttr +----------- +.. automodule:: paddle.v2.fluid.param_attr + :members: ParamAttr + :noindex: diff --git a/doc/api/v2/fluid/profiler.rst b/doc/api/v2/fluid/profiler.rst new file mode 100644 index 0000000000..7d4042d1f4 --- /dev/null +++ b/doc/api/v2/fluid/profiler.rst @@ -0,0 +1,10 @@ +=========== +Profiler +=========== + + + +Profiler +----------- +.. autofunction:: paddle.v2.fluid.profiler.cuda_profiler + :noindex: diff --git a/doc/api/v2/fluid/regularizer.rst b/doc/api/v2/fluid/regularizer.rst new file mode 100644 index 0000000000..3af2b07d2a --- /dev/null +++ b/doc/api/v2/fluid/regularizer.rst @@ -0,0 +1,25 @@ +=========== +Regularizer +=========== + +WeightDecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: WeightDecayRegularizer + :noindex: + + +L2DecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L2DecayRegularizer + :noindex: + + + +L1DecayRegularizer +----------- +.. automodule:: paddle.v2.fluid.regularizer + :members: L1DecayRegularizer + + diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index e41bfae285..5568619fe6 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -31,11 +31,9 @@ def fc(input, size: The size of the layer num_flatten_dims: Number of columns in input param_attr: The parameters/weights to the FC Layer - param_initializer: Initializer used for the weight/parameter. - If None, XavierInitializer() is used + param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used bias_attr: The bias parameter for the FC layer - bias_initializer: Initializer used for the bias. - If None, then ConstantInitializer() is used + bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used act: Activation to be applied to the output of FC layer name: Name/alias of the function main_program: Name of the main program that calls this From d5e327945145f30e09209db04a0a4066fd5eeae7 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 4 Dec 2017 18:50:36 +0800 Subject: [PATCH 177/275] While op forward for sentimental analysis (#6140) * Add DataFeeder A v2 API like data feeder for book demos. We can feed data directly from reader. * Fix CI * Add an unittest for while/rnn op forward * Add unittest for raw while op backward * Fix CI --- paddle/framework/backward.cc | 41 ++++++++- paddle/framework/block_desc.cc | 3 + paddle/framework/executor.cc | 4 + paddle/framework/op_desc.cc | 7 +- paddle/framework/scope.cc | 20 +++-- paddle/framework/scope.h | 2 + paddle/framework/shape_inference.cc | 5 ++ paddle/operators/increment_op.cc | 2 + paddle/operators/lod_tensor_to_array_op.cc | 21 +++-- paddle/operators/multiplex_op.cc | 8 +- paddle/operators/recurrent_op.cc | 4 +- paddle/operators/sequence_pool_op.cc | 1 + paddle/operators/sum_op.cc | 34 ++++++-- paddle/operators/sum_op.h | 3 + .../operators/tensor_array_read_write_op.cc | 24 +++-- paddle/operators/while_op.cc | 45 +++++++--- python/paddle/v2/fluid/data_feeder.py | 1 - python/paddle/v2/fluid/layers.py | 4 +- python/paddle/v2/fluid/optimizer.py | 3 +- .../book/test_understand_sentiment_lstm.py | 4 +- python/paddle/v2/fluid/tests/test_dyn_rnn.py | 87 +++++++++++++++++++ 21 files changed, 262 insertions(+), 61 deletions(-) create mode 100644 python/paddle/v2/fluid/tests/test_dyn_rnn.py diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 8fd2906107..c8b85caaca 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -27,6 +27,18 @@ namespace paddle { namespace framework { +static std::unordered_set* g_ctrl_flow_ops_ = nullptr; +// Control Flow operators's backward is significantly different from +// computational operators. Hack Code here. +// We should design a better way to backward CtrlFlowOps. +static std::unordered_set& CtrlFlowOps() { + if (g_ctrl_flow_ops_ == nullptr) { + g_ctrl_flow_ops_ = + new std::unordered_set{"increment", "lod_rank_table"}; + } + return *g_ctrl_flow_ops_; +} + static inline std::unique_ptr CreateGradOp( const OperatorBase& op, const std::unordered_set& no_grad_set, std::unordered_map* grad_to_var) { @@ -288,12 +300,24 @@ static void CreateGradVarInBlock( for (size_t op_index = grad_op_start_index; op_index < ops.size(); ++op_index) { std::unordered_set new_vars; + auto& ctrl_flow_ops = CtrlFlowOps(); ForEachVarName(ops[op_index]->Outputs(), [&](const std::string& grad_var_name) { - if (block_desc->HasVar(grad_var_name)) { + if (ctrl_flow_ops.find(ops[op_index]->Type()) != + ctrl_flow_ops.end()) { + if (block_desc->HasVarRecursive(grad_var_name)) { + return false; + } + } else { + if (block_desc->HasVar(grad_var_name)) { + return false; + } + } + if (grad_var_name == framework::kEmptyVarName) { return false; } auto var = block_desc->Var(grad_var_name); + VLOG(10) << "Creating Variable " << grad_var_name; new_vars.insert(var->Name()); auto it = param_name_map.find(grad_var_name); if (it == param_name_map.end()) { @@ -333,14 +357,25 @@ std::vector> MakeOpGrad( // All input gradients of forwarding operator do not need to calculate. const std::vector& inputs = op_desc->InputArgumentNames(); if (AllGradInSet(inputs, *no_grad_vars)) { + VLOG(10) << "Drop operator " << op_desc->Type(); return grad_op_descs; // empty vector } + // All output gradients of forwarding operator do not need to calculate. const std::vector& outputs = op_desc->OutputArgumentNames(); + if (AllGradInSet(outputs, *no_grad_vars)) { - for (const std::string& name : inputs) { - no_grad_vars->insert(GradVarName(name)); + VLOG(10) << "Drop operator " << op_desc->Type(); + // FIXME: Hack code here + auto& ctrl_flow_ops = CtrlFlowOps(); + if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) { + // Only computational op need drop input's gradient. + for (const std::string& name : inputs) { + no_grad_vars->insert(GradVarName(name)); + VLOG(10) << " Also drop " << GradVarName(name); + } } + return grad_op_descs; // empty vector } diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc index 11764810e1..6a7a07d5cf 100644 --- a/paddle/framework/block_desc.cc +++ b/paddle/framework/block_desc.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/block_desc.h" +#include "paddle/framework/operator.h" #include "paddle/framework/program_desc.h" namespace paddle { @@ -42,6 +43,8 @@ bool BlockDescBind::HasVar(const std::string &name) const { } VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const { + if (name == kEmptyVarName) return nullptr; + auto it = vars_.find(name); if (it == vars_.end()) { return Parent() == kNoneBlockIndex ? nullptr diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc index 2ffb5b7dbb..83aa927c29 100644 --- a/paddle/framework/executor.cc +++ b/paddle/framework/executor.cc @@ -97,6 +97,10 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, if (create_local_scope) { local_scope = &scope->NewScope(); for (auto& var : block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + if (var->Persistable()) { auto* ptr = scope->Var(var->Name()); CreateTensor(ptr, var->GetType()); diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 02a8253243..2281d93df9 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -466,7 +466,12 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { auto var = block_.FindVarRecursive(name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); try { - return framework::make_ddim(var->Shape()); + auto shape = var->Shape(); + if (shape.empty()) { + return framework::make_ddim({0UL}); + } else { + return framework::make_ddim(var->Shape()); + } } catch (...) { VLOG(5) << "GetDim of variable " << name << " error"; std::rethrow_exception(std::current_exception()); diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc index 9ad6272c99..656736e238 100644 --- a/paddle/framework/scope.cc +++ b/paddle/framework/scope.cc @@ -36,12 +36,9 @@ Scope& Scope::NewScope() const { } Variable* Scope::Var(const std::string& name) { - auto iter = vars_.find(name); - if (iter != vars_.end()) { - VLOG(3) << "Get existing variable " << name; - return iter->second; - } - Variable* v = new Variable(); + auto* v = FindVarLocally(name); + if (v != nullptr) return v; + v = new Variable(); vars_[name] = v; VLOG(3) << "Create variable " << name; v->name_ = &(vars_.find(name)->first); @@ -57,8 +54,10 @@ Variable* Scope::Var(std::string* name) { } Variable* Scope::FindVar(const std::string& name) const { - auto it = vars_.find(name); - if (it != vars_.end()) return it->second; + auto var = FindVarLocally(name); + if (var != nullptr) { + return var; + } return (parent_ == nullptr) ? nullptr : parent_->FindVar(name); } @@ -116,6 +115,11 @@ std::string Scope::Rename(const std::string& origin_name) const { Rename(origin_name, var_name); return var_name; } +Variable* Scope::FindVarLocally(const std::string& name) const { + auto it = vars_.find(name); + if (it != vars_.end()) return it->second; + return nullptr; +} } // namespace framework } // namespace paddle diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h index c2aafb6ad8..56e815db54 100644 --- a/paddle/framework/scope.h +++ b/paddle/framework/scope.h @@ -76,6 +76,8 @@ class Scope { std::string Rename(const std::string& origin_name) const; private: + Variable* FindVarLocally(const std::string& name) const; + // Call Scope::NewScope for a sub-scope. explicit Scope(Scope const* parent) : parent_(parent) {} diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc index 2298507471..7dac1cfd5e 100644 --- a/paddle/framework/shape_inference.cc +++ b/paddle/framework/shape_inference.cc @@ -12,6 +12,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/framework/shape_inference.h" +#include "grad_op_desc_maker.h" +#include "paddle/framework/operator.h" namespace paddle { namespace framework { @@ -49,6 +51,9 @@ void InferShapeContext::SetDims(const std::vector &names, size_t length = names.size(); PADDLE_ENFORCE_EQ(length, dims.size()); for (size_t i = 0; i < length; ++i) { + if (names[i] == framework::kEmptyVarName) { + continue; + } SetDim(names[i], dims[i]); } } diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc index 35efb12932..54911267e3 100644 --- a/paddle/operators/increment_op.cc +++ b/paddle/operators/increment_op.cc @@ -61,6 +61,8 @@ class IncrementOp : public framework::OperatorBase { out.Resize(x.dims()); out.mutable_data(x.place(), x.type()); float value = Attr("step"); + VLOG(10) << Output("Out") << " increase " << Input("X") << " with " + << value; framework::VisitDataType(framework::ToDataType(out.type()), IncrementFunctor(x, &out, value)); } diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc index 010c79d4e1..b970bf3177 100644 --- a/paddle/operators/lod_tensor_to_array_op.cc +++ b/paddle/operators/lod_tensor_to_array_op.cc @@ -14,6 +14,7 @@ #include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -32,15 +33,20 @@ class LoDTensorToArrayOp : public framework::OperatorBase { : OperatorBase(type, inputs, outputs, attrs) {} void Run(const framework::Scope &scope, const platform::DeviceContext &dev_ctx) const override { - auto &x = scope.FindVar(Input("X"))->Get(); - auto &rank_table = - scope.FindVar(Input("RankTable"))->Get(); - auto &out = - *scope.FindVar(Output("Out"))->GetMutable(); - + auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s", + Input("X")) + .Get(); + auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable"))) + .Get(); + auto &out = *detail::Ref(scope.FindVar(Output("Out"))) + .GetMutable(); auto &items = rank_table.items(); auto max_seq_len = items[0].length; auto rank_level = rank_table.level(); + + PADDLE_ENFORCE_LT(rank_level, x.lod().size(), + "Input should be a LOD tensor, and size is at least %d", + rank_level + 1); out.resize(max_seq_len); std::vector> copy_ranges(max_seq_len); @@ -55,16 +61,13 @@ class LoDTensorToArrayOp : public framework::OperatorBase { size_t start_idx = x.lod()[rank_level][item.index] + t; auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset( x.lod(), start_idx, start_idx + 1, rank_level + 1); - auto &lod_length = lod_and_offset.first; framework::AppendLoD(&lod, lod_length); - size_t start_offset = lod_and_offset.second.first; size_t end_offset = lod_and_offset.second.second; copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset}); } } - for (size_t i = 0; i < max_seq_len; ++i) { auto &ranges = copy_ranges[i]; size_t height = std::accumulate( diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index f8527dfab3..8e7f544e0d 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -99,13 +99,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel { "Output(X@Grad) should not be null."); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null."); - std::vector d_ins; - auto ins = ctx->GetInputsDim("X"); - // No need to compute gradient for Input(Ids) - for (size_t i = 0; i < ins.size(); i++) { - d_ins.push_back(ins[i]); - } - ctx->SetOutputsDim(framework::GradVarName("X"), d_ins); + ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X")); } protected: diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index c976e22c77..8b60b9c912 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -599,7 +599,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase { std::vector output{kOutputs}; for (auto &s : input) { PADDLE_ENFORCE(ctx->HasInputs(s)); - PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s))); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)), + "Cannot find the gradient variable %s", + framework::GradVarName(s)); } for (auto &s : output) { PADDLE_ENFORCE(ctx->HasInputs(s)); diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc index 2a000ac60b..a2f4257037 100644 --- a/paddle/operators/sequence_pool_op.cc +++ b/paddle/operators/sequence_pool_op.cc @@ -104,6 +104,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch."); } ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); } protected: diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index ddc210c26e..744b2fe3f2 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -37,10 +37,16 @@ class SumOp : public framework::OperatorWithKernel { size_t N = x_dims.size(); PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1."); - auto in_dim = x_dims[0]; - for (size_t i = 1; i < N; i++) { - auto dim = x_dims[i]; - PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape"); + framework::DDim in_dim({0}); + for (auto& x_dim : x_dims) { + if (framework::product(x_dim) == 0) { + continue; + } + if (framework::product(in_dim) == 0) { + in_dim = x_dim; + } else { + PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape"); + } } ctx->SetOutputDim("Out", in_dim); ctx->ShareLoD("X", /*->*/ "Out"); @@ -51,9 +57,23 @@ class SumOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { auto x_vars = ctx.MultiInputVar("X"); if (x_vars[0]->IsType()) { - return framework::OpKernelType( - framework::ToDataType(x_vars[0]->Get().type()), - ctx.device_context()); + int dtype = -1; + for (auto& x_var : x_vars) { + auto& lod_tensor = x_var->Get(); + if (lod_tensor.numel() == 0) { + continue; + } + if (dtype == -1) { + dtype = framework::ToDataType(lod_tensor.type()); + } else { + PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type())); + } + } + PADDLE_ENFORCE_NE(dtype, -1, + "Sum operator should have at least one tensor"); + + return framework::OpKernelType(static_cast(dtype), + ctx.device_context()); } else if (x_vars[0]->IsType()) { return framework::OpKernelType( framework::ToDataType( diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index a1eb3b014e..ed6c80ce60 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -53,6 +53,9 @@ class SumKernel : public framework::OpKernel { for (int i = in_place ? 1 : 0; i < N; i++) { if (in_vars[i]->IsType()) { auto &in_t = in_vars[i]->Get(); + if (in_t.numel() == 0) { + continue; + } auto in = EigenVector::Flatten(in_t); result.device(place) = result + in; } else if (in_vars[i]->IsType()) { diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index efde850143..4eb8b60f47 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -27,7 +27,7 @@ class WriteToArrayOp : public ArrayOp { void Run(const framework::Scope &scope, const platform::DeviceContext &dev_ctx) const override { auto *x = scope.FindVar(Input("X")); - PADDLE_ENFORCE(x != nullptr, "X must be set"); + if (x == nullptr) return; auto &x_tensor = x->Get(); size_t offset = GetOffset(scope, dev_ctx); auto *out = @@ -76,7 +76,9 @@ class WriteToArrayInferShape : public framework::InferShapeBase { PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index"); PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1, "The number of element of subscript index must be 1"); - PADDLE_ENFORCE(context->HasInput("X"), NotHasXError()); + if (!context->HasInput("X")) { + return; + } PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError()); context->SetOutputDim("Out", context->GetInputDim("X")); } @@ -99,9 +101,10 @@ class WriteToArrayInferVarType : public framework::VarTypeInference { auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name), "Cannot found %s", out_name); out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY); - auto &x = - detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name); - out.SetDataType(x.GetDataType()); + auto *x = block->FindVarRecursive(x_name); + if (x != nullptr) { + out.SetDataType(x->GetDataType()); + } } }; @@ -121,10 +124,13 @@ class ReadFromArrayOp : public ArrayOp { PADDLE_ENFORCE(out != nullptr, "Out must be set"); auto *out_tensor = out->GetMutable(); size_t offset = GetOffset(scope, dev_ctx); - PADDLE_ENFORCE_LT(offset, x_array.size()); - framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx, - out_tensor); - out_tensor->set_lod(x_array[offset].lod()); + if (offset < x_array.size()) { + framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx, + out_tensor); + out_tensor->set_lod(x_array[offset].lod()); + } else { + VLOG(10) << "offset " << offset << " >= " << x_array.size(); + } } }; diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 59460f6c87..9b3f21cf94 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -98,8 +98,6 @@ class WhileGradOp : public framework::OperatorBase { void Run(const framework::Scope &scope, const platform::DeviceContext &dev_ctx) const override { - // PADDLE_ENFORCE(...) - framework::Executor executor(dev_ctx); auto *block = Attr(kStepBlock); auto *program = block->Program(); @@ -124,8 +122,12 @@ class WhileGradOp : public framework::OperatorBase { auto inside_og_name = inside_og_names[i]; VLOG(10) << "Linking outside " << outside_og_name << " --> inside " << inside_og_name; - auto &og_outside = detail::Ref(scope.FindVar(outside_og_name)); - auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name)); + auto &og_outside = + detail::Ref(scope.FindVar(outside_og_name), + "Cannot find Outside Gradient %s", outside_og_name); + auto &og_inside = + detail::Ref(cur_scope.Var(inside_og_name), + "Cannot find inside gradient %s", inside_og_name); if (og_outside.Type().hash_code() == typeid(framework::LoDTensor).hash_code()) { auto &outside_tensor = og_outside.Get(); @@ -160,7 +162,7 @@ class WhileGradOp : public framework::OperatorBase { PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { if (pg_names[param_id] == framework::kEmptyVarName) { - continue; // iterator doesn't have gradient + continue; // parameter doesn't have gradient } auto inside_grad_name = framework::GradVarName(p_names[param_id]); @@ -190,7 +192,6 @@ class WhileGradOp : public framework::OperatorBase { } } - // sum gradient auto new_inside_name = cur_scope.Rename(inside_grad_name); auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, @@ -207,18 +208,35 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; protected: - virtual std::unique_ptr Apply() const { + std::unique_ptr Apply() const override { auto *grad = new framework::OpDescBind(); grad->SetType("while_grad"); grad->SetInput(kParameters, Input(kParameters)); - grad->SetOutput( - framework::GradVarName(kParameters), - InputGrad(kParameters, /*do not drop empty gradient*/ false)); + + // Not all of IGs will be generated by inner gradient operators of while op. + // Ignore IGs that is not generated by the inside block. + auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false); + std::unordered_set all_outs; + for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) { + for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) { + all_outs.insert(oname); + } + } + for (auto &each_ig : igs) { + if (all_outs.find(each_ig) == all_outs.end()) { + VLOG(10) << "Ignore " << each_ig; + each_ig = framework::kEmptyVarName; + } + } + + grad->SetOutput(framework::GradVarName(kParameters), igs); + grad->SetInput(kOutputs, Output(kOutputs)); // OG should be re-calculated by step blocks, since many outputs of while op // do not need to calculate gradients. std::unordered_set block_ins; + auto *fwd_block = this->grad_block_[0]->ParentBlock(); { for (auto &p : Input(kParameters)) { block_ins.insert(p); @@ -233,6 +251,13 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { if (block_ins.find(input_name) != block_ins.end()) { continue; } + + // If the input of Op is generated by the forward block, do not make it + // as input again. + if (fwd_block->FindVar(input_name) != nullptr) { + continue; + } + extra_inputs.insert(input_name); } diff --git a/python/paddle/v2/fluid/data_feeder.py b/python/paddle/v2/fluid/data_feeder.py index 3dee0b5b73..30a542af21 100644 --- a/python/paddle/v2/fluid/data_feeder.py +++ b/python/paddle/v2/fluid/data_feeder.py @@ -1,5 +1,4 @@ from __future__ import print_function - import core import numpy import six.moves as six diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 5568619fe6..99d0ac4a1b 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -10,7 +10,7 @@ from param_attr import ParamAttr __all__ = [ 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat', 'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim', - 'batch_norm', 'accuracy', 'split_lod_tensor' + 'batch_norm', 'accuracy', 'split_lod_tensor', 'While' ] @@ -1439,7 +1439,7 @@ def increment(x, value=1.0, in_place=True, main_program=None): type='increment', inputs={'X': [x]}, outputs={'Out': [out]}, - attrs={'step': value}) + attrs={'step': float(value)}) return out diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 934e024742..719e3b2563 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -197,8 +197,7 @@ class Optimizer(object): This method combines interface `append_backward_ops()` and `create_optimization_pass()` into one. """ - params_grads = append_backward_ops(loss, parameter_list, no_grad_set or - set()) + params_grads = append_backward_ops(loss, parameter_list, no_grad_set) # Add regularization if any params_grads = append_regularization_ops(params_grads) optimize_ops = self.create_optimization_pass(params_grads, loss, diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py index b247932033..80f8599679 100644 --- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py +++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py @@ -8,7 +8,8 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): name="words", shape=[seq_len * batch_size, 1], append_batch_size=False, - dtype="int64") + dtype="int64", + lod_level=1) label = fluid.layers.data( name="label", shape=[batch_size, 1], @@ -21,6 +22,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50): c_pre_init = fluid.layers.fill_constant( dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0) + c_pre_init.stop_gradient = False layer_1_out = fluid.layers.lstm( emb, c_pre_init=c_pre_init, hidden_dim=emb_dim) layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2]) diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py new file mode 100644 index 0000000000..271e39a0e0 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py @@ -0,0 +1,87 @@ +import paddle.v2.fluid as fluid +import paddle.v2 as paddle +import unittest +import numpy + + +class TestDynRNN(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + self.BATCH_SIZE = 100 + self.train_data = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), + batch_size=self.BATCH_SIZE) + + def test_plain_while_op(self): + main_program = fluid.Program() + startup_program = fluid.Program() + + with fluid.program_guard(main_program, startup_program): + sentence = fluid.layers.data( + name='word', shape=[1], dtype='int64', lod_level=1) + sent_emb = fluid.layers.embedding( + input=sentence, size=[len(self.word_dict), 32], dtype='float32') + + label = fluid.layers.data(name='label', shape=[1], dtype='float32') + + rank_table = fluid.layers.lod_rank_table(x=sent_emb) + + sent_emb_array = fluid.layers.lod_tensor_to_array( + x=sent_emb, table=rank_table) + + seq_len = fluid.layers.max_sequence_len(rank_table=rank_table) + i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) + i.stop_gradient = False + + boot_mem = fluid.layers.fill_constant_batch_size_like( + input=fluid.layers.array_read( + array=sent_emb_array, i=i), + value=0, + shape=[-1, 100], + dtype='float32') + boot_mem.stop_gradient = False + + mem_array = fluid.layers.array_write(x=boot_mem, i=i) + + cond = fluid.layers.less_than(x=i, y=seq_len) + cond.stop_gradient = False + while_op = fluid.layers.While(cond=cond) + out = fluid.layers.create_array(dtype='float32') + + with while_op.block(): + mem = fluid.layers.array_read(array=mem_array, i=i) + ipt = fluid.layers.array_read(array=sent_emb_array, i=i) + + mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table) + + hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh') + fluid.layers.array_write(x=hidden, i=i, array=out) + fluid.layers.increment(x=i, in_place=True) + fluid.layers.array_write(x=hidden, i=i, array=mem_array) + fluid.layers.less_than(x=i, y=seq_len, cond=cond) + + all_timesteps = fluid.layers.array_to_lod_tensor( + x=out, table=rank_table) + last = fluid.layers.sequence_pool( + input=all_timesteps, pool_type='last') + logits = fluid.layers.fc(input=last, size=1, act=None) + loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=logits, label=label) + loss = fluid.layers.mean(x=loss) + sgd = fluid.optimizer.SGD(1e-4) + sgd.minimize(loss=loss) + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + exe.run(startup_program) + feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu) + + data = next(self.train_data()) + val = exe.run(main_program, feed=feeder.feed(data), + fetch_list=[loss])[0] + self.assertEqual((1, ), val.shape) + print(val) + self.assertFalse(numpy.isnan(val)) + + +if __name__ == '__main__': + unittest.main() From 5bd1e73f5e6e7532bd1b13b1c0924ba70ae5cd1a Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 5 Dec 2017 00:25:39 +0800 Subject: [PATCH 178/275] Refine and speedup momentum operator. --- paddle/operators/momentum_op.cc | 4 +- paddle/operators/momentum_op.cu | 66 +++++++++++++++++++++++++++++++-- paddle/operators/momentum_op.h | 13 +++---- 3 files changed, 70 insertions(+), 13 deletions(-) diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc index 1995400619..fde253b0b3 100644 --- a/paddle/operators/momentum_op.cc +++ b/paddle/operators/momentum_op.cc @@ -101,5 +101,5 @@ $$ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker); -REGISTER_OP_CPU_KERNEL( - momentum, ops::MomentumOpKernel); +REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel, + ops::MomentumOpKernel); diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu index efc24e795e..d856df4002 100644 --- a/paddle/operators/momentum_op.cu +++ b/paddle/operators/momentum_op.cu @@ -12,9 +12,67 @@ See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU -#include "paddle/operators/momentum_op.h" +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +__global__ void MomentumKernel(const T* p, const T* g, const T* v, + const T* learning_rate, const T mu, + const int64_t num, bool use_nesterov, T* p_out, + T* v_out) { + T lr = learning_rate[0]; + if (use_nesterov) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + T g_val = g[i]; + T v_new = v[i] * mu + g_val; + v_out[i] = v_new; + p_out[i] = p[i] - g_val * lr + v_new * mu * lr; + } + } else { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; + i += blockDim.x * gridDim.x) { + T v_new = v[i] * mu + g[i]; + v_out[i] = v_new; + p_out[i] = p[i] - lr * v_new; + } + } +} + +template +class MomentumOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto param_out = ctx.Output("ParamOut"); + auto velocity_out = ctx.Output("VelocityOut"); + auto param = ctx.Input("Param"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto learning_rate = ctx.Input("LearningRate"); + + T* p_out = param_out->mutable_data(ctx.GetPlace()); + T* v_out = velocity_out->mutable_data(ctx.GetPlace()); + + T mu = static_cast(ctx.Attr("mu")); + bool use_nesterov = ctx.Attr("use_nesterov"); + + auto* p = param->data(); + auto* v = velocity->data(); + auto* g = grad->data(); + auto* lr = learning_rate->data(); + + int block = 512; + int grid = (param->numel() + block - 1) / block; + MomentumKernel<<>>( + p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out); + } +}; + +} // namespace operators +} // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - momentum, ops::MomentumOpKernel); +REGISTER_OP_GPU_KERNEL(momentum, ops::MomentumOpCUDAKernel, + ops::MomentumOpCUDAKernel); diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h index 8f7f5eb5c2..2d919573d2 100644 --- a/paddle/operators/momentum_op.h +++ b/paddle/operators/momentum_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class MomentumOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -33,7 +33,7 @@ class MomentumOpKernel : public framework::OpKernel { param_out->mutable_data(ctx.GetPlace()); velocity_out->mutable_data(ctx.GetPlace()); - float mu = ctx.Attr("mu"); + T mu = static_cast(ctx.Attr("mu")); bool use_nesterov = ctx.Attr("use_nesterov"); auto p_out = framework::EigenVector::Flatten(*param_out); @@ -42,18 +42,17 @@ class MomentumOpKernel : public framework::OpKernel { auto p = framework::EigenVector::Flatten(*param); auto v = framework::EigenVector::Flatten(*velocity); auto g = framework::EigenVector::Flatten(*grad); - auto lr = framework::EigenVector::Flatten(*learning_rate); + auto* lr = learning_rate->data(); - auto place = ctx.GetEigenDevice(); + auto place = ctx.GetEigenDevice(); Eigen::DSizes grad_dsize(grad->numel()); v_out.device(place) = v * mu + g; if (use_nesterov) { - p_out.device(place) = p - g * lr.broadcast(grad_dsize) + - v_out * mu * lr.broadcast(grad_dsize); + p_out.device(place) = p - (g - v_out * mu) * lr[0]; } else { - p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out; + p_out.device(place) = p - lr[0] * v_out; } } }; From 488908e95b1f17be97ab295e0971cd7832d703c7 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 4 Dec 2017 20:55:28 +0800 Subject: [PATCH 179/275] refine cuda --- paddle/operators/elementwise_op_function.h | 55 ++++++++++++++++++++-- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 22b96b9312..09ab42b501 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -18,6 +18,10 @@ #include "paddle/framework/operator.h" #include "paddle/platform/transform.h" +#ifdef __NVCC__ +#include +#endif + #include "paddle/operators/math/math_function.h" namespace paddle { @@ -74,12 +78,12 @@ struct RowwiseTransformIterator { bool operator==( const RowwiseTransformIterator& rhs) const { - return &(this->operator*()) == &(*rhs); + return (ptr_ + i_) == &(*rhs); } bool operator!=( const RowwiseTransformIterator& rhs) const { - return &(this->operator*()) &= &(*rhs); + return (ptr_ + i_) &= &(*rhs); } const T& operator*() { return ptr_[i_]; } @@ -108,12 +112,12 @@ struct MidWiseTransformIterator { bool operator==( const MidWiseTransformIterator& rhs) const { - return &(this->operator*()) == &(*rhs); + return (ptr_ + i_) == &(*rhs); } bool operator!=( const MidWiseTransformIterator& rhs) const { - return &(this->operator*()) &= &(*rhs); + return (ptr_ + i_) &= &(*rhs); } const T& operator*() { return ptr_[i_]; } @@ -125,6 +129,49 @@ struct MidWiseTransformIterator { int post_; }; +#ifdef __NVCC__ +template +struct RowwiseTransformIterator + : public thrust::iterator_adaptor< + RowwiseTransformIterator, const T*> { + public: + typedef thrust::iterator_adaptor< + RowwiseTransformIterator, const T*> + super_t; + __host__ __device__ RowwiseTransformIterator(const T* x, int n) + : super_t(x), begin_(x), n_(n){}; + friend class thrust::iterator_core_access; + + private: + unsigned int n_; + const T* begin_; + __host__ __device__ typename super_t::reference dereference() const { + return *(begin_ + (this->base() - begin_) % n_); + } +}; + +template +struct MidWiseTransformIterator + : public thrust::iterator_adaptor< + MidWiseTransformIterator, const T*> { + public: + typedef thrust::iterator_adaptor< + MidWiseTransformIterator, const T*> + super_t; + __host__ __device__ MidWiseTransformIterator(const T* x, int n, int post) + : super_t(x), begin_(x), n_(n), post_(post){}; + friend class thrust::iterator_core_access; + + private: + unsigned int post_; + unsigned int n_; + const T* begin_; + __host__ __device__ typename super_t::reference dereference() const { + return *(begin_ + (((this->base() - begin_) / post_) % n_)); + } +}; +#endif + template struct TransformFunctor { TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, From 33fe7415c7070494f347cf9ba75c878ecffeeb52 Mon Sep 17 00:00:00 2001 From: "Wang,Jeff" Date: Mon, 4 Dec 2017 12:09:41 -0800 Subject: [PATCH 180/275] Provide more details on the CONTENT_DIR. Change the instructions to use https instead of SSH --- doc/howto/dev/write_docs_cn.rst | 43 +++++++++++++++++++-------------- doc/howto/dev/write_docs_en.rst | 31 +++++++++++++++++------- doc/howto/index_cn.rst | 1 - 3 files changed, 47 insertions(+), 28 deletions(-) diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index 3dddbbe506..b19918a5f8 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -3,7 +3,7 @@ ################## PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。 - +也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下 如何构建文档 ============ @@ -13,40 +13,52 @@ PaddlePaddle的文档构建有三种方式。 使用PaddlePaddle.org工具 -------------- -这个是目前推荐的使用方法。除了可以自动编役文档,也可以直接在网页预览文档。 +这个是目前推荐的使用方法。除了可以自动编译文档,也可以直接在网页预览文档。 文件工具是使用Docker,需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具 .. code-block:: bash - mkdir paddlepaddle + mkdir paddlepaddle # Create paddlepaddle working directory cd paddlepaddle - git clone git@github.com:PaddlePaddle/Paddle.git - git clone git@github.com:PaddlePaddle/book.git - git clone git@github.com:PaddlePaddle/models.git + + # Clone the content repositories + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git docker run -it -p 8000:8000 paddlepaddle/paddlepaddle.org:latest 之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 +编译后的文件将被存储在工作目录 /.ppo_workspace/content。 如果不想使用 Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 .. code-block:: bash - mkdir paddlepaddle + mkdir paddlepaddle # Create paddlepaddle working directory cd paddlepaddle - git clone git@github.com:PaddlePaddle/Paddle.git - git clone git@github.com:PaddlePaddle/book.git - git clone git@github.com:PaddlePaddle/models.git - git clone git@github.com:PaddlePaddle/PaddlePaddle.org.git - export CONTENT_DIR= + + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git + + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd + export CONTENT_DIR= export ENV='' cd PaddlePaddle.org/portal/ pip install -r requirements.txt python manage.py runserver +工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。 之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。 -想了解更多关於 PaddlePaddle.org 工具,可以 `点击这里 `_ 。 +编译后的文件将被存储在工作目录 /.ppo_workspace/content。 + +想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。 使用Docker构建 -------------- @@ -85,11 +97,6 @@ PaddlePaddle的文档构建有三种方式。 PaddlePaddle文档使用 `sphinx`_ 自动生成,用户可以参考sphinx教程进行书写。 -如何更新文档主题 -================ - -PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下,包含所有和前端网页设计相关的文件。 - 如何更新www.paddlepaddle.org ============================ diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst index 0e60e21889..ed37737517 100644 --- a/doc/howto/dev/write_docs_en.rst +++ b/doc/howto/dev/write_docs_en.rst @@ -4,6 +4,7 @@ Contribute Documentation PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories. +When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content How to Build Documentations ============ @@ -19,26 +20,36 @@ The tool uses Docker, please install it on your system. Please check Docker offi .. code-block:: bash - mkdir paddlepaddle + mkdir paddlepaddle # Create paddlepaddle working directory cd paddlepaddle - git clone git@github.com:PaddlePaddle/Paddle.git - git clone git@github.com:PaddlePaddle/book.git - git clone git@github.com:PaddlePaddle/models.git + + # Clone the content repositories. You may only clone the contents you need + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git docker run -it -p 8000:8000 paddlepaddle/paddlepaddle.org:latest Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up .. code-block:: bash - mkdir paddlepaddle + mkdir paddlepaddle # Create paddlepaddle working directory cd paddlepaddle - git clone git@github.com:PaddlePaddle/Paddle.git - git clone git@github.com:PaddlePaddle/book.git - git clone git@github.com:PaddlePaddle/models.git - git clone git@github.com:PaddlePaddle/PaddlePaddle.org.git + + # Clone the content repositories and PaddlePaddle.org + git clone https://github.com/PaddlePaddle/Paddle.git + git clone https://github.com/PaddlePaddle/book.git + git clone https://github.com/PaddlePaddle/models.git + git clone https://github.com/PaddlePaddle/Mobile.git + git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git + + # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd export CONTENT_DIR= export ENV='' cd PaddlePaddle.org/portal/ @@ -46,6 +57,8 @@ If you don't wish to use Docker, you can also activate the tool through Django. python manage.py runserver Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +The compiled documentations will be stored in /.ppo_workspace/content + If you want to learn more on the PaddlePaddle.org, please `click here `_ 。 How to write Documentations diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst index 8ea99ea40c..991b9e2596 100644 --- a/doc/howto/index_cn.rst +++ b/doc/howto/index_cn.rst @@ -19,7 +19,6 @@ .. toctree:: :maxdepth: 1 - dev/build_cn.rst dev/contribute_to_paddle_cn.md dev/write_docs_cn.rst From 2da4a89e1f591b2d9e430f59b516866651df1d25 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 5 Dec 2017 09:58:09 +0800 Subject: [PATCH 181/275] fix typo in pip_install_cn/en.rst --- doc/getstarted/build_and_install/pip_install_cn.rst | 4 ++-- doc/getstarted/build_and_install/pip_install_en.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst index b26bf4c95c..b270e2c2f0 100644 --- a/doc/getstarted/build_and_install/pip_install_cn.rst +++ b/doc/getstarted/build_and_install/pip_install_cn.rst @@ -34,7 +34,7 @@ PaddlePaddle可以使用常用的Python包管理工具 :align: center .. csv-table:: 各个版本最新的whl包 - :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API" + :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API" :widths: 1, 3, 3, 3 "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" @@ -83,4 +83,4 @@ PaddlePaddle发布的安装包会尽量对齐 `manylinux1 `_ 链接中找到。 - 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。 \ No newline at end of file + 如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ,需要升级pip版本到最新; 如果系统支持 manylinux1_x86_64 而安装包(本地)是 linux_x86_64 ,可以重命名这个whl包为 manylinux1_x86_64 再安装。 diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst index 113790e4e4..70f601a11c 100644 --- a/doc/getstarted/build_and_install/pip_install_en.rst +++ b/doc/getstarted/build_and_install/pip_install_en.rst @@ -37,7 +37,7 @@ If the links below shows up the login form, just click "Log in as guest" to star :align: center .. csv-table:: whl package of each version - :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API" + :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API" :widths: 1, 3, 3, 3 "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" From 54f09620cf6ee80224270cbf400cdbbcbdd19697 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 5 Dec 2017 08:43:08 +0800 Subject: [PATCH 182/275] code refine --- paddle/operators/elementwise_op_function.h | 19 +++++-------------- 1 file changed, 5 insertions(+), 14 deletions(-) diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 09ab42b501..14da42a786 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -70,9 +70,7 @@ struct RowwiseTransformIterator { RowwiseTransformIterator& operator++() { ++i_; - if (i_ == n_) { - i_ = 0; - } + i_ %= n_; return *this; } @@ -90,7 +88,7 @@ struct RowwiseTransformIterator { const T* ptr_; int i_; - int n_; + int64_t n_; }; template @@ -99,14 +97,7 @@ struct MidWiseTransformIterator { : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} MidWiseTransformIterator& operator++() { - ++j_; - if (j_ == post_) { - j_ = 0; - ++i_; - if (i_ == n_) { - i_ = 0; - } - } + i_ = ++j_ / post_ % n_; return *this; } @@ -124,8 +115,8 @@ struct MidWiseTransformIterator { const T* ptr_; int i_; - int j_; - int n_; + int64_t j_; + int64_t n_; int post_; }; From e03b574e0e54ed89341766e9d8f12d7920abdf4d Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 5 Dec 2017 10:42:33 +0800 Subject: [PATCH 183/275] Refine the Eigen usage for CPU implementation. --- paddle/operators/momentum_op.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h index 2d919573d2..da69532ea5 100644 --- a/paddle/operators/momentum_op.h +++ b/paddle/operators/momentum_op.h @@ -44,15 +44,11 @@ class MomentumOpKernel : public framework::OpKernel { auto g = framework::EigenVector::Flatten(*grad); auto* lr = learning_rate->data(); - auto place = ctx.GetEigenDevice(); - - Eigen::DSizes grad_dsize(grad->numel()); - - v_out.device(place) = v * mu + g; + v_out = v * mu + g; if (use_nesterov) { - p_out.device(place) = p - (g - v_out * mu) * lr[0]; + p_out = p - (g - v_out * mu) * lr[0]; } else { - p_out.device(place) = p - lr[0] * v_out; + p_out = p - lr[0] * v_out; } } }; From 576c41b3486864938509dee6a3ecac7b258f7ce5 Mon Sep 17 00:00:00 2001 From: "Wang,Jeff" Date: Mon, 4 Dec 2017 18:57:57 -0800 Subject: [PATCH 184/275] Update the incorrect docker commands. Should always specify -v flag. --- doc/howto/dev/write_docs_cn.rst | 4 +++- doc/howto/dev/write_docs_en.rst | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst index b19918a5f8..1bc947c260 100644 --- a/doc/howto/dev/write_docs_cn.rst +++ b/doc/howto/dev/write_docs_cn.rst @@ -28,8 +28,10 @@ PaddlePaddle的文档构建有三种方式。 git clone https://github.com/PaddlePaddle/models.git git clone https://github.com/PaddlePaddle/Mobile.git - docker run -it -p 8000:8000 paddlepaddle/paddlepaddle.org:latest + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest +注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令 之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 编译后的文件将被存储在工作目录 /.ppo_workspace/content。 diff --git a/doc/howto/dev/write_docs_en.rst b/doc/howto/dev/write_docs_en.rst index ed37737517..b3ef07eb1d 100644 --- a/doc/howto/dev/write_docs_en.rst +++ b/doc/howto/dev/write_docs_en.rst @@ -29,8 +29,10 @@ The tool uses Docker, please install it on your system. Please check Docker offi git clone https://github.com/PaddlePaddle/models.git git clone https://github.com/PaddlePaddle/Mobile.git - docker run -it -p 8000:8000 paddlepaddle/paddlepaddle.org:latest + # Please specify the working directory through -v + docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest +Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation The compiled documentations will be stored in /.ppo_workspace/content From ee0d365a3a21ab8881ad88b252a5d117b87bc726 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 5 Dec 2017 11:36:08 +0800 Subject: [PATCH 185/275] add inference benchmark data --- benchmark/IntelOptimizedPaddle.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index 16c2390fd3..c275aeb5cb 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -23,6 +23,8 @@ On each machine, we will test and compare the performance of training on single ## Benchmark Model ### Server + +#### Training Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz Input image size - 3 * 224 * 224, Time: images/second @@ -62,6 +64,34 @@ TBD chart on batch size 128 TBD +#### Inference +Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz +- VGG-19 + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|-------|-------|-------|-------|-------| +| OpenBLAS | 0.36 | 0.48 | 0.56 | 0.50 | 0.43 | +| MKLML | 5.41 | 9.52 | 14.71 | 20.46 | 29.35 | +| MKL-DNN | 65.52 | 89.94 | 83.92 | 94.77 | 95.78 | + +- ResNet-50 + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|-------|--------|--------|--------|--------| +| OpenBLAS | 0.29 | 0.43 | 0.71 | 0.85 | 0.71 | +| MKLML | 6.26 | 11.88 | 21.37 | 39.67 | 59.01 | +| MKL-DNN | 90.27 | 134.03 | 136.03 | 153.66 | 211.22 | + + +- GoogLeNet + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|--------|--------|--------|--------|--------| +| OpenBLAS | 12.47 | 12.36 | 12.25 | 12.13 | 12.08 | +| MKLML | 22.50 | 43.90 | 81.22 | 132.92 | 199.69 | +| MKL-DNN | 221.69 | 341.33 | 428.09 | 528.24 | 624.18 | + + ### Laptop TBD ### Desktop From 9e244a8cbe8e0d089e0f3d402230a1d5f2ffcbb9 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 5 Dec 2017 10:59:43 +0800 Subject: [PATCH 186/275] follow comments --- paddle/operators/elementwise_add_op.h | 4 ++-- paddle/operators/elementwise_op_function.h | 28 +++++++++++----------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index 686d45573d..3a198c167e 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -34,8 +34,8 @@ class ElementwiseAddKernel : public framework::OpKernel { auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - TransformFunctor, T, Place> functor(x, y, z, ctx, - AddFunctor()); + TransformFunctor, T, Place> functor( + x, y, z, ctx.device_context(), AddFunctor()); auto x_dims = x->dims(); auto y_dims = y->dims(); diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 14da42a786..6d849bff49 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -81,7 +81,7 @@ struct RowwiseTransformIterator { bool operator!=( const RowwiseTransformIterator& rhs) const { - return (ptr_ + i_) &= &(*rhs); + return (ptr_ + i_) != &(*rhs); } const T& operator*() { return ptr_[i_]; } @@ -97,7 +97,7 @@ struct MidWiseTransformIterator { : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} MidWiseTransformIterator& operator++() { - i_ = ++j_ / post_ % n_; + i_ = (++j_ / post_) % n_; return *this; } @@ -108,7 +108,7 @@ struct MidWiseTransformIterator { bool operator!=( const MidWiseTransformIterator& rhs) const { - return (ptr_ + i_) &= &(*rhs); + return (ptr_ + i_) != &(*rhs); } const T& operator*() { return ptr_[i_]; } @@ -129,14 +129,14 @@ struct RowwiseTransformIterator typedef thrust::iterator_adaptor< RowwiseTransformIterator, const T*> super_t; - __host__ __device__ RowwiseTransformIterator(const T* x, int n) + HOSTDEVICE RowwiseTransformIterator(const T* x, int n) : super_t(x), begin_(x), n_(n){}; friend class thrust::iterator_core_access; private: unsigned int n_; const T* begin_; - __host__ __device__ typename super_t::reference dereference() const { + HOSTDEVICE typename super_t::reference dereference() const { return *(begin_ + (this->base() - begin_) % n_); } }; @@ -149,7 +149,7 @@ struct MidWiseTransformIterator typedef thrust::iterator_adaptor< MidWiseTransformIterator, const T*> super_t; - __host__ __device__ MidWiseTransformIterator(const T* x, int n, int post) + HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) : super_t(x), begin_(x), n_(n), post_(post){}; friend class thrust::iterator_core_access; @@ -157,7 +157,7 @@ struct MidWiseTransformIterator unsigned int post_; unsigned int n_; const T* begin_; - __host__ __device__ typename super_t::reference dereference() const { + HOSTDEVICE typename super_t::reference dereference() const { return *(begin_ + (((this->base() - begin_) / post_) % n_)); } }; @@ -166,7 +166,7 @@ struct MidWiseTransformIterator template struct TransformFunctor { TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z, const framework::ExecutionContext& ctx, + framework::Tensor* z, const platform::DeviceContext& ctx, Functor func) : x_(x->data()), y_(y->data()), @@ -177,26 +177,26 @@ struct TransformFunctor { inline void Run() const { platform::Transform trans; - trans(ctx_.device_context(), x_, x_ + nx_, y_, z_, func_); + trans(ctx_, x_, x_ + nx_, y_, z_, func_); } inline void RunRowWise(int n, int pre) const { platform::Transform trans; - trans(ctx_.device_context(), x_, x_ + nx_, - RowwiseTransformIterator(y_, n), z_, func_); + trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), z_, + func_); } inline void RunMidWise(int n, int pre, int post) const { platform::Transform trans; - trans(ctx_.device_context(), x_, x_ + nx_, - MidWiseTransformIterator(y_, n, post), z_, func_); + trans(ctx_, x_, x_ + nx_, MidWiseTransformIterator(y_, n, post), + z_, func_); } const T* x_; const T* y_; T* z_; int64_t nx_; - const framework::ExecutionContext& ctx_; + const platform::DeviceContext& ctx_; Functor func_; }; From 96a5f96cc192e0bca3a8ce6fa0c3a3511e56ea7a Mon Sep 17 00:00:00 2001 From: QI JUN Date: Tue, 5 Dec 2017 12:37:39 +0800 Subject: [PATCH 187/275] fix bug in gpu default memory allocating policy (#6268) --- paddle/platform/gpu_info.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index 63a3351708..4fa2eaed31 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -18,8 +18,8 @@ limitations under the License. */ #include "paddle/platform/enforce.h" -DEFINE_double(fraction_of_gpu_memory_to_use, 0.95, - "Default use 95% of GPU memory for PaddlePaddle," +DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, + "Default use 92% of GPU memory for PaddlePaddle," "reserve the rest for page tables, etc"); namespace paddle { From 37671ac0539b69e2c5bb72dbaad22a96d633118c Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 5 Dec 2017 12:39:47 +0800 Subject: [PATCH 188/275] follow comments --- paddle/operators/elementwise_op_function.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index 6d849bff49..ec448a9e95 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -60,12 +60,13 @@ inline void get_mid_dims(const framework::DDim& x_dims, } template -struct RowwiseTransformIterator; +class RowwiseTransformIterator; template -struct MidWiseTransformIterator; +class MidWiseTransformIterator; template -struct RowwiseTransformIterator { +class RowwiseTransformIterator { + public: RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {} RowwiseTransformIterator& operator++() { @@ -86,13 +87,15 @@ struct RowwiseTransformIterator { const T& operator*() { return ptr_[i_]; } + private: const T* ptr_; int i_; int64_t n_; }; template -struct MidWiseTransformIterator { +class MidWiseTransformIterator { + public: MidWiseTransformIterator(const T* ptr, int n, int post) : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} @@ -113,6 +116,7 @@ struct MidWiseTransformIterator { const T& operator*() { return ptr_[i_]; } + private: const T* ptr_; int i_; int64_t j_; @@ -122,7 +126,7 @@ struct MidWiseTransformIterator { #ifdef __NVCC__ template -struct RowwiseTransformIterator +class RowwiseTransformIterator : public thrust::iterator_adaptor< RowwiseTransformIterator, const T*> { public: @@ -142,7 +146,7 @@ struct RowwiseTransformIterator }; template -struct MidWiseTransformIterator +class MidWiseTransformIterator : public thrust::iterator_adaptor< MidWiseTransformIterator, const T*> { public: @@ -164,7 +168,8 @@ struct MidWiseTransformIterator #endif template -struct TransformFunctor { +class TransformFunctor { + public: TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, framework::Tensor* z, const platform::DeviceContext& ctx, Functor func) @@ -192,6 +197,7 @@ struct TransformFunctor { z_, func_); } + private: const T* x_; const T* y_; T* z_; From d432b10d8aa5beb0e8576b8f9811048af98519bc Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 5 Dec 2017 13:45:17 +0800 Subject: [PATCH 189/275] Update cuda kernel and doc. --- paddle/operators/momentum_op.cc | 8 ++++++-- paddle/operators/momentum_op.cu | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc index fde253b0b3..2ab48fedec 100644 --- a/paddle/operators/momentum_op.cc +++ b/paddle/operators/momentum_op.cc @@ -71,8 +71,12 @@ class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor, default Tensor) " "Input learning rate"); - AddOutput("ParamOut", "(Tensor) Output updated parameter"); - AddOutput("VelocityOut", "(Tensor) Output updated velocity"); + AddOutput("ParamOut", + "(Tensor) This output is updated parameter. " + "It shared memory with Input(Param)."); + AddOutput("VelocityOut", + "(Tensor) This output is updated velocity. " + "It shared memory with Input(Velocity)."); AddAttr("mu", "(float) Momentum coefficient"); AddAttr("use_nesterov", diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu index d856df4002..be0c8ea071 100644 --- a/paddle/operators/momentum_op.cu +++ b/paddle/operators/momentum_op.cu @@ -29,7 +29,7 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v, T g_val = g[i]; T v_new = v[i] * mu + g_val; v_out[i] = v_new; - p_out[i] = p[i] - g_val * lr + v_new * mu * lr; + p_out[i] = p[i] - (g_val - v_new * mu) * lr; } } else { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; From 4e451a34db079eb0ac443a8a132fd65c47e6fd23 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 5 Dec 2017 15:29:16 +0800 Subject: [PATCH 190/275] Remove the cuda stream synchronization between each operator. --- paddle/framework/operator.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 93467ab8ac..f1444eeee9 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -426,9 +426,6 @@ void OperatorWithKernel::Run(const Scope& scope, } kernel_iter->second->Compute(ctx); - - // throws errors if have. - dev_ctx.Finish(); } OpKernelType OperatorWithKernel::GetKernelType( const ExecutionContext& ctx) const { From 45c8a88a3e0ff4ca0f5440102103a5423432969e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 5 Dec 2017 16:08:32 +0800 Subject: [PATCH 191/275] add crf_decoding layer (#6274) * add crf_decoding layer * fix some typo * fix test_crf_decoding_op --- paddle/operators/crf_decoding_op.cc | 17 +++++++++-------- paddle/operators/crf_decoding_op.h | 10 +++++----- python/paddle/v2/fluid/framework.py | 2 +- python/paddle/v2/fluid/layer_helper.py | 8 +++++++- python/paddle/v2/fluid/layers.py | 18 ++++++++++++++++++ .../tests/book/test_label_semantic_roles.py | 12 +++++++++--- .../v2/fluid/tests/test_crf_decoding_op.py | 12 ++++++------ python/paddle/v2/fluid/tests/test_layers.py | 7 ++++++- 8 files changed, 61 insertions(+), 25 deletions(-) diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc index f418f489c0..291b23ed1b 100644 --- a/paddle/operators/crf_decoding_op.cc +++ b/paddle/operators/crf_decoding_op.cc @@ -36,17 +36,18 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker { "w. See more details in comments of the linear_chain_crf operator."); AddInput( "Label", - "(LoDTensor, LoDTensor). The ground truth with shape " + "(LoDTensor, LoDTensor). The ground truth with shape " "[N x 1]. This input is optional. See more details in the operator's " "comments.") .AsDispensable(); - AddOutput("ViterbiPath", - "(LoDTensor, LoDTensor). The decoding results. What to " - "return changes depending on whether the Input(Label) (the groud " - "truth) is given. See more details in the operator's comment."); + AddOutput( + "ViterbiPath", + "(LoDTensor, LoDTensor). The decoding results. What to " + "return changes depending on whether the Input(Label) (the ground " + "truth) is given. See more details in the operator's comment."); AddComment(R"DOC( The crf_decoding operator reads the emission feature weights and the transition -freature weights learned by the linear_chain_crf operator. It implements the +feature weights learned by the linear_chain_crf operator. It implements the Viterbi algorithm which is a dynamic programming algorithm for finding the most likely sequence of hidden states, called the Viterbi path, that results in a sequence of observed tags. @@ -60,14 +61,14 @@ operator. When Input(Label) is given, the crf_decoding operator returns a row vector with shape [N x 1] whose values are fixed to be 0, indicating an incorrect -prediction, or 1 indicating a tag is correctly predicted. Such an ouput is the +prediction, or 1 indicating a tag is correctly predicted. Such an output is the input to chunk_eval operator. 2. Input(Label) is not given: This is the standard decoding process. -The crf_decoding operator returns a row vecotr with shape [N x 1] whose values +The crf_decoding operator returns a row vector with shape [N x 1] whose values range from 0 to maximum tag number - 1. Each element indicates an index of a predicted tag. )DOC"); diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h index 526e0c5dcb..57b5e21b3a 100644 --- a/paddle/operators/crf_decoding_op.h +++ b/paddle/operators/crf_decoding_op.h @@ -43,9 +43,9 @@ class CRFDecodingOpKernel : public framework::OpKernel { const size_t level = 0; const size_t seq_num = lod[level].size() - 1; - int* path = decoded_path->mutable_data(platform::CPUPlace()); - math::SetConstant()(ctx.device_context(), - decoded_path, 0); + int64_t* path = decoded_path->mutable_data(platform::CPUPlace()); + math::SetConstant()(ctx.device_context(), + decoded_path, 0); for (size_t i = 0; i < seq_num; ++i) { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); @@ -57,7 +57,7 @@ class CRFDecodingOpKernel : public framework::OpKernel { if (label) { PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL, "The Input(Label) should be a sequence."); - const int* label_value = label->data(); + const int64_t* label_value = label->data(); size_t batch_size = emission_weights->dims()[0]; for (size_t i = 0; i < batch_size; ++i) { path[i] = label_value[i] == path[i] ? 1 : 0; @@ -76,7 +76,7 @@ class CRFDecodingOpKernel : public framework::OpKernel { const T* x = emission_weights.data(); const T* w = transition_weights.data(); - int* path = decoded_path->data(); + int64_t* path = decoded_path->data(); // alpha is a memo table. An element alpha(k, v) records the score of the // best sequence of tags from position 1 to position k with v being the end diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 49c6d89834..cd8bbe0836 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -237,7 +237,7 @@ class Operator(object): def find_name(var_list, name): for var_name in var_list: - if var_name == name: + if var_list[var_name] is not None and var_name == name: return True return False diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py index 5b384e5cf5..cbee3fe637 100644 --- a/python/paddle/v2/fluid/layer_helper.py +++ b/python/paddle/v2/fluid/layer_helper.py @@ -1,7 +1,7 @@ import copy import itertools -from framework import Variable, default_main_program, default_startup_program, \ +from framework import Variable, Parameter, default_main_program, default_startup_program, \ unique_name, dtype_is_floating from paddle.v2.fluid.initializer import Constant, Xavier from param_attr import ParamAttr @@ -122,6 +122,12 @@ class LayerHelper(object): return self.main_program.global_block().create_parameter( dtype=dtype, shape=shape, **attr.to_kwargs()) + def get_parameter(self, name): + param = self.main_program.global_block().var(name) + if not isinstance(param, Parameter): + raise ValueError("no Parameter name %s found" % name) + return param + def create_tmp_variable(self, dtype): return self.main_program.current_block().create_var( name=unique_name(".".join([self.name, 'tmp'])), diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 99d0ac4a1b..fc7b687263 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -477,6 +477,24 @@ def linear_chain_crf(input, return log_likelihood +def crf_decoding(input, + param_attr, + label=None, + main_program=None, + startup_program=None): + helper = LayerHelper('crf_decoding', **locals()) + transition = helper.get_parameter(param_attr.name) + viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype()) + helper.append_op( + type='crf_decoding', + inputs={"Emission": [input], + "Transition": transition, + "Label": label}, + outputs={"ViterbiPath": [viterbi_path]}) + + return viterbi_path + + def assign(input, output, main_program=None, startup_program=None): helper = LayerHelper('assign', **locals()) helper.append_op( diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index 0494c7cdca..0eb7cf600c 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -137,12 +137,19 @@ def main(): param_attr=fluid.ParamAttr( name='crfw', learning_rate=mix_hidden_lr)) avg_cost = fluid.layers.mean(x=crf_cost) + # TODO(qiao) - # 1. add crf_decode_layer and evaluator - # 2. use other optimizer and check why out will be NAN + # check other optimizers and check why out will be NAN sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001) sgd_optimizer.minimize(avg_cost) + # TODO(qiao) + # add dependency track and move this config before optimizer + crf_decode = fluid.layers.crf_decoding( + input=feature_out, + label=target, + param_attr=fluid.ParamAttr(name='crfw')) + train_data = paddle.batch( paddle.reader.shuffle( paddle.dataset.conll05.test(), buf_size=8192), @@ -168,7 +175,6 @@ def main(): feed=feeder.feed(data), fetch_list=[avg_cost]) avg_cost_val = np.array(outs[0]) - if batch_id % 10 == 0: print("avg_cost=" + str(avg_cost_val)) diff --git a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py index ee2b996bf4..ab573da31d 100644 --- a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py +++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py @@ -20,14 +20,14 @@ class CRFDecoding(object): self.w = transition_weights[2:, :] self.track = np.zeros( - (seq_start_positions[-1], self.tag_num), dtype="int32") + (seq_start_positions[-1], self.tag_num), dtype="int64") self.decoded_path = np.zeros( - (seq_start_positions[-1], 1), dtype="int32") + (seq_start_positions[-1], 1), dtype="int64") def _decode_one_sequence(self, decoded_path, x): seq_len, tag_num = x.shape alpha = np.zeros((seq_len, tag_num), dtype="float64") - track = np.zeros((seq_len, tag_num), dtype="int32") + track = np.zeros((seq_len, tag_num), dtype="int64") for i in range(tag_num): alpha[0, i] = self.a[i] + x[0, i] @@ -125,10 +125,10 @@ class TestCRFDecodingOp2(OpTest): axis=0) labels = np.random.randint( - low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32") + low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64") predicted_labels = np.ones( - (lod[-1][-1], 1), dtype="int32") * (TAG_NUM - 1) - expected_output = (labels == predicted_labels).astype("int32") + (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1) + expected_output = (labels == predicted_labels).astype("int64") self.inputs = { "Emission": (emission, lod), diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index a9d9d369c7..b2c31eecc1 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -4,6 +4,7 @@ import unittest import paddle.v2.fluid.layers as layers import paddle.v2.fluid.nets as nets from paddle.v2.fluid.framework import Program, program_guard +from paddle.v2.fluid.param_attr import ParamAttr class TestBook(unittest.TestCase): @@ -132,8 +133,12 @@ class TestBook(unittest.TestCase): images = layers.data(name='pixel', shape=[784], dtype='float32') label = layers.data(name='label', shape=[1], dtype='int32') hidden = layers.fc(input=images, size=128) - crf = layers.linear_chain_crf(input=hidden, label=label) + crf = layers.linear_chain_crf( + input=hidden, label=label, param_attr=ParamAttr(name="crfw")) + crf_decode = layers.crf_decoding( + input=hidden, param_attr=ParamAttr(name="crfw")) self.assertNotEqual(crf, None) + self.assertNotEqual(crf_decode, None) print(str(program)) From e0ac34a62062b7b47adc52b13f6bcc89dc23bf74 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 5 Dec 2017 17:16:27 +0800 Subject: [PATCH 192/275] "fix build cares" (#6097) --- cmake/external/cares.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake index ac456933bd..aec51410b3 100644 --- a/cmake/external/cares.cmake +++ b/cmake/external/cares.cmake @@ -33,7 +33,7 @@ ExternalProject_Add( UPDATE_COMMAND "" CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR} BUILD_IN_SOURCE 1 - BUILD_COMMAND make + BUILD_COMMAND make -j8 INSTALL_COMMAND make install ) From e670453518ff1743e764dd98c0daecaa2539e862 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 17 Nov 2017 14:01:58 +0800 Subject: [PATCH 193/275] add script to check the cpu env --- paddle/scripts/check_env.sh | 141 ++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) create mode 100755 paddle/scripts/check_env.sh diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh new file mode 100755 index 0000000000..557e3f208f --- /dev/null +++ b/paddle/scripts/check_env.sh @@ -0,0 +1,141 @@ +#!/bin/bash + +if [ "`uname -s`" != "Linux" ]; then + echo "Current scenario only support in Linux yet!" + exit 0 +fi + +echo "========================= Hardware Information =========================" +sockets=`grep 'physical id' /proc/cpuinfo | sort -u | wc -l` +cores_per_socket=`grep 'core id' /proc/cpuinfo | sort -u | wc -l` +ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs` +physical_cores=$((sockets * cores_per_socket)) +virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l` +numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs` +echo "CPU Name : `lscpu |grep \"name\" |awk -F':' '{print $2}'|xargs`" +echo "CPU Family : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`" +echo "Socket Number : $sockets" +echo "Cores Per Socket : $cores_per_socket" +echo "Total Physical Cores : $physical_cores" +echo "Total Virtual Cores : $virtual_cores" +if [ $ht -eq 1 ]; then + echo "Hyper Threading : OFF" + if [ $physical_cores -ne $virtual_cores ]; then + echo "Error: HT logical error" + fi +else + echo "Hyper Threading : ON" + if [ $physical_cores -ge $virtual_cores ]; then + echo "Error: HT logical error" + fi +fi +echo "NUMA Nodes : $numa_nodes" +if [ $numa_nodes -lt $sockets ]; then + echo "Warning: NUMA node is not enough for the best performance,\ + at least $sockets" +fi + +echo "-------------------------- Memory Information --------------------------" +echo "DIMMs max slots : `dmidecode | grep "Bank Locator" | wc -l`" +# dmidecode support start from 2.11 +num_dimms_installed=`dmidecode | grep "Memory Device Mapped" | wc -l` +num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -i "Hz" |wc -l` +echo "Installed DIMM number : $num_dimms_installed" +if [ $num_dimms_installed -ne $num_clock_configed ]; then + echo "Error: installed DIMMs do ont match configured clocks: $num_clock_configed" +fi +echo "Memory Size : `free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`" +echo "Swap Memory Size : `free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`" +echo "Total Memory Size : `free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`" +echo "Max Memory Capacity : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`" +# DIMMs fequency +clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs` +echo "Configed Clock Speed : $clock_speeds" +num_clock_type=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | wc -l` +if [ $num_clock_type -ne 1 ]; then + echo "Warning: Have more than 1 speed type, all DIMMs should have same fequency: $clock_speeds" +fi + +echo "-------------------------- Turbo Information --------------------------" +scaling_drive=`cat /sys/devices/system/cpu/cpu0/cpufreq/scaling_driver` +echo "Scaling Driver : $scaling_drive" +if [ $scaling_drive == "intel_pstate" ] && [ -e /sys/devices/system/cpu/intel_pstate/no_turbo ]; then + turbo=`cat /sys/devices/system/cpu/intel_pstate/no_turbo` + if [ $turbo -eq 1 ]; then + echo "Turbo Status : OFF" + else + echo "Turbo Status : ON" + fi +else + echo "Warning: Scaling driver is not intel_pstarte, maybe should enable it in BIOS" + echo "Turbo Status : Unknown" +fi +# cpu frequency +num_max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| sort -u |wc -l` +num_min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| sort -u |wc -l` +if [ $num_max_freq -ne 1 ]; then + echo "Error: the max_frequency of all CPU should be equal" +fi +if [ $num_min_freq -ne 1 ]; then + echo "Error: the min_frequency of all CPU should be equal" +fi +max_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq| uniq|xargs` # kHz +max_freq=`awk 'BEGIN{printf "%.2f",('$max_freq' / 1000000)}'` # GHz +min_freq=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq| uniq|xargs` # kHz +min_freq=`awk 'BEGIN{printf "%.2f",('$min_freq' / 1000000)}'` # GHz +echo "CPU Max Frequency : $max_freq GHz" +echo "CPU Min Frequency : $min_freq GHz" +# cpu governor +num_governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |wc -l` +if [ $num_governor -ne 1 ]; then + echo "Error: the governor of all CPU should be the same" +fi +governor=`cat /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor| sort -u |uniq` +echo "CPU Freq Governor : $governor" + + +echo "========================= Software Information =========================" +echo "BIOS Release Date : `dmidecode | grep "Release Date"|awk -F ':' '{print $2}'|xargs`" +echo "OS Version : `cat /etc/redhat-release`" +echo "Kernel Release Version : `uname -r`" +echo "Kernel Patch Version : `uname -v`" +echo "GCC Version :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`" +echo "CMake Version :`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`" +echo "------------------ Environment Variables Information -------------------" +kmp_affinity=`env | grep KMP_AFFINITY` +omp_dynamic=`env | grep OMP_DYNAMIC` +omp_nested=`env | grep OMP_NESTED` +omp_num_threads=`env | grep OMP_NUM_THREADS` +mkl_num_threads=`env | grep MKL_NUM_THREADS` +mkl_dynamic=`env | grep MKL_DYNAMIC` +if [ ! $kmp_affinity ]; then kmp_affinity="unset"; fi +if [ ! $omp_dynamic ]; then omp_dynamic="unset"; fi +if [ ! $omp_nested ]; then omp_nested="unset"; fi +if [ ! $omp_num_threads ]; then omp_num_threads="unset"; fi +if [ ! $mkl_num_threads ]; then mkl_num_threads="unset"; fi +if [ ! $mkl_dynamic ]; then mkl_dynamic="unset"; fi +echo "KMP_AFFINITY : $kmp_affinity" +echo "OMP_DYNAMIC : $omp_dynamic" +echo "OMP_NESTED : $omp_nested" +echo "OMP_NUM_THREADS : $omp_num_threads" +echo "MKL_NUM_THREADS : $mkl_num_threads" +echo "MKL_DYNAMIC : $mkl_dynamic" +# Check if any MKL related libraries have been installed in LD_LIBRARY_PATH +for path in `echo $LD_LIBRARY_PATH | awk -F ':' '{for(i=1;i<=NF;++i)print $i}'`; do + mkldnn_found=`find $path -name "libmkldnn.so"` + if [ "$mkldnn_found" ]; then + echo "Found MKL-DNN : $mkldnn_found" + fi + mklml_found=`find $path -name "libmklml_intel.so"` + if [ "$mklml_found" ]; then + echo "Found MKLML : $mklml_found" + fi + iomp_found=`find $path -name "libiomp5.so"` + if [ "$iomp_found" ]; then + echo "Found IOMP : $iomp_found" + fi +done + +# dump all details for fully check +lscpu > lscpu.dump +dmidecode > dmidecode.dump From 4eac85c60b50f56c75360d3e93b2c8d64b585ffb Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 5 Dec 2017 20:15:58 +0800 Subject: [PATCH 194/275] "add init seed" (#6221) * "add init seed" * "fix compile error" * "add program level seed setting" * "fixed based on comments" --- python/paddle/v2/fluid/framework.py | 11 +++++++++ python/paddle/v2/fluid/initializer.py | 10 ++++++++ .../paddle/v2/fluid/tests/test_initializer.py | 23 +++++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index cd8bbe0836..e6e3190b99 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -512,6 +512,7 @@ class Program(object): self.desc = core.ProgramDesc() self.blocks = [Block(self, 0)] self.current_block_idx = 0 + self._seed = 0 def __str__(self): return self.to_string(True) @@ -564,6 +565,16 @@ class Program(object): p.sync_with_cpp() return p + @property + def random_seed(self): + return self._seed + + @random_seed.setter + def random_seed(self, seed): + if not isinstance(seed, int): + raise ValueError("Seed must be a integer.") + self._seed = seed + def __repr__(self): return str(self) diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py index d3f648f846..c0839caaf2 100644 --- a/python/paddle/v2/fluid/initializer.py +++ b/python/paddle/v2/fluid/initializer.py @@ -132,6 +132,8 @@ class UniformInitializer(Initializer): assert isinstance(var, framework.Variable) assert isinstance(block, framework.Block) # Initialization Ops should be prepended and not appended + if self._seed == 0: + self._seed = block.program.random_seed op = block.prepend_op( type="uniform_random", outputs={"Out": var}, @@ -180,6 +182,8 @@ class NormalInitializer(Initializer): assert isinstance(var, framework.Variable) assert isinstance(block, framework.Block) # Initialization Ops should be prepended and not appended + if self._seed == 0: + self._seed = block.program.random_seed op = block.prepend_op( type="gaussian_random", outputs={"Out": var}, @@ -255,6 +259,9 @@ class XavierInitializer(Initializer): fan_in = f_in if self._fan_in is None else self._fan_in fan_out = f_out if self._fan_out is None else self._fan_out + if self._seed == 0: + self._seed = block.program.random_seed + if self._uniform: limit = np.sqrt(6.0 / float(fan_in + fan_out)) op = block.prepend_op( @@ -338,6 +345,9 @@ class MSRAInitializer(Initializer): # If fan_in is passed, use it fan_in = f_in if self._fan_in is None else self._fan_in + if self._seed == 0: + self._seed = block.program.random_seed + if self._uniform: limit = np.sqrt(6.0 / float(fan_in)) op = block.prepend_op( diff --git a/python/paddle/v2/fluid/tests/test_initializer.py b/python/paddle/v2/fluid/tests/test_initializer.py index 6c20203f8e..3175010f48 100644 --- a/python/paddle/v2/fluid/tests/test_initializer.py +++ b/python/paddle/v2/fluid/tests/test_initializer.py @@ -60,6 +60,29 @@ class TestUniformInitializer(unittest.TestCase): self.assertAlmostEqual(init_op.attr('max'), 1.0, delta=DELTA) self.assertEqual(init_op.attr('seed'), 0) + def test_uniform_initializer_random_seed(self): + """Test the uniform initializer with manually setting seed + """ + program = framework.Program() + program.random_seed = 123 + block = program.global_block() + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.UniformInitializer()) + block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="param", + initializer=initializer.UniformInitializer(seed=456)) + init_op = block.ops[1] + self.assertEqual(init_op.attr("seed"), 123) + init_op1 = block.ops[0] + self.assertEqual(init_op1.attr("seed"), 456) + def test_uniform_initializer(self): """Test uniform initializer with supplied attributes """ From 62c00e00ebbd0e5605877abcdf995045757ff1b4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 5 Dec 2017 22:14:50 +0800 Subject: [PATCH 195/275] add dimms locator info --- paddle/scripts/check_env.sh | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh index 557e3f208f..03fb102705 100755 --- a/paddle/scripts/check_env.sh +++ b/paddle/scripts/check_env.sh @@ -36,13 +36,39 @@ if [ $numa_nodes -lt $sockets ]; then fi echo "-------------------------- Memory Information --------------------------" -echo "DIMMs max slots : `dmidecode | grep "Bank Locator" | wc -l`" # dmidecode support start from 2.11 -num_dimms_installed=`dmidecode | grep "Memory Device Mapped" | wc -l` -num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -i "Hz" |wc -l` +max_dimms=0 +num_dimms_installed=0 +for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do + num_refered=`dmidecode |grep -c "$dimm_id"` + # the acutal dimm id should be refered only once + if [ $num_refered -eq 1 ]; then + num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1}; {if (s==1) {a[NR]=$0}}; + /Manufacturer/ {s=0; for (i in a) print a[i]; delete a}' |grep -ic unknown` + if [ $num_unknown -eq 0 ]; then + dimms_installed="$dimms_installed \n $dimm_id" + ((num_dimms_installed++)) + else + dimms_uninstalled="$dimms_uninstalled \n $dimm_id" + fi + ((max_dimms++)) + fi +done echo "Installed DIMM number : $num_dimms_installed" +num_dimms_mapped=`dmidecode | grep "Memory Device Mapped" | wc -l` +if [ $num_dimms_installed -ne $num_dimms_mapped ]; then + echo "Error: The installed DIMMs number does ont match the mapped memory device: $num_dimms_mapped" +fi +num_clock_configed=`dmidecode | grep -i "Configured Clock Speed" |grep -ic "Hz"` if [ $num_dimms_installed -ne $num_clock_configed ]; then - echo "Error: installed DIMMs do ont match configured clocks: $num_clock_configed" + echo "Error: The installed DIMMs number does ont match configured clocks: $num_clock_configed" +fi +echo -e "Installed DIMMs Locator: $dimms_installed" +echo -e "Not installed DIMMs : $dimms_uninstalled" +max_dimm_slots=`dmidecode | grep -c "Bank Locator"` +echo "DIMMs max slots : $max_dimm_slots" +if [ $max_dimms -ne $max_dimm_slots ]; then + echo "Error: The max dimm slots do not match the max dimms: $max_dimms" fi echo "Memory Size : `free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`" echo "Swap Memory Size : `free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`" From 161128badac906f760e097fe7fbfb0a63a6ae0ba Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Tue, 5 Dec 2017 23:54:49 +0800 Subject: [PATCH 196/275] add chunk eval layer (#6296) * add crf_decoding layer * fix some typo * init trunk_evaluator * add trunk_evaluator layer * update chunk_eval_op and test, change int32 to int64 * fix a numeric problem * change layers.trunk_evaluator to layers.trunk_eval * fix typo * add precision_val --- paddle/operators/chunk_eval_op.cc | 9 ++--- paddle/operators/chunk_eval_op.h | 14 ++++---- python/paddle/v2/fluid/layers.py | 34 +++++++++++++++++++ .../tests/book/test_label_semantic_roles.py | 19 +++++++++-- .../v2/fluid/tests/test_chunk_eval_op.py | 2 +- python/paddle/v2/fluid/tests/test_layers.py | 6 ++++ 6 files changed, 69 insertions(+), 15 deletions(-) diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc index 309660b01f..94127ab33e 100644 --- a/paddle/operators/chunk_eval_op.cc +++ b/paddle/operators/chunk_eval_op.cc @@ -58,9 +58,10 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("Inference", - "(Tensor, default: Tensor). Predictions from the network."); + "(Tensor, default: Tensor). " + "Predictions from the network."); AddInput("Label", - "(Tensor, default: Tensor). The true tag sequences."); + "(Tensor, default: Tensor). The true tag sequences."); AddOutput("Precision", "(float). The evaluated precision (called positive predictive " "value) of chunks on the given mini-batch."); @@ -84,7 +85,7 @@ class ChunkEvalOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(std::vector{}); AddComment(R"DOC( For some basics of chunking, please refer to -‘Chunking with Support Vector Mechines ’. +‘Chunking with Support Vector Machines ’. CheckEvalOp computes the precision, recall, and F1-score of chunk detection, @@ -97,7 +98,7 @@ Here is a NER example of labeling for these tagging schemes: IOE: I-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O E-LOC IOBES: B-PER E-PER O O I-ORG I-ORG I-ORG E-ORG O S-LOC -There are three chunk types(named entity types) including PER(person), ORG(orgnazation) +There are three chunk types(named entity types) including PER(person), ORG(organization) and LOC(LOCATION), and we can see that the labels have the form -. Since the calculations actually use label ids rather than labels, extra attention diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h index 81aa07817b..dd88f2553b 100644 --- a/paddle/operators/chunk_eval_op.h +++ b/paddle/operators/chunk_eval_op.h @@ -35,10 +35,10 @@ class ChunkEvalKernel : public framework::OpKernel { } }; - void GetSegments(const int* label, int length, std::vector& segments, - int num_chunk_types, int num_tag_types, int other_chunk_type, - int tag_begin, int tag_inside, int tag_end, - int tag_single) const { + void GetSegments(const int64_t* label, int length, + std::vector& segments, int num_chunk_types, + int num_tag_types, int other_chunk_type, int tag_begin, + int tag_inside, int tag_end, int tag_single) const { segments.clear(); segments.reserve(length); int chunk_start = 0; @@ -152,8 +152,8 @@ class ChunkEvalKernel : public framework::OpKernel { auto* recall = context.Output("Recall"); auto* f1 = context.Output("F1-Score"); - const int* inference_data = inference->data(); - const int* label_data = label->data(); + const int64_t* inference_data = inference->data(); + const int64_t* label_data = label->data(); T* precision_data = precision->mutable_data(context.GetPlace()); T* racall_data = recall->mutable_data(context.GetPlace()); T* f1_data = f1->mutable_data(context.GetPlace()); @@ -179,7 +179,7 @@ class ChunkEvalKernel : public framework::OpKernel { ((*precision_data) + (*racall_data)); } - void EvalOneSeq(const int* output, const int* label, int length, + void EvalOneSeq(const int64_t* output, const int64_t* label, int length, std::vector& output_segments, std::vector& label_segments, int64_t& num_output_segments, int64_t& num_label_segments, diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index fc7b687263..3f7cd525b3 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -632,6 +632,40 @@ def accuracy(input, label, k=1, correct=None, total=None, **kwargs): return acc_out +def chunk_eval(input, + label, + chunk_scheme, + num_chunk_types, + excluded_chunk_types=None, + **kwargs): + """ + This function computes the accuracy using the input and label. + The output is the top_k inputs and their indices. + """ + helper = LayerHelper("chunk_eval", **kwargs) + + # prepare output + precision = helper.create_tmp_variable(dtype="float32") + recall = helper.create_tmp_variable(dtype="float32") + f1_score = helper.create_tmp_variable(dtype="float32") + + helper.append_op( + type="chunk_eval", + inputs={"Inference": [input], + "Label": [label]}, + outputs={ + "Precision": [precision], + "Recall": [recall], + "F1-Score": [f1_score] + }, + attrs={ + "num_chunk_types": num_chunk_types, + 'chunk_scheme': chunk_scheme, + 'excluded_chunk_types': excluded_chunk_types or [] + }) + return precision, recall, f1_score + + def sequence_conv(input, num_filters, filter_size=3, diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py index 0eb7cf600c..d2693b602e 100644 --- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py @@ -1,3 +1,5 @@ +import math + import numpy as np import paddle.v2 as paddle import paddle.v2.dataset.conll05 as conll05 @@ -146,9 +148,13 @@ def main(): # TODO(qiao) # add dependency track and move this config before optimizer crf_decode = fluid.layers.crf_decoding( - input=feature_out, + input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) + + precision, recall, f1_score = fluid.layers.chunk_eval( + input=crf_decode, label=target, - param_attr=fluid.ParamAttr(name='crfw')) + chunk_scheme="IOB", + num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) train_data = paddle.batch( paddle.reader.shuffle( @@ -173,10 +179,17 @@ def main(): for data in train_data(): outs = exe.run(fluid.default_main_program(), feed=feeder.feed(data), - fetch_list=[avg_cost]) + fetch_list=[avg_cost, precision, recall, f1_score]) avg_cost_val = np.array(outs[0]) + precision_val = np.array(outs[1]) + recall_val = np.array(outs[2]) + f1_score_val = np.array(outs[3]) + if batch_id % 10 == 0: print("avg_cost=" + str(avg_cost_val)) + print("precision_val=" + str(precision_val)) + print("recall_val:" + str(recall_val)) + print("f1_score_val:" + str(f1_score_val)) # exit early for CI exit(0) diff --git a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py index 48673296a6..819e65a653 100644 --- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py +++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py @@ -120,7 +120,7 @@ class TestChunkEvalOp(OpTest): self.num_correct_chunks, self.num_infer_chunks, self.num_label_chunks = 4, 5, 9 def set_data(self): - infer = np.zeros((self.batch_size, )).astype('int32') + infer = np.zeros((self.batch_size, )).astype('int64') infer.fill(self.num_chunk_types * self.num_tag_types) label = np.copy(infer) starts = np.random.choice( diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index b2c31eecc1..57f6a362de 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -130,6 +130,7 @@ class TestBook(unittest.TestCase): def test_linear_chain_crf(self): program = Program() with program_guard(program, startup_program=Program()): + label_dict_len = 10 images = layers.data(name='pixel', shape=[784], dtype='float32') label = layers.data(name='label', shape=[1], dtype='int32') hidden = layers.fc(input=images, size=128) @@ -137,6 +138,11 @@ class TestBook(unittest.TestCase): input=hidden, label=label, param_attr=ParamAttr(name="crfw")) crf_decode = layers.crf_decoding( input=hidden, param_attr=ParamAttr(name="crfw")) + layers.chunk_eval( + input=crf_decode, + label=label, + chunk_scheme="IOB", + num_chunk_types=(label_dict_len - 1) / 2) self.assertNotEqual(crf, None) self.assertNotEqual(crf_decode, None) From 1d04b19ce86ccf055f58955142447aab577d6171 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Wed, 6 Dec 2017 01:55:12 +0530 Subject: [PATCH 197/275] Fix the rendering of latex equation for adamax op (#6294) * Using latex fraction syntax in sigmoid and logsigmoid op * Fixing the rendering of the latex equations in adamax operator --- paddle/operators/activation_op.cc | 8 ++++---- paddle/operators/adamax_op.cc | 10 ++++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 154c618e8e..83262f950e 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -44,9 +44,9 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of Sigmoid operator"); AddOutput("Y", "Output of Sigmoid operator"); AddComment(R"DOC( -Sigmoid Activation Operator. +Sigmoid Activation Operator -$y = 1 / (1 + e^{-x})$ +$$y = \frac{1}{1 + e^{-x}}$$ )DOC"); } @@ -60,9 +60,9 @@ class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "Input of LogSigmoid operator"); AddOutput("Y", "Output of LogSigmoid operator"); AddComment(R"DOC( -Logsigmoid Activation Operator. +Logsigmoid Activation Operator -$y = \log(1 / (1 + e^{-x}))$ +$$y = \log \frac{1}{1 + e^{-x}}$$ )DOC"); } diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index d5bbc672e1..867ddd9790 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -107,10 +107,12 @@ Adam algorithm based on the infinity norm. Adamax updates: -$$momentOut = \beta_1 * moment + (1 - \beta_1) * grad \break -infNormOut = max(\beta_2 * infNorm + \epsilon, |grad|) \break -learningRate = learningRate /(1 - \beta_1_{pow}) \break -paramOut = param - learningRate * momentPut / infNormOut$$ +$$ + momentOut = \beta_{1} * moment + (1 - \beta_{1}) * grad \\ + infNormOut = max(\beta_{2} * infNorm + \epsilon, |grad|) \\ + learningRate = \frac{learningRate}{1 - \beta_{1}^{Beta1Pow}} \\ + paramOut = param - learningRate * \frac{momentOut}{infNormOut} +$$ The original paper does not have an epsilon attribute. However, it is added here for numerical stability to prevent the From 458ffbf4fe7fe6da5e2ea133c027d560815cc3cc Mon Sep 17 00:00:00 2001 From: Thuan Nguyen Date: Tue, 5 Dec 2017 14:42:17 -0800 Subject: [PATCH 199/275] Refer to https://github.com/PaddlePaddle/Paddle/issues/6305 (#6306) This pull request adds "Build using Docker" documentation to the "Getting Started > Install and Build" menu on PaddlePaddle.org --- doc/getstarted/build_and_install/index_cn.rst | 2 +- doc/getstarted/build_and_install/index_en.rst | 1 + doc/howto/dev/build_cn.md | 2 +- doc/howto/dev/build_en.md | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst index 88c5142dde..c9ba84c842 100644 --- a/doc/getstarted/build_and_install/index_cn.rst +++ b/doc/getstarted/build_and_install/index_cn.rst @@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式: pip_install_cn.rst docker_install_cn.rst - + ../../howto/dev/build_cn.md 编译流程 ++++++++ diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst index c8b60d0357..32d66d63dd 100644 --- a/doc/getstarted/build_and_install/index_en.rst +++ b/doc/getstarted/build_and_install/index_en.rst @@ -13,6 +13,7 @@ You can choose either pip or Docker to complete your install: pip_install_en.rst docker_install_en.rst + ../../howto/dev/build_en.md Build from Source diff --git a/doc/howto/dev/build_cn.md b/doc/howto/dev/build_cn.md index 0b911f7b75..4a80a52451 100644 --- a/doc/howto/dev/build_cn.md +++ b/doc/howto/dev/build_cn.md @@ -1,4 +1,4 @@ -# 编译PaddlePaddle和运行单元测试 +# 用Docker编译和测试PaddlePaddle ## 需要的软硬件 diff --git a/doc/howto/dev/build_en.md b/doc/howto/dev/build_en.md index d0048e3714..91c41ef8ce 100644 --- a/doc/howto/dev/build_en.md +++ b/doc/howto/dev/build_en.md @@ -1,4 +1,4 @@ -# Build PaddlePaddle from Source Code and Run Unit Test +# Build using Docker ## What Developers Need From 2c1270e40c972bd91b07bca52561dddf66a53625 Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Tue, 5 Dec 2017 15:16:59 -0800 Subject: [PATCH 200/275] fix maxout op latex equation (#6303) --- paddle/operators/maxout_op.cc | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc index e203a25d54..44bf402e95 100644 --- a/paddle/operators/maxout_op.cc +++ b/paddle/operators/maxout_op.cc @@ -40,23 +40,28 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker { "the number of channels divided by groups.." )DOC"); AddComment(R"DOC( - Assumed the input shape is (N, Ci, H, W). - The output shape is (N, Co, H, W). Then `Co = Ci / groups`. +MaxOut Operator. - math: - y_{si+j} = \max_k x_{gsi + sk + j} - g = groups - s = input.size / num_channels - 0 \le i < num_channels / groups - 0 \le j < s - 0 \le k < groups +Assumed the input shape is (N, Ci, H, W). +The output shape is (N, Co, H, W). +Then $Co = Ci / groups$ and the operator formula is as follows: - Please refer to Paper: - - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf - - Multi-digit Number Recognition from Street View \ - Imagery using Deep Convolutional Neural Networks: \ - https://arxiv.org/pdf/1312.6082v4.pdf - )DOC"); +$$ +y_{si+j} = \max_k x_{gsi + sk + j} \\ +g = groups \\ +s = \frac{input.size}{num\_channels} \\ +0 \le i < \frac{num\_channels}{groups} \\ +0 \le j < s \\ +0 \le k < groups +$$ + +Please refer to Paper: + - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf + - Multi-digit Number Recognition from Street View \ + Imagery using Deep Convolutional Neural Networks: \ + https://arxiv.org/pdf/1312.6082v4.pdf + +)DOC"); } }; From 16822fb702b52bea79d40b34a6cb6e368251f8c8 Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Tue, 5 Dec 2017 15:17:44 -0800 Subject: [PATCH 201/275] fix latex equation for clip by norm op (#6302) --- paddle/operators/clip_by_norm_op.cc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc index d9fc532e39..f73d55bbe3 100644 --- a/paddle/operators/clip_by_norm_op.cc +++ b/paddle/operators/clip_by_norm_op.cc @@ -47,15 +47,19 @@ class ClipByNormOpMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) The output of clip_by_norm op with shape as input(X)"); AddAttr("max_norm", "(float) The maximum norm value."); AddComment(R"DOC( -ClipByNorm operator limits the L2 norm of the input 'X' within 'max_norm'. -If the L2 norm of 'X' is less than or equal to 'max_norm', 'Out' will be -the same as 'X'. If the L2 norm of 'X' is greater than 'max_norm', 'X' will -be linearly scaled to make the L2 norm of 'Out' equal to 'max_norm', as -shown in the following formula: +ClipByNorm Operator. -'Out' = 'max_norm' * 'X' / norm('X'), +This operator limits the L2 norm of the input $X$ within $max\_norm$. +If the L2 norm of $X$ is less than or equal to $max\_norm$, $Out$ will be +the same as $X$. If the L2 norm of $X$ is greater than $max\_norm$, $X$ will +be linearly scaled to make the L2 norm of $Out$ equal to $max\_norm$, as +shown in the following formula: -where norm('X') represents the L2 norm of 'X'. +$$ +Out = \frac{max\_norm * X}{norm(X)}, +$$ + +where $norm(X)$ represents the L2 norm of $X$. )DOC"); } }; From 002a7b4d01e09b968ab2ffb0a9620becfacce139 Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Tue, 5 Dec 2017 15:18:05 -0800 Subject: [PATCH 202/275] fix scatter op equation (#6304) --- paddle/operators/scatter_op.cc | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc index ce4b794bc3..573bbcd187 100644 --- a/paddle/operators/scatter_op.cc +++ b/paddle/operators/scatter_op.cc @@ -87,10 +87,15 @@ class ScatterOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Updates", "The updated value of updates op"); AddOutput("Out", "The output of add op"); AddComment(R"DOC( -Scatter Operator by selecting from the first axis, +Scatter Operator. -Out = Ref +This operator obtains output by updating the input on selected indices on the first axis: + +$$ +Out = Ref \\ Out[Index] = Ref[Index] + Updates +$$ + )DOC"); } }; From 6fb34e8d616a6277119aae443e6fa24b34051467 Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Tue, 5 Dec 2017 16:17:06 -0800 Subject: [PATCH 203/275] fix typo and path issue in build from source doc --- doc/getstarted/build_and_install/build_from_source_cn.rst | 6 +++--- doc/getstarted/build_and_install/build_from_source_en.rst | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst index 3c525bdad6..c875c807b8 100644 --- a/doc/getstarted/build_and_install/build_from_source_cn.rst +++ b/doc/getstarted/build_and_install/build_from_source_cn.rst @@ -19,7 +19,7 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译 git clone https://github.com/PaddlePaddle/Paddle.git cd Paddle # 如果使用Docker编译环境,执行下面的命令编译CPU-Only的二进制 - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh # 如果不使用Docker编译环境,执行下面的命令 mkdir build cd build @@ -30,7 +30,7 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译 .. code-block:: bash - pip install python/dist/*.whl + pip install build/python/dist/*.whl .. _run_test: @@ -45,7 +45,7 @@ PaddlePaddle主要使用 `CMake `_ 以及GCC, G++作为编译 .. code-block:: bash - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh 如果不使用Docker,可以执行ctest命令即可: diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst index 76fbc43de2..f194f84ce7 100644 --- a/doc/getstarted/build_and_install/build_from_source_en.rst +++ b/doc/getstarted/build_and_install/build_from_source_en.rst @@ -21,7 +21,7 @@ Then run: git clone https://github.com/PaddlePaddle/Paddle.git cd Paddle # run the following command to build a CPU-Only binaries if you are using docker - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh # else run these commands mkdir build cd build @@ -34,7 +34,7 @@ machine or copy it to the target machine. .. code-block:: bash - pip install python/dist/*.whl + pip install build/python/dist/*.whl .. _run_test: @@ -49,7 +49,7 @@ Set :code:`WITH_GPU=ON` Can also run tests on GPU. .. code-block:: bash - docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh + docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh If you don't use Docker, just run ctest will start the tests: @@ -117,7 +117,7 @@ You can add :code:`-D` argument to pass such options, like: "WITH_PYTHON", "Build with integrated Python interpreter", "ON" "WITH_STYLE_CHECK", "Check code style when building", "ON" "WITH_TESTING", "Build unit tests", "ON" - "WITH_DOC", "Build documentaions", "OFF" + "WITH_DOC", "Build documentations", "OFF" "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto" "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON" "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON" From a5167ce0b4116db4c93d7bbd5a2364f5da9b8f63 Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Tue, 5 Dec 2017 16:38:04 -0800 Subject: [PATCH 204/275] fix lod_array_lengh op equation (#6307) --- paddle/operators/lod_array_length_op.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/paddle/operators/lod_array_length_op.cc b/paddle/operators/lod_array_length_op.cc index 80445eb575..b2f4ec57fa 100644 --- a/paddle/operators/lod_array_length_op.cc +++ b/paddle/operators/lod_array_length_op.cc @@ -43,12 +43,16 @@ class LoDArrayLengthProtoMaker : public framework::OpProtoAndCheckerMaker { : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "(LoDTensorArray) The input tensor array."); AddOutput("Out", "(Tensor) 1x1 CPU Tensor of length, int64_t"); - AddComment(R"DOC(Get the length of lod tensor array + AddComment(R"DOC( +LoDArrayLength Operator. -Out = len(X) +This operator obtains the length of lod tensor array: + +$$Out = len(X)$$ NOTE: The output is a CPU Tensor since the control variable should be only in CPU and the length of LoDTensorArray should be used as control variables. + )DOC"); } }; From dbf205002d030afebe1aa17c6bcd94ec2a6a83fe Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Tue, 5 Dec 2017 16:39:41 -0800 Subject: [PATCH 205/275] fix read and write tensor array op (#6312) --- .../operators/tensor_array_read_write_op.cc | 20 +++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc index 4eb8b60f47..2835b84f75 100644 --- a/paddle/operators/tensor_array_read_write_op.cc +++ b/paddle/operators/tensor_array_read_write_op.cc @@ -60,12 +60,16 @@ class WriteToArrayOpProtoMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) the subscript index in tensor array. The number of element " "should be 1"); AddOutput("Out", "(TensorArray) the tensor array will be written"); - AddComment(R"DOC(Write a LoDTensor to a LoDTensor array. + AddComment(R"DOC( +WriteToArray Operator. -Assume T is LoDTensor, i is the subscript of the array, and A is the array. The +This operator writes a LoDTensor to a LoDTensor array. + +Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The equation is -A[i] = T +$$A[i] = T$$ + )DOC"); } }; @@ -144,12 +148,16 @@ class ReadFromArrayProtoMaker : public framework::OpProtoAndCheckerMaker { "(Tensor) the subscript index in tensor array. The number of " "element should be 1"); AddOutput("Out", "(LoDTensor) the tensor will be read from."); - AddComment(R"DOC(Read a LoDTensor from a LoDTensor Array + AddComment(R"DOC( +ReadFromArray Operator. -Assume T is LoDTensor, i is th e subscript of the array, and A is the array. The +Read a LoDTensor from a LoDTensor Array. + +Assume $T$ is LoDTensor, $i$ is the subscript of the array, and $A$ is the array. The equation is -T = A[i] +$$T = A[i]$$ + )DOC"); } }; From 94a36b8cc7173d8c59ad8ba78a1af0867c073a2d Mon Sep 17 00:00:00 2001 From: kexinzhao <19hskevin87@gmail.com> Date: Tue, 5 Dec 2017 16:40:21 -0800 Subject: [PATCH 206/275] fix clip op doc operation (#6314) --- paddle/operators/clip_op.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc index 3e9066ceb2..4ddf24dea3 100644 --- a/paddle/operators/clip_op.cc +++ b/paddle/operators/clip_op.cc @@ -52,7 +52,11 @@ class ClipOpMaker : public framework::OpProtoAndCheckerMaker { Clip Operator. The clip operator limits the value of given input within an interval. The interval is -specified with arguments 'min' and 'max'. +specified with arguments 'min' and 'max': + +$$ +Out = \min(\max(X, min), max) +$$ )DOC"); } From 229c2e78833dc4574083de0935ad321ea7a72317 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 6 Dec 2017 10:31:06 +0800 Subject: [PATCH 207/275] Feature/while op sentiment analysis (#6282) * Add DataFeeder A v2 API like data feeder for book demos. We can feed data directly from reader. * Fix CI * Add an unittest for while/rnn op forward * Add unittest for raw while op backward * Fix CI * Complete Dynamic RNN --- paddle/framework/backward.cc | 4 +- python/paddle/v2/fluid/layer_helper.py | 7 + python/paddle/v2/fluid/layers.py | 221 ++++++++++++++++++- python/paddle/v2/fluid/tests/test_dyn_rnn.py | 45 +++- 4 files changed, 269 insertions(+), 8 deletions(-) diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index c8b85caaca..7294ba1a9c 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -33,8 +33,8 @@ static std::unordered_set* g_ctrl_flow_ops_ = nullptr; // We should design a better way to backward CtrlFlowOps. static std::unordered_set& CtrlFlowOps() { if (g_ctrl_flow_ops_ == nullptr) { - g_ctrl_flow_ops_ = - new std::unordered_set{"increment", "lod_rank_table"}; + g_ctrl_flow_ops_ = new std::unordered_set{ + "increment", "lod_rank_table", "less_than"}; } return *g_ctrl_flow_ops_; } diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py index cbee3fe637..3963e13222 100644 --- a/python/paddle/v2/fluid/layer_helper.py +++ b/python/paddle/v2/fluid/layer_helper.py @@ -151,6 +151,13 @@ class LayerHelper(object): persistable=True, initializer=initializer) + @property + def to_kwargs(self): + return { + 'main_program': self.main_program, + 'startup_program': self.startup_program + } + def append_bias_op(self, input_var, dim_start=1, dim_end=None): """ Append bias operator and return its output. If the user does not set diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 3f7cd525b3..98a04ea9c2 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -6,6 +6,7 @@ from paddle.v2.fluid.layer_helper import LayerHelper, unique_name import re import cStringIO from param_attr import ParamAttr +import contextlib __all__ = [ 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat', @@ -1395,7 +1396,7 @@ def lod_tensor_to_array(x, table, main_program=None): return array -def array_to_lod_tensor(x, table, main_program=None): +def array_to_lod_tensor(x, table, main_program=None, startup_program=None): """ This function creates an operator to convert an array to a LOD_Tensor. @@ -1476,7 +1477,11 @@ def zeros(shape, dtype, main_program=None): return fill_constant(value=0.0, **locals()) -def increment(x, value=1.0, in_place=True, main_program=None): +def increment(x, + value=1.0, + in_place=True, + main_program=None, + startup_program=None): """ This function creates an operator to increment each value in the input `x` by an amount: `value` as mentioned in the input parameter. This @@ -1495,7 +1500,7 @@ def increment(x, value=1.0, in_place=True, main_program=None): return out -def array_write(x, i, array=None, main_program=None): +def array_write(x, i, array=None, main_program=None, startup_program=None): """ This function creates an operator to write the data out as a LOD_TENSOR_ARRAY. @@ -1534,7 +1539,7 @@ def less_than(x, y, cond=None, main_program=None, **ignored): return cond -def array_read(array, i, main_program=None): +def array_read(array, i, main_program=None, startup_program=None): """ This function creates an operator to read the data in as a LOD_TENSOR_ARRAY. @@ -1553,7 +1558,7 @@ def array_read(array, i, main_program=None): return out -def shrink_memory(x, i, table, main_program=None): +def shrink_memory(x, i, table, main_program=None, startup_program=None): """ This function creates an operator to shrink_rnn_memory using the RankTable as mentioned in the input parameter. @@ -1890,3 +1895,209 @@ class IfElse(object): main_program=self.helper.main_program, startup_program=self.helper.startup_program)) return rlist + + +class DynamicRNN(object): + BEFORE_RNN = 0 + IN_RNN = 1 + AFTER_RNN = 2 + + def __init__(self, name=None, main_program=None, startup_program=None): + self.helper = LayerHelper( + 'dynamic_rnn', + name=name, + main_program=main_program, + startup_program=startup_program) + self.status = DynamicRNN.BEFORE_RNN + self.lod_rank_table = None + self.max_seq_len = None + self.step_idx = None + self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64') + self.mem_dict = dict() + self.output_array = [] + self.outputs = [] + self.cond = self.helper.create_tmp_variable(dtype='bool') + self.cond.stop_gradient = False + self.while_op = While(self.cond) + self.input_array = [] + self.mem_link = [] + + def step_input(self, x): + self._assert_in_rnn_block_("step_input") + if not isinstance(x, Variable): + raise TypeError( + "step_input() can only take a Variable as its input") + parent_block = self._parent_block_() + if self.lod_rank_table is None: + self.lod_rank_table = parent_block.create_var( + name=unique_name('lod_rank_table'), + type=core.VarDesc.VarType.LOD_RANK_TABLE) + self.lod_rank_table.stop_gradient = True + parent_block.append_op( + type='lod_rank_table', + inputs={"X": x}, + outputs={"Out": self.lod_rank_table}) + self.max_seq_len = parent_block.create_var( + name=unique_name('dynamic_rnn_max_seq_len'), dtype='int64') + self.max_seq_len.stop_gradient = False + parent_block.append_op( + type='max_sequence_len', + inputs={'RankTable': self.lod_rank_table}, + outputs={"Out": self.max_seq_len}) + self.cond.stop_gradient = True + parent_block.append_op( + type='less_than', + inputs={'X': self.step_idx, + 'Y': self.max_seq_len}, + outputs={'Out': self.cond}) + + input_array = parent_block.create_var( + name=unique_name('dynamic_rnn_input_array'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=x.dtype) + self.input_array.append((input_array, x.dtype)) + parent_block.append_op( + type='lod_tensor_to_array', + inputs={'X': x, + 'RankTable': self.lod_rank_table}, + outputs={'Out': input_array}) + return array_read( + array=input_array, i=self.step_idx, **self.helper.to_kwargs) + + @contextlib.contextmanager + def block(self): + if self.status != DynamicRNN.BEFORE_RNN: + raise ValueError("rnn.block() can only be invoke once") + self.step_idx = fill_constant(shape=[1], dtype='int64', value=0) + self.step_idx.stop_gradient = False + self.status = DynamicRNN.IN_RNN + with self.while_op.block(): + yield + increment( + x=self.step_idx, + value=1.0, + in_place=True, + **self.helper.to_kwargs) + + for new_mem, mem_array in self.mem_link: + array_write( + x=new_mem, + i=self.step_idx, + array=mem_array, + **self.helper.to_kwargs) + + less_than( + x=self.step_idx, + y=self.max_seq_len, + cond=self.cond, + **self.helper.to_kwargs) + + self.status = DynamicRNN.AFTER_RNN + for each_array in self.output_array: + self.outputs.append( + array_to_lod_tensor( + x=each_array, + table=self.lod_rank_table, + **self.helper.to_kwargs)) + + def __call__(self, *args, **kwargs): + if self.status != DynamicRNN.AFTER_RNN: + raise ValueError( + "Dynamic RNN outputs can only be retrieved after rnn block") + if len(self.outputs) == 1: + return self.outputs[0] + else: + return self.outputs + + def memory(self, init=None, shape=None, value=0.0, dtype='float32'): + self._assert_in_rnn_block_('memory') + if init is not None: + if not isinstance(init, Variable): + raise TypeError( + "The input arg `init` of memory() must be a Variable") + parent_block = self._parent_block_() + mem_array = parent_block.create_var( + name=unique_name('dynamic_rnn_mem_array'), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=init.dtype) + parent_block.append_op( + type='write_to_array', + inputs={'X': init, + 'I': self.zero_idx}, + outputs={'Out': mem_array}) + retv = array_read( + array=mem_array, i=self.step_idx, **self.helper.to_kwargs) + retv = shrink_memory( + x=retv, + i=self.step_idx, + table=self.lod_rank_table, + **self.helper.to_kwargs) + self.mem_dict[retv.name] = mem_array + return retv + else: + if len(self.input_array) == 0: + raise ValueError( + "step_input should be invoked before memory(shape=..., value=...)" + ) + parent_block = self._parent_block_() + init = parent_block.create_var( + name=unique_name('mem_init'), dtype=dtype) + arr, dtype = self.input_array[0] + in0 = parent_block.create_var(name=unique_name('in0'), dtype=dtype) + parent_block.append_op( + type='read_from_array', + inputs={'X': [arr], + 'I': [self.zero_idx]}, + outputs={'Out': [in0]}) + parent_block.append_op( + type='fill_constant_batch_size_like', + inputs={'Input': [in0]}, + outputs={'Out': [init]}, + attrs={ + 'shape': [-1] + shape, + 'value': float(value), + 'dtype': init.dtype + }) + return self.memory(init=init) + + def update_memory(self, ex_mem, new_mem): + self._assert_in_rnn_block_('update_memory') + if not isinstance(ex_mem, Variable): + raise TypeError("The input arg `ex_mem` of update_memory() must " + "be a Variable") + if not isinstance(new_mem, Variable): + raise TypeError("The input arg `new_mem` of update_memory() must " + "be a Variable") + + mem_array = self.mem_dict.get(ex_mem.name, None) + if mem_array is None: + raise ValueError("Please invoke memory before update_memory") + if self.lod_rank_table is None: + raise ValueError("Please invoke step_input before update_memory") + + self.mem_link.append((new_mem, mem_array)) + + def output(self, *outputs): + self._assert_in_rnn_block_('output') + parent_block = self._parent_block_() + for each in outputs: + outside_array = parent_block.create_var( + name=unique_name("_".join( + [self.helper.name, "output_array", each.name])), + type=core.VarDesc.VarType.LOD_TENSOR_ARRAY, + dtype=each.dtype) + array_write(x=each, i=self.step_idx, array=outside_array) + self.output_array.append(outside_array) + + def _parent_block_(self): + prog = self.helper.main_program + parent_idx = prog.current_block().parent_idx + assert parent_idx >= 0 + parent_block = prog.block(parent_idx) + + return parent_block + + def _assert_in_rnn_block_(self, method): + if self.status != DynamicRNN.IN_RNN: + raise ValueError("{0} can only be invoked inside rnn block.".format( + method)) diff --git a/python/paddle/v2/fluid/tests/test_dyn_rnn.py b/python/paddle/v2/fluid/tests/test_dyn_rnn.py index 271e39a0e0..034266c26f 100644 --- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py +++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py @@ -7,7 +7,7 @@ import numpy class TestDynRNN(unittest.TestCase): def setUp(self): self.word_dict = paddle.dataset.imdb.word_dict() - self.BATCH_SIZE = 100 + self.BATCH_SIZE = 2 self.train_data = paddle.batch( paddle.dataset.imdb.train(self.word_dict), batch_size=self.BATCH_SIZE) @@ -55,6 +55,7 @@ class TestDynRNN(unittest.TestCase): mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table) hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh') + fluid.layers.array_write(x=hidden, i=i, array=out) fluid.layers.increment(x=i, in_place=True) fluid.layers.array_write(x=hidden, i=i, array=mem_array) @@ -82,6 +83,48 @@ class TestDynRNN(unittest.TestCase): print(val) self.assertFalse(numpy.isnan(val)) + def test_train_dyn_rnn(self): + main_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(main_program, startup_program): + sentence = fluid.layers.data( + name='word', shape=[1], dtype='int64', lod_level=1) + sent_emb = fluid.layers.embedding( + input=sentence, size=[len(self.word_dict), 32], dtype='float32') + + rnn = fluid.layers.DynamicRNN() + + with rnn.block(): + in_ = rnn.step_input(sent_emb) + mem = rnn.memory(shape=[100], dtype='float32') + out_ = fluid.layers.fc(input=[in_, mem], size=100, act='tanh') + rnn.update_memory(mem, out_) + rnn.output(out_) + + last = fluid.layers.sequence_pool(input=rnn(), pool_type='last') + logits = fluid.layers.fc(input=last, size=1, act=None) + label = fluid.layers.data(name='label', shape=[1], dtype='float32') + loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=logits, label=label) + loss = fluid.layers.mean(x=loss) + sgd = fluid.optimizer.Adam(1e-3) + sgd.minimize(loss=loss) + + cpu = fluid.CPUPlace() + exe = fluid.Executor(cpu) + exe.run(startup_program) + feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu) + data = next(self.train_data()) + loss_0 = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[loss])[0] + for _ in xrange(100): + val = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[loss])[0] + # loss should be small after 100 mini-batch + self.assertLess(val[0], loss_0[0]) + if __name__ == '__main__': unittest.main() From d303f7ae4fecbfa684b598421403ddcaf0286b85 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Wed, 6 Dec 2017 11:41:48 +0800 Subject: [PATCH 208/275] fix int overflow --- paddle/operators/conv_cudnn_op.cu.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index 3f97dc7ee0..bc265dcc4f 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -28,7 +28,8 @@ using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor; using DataLayout = platform::DataLayout; -static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024; +static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = + static_cast(1024) * 1024 * 1024; template class CudnnConvOpKernel : public framework::OpKernel { @@ -44,7 +45,8 @@ class CudnnConvOpKernel : public framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); - int user_workspace_size = ctx.Attr("workspace_size_MB"); + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); const T* input_data = input->data(); const T* filter_data = filter->data(); @@ -163,7 +165,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel { std::vector paddings = ctx.Attr>("paddings"); std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); - int user_workspace_size = ctx.Attr("workspace_size_MB"); + int64_t user_workspace_size = + static_cast(ctx.Attr("workspace_size_MB")); // ------------------- cudnn descriptors --------------------- ScopedTensorDescriptor input_desc; From 8711a9a22aaf2ed4d4711089d0136da133c1826d Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 6 Dec 2017 11:19:16 +0800 Subject: [PATCH 209/275] refine code --- paddle/operators/elementwise_add_op.h | 2 +- paddle/operators/elementwise_op_function.h | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index 3a198c167e..921dc5f6a6 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -21,7 +21,7 @@ namespace operators { template struct AddFunctor { - HOSTDEVICE T operator()(T a, T b) const { return a + b; } + inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } }; template diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index ec448a9e95..ca3542e783 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -71,7 +71,9 @@ class RowwiseTransformIterator { RowwiseTransformIterator& operator++() { ++i_; - i_ %= n_; + if (UNLIKELY(i_ == n_)) { + i_ = 0; + } return *this; } @@ -100,7 +102,12 @@ class MidWiseTransformIterator { : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} MidWiseTransformIterator& operator++() { - i_ = (++j_ / post_) % n_; + ++j_; + i_ = j_ / post_; + if (UNLIKELY(i_ == n_)) { + j_ = 0; + i_ = 0; + } return *this; } From 83537c7ada62153d9bd323de6144d67902cdcd39 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 6 Dec 2017 13:10:04 +0800 Subject: [PATCH 210/275] Fix warning about comparison of integers of different signs --- paddle/operators/nce_op.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index ea92a797fe..0a8a95de5f 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -49,7 +49,7 @@ void PrepareSamples(const framework::ExecutionContext& context) { int num_label = label_dims.size() == 2 ? label_dims[1] : 1; int index = 0; - for (size_t i = 0; i < label_dims[0]; ++i) { + for (int64_t i = 0; i < label_dims[0]; ++i) { int j = 0; for (; j < num_label; ++j) { sample_labels_data[index++] = label_data[i * num_label + j]; @@ -86,7 +86,7 @@ class NCEKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); int num_neg_samples = context.Attr("num_neg_samples"); int num_total_classes = context.Attr("num_total_classes"); - int num_true_class = 1; + int64_t num_true_class = 1; if (label != nullptr) { num_true_class = label->dims()[1]; } @@ -95,18 +95,18 @@ class NCEKernel : public framework::OpKernel { auto bias = context.Input("Bias"); if (bias != nullptr) { const T* bias_data = bias->data(); - for (size_t i = 0; i < sample_labels->numel(); ++i) { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { sample_out_data[i] = bias_data[sample_labels_data[i]]; } } else { - for (size_t i = 0; i < sample_labels->numel(); ++i) { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { sample_out_data[i] = 0; } } // forward mul auto input_mat = EigenMatrix::From(*(context.Input("Input"))); auto weight_mat = EigenMatrix::From(*(context.Input("Weight"))); - for (size_t i = 0; i < sample_labels->numel(); ++i) { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { Eigen::Tensor result = (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) * weight_mat.chip(sample_labels_data[i], 0)) @@ -115,8 +115,8 @@ class NCEKernel : public framework::OpKernel { sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i]))); } // forward cost - for (size_t i = 0; i < sample_labels->dims()[0]; ++i) { - size_t j = 0; + for (int64_t i = 0; i < sample_labels->dims()[0]; ++i) { + int64_t j = 0; out_data[i] = 0; T w = sample_weight == nullptr ? 1. : sample_weight_data[i]; // for true classes @@ -162,7 +162,7 @@ class NCEGradKernel : public framework::OpKernel { T* sample_grad_data = sample_grad.mutable_data(sample_labels->dims(), context.GetPlace()); // backward cost - for (size_t i = 0; i < sample_labels->numel(); ++i) { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { T o = sample_out_data[i]; T w = sample_weight == nullptr ? 1 @@ -177,7 +177,7 @@ class NCEGradKernel : public framework::OpKernel { if (d_bias != nullptr) { T* d_bias_data = d_bias->mutable_data(context.GetPlace()); std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0); - for (size_t i = 0; i < sample_labels->numel(); ++i) { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { d_bias_data[sample_labels_data[i]] += sample_grad_data[i]; } } @@ -188,7 +188,7 @@ class NCEGradKernel : public framework::OpKernel { std::fill(d_w_data, d_w_data + d_w->numel(), 0.0); auto d_w_matrix = EigenMatrix::From(*d_w); auto x_matrix = EigenMatrix::From(*(context.Input("Input"))); - for (size_t i = 0; i < sample_labels->numel(); ++i) { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { d_w_matrix.chip(sample_labels_data[i], 0) += x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) * sample_grad_data[i]; @@ -200,7 +200,7 @@ class NCEGradKernel : public framework::OpKernel { d_x->mutable_data(context.GetPlace()); auto d_x_matrix = EigenMatrix::From(*d_x); auto w_matrix = EigenMatrix::From(*(context.Input("Weight"))); - for (size_t i = 0; i < sample_labels->numel(); ++i) { + for (int64_t i = 0; i < sample_labels->numel(); ++i) { d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) += w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i]; } From 6173f91cce21e1b730c873dd765ddc8c1bcb419b Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 6 Dec 2017 13:26:38 +0800 Subject: [PATCH 211/275] uncomment code --- paddle/gserver/tests/test_LayerGrad.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp index f8b36cb386..71ba3d176b 100644 --- a/paddle/gserver/tests/test_LayerGrad.cpp +++ b/paddle/gserver/tests/test_LayerGrad.cpp @@ -275,11 +275,11 @@ void testProjectionConv(size_t groups, bool isDeconv) { #ifdef PADDLE_WITH_CUDA TEST(Projection, conv) { /// test ConvProjection - // testProjectionConv(1, false); - // testProjectionConv(3, false); + testProjectionConv(1, false); + testProjectionConv(3, false); /// test ConvTransProjection testProjectionConv(1, true); - // testProjectionConv(3, true); + testProjectionConv(3, true); } #endif From 45aca4e9bb64d6836b778159a912adac89f33df3 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Wed, 6 Dec 2017 14:26:11 +0800 Subject: [PATCH 212/275] Change the type of conv2d in Python API. --- python/paddle/v2/fluid/layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 98a04ea9c2..7b31cabddf 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -764,7 +764,7 @@ def conv2d(input, pre_bias = helper.create_tmp_variable(dtype) helper.append_op( - type='conv2d', + type='conv2d_cudnn', inputs={ 'Input': input, 'Filter': filter, From 06a3a8871326bef41b17c456fbe68a9f31f80004 Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Tue, 5 Dec 2017 23:32:56 -0800 Subject: [PATCH 213/275] feature/nmt add encoder (#6323) * init nmt * encoder ready * only generation implementation waiting for dynamic rnn ready to train * init python * remove decoder temporary * clean * clean --- .../tests/book/test_machine_translation.py | 103 ++++++++++++++++++ 1 file changed, 103 insertions(+) create mode 100644 python/paddle/v2/fluid/tests/book/test_machine_translation.py diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py new file mode 100644 index 0000000000..5bc7e1b59d --- /dev/null +++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py @@ -0,0 +1,103 @@ +import numpy as np +import paddle.v2 as paddle +import paddle.v2.dataset.conll05 as conll05 +import paddle.v2.fluid.core as core +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +from paddle.v2.fluid.executor import Executor, g_scope +from paddle.v2.fluid.optimizer import SGDOptimizer +import paddle.v2.fluid as fluid +import paddle.v2.fluid.layers as pd + +dict_size = 30000 +source_dict_dim = target_dict_dim = dict_size +src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) +hidden_dim = 512 +word_dim = 512 +IS_SPARSE = True +batch_size = 50 +max_length = 50 +topk_size = 50 +trg_dic_size = 10000 + +src_word_id = layers.data(name="src_word_id", shape=[1], dtype='int64') +src_embedding = layers.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + +def encoder(): + + lstm_hidden0, lstm_0 = layers.dynamic_lstm( + input=src_embedding, + size=hidden_dim, + candidate_activation='sigmoid', + cell_activation='sigmoid') + + lstm_hidden1, lstm_1 = layers.dynamic_lstm( + input=src_embedding, + size=hidden_dim, + candidate_activation='sigmoid', + cell_activation='sigmoid', + is_reverse=True) + + bidirect_lstm_out = layers.concat([lstm_hidden0, lstm_hidden1], axis=0) + + return bidirect_lstm_out + + +def decoder_trainer(context): + ''' + decoder with trainer + ''' + pass + + +def to_lodtensor(data, place): + seq_lens = [len(seq) for seq in data] + cur_len = 0 + lod = [cur_len] + for l in seq_lens: + cur_len += l + lod.append(cur_len) + flattened_data = np.concatenate(data, axis=0).astype("int64") + flattened_data = flattened_data.reshape([len(flattened_data), 1]) + res = core.LoDTensor() + res.set(flattened_data, place) + res.set_lod([lod]) + return res + + +def main(): + encoder_out = encoder() + # TODO(jacquesqiao) call here + decoder_trainer(encoder_out) + + train_data = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.wmt14.train(8000), buf_size=1000), + batch_size=batch_size) + + place = core.CPUPlace() + exe = Executor(place) + + exe.run(framework.default_startup_program()) + + batch_id = 0 + for pass_id in xrange(2): + print 'pass_id', pass_id + for data in train_data(): + print 'batch', batch_id + batch_id += 1 + if batch_id > 10: break + word_data = to_lodtensor(map(lambda x: x[0], data), place) + outs = exe.run(framework.default_main_program(), + feed={'src_word_id': word_data, }, + fetch_list=[encoder_out]) + + +if __name__ == '__main__': + main() From c7e739f5425b894d513dfbd853cb30ef01797fb1 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Wed, 6 Dec 2017 16:38:12 +0800 Subject: [PATCH 214/275] Add LRN efficient GPU implement. (#5894) Add LRN efficient GPU implement --- paddle/operators/lrn_op.cc | 104 ++++++++++++- paddle/operators/lrn_op.cu | 160 +++++++++++++++++++- paddle/operators/lrn_op.h | 115 ++++---------- python/paddle/v2/fluid/tests/test_lrn_op.py | 3 +- 4 files changed, 289 insertions(+), 93 deletions(-) diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc index 00392b7967..e20340e77b 100644 --- a/paddle/operators/lrn_op.cc +++ b/paddle/operators/lrn_op.cc @@ -19,6 +19,103 @@ namespace operators { using framework::Tensor; +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta) { + auto x_v = framework::EigenVector::Flatten(input); + + const int start = -(n - 1) / 2; + const int end = start + n; + + auto e_mid = framework::EigenTensor::From(*mid); + e_mid = e_mid.constant(k); + + auto e_x = framework::EigenTensor::From(input); + for (int m = 0; m < N; m++) { + for (int i = 0; i < C; i++) { + for (int c = start; c <= end; c++) { + int ch = i + c; + if (ch >= 0 && ch < C) { + auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto r = e_x.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + s += alpha * r.square(); + } + } + } + } + + auto out_e = framework::EigenVector::Flatten(*out); + out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); + } +}; +template struct LRNFunctor; +template struct LRNFunctor; + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta) { + T ratio = -2 * alpha * beta; + auto x_g_e = framework::EigenVector::Flatten(*x_g); + x_g_e = x_g_e.constant(0.0); + + auto e_x = framework::EigenTensor::From(x); + auto e_x_g = framework::EigenTensor::From(*x_g); + auto e_out = framework::EigenTensor::From(out); + auto e_out_g = framework::EigenTensor::From(out_g); + auto e_mid = framework::EigenTensor::From(mid); + + const int start = -(n - 1) / 2; + const int end = start + n; + for (int m = 0; m < N; m++) { + for (int i = 0; i < C; i++) { + auto i_x = e_x.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_x_g = e_x_g.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_out_g = e_out_g.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto i_mid = e_mid.slice(Eigen::array({{m, i, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + i_x_g = i_mid.pow(-beta) * i_out_g; + for (int c = start; c <= end; c++) { + int ch = i + c; + if (ch < 0 || ch >= C) { + continue; + } + + auto c_out = e_out.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto c_mid = e_mid.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + auto c_out_g = e_out_g.slice(Eigen::array({{m, ch, 0, 0}}), + Eigen::array({{1, 1, H, W}})); + + i_x_g += ratio * c_out_g * c_out * i_x / c_mid; + } + } + } + } +}; +template struct LRNGradFunctor; +template struct LRNGradFunctor; + class LRNOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -83,8 +180,8 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Local Response Normalization Operator. -This operator comes from the paper -"ImageNet Classification with Deep Convolutional Neural Networks". +This operator comes from the paper: +<>. The original formula is: @@ -119,8 +216,7 @@ class LRNOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("MidOut")), - "Input(MidOut@GRAD) should not be null"); + PADDLE_ENFORCE(ctx->HasInput("MidOut"), "Input(MidOut) should not be null"); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), "Input(Out@GRAD) should not be null"); diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu index 607dc6d86a..e9a8671233 100644 --- a/paddle/operators/lrn_op.cu +++ b/paddle/operators/lrn_op.cu @@ -12,11 +12,167 @@ See the License for the specific language governing permissions and limitations under the License. */ -#define EIGEN_USE_GPU #include "paddle/operators/lrn_op.h" -namespace ops = paddle::operators; +namespace paddle { +namespace operators { + +template +__global__ void KeCMRNormFillScale(int img_size, const T* in, T* mid, int C, + int H, int W, int size, T k, T alpha) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < img_size) { + const int w = idx % W; + const int h = (idx / W) % H; + const int n = idx / W / H; + const int offset = (n * C * H + h) * W + w; + + in += offset; + mid += offset; + const int step = H * W; + const int pre_pad = (size - 1) / 2; + const int post_pad = size - pre_pad - 1; + + T accum = 0; + int index = 0; + while (index < C + post_pad) { + if (index < C) { + T val = in[index * step]; + accum += val * val; + } + if (index >= size) { + T val = in[(index - size) * step]; + accum -= val * val; + } + if (index >= post_pad) { + mid[(index - post_pad) * step] = k + accum * alpha; + } + ++index; + } + } +} + +template +__global__ void KeCMRNormOutput(int input_size, const T* in, const T* mid, + T negative_beta, T* out) { + const int index = threadIdx.x + blockIdx.x * blockDim.x; + if (index < input_size) { + out[index] = in[index] * pow(mid[index], negative_beta); + } +} + +template +void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs, + T* outputs, T* mid, int N, int C, int H, int W, int n, T k, + T alpha, T beta) { + int img_size = N * H * W; + const int block_size = 1024; + int grid_size = (img_size + block_size - 1) / block_size; + + KeCMRNormFillScale< + T><<>>( + img_size, inputs, mid, C, H, W, n, k, alpha); + + int input_size = N * H * W * C; + grid_size = (input_size + block_size - 1) / block_size; + KeCMRNormOutput< + T><<>>( + input_size, inputs, mid, -beta, outputs); +} + +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta) { + CrossMapNormal( + ctx, input.data(), out->mutable_data(ctx.GetPlace()), + mid->mutable_data(ctx.GetPlace()), N, C, H, W, n, k, alpha, beta); + } +}; + +template struct LRNFunctor; +template struct LRNFunctor; +template +__global__ void KeCMRNormDiff(int img_size, const T* x, const T* out, + const T* mid, T* x_g, const T* out_g, int C, + int H, int W, int size, T negative_beta, + T ratio) { + const int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx < img_size) { + const int w = idx % W; + const int h = (idx / W) % H; + const int n = idx / W / H; + const int offset = (n * C * H + h) * W + w; + x += offset; + out += offset; + mid += offset; + out_g += offset; + x_g += offset; + + const int step = H * W; + const int pre_pad = size - (size + 1) / 2; + const int post_pad = size - pre_pad - 1; + + int index = 0; + T accum = 0; + // TODO(gongwb): optimize this with thread shared array. + while (index < C + post_pad) { + if (index < C) { + x_g[index * step] = 0.0; + accum += out_g[index * step] * out[index * step] / mid[index * step]; + } + if (index >= size) { + accum -= out_g[(index - size) * step] * out[(index - size) * step] / + mid[(index - size) * step]; + } + if (index >= post_pad) { + x_g[(index - post_pad) * step] += + out_g[(index - post_pad) * step] * + pow(mid[(index - post_pad) * step], negative_beta) - + ratio * x[(index - post_pad) * step] * accum; + } + ++index; + } + } +} + +template +void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x, + const T* out, const T* mid, T* x_g, const T* out_g, + int N, int C, int H, int W, int n, T alpha, T beta) { + int img_size = N * H * W; + + const int block_size = 1024; + int grid_size = (img_size + block_size - 1) / block_size; + + KeCMRNormDiff< + T><<>>( + img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta, + 2.0f * alpha * beta); +} + +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta) { + CrossMapNormalGrad(ctx, x.data(), out.data(), mid.data(), + x_g->mutable_data(ctx.GetPlace()), out_g.data(), + N, C, H, W, n, alpha, beta); + } +}; + +template struct LRNGradFunctor; +template struct LRNGradFunctor; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel); REGISTER_OP_GPU_KERNEL(lrn_grad, ops::LRNGradKernel); diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h index 606c657443..aa7539db4a 100644 --- a/paddle/operators/lrn_op.h +++ b/paddle/operators/lrn_op.h @@ -21,6 +21,14 @@ namespace paddle { namespace operators { +template +struct LRNFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& input, framework::Tensor* out, + framework::Tensor* mid, int N, int C, int H, int W, int n, + T k, T alpha, T beta); +}; + template class LRNKernel : public framework::OpKernel { public: @@ -31,8 +39,8 @@ class LRNKernel : public framework::OpKernel { // f(x) represents outputs void Compute(const framework::ExecutionContext& ctx) const override { // input - const Tensor* x = ctx.Input("X"); - auto x_dims = x->dims(); + const Tensor& x = *ctx.Input("X"); + auto x_dims = x.dims(); // NCHW int N = x_dims[0]; @@ -57,38 +65,20 @@ class LRNKernel : public framework::OpKernel { PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); - auto x_v = framework::EigenVector::Flatten(*x); - - const int start = -(n - 1) / 2; - const int end = start + n; - - auto e_mid = framework::EigenTensor::From(*mid); - e_mid.device(ctx.GetEigenDevice()) = e_mid.constant(k); - - auto e_x = framework::EigenTensor::From(*x); - for (int m = 0; m < N; m++) { - for (int i = 0; i < C; i++) { - for (int c = start; c <= end; c++) { - int ch = i + c; - if (ch >= 0 && ch < C) { - auto s = e_mid.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto r = e_x.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - s.device(ctx.GetEigenDevice()) += alpha * r.square(); - } - } - } - } - - auto out_e = framework::EigenVector::Flatten(*out); - out_e.device(ctx.GetEigenDevice()) = - x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); + LRNFunctor f; + f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta); } }; +template +struct LRNGradFunctor { + void operator()(const framework::ExecutionContext& ctx, + const framework::Tensor& x, const framework::Tensor& out, + const framework::Tensor& mid, framework::Tensor* x_g, + const framework::Tensor& out_g, int N, int C, int H, int W, + int n, T alpha, T beta); +}; + /** * \brief Backward calculation for normalization with across maps. * @@ -97,7 +87,7 @@ class LRNKernel : public framework::OpKernel { * The implementation of this Function is derived from the * CrossMapNormalFunc implementation. * - * InputGrad = OutputGrad * denoms ^ (-beta) + * InputGrad = OutputGrad * MidOut ^ (-beta) * -- upper * + > (OutputGrad * OutputValue * (-2 * alpha * beta) / MidOut) * InputValue * -- lower @@ -113,18 +103,15 @@ class LRNGradKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; void Compute(const framework::ExecutionContext& ctx) const override { - const Tensor* x = ctx.Input("X"); - const Tensor* out = ctx.Input("Out"); - const Tensor* out_g = ctx.Input(framework::GradVarName("Out")); - const Tensor* mid = ctx.Input("MidOut"); + const Tensor& x = *ctx.Input("X"); + const Tensor& out = *ctx.Input("Out"); + const Tensor& out_g = *ctx.Input(framework::GradVarName("Out")); + const Tensor& mid = *ctx.Input("MidOut"); auto x_g = ctx.Output(framework::GradVarName("X")); x_g->mutable_data(ctx.GetPlace()); - auto x_g_e = framework::EigenVector::Flatten(*x_g); - x_g_e.device(ctx.GetEigenDevice()) = x_g_e.constant(0.0); - - auto x_dims = x->dims(); + auto x_dims = x.dims(); int N = x_dims[0]; int C = x_dims[1]; int H = x_dims[2]; @@ -133,51 +120,9 @@ class LRNGradKernel : public framework::OpKernel { int n = ctx.Attr("n"); T alpha = ctx.Attr("alpha"); T beta = ctx.Attr("beta"); - T ratio = -2 * alpha * beta; - - auto e_x = framework::EigenTensor::From(*x); - auto e_x_g = framework::EigenTensor::From(*x_g); - auto e_out = framework::EigenTensor::From(*out); - auto e_out_g = framework::EigenTensor::From(*out_g); - auto e_mid = framework::EigenTensor::From(*mid); - - const int start = -(n - 1) / 2; - const int end = start + n; - for (int m = 0; m < N; m++) { - for (int i = 0; i < C; i++) { - auto i_x = e_x.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto i_x_g = e_x_g.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto i_out_g = e_out_g.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto i_mid = e_mid.slice(Eigen::array({{m, i, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - i_x_g.device(ctx.GetEigenDevice()) = i_mid.pow(-beta) * i_out_g; - for (int c = start; c <= end; c++) { - int ch = i + c; - if (ch < 0 || ch >= C) { - continue; - } - - auto c_out = e_out.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto c_mid = e_mid.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - auto c_out_g = e_out_g.slice(Eigen::array({{m, ch, 0, 0}}), - Eigen::array({{1, 1, H, W}})); - - i_x_g.device(ctx.GetEigenDevice()) += - ratio * c_out_g * c_out * i_x / c_mid; - } - } - } + + LRNGradFunctor f; + f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); } }; diff --git a/python/paddle/v2/fluid/tests/test_lrn_op.py b/python/paddle/v2/fluid/tests/test_lrn_op.py index 7e34b3c91c..9abb09e53a 100644 --- a/python/paddle/v2/fluid/tests/test_lrn_op.py +++ b/python/paddle/v2/fluid/tests/test_lrn_op.py @@ -23,7 +23,7 @@ class TestLRNOp(OpTest): start = -(self.n - 1) / 2 end = start + self.n - mid = np.empty((self.N, self.C, self.H, self.W), dtype=float) + mid = np.empty((self.N, self.C, self.H, self.W)).astype("float32") mid.fill(self.k) for m in range(0, self.N): for i in range(0, self.C): @@ -74,5 +74,4 @@ class TestLRNOp(OpTest): if __name__ == "__main__": - exit(0) # LRN grad implement wrong unittest.main() From 7910d96ab8bfe83427eb95518d0ec4f63e142f38 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Wed, 6 Dec 2017 17:05:55 +0800 Subject: [PATCH 215/275] build paddle_python before build paddle_docs (#6337) --- paddle/scripts/travis/build_doc.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 7d54f0254c..278485f788 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -8,6 +8,7 @@ cd $TRAVIS_BUILD_DIR/build # Compile Documentation only. cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON make -j `nproc` gen_proto_py +make -j `nproc` paddle_python make -j `nproc` paddle_docs paddle_docs_cn # check websites for broken links From c53a9c8d44524b62c2bbececd83bdfabdb55479c Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 6 Dec 2017 17:06:31 +0800 Subject: [PATCH 216/275] compile cblas library as static --- cmake/external/openblas.cmake | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index 4c4f59656d..97857a686b 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -114,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") -IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") - ADD_LIBRARY(cblas SHARED ${dummyfile}) -ELSE() - ADD_LIBRARY(cblas STATIC ${dummyfile}) -ENDIF() +ADD_LIBRARY(cblas STATIC ${dummyfile}) TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) IF(NOT ${CBLAS_FOUND}) From 0d4f050955f1a616a4468815dda7bba6dd5b0bb2 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Wed, 6 Dec 2017 16:26:17 +0530 Subject: [PATCH 217/275] Fix equation in logical or op (#6315) --- paddle/operators/logical_op.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc index a37582c1d8..c818d5e9c1 100644 --- a/paddle/operators/logical_op.cc +++ b/paddle/operators/logical_op.cc @@ -139,15 +139,16 @@ class LogicalOp : public framework::OperatorWithKernel { ::paddle::operators::UnaryLogicalOpInferShape<_##op_type##Comment>, \ ::paddle::framework::EmptyGradOpMaker); -REGISTER_BINARY_LOGICAL_OP(logical_and, "Out = X && Y"); +REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$"); REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU, paddle::operators::LogicalAndFunctor); -REGISTER_BINARY_LOGICAL_OP(logical_or, "Out = X && Y"); +REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$"); REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU, paddle::operators::LogicalOrFunctor); -REGISTER_UNARY_LOGICAL_OP(logical_not, "Out = !X"); +REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$"); REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU, paddle::operators::LogicalNotFunctor); -REGISTER_BINARY_LOGICAL_OP(logical_xor, "Out = (X || Y) && !(X && Y)"); +REGISTER_BINARY_LOGICAL_OP(logical_xor, + "$$Out = (X || Y) \\, \\&\\& \\, !(X \\&\\& Y)$$"); REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU, paddle::operators::LogicalXorFunctor); From c4599d3e9a99ec18b0c4e6bd1100b4541a1e0a5c Mon Sep 17 00:00:00 2001 From: Yancey Date: Wed, 6 Dec 2017 19:15:43 +0800 Subject: [PATCH 218/275] Add version api (#2985) * write versino.py * add version py * clean init py * add istaged, major and etc... fields * update * update * update --- .gitignore | 1 + doc/design/releasing_process.md | 17 +++++----- python/paddle/__init__.py | 8 +++++ python/setup.py.in | 55 ++++++++++++++++++++++++++++++++- 4 files changed, 72 insertions(+), 9 deletions(-) diff --git a/.gitignore b/.gitignore index 020d3f0c30..ac56a3320e 100644 --- a/.gitignore +++ b/.gitignore @@ -28,3 +28,4 @@ cmake_install.cmake paddle/.timestamp python/paddlepaddle.egg-info/ paddle/pybind/pybind.h +python/paddle/version.py diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md index 62ff8f3229..14c081ea84 100644 --- a/doc/design/releasing_process.md +++ b/doc/design/releasing_process.md @@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理,使用[Semantic Vers PaddlePaddle每次发新的版本,遵循以下流程: 1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` -2. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 -3. 对这个版本的提交,做如下几个操作: +1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 +1. 对这个版本的提交,做如下几个操作: + * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。 * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步 * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。 * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性 @@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本,遵循以下流程: pip install twine twine upload dist/[package to upload] ``` -4. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 -5. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 -6. 协同完成Release Note的书写 +1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 +1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面 +1. 协同完成Release Note的书写 需要注意的是: @@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本,遵循以下流程: * `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 * 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。 -# PaddlePaddle 分支规范 +## PaddlePaddle 分支规范 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 @@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git- * BugFix分支也是在开发者自己的fork版本库维护,与功能分支不同的是,BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支,同时提起`Pull Request`。 -# PaddlePaddle回归测试列表 +## PaddlePaddle回归测试列表 本列表说明PaddlePaddle发版之前需要测试的功能点。 -## PaddlePaddle Book中所有章节 +### PaddlePaddle Book中所有章节 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。 diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index f662d68263..1030c94e16 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -11,3 +11,11 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +try: + from version import full_version as __version__ + from version import commit as __git_commit__ +except ImportError: + import sys + sys.stderr.write('''Warning with import paddle: you should not + import paddle from the source directory; please install paddlepaddle*.whl firstly.''' + ) diff --git a/python/setup.py.in b/python/setup.py.in index fe91df10da..d59a6a4780 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -1,8 +1,61 @@ from setuptools import setup, Distribution, Extension +import subprocess class BinaryDistribution(Distribution): def has_ext_modules(foo): return True +MAJOR = 0 +MINOR = 10 +PATCH = 0 +RC = 0 +ISTAGED = False + + + +def git_commit(): + try: + cmd = ['git', 'rev-parse', 'HEAD'] + git_commit = subprocess.Popen(cmd, stdout = subprocess.PIPE).communicate()[0].strip() + except: + git_commit = 'Unknown' + return git_commit + +def write_version_py(filename='paddle/version.py'): + cnt = ''' +# THIS FILE IS GENERATED FROM PADDLEPADDLE SETUP.PY +# +full_version = '%(major)d.%(minor)d.%(patch)d' +major = '%(major)d' +minor = '%(minor)d' +patch = '%(patch)d' +rc = '%(rc)d' +istaged = %(istaged)s +commit = '%(commit)s' + +def show(): + if istaged: + print 'full_version:', full_version + print 'major:', major + print 'minor:', minor + print 'patch:', patch + print 'rc:', rc + else: + print 'commit:', commit +''' + commit = git_commit() + with open(filename, 'w') as f: + f.write(cnt % { + 'major': MAJOR, + 'minor': MINOR, + 'patch': PATCH, + 'rc': RC, + 'version': '${PADDLE_VERSION}', + 'commit': commit, + 'istaged': ISTAGED}) + +write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py') + + packages=['paddle', 'paddle.proto', 'paddle.trainer', @@ -21,7 +74,7 @@ with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f: setup_requires = f.read().splitlines() if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: - setup_requires+=["opencv-python"] + setup_requires+=['opencv-python'] # the prefix is sys.prefix which should always be usr paddle_bin_dir = 'opt/paddle/bin' From 45b015be22375356d0af8fbbc5428fdf6ac5766d Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 6 Dec 2017 21:37:34 +0800 Subject: [PATCH 219/275] Fix #6335 (#6343) --- python/paddle/v2/fluid/regularizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py index c2c18e1951..bb1ac8911e 100644 --- a/python/paddle/v2/fluid/regularizer.py +++ b/python/paddle/v2/fluid/regularizer.py @@ -145,7 +145,7 @@ class L1DecayRegularizer(WeightDecayRegularizer): # import paddle.fluid as fluid # # hidden = fluid.layers.fc(..., -# param_attr=ParamAttr(fluid.regularizer.Xavier())) +# param_attr=fluid.regularizer.Xavier()) # # It is no need to add a `Regularizer` as the class suffix L1Decay = L1DecayRegularizer From e557611f390bd4d537eedd337e94ab87fb6db0af Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Thu, 7 Dec 2017 02:07:23 +0530 Subject: [PATCH 220/275] Fix equations in sequence_pool op (#6355) --- paddle/operators/sequence_pool_op.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc index a2f4257037..bfda8649cd 100644 --- a/paddle/operators/sequence_pool_op.cc +++ b/paddle/operators/sequence_pool_op.cc @@ -58,12 +58,12 @@ Sequence Pool Operator. The SequencePoolOp pools features of all time-steps of each instance. It supports six pooling types: -1. AVERAGE: Out[i] = $$avg(X_i)$$ -2. SUM: Out[i] = $$\sum_jX_{ij}$$ -3. SQRT: Out[i] = $$\frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$ +1. AVERAGE: $$Out[i] = \frac{\sum_i X_i}{N}$$ +2. SUM: $$Out[i] = \sum_jX_{ij}$$ +3. SQRT: $$Out[i] = \frac{\sum_jX_{ij}}{\sqrt{len(X_i)}}$$ 4. LAST: Out[i] = last instance in i-th sequence X[i] 5. FIRST: Out[i] = first instance in i-th sequence X[i] -6. MAX: Out[i] = $$max(X_i)$$ +6. MAX: $$Out[i] = max(X_i)$$ The following example explains how this works: For a mini-batch of 3 variable-length sentences, From 6f08a2191ed5b3790a55047379973dda1f1ce28c Mon Sep 17 00:00:00 2001 From: Yan Chunwei Date: Wed, 6 Dec 2017 21:41:38 -0800 Subject: [PATCH 221/275] add gru unit layer wrapper (#6325) --- python/paddle/v2/fluid/layers.py | 71 ++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 7b31cabddf..fb444f2d86 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -180,6 +180,77 @@ def dynamic_lstm(input, return hidden, cell +def gru_unit(input, + hidden, + size, + weight=None, + bias=None, + activation='tanh', + gate_activation='sigmoid', + main_program=None, + startup_program=None): + """ + GRUUnit Operator implements partial calculations of the GRU unit as following: + + $$ + update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ + reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\ + output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ + output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t) + $$ + + which is same as one time step of GRU Operator. + + @note To implement the complete GRU unit, fully-connected operator must be + used before to feed xu, xr and xc as the Input of GRUUnit operator. + + TODO(ChunweiYan) add more document here + """ + activation_dict = dict( + identity=0, + sigmoid=1, + tanh=2, + relu=3, ) + activation = activation_dict[activation] + gate_activation = activation_dict[gate_activation] + + helper = LayerHelper('gru_unit', **locals()) + dtype = helper.input_dtype() + size = size / 3 + + # create weight + if weight is None: + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype) + + # create bias + if bias is None: + bias_size = [1, 3 * size] + bias = helper.create_parameter( + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + + gate = helper.create_tmp_variable(dtype) + reset_hidden_pre = helper.create_tmp_variable(dtype) + updated_hidden = helper.create_tmp_variable(dtype) + + helper.append_op( + type='gru_unit', + inputs={'Input': input, + 'HiddenPrev': hidden, + 'Weight': weight}, + outputs={ + 'Gate': gate, + 'ResetHiddenPrev': reset_hidden_pre, + 'Hidden': updated_hidden, + }, + attrs={ + 'activation': 0, + 'gate_activation': 1, + }) + + return updated_hidden, reset_hidden_pre, gate + + def data(name, shape, append_batch_size=True, From 584c9cfc82bedf7fd1e293b9e295d2fd5a196409 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 7 Dec 2017 13:54:11 +0800 Subject: [PATCH 222/275] Add comments of unique_name, Variable, Operator (#6342) --- python/paddle/v2/fluid/framework.py | 231 ++++++++++++++++++++++++++-- 1 file changed, 216 insertions(+), 15 deletions(-) diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index e6e3190b99..bf0cd275b6 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -3,6 +3,7 @@ import collections import numpy as np from . import core import proto.framework_pb2 as framework_pb2 +import google.protobuf.message import contextlib __all__ = [ @@ -13,11 +14,28 @@ __all__ = [ def unique_name(prefix): + """ + Generate unique names with prefix + + Args: + prefix(str): The prefix of return string + + Returns(str): A unique string with the prefix + + """ uid = core.unique_integer(prefix) # unique during whole process. return "_".join([prefix, str(uid)]) def convert_np_dtype_to_dtype_(np_dtype): + """ + Convert the data type in numpy to the data type in Paddle + Args: + np_dtype(np.dtype): the data type in numpy + + Returns(core.DataType): the data type in Paddle + + """ dtype = np.dtype(np_dtype) if dtype == np.float32: return core.DataType.FP32 @@ -38,17 +56,33 @@ def convert_np_dtype_to_dtype_(np_dtype): def dtype_is_floating(dtype): + """ + Check the data type is floating or not. + Args: + dtype(np.dtype|core.DataType): data type. + Could be numpy format or Paddle format + + Returns(bool): True if data type is a float value + + """ if not isinstance(dtype, core.DataType): dtype = convert_np_dtype_to_dtype_(dtype) - if (dtype == core.DataType.FP16 or dtype == core.DataType.FP32 or - dtype == core.DataType.FP64): - return True - else: - return False + return dtype in [core.DataType.FP16, core.DataType.FP32, core.DataType.FP64] def _debug_string_(proto, throw_on_error=True): + """ + Get the debug string of a protobuf message. The message could be not + initialized. + Args: + proto(google.protobuf.message.Message): The protobuf message + throw_on_error(bool): True if raise an error when the protobuf message + is not initialized. + + Returns(str): The debug string of the protobuf message + + """ error_fields = list() if not proto.IsInitialized(error_fields) and throw_on_error: raise ValueError("{0} are not initialized\nThe message is {1}".format( @@ -57,6 +91,38 @@ def _debug_string_(proto, throw_on_error=True): class Variable(object): + """ + Python variable. Every input and output of an operator is a variable. Every + variable belongs to a block. The variable has a name and two variables in + different blocks could have the same name. + + There are many kinds of variables. Please reference the framework.proto for + details. + + Notes: The constructor of Variable should not be invoked directly. Please + use `Block.create_var` to create a variable. + + >>> cur_program = Program() + >>> cur_block = cur_program.current_block() + >>> new_variable = cur_block.create_var( + >>> name="X", shape=[-1, 23, 48], dtype='float32') + + Args: + block(Block): The associated block. It will be passed by + `Block.create_var` automatically. + type(core.VarDesc.VarType): Variable type. Please reference the + framework.proto for details. + shape(tuple|list|None): The shape of variable. -1 means the batch size. + Some kinds of variable do not contain shape, just set it to None. + dtype(np.dtype|core.DataType|str): The data type of variable. + lod_level(int): The level of lod tensor. 0 means there is not a time + series data. + persistable(bool): True if the variable should be saved as check point. + Defaults to False. + stop_gradient(bool): True if the variable will stop to calculate + gradients when backward. Defaults to False. + """ + def __init__(self, block, type=core.VarDesc.VarType.LOD_TENSOR, @@ -140,6 +206,16 @@ class Variable(object): return self.to_string(True) def to_string(self, throw_on_error): + """ + Get debug string. + + Args: + throw_on_error(bool): True if raise an exception when self is not + intialized. + + Returns(str): The debug string. + + """ protostr = self.desc.serialize_to_string() proto = framework_pb2.VarDesc.FromString(str(protostr)) return _debug_string_(proto, throw_on_error) @@ -185,7 +261,9 @@ class Variable(object): def get_all_op_protos(): """ Get all registered op proto from PaddlePaddle C++ end. - :return: A list of registered OpProto. + + Returns(list): list of OpProto + """ protostrs = core.get_all_op_protos() ret_values = [] @@ -196,6 +274,10 @@ def get_all_op_protos(): class OpProtoHolder(object): + """ + A global variable to hold all OpProtos from C++ as a map + """ + @classmethod def instance(cls): if not hasattr(cls, '_instance'): @@ -212,12 +294,26 @@ class OpProtoHolder(object): self.op_proto_map[proto.type] = proto def get_op_proto(self, type): + """ + Get OpProto by a type string. + Args: + type(str): The type that operator registered in C++ side. + + Returns(framework_pb2.OpProto): The OpProto + + """ if type not in self.op_proto_map: raise ValueError("Operator \"%s\" has not been registered." % type) return self.op_proto_map[type] class Operator(object): + """ + Python Operator class. The operator represents the build in instructs in a + Block. Users can use the build in instructs to describe their neural + network. + """ + def __init__(self, block, desc, @@ -225,6 +321,30 @@ class Operator(object): inputs=None, outputs=None, attrs=None): + """ + Constructor. + + Notes: The constructor of operator should not be invoked directly. Use + Block.append_op or Block.prepend_op instead. + + >>> cur_program = Program() + >>> cur_block = cur_program.current_block() + >>> # var1 += var2 + var3 + >>> cur_block.append_op(type="sum", + >>> inputs={"X": [var1, var2, var3]}, + >>> outputs={"Out": [var1]}) + + Args: + block(Block): The block has the current operator + desc(core.OpDesc): The protobuf description + type(str): The type of operator. + inputs(dict): The input dictionary. Key is the input parameter name. + Value is a list of variables. + outputs(dict): The output dictionary. Has same format with inputs + attrs(dict): The attributes dictionary. Key is attribute name. Value + is the attribute value. The attribute type should be as same as + the type registered in C++ + """ self.block = block self.desc = desc if len(self.desc.type()) != 0: @@ -311,6 +431,15 @@ class Operator(object): self.desc.infer_shape(self.block.desc) def to_string(self, throw_on_error): + """ + To debug string. + Args: + throw_on_error(bool): raise exception when self is not initialized + when throw_on_error is True + + Returns(str): The debug string. + + """ protostr = self.desc.serialize_to_string() proto = framework_pb2.OpDesc.FromString(str(protostr)) return _debug_string_(proto, throw_on_error) @@ -325,21 +454,55 @@ class Operator(object): return self.desc.type() def input(self, name): + """ + Get input arguments by the input parameter name + Args: + name(str): The input parameter name + + Returns(list): return the list of argument names associated with the + specific parameter name. + + """ return self.desc.input(name) @property def input_names(self): + """ + Get all input parameter names + Returns(list): return a list of input parameter names + + """ return self.desc.input_names() def output(self, name): + """ + Get output arguments by the output parameter name + Args: + name(str): The output parameter name + + Returns(list): return the list of argument names associated with the + specific parameter name. + + """ return self.desc.output(name) @property def output_names(self): + """ + Get all output parameter names + Returns(list): return a list of output parameter names + + """ return self.desc.output_names() @property def idx(self): + """ + Return the array index of current operator. + Returns(int): The array index in block.ops array + Raises: + ValueError: when the operator is not found. + """ for i, op in enumerate(self.block.ops): if op == self: return i @@ -347,19 +510,57 @@ class Operator(object): "Can't find op itself in it's block. It could be a bug of Paddle.") def has_attr(self, name): + """ + operator has the attribute with name or not. + Args: + name(str): the attribute name + + Returns(bool): True if has this attribute. + + """ return self.desc.has_attr(name) def attr_type(self, name): + """ + Get the type of attribute by attribute name + Args: + name(str): the attribute name + + Returns(core.AttrType): the attribute type + + """ return self.desc.attr_type(name) @property def attr_names(self): + """ + Get all attribute names + Returns(list): The list of attribute name + + """ return self.desc.attr_names() def attr(self, name): + """ + Get attribute by name + Args: + name(str): the attribute name + + Returns(bool|int|str|float|list): The attribute value. The return value + can be any valid attribute type. + + """ return self.desc.attr(name) def block_attr(self, name): + """ + Get the block attribute by name + Args: + name(str): the attribute name + + Returns(int): the block index + + """ return self.desc.block_attr(name) @@ -479,7 +680,7 @@ class Block(object): """ Copy the information of parameters from other block Args: - other(Block): other block + other(Block): other block Returns: None @@ -623,7 +824,7 @@ class Program(object): def copy_param_info_from(self, other): """ - Copy the information of parameters from other program. + Copy the information of parameters from other program. Args: other(Program): Other program @@ -675,7 +876,7 @@ def default_startup_program(): """ Get default startup program. In startup program, Paddle will initialize parameters, initialize nccl handle, etc. - + Returns: Program: startup program """ @@ -685,7 +886,7 @@ def default_startup_program(): def default_main_program(): """ Get default main program. The main program is used for training or testing. - + Returns: Program: main program """ @@ -695,7 +896,7 @@ def default_main_program(): def switch_main_program(program): """ Switch the main program to a new program. - + Args: program(Program): The new main program @@ -710,7 +911,7 @@ def switch_main_program(program): def switch_startup_program(program): """ - Switch the startup program to a new program + Switch the startup program to a new program Args: program(Program): The new startup program @@ -727,15 +928,15 @@ def switch_startup_program(program): def program_guard(main_program, startup_program=None): """ Switch program with `with` statement - + Examples: >>> with program_guard(Program()): >>> data = fluid.layers.data(...) >>> hidden = fluid.layers.fc(...) - + Args: main_program(Program): New main program inside `with` statement - startup_program(Program): New startup program inside `with` statement. + startup_program(Program): New startup program inside `with` statement. None means do not change startup program. Returns: From 8dacb4050b49be79e830acb570f562790bdd8538 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 7 Dec 2017 14:19:52 +0800 Subject: [PATCH 223/275] install dmidecode in product docker image --- paddle/scripts/docker/build.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 0f889e6853..3c6ec6faba 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -175,7 +175,7 @@ EOF # run paddle version to install python packages first RUN apt-get update &&\ ${NCCL_DEPS}\ - apt-get install -y wget python-pip && pip install -U pip && \ + apt-get install -y wget python-pip dmidecode && pip install -U pip && \ pip install /*.whl; apt-get install -f -y && \ apt-get clean -y && \ rm -f /*.whl && \ From f291abfc53055c0233aefbb62d4e6e5fca69e2da Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 7 Dec 2017 15:35:03 +0800 Subject: [PATCH 224/275] Add HasCUDNN to detect if CUDNN is installed or not (#6349) * Add HasCUDNN to detect if CUDNN is installed or not * Fix CI --- paddle/platform/dynload/cudnn.cc | 18 +++++++++- paddle/platform/dynload/cudnn.h | 3 ++ paddle/platform/dynload/dynamic_loader.cc | 41 +++++++++++++---------- 3 files changed, 43 insertions(+), 19 deletions(-) diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc index 761d9edd87..76ec82e108 100644 --- a/paddle/platform/dynload/cudnn.cc +++ b/paddle/platform/dynload/cudnn.cc @@ -12,7 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#include "paddle/platform/dynload/cudnn.h" +#include "paddle/platform/enforce.h" namespace paddle { namespace platform { @@ -41,6 +42,21 @@ CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP); CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP); #endif +#ifdef PADDLE_USE_DSO +bool HasCUDNN() { + std::call_once(cudnn_dso_flag, GetCudnnDsoHandle, &cudnn_dso_handle); + return cudnn_dso_handle != nullptr; +} + +void EnforceCUDNNLoaded(const char* fn_name) { + PADDLE_ENFORCE(cudnn_dso_handle != nullptr, + "Cannot load cudnn shared library. Cannot invoke method %s", + fn_name); +} +#else +bool HasCUDNN() { return true; } +#endif + } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h index 61caac5450..8c937b37d7 100644 --- a/paddle/platform/dynload/cudnn.h +++ b/paddle/platform/dynload/cudnn.h @@ -25,9 +25,11 @@ namespace dynload { extern std::once_flag cudnn_dso_flag; extern void* cudnn_dso_handle; +extern bool HasCUDNN(); #ifdef PADDLE_USE_DSO +extern void EnforceCUDNNLoaded(const char* fn_name); #define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ struct DynLoad__##__name { \ template \ @@ -36,6 +38,7 @@ extern void* cudnn_dso_handle; std::call_once(cudnn_dso_flag, \ paddle::platform::dynload::GetCudnnDsoHandle, \ &cudnn_dso_handle); \ + EnforceCUDNNLoaded(#__name); \ void* p_##__name = dlsym(cudnn_dso_handle, #__name); \ return reinterpret_cast(p_##__name)(args...); \ } \ diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc index 6feba42c0d..7a82d06a0a 100644 --- a/paddle/platform/dynload/dynamic_loader.cc +++ b/paddle/platform/dynload/dynamic_loader.cc @@ -78,12 +78,11 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, *dso_handle = dlopen(dso_path.c_str(), dynload_flags); if (nullptr == *dso_handle) { if (dso_path == "libcudnn.dylib") { - PADDLE_ENFORCE(true, - "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " - "For instance, sudo tar -xzf " - "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " - "chmod a+r /usr/local/cuda/include/cudnn.h " - "/usr/local/cuda/lib/libcudnn*"); + LOG(WARNING) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n " + "For instance, sudo tar -xzf " + "cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local \n sudo " + "chmod a+r /usr/local/cuda/include/cudnn.h " + "/usr/local/cuda/lib/libcudnn*"; } } } @@ -92,7 +91,8 @@ static inline void GetDsoHandleFromDefaultPath(std::string& dso_path, static inline void GetDsoHandleFromSearchPath(const std::string& search_root, const std::string& dso_name, - void** dso_handle) { + void** dso_handle, + bool throw_on_error = true) { int dynload_flags = RTLD_LAZY | RTLD_LOCAL; *dso_handle = nullptr; @@ -111,15 +111,19 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root, GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags); } } - PADDLE_ENFORCE(nullptr != *dso_handle, - "Failed to find dynamic library: %s ( %s ) \n Please specify " - "its path correctly using following ways: \n Method. set " - "environment variable LD_LIBRARY_PATH on Linux or " - "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: " - "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " - "using the DYLD_LIBRARY_PATH is impossible unless System " - "Integrity Protection (SIP) is disabled.", - dlPath, dlerror()); + auto error_msg = + "Failed to find dynamic library: %s ( %s ) \n Please specify " + "its path correctly using following ways: \n Method. set " + "environment variable LD_LIBRARY_PATH on Linux or " + "DYLD_LIBRARY_PATH on Mac OS. \n For instance, issue command: " + "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, " + "using the DYLD_LIBRARY_PATH is impossible unless System " + "Integrity Protection (SIP) is disabled."; + if (throw_on_error) { + PADDLE_ENFORCE(nullptr != *dso_handle, error_msg, dlPath, dlerror()); + } else if (nullptr == *dso_handle) { + LOG(WARNING) << string::Sprintf(error_msg, dlPath, dlerror()); + } } void GetCublasDsoHandle(void** dso_handle) { @@ -132,9 +136,10 @@ void GetCublasDsoHandle(void** dso_handle) { void GetCudnnDsoHandle(void** dso_handle) { #if defined(__APPLE__) || defined(__OSX__) - GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle); + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle, + false); #else - GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle); + GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle, false); #endif } From 6b9567e0ac6bef93676dabc18a7b9b4463a95d40 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 7 Dec 2017 15:44:22 +0800 Subject: [PATCH 225/275] Remove DeviceContext::Finish --- paddle/platform/device_context.cc | 4 ---- paddle/platform/device_context.h | 5 ----- 2 files changed, 9 deletions(-) diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index 7afcdfce93..ae4f0bf896 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -122,10 +122,6 @@ Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); -} - -void CUDADeviceContext::Finish() const { - Wait(); PADDLE_ENFORCE(cudaGetLastError()); } diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 526d089e35..ef5f19214d 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -46,8 +46,6 @@ class DeviceContext { DeviceType* GetEigenDevice() const; virtual void Wait() const {} - - virtual void Finish() const {} }; class CPUDeviceContext : public DeviceContext { @@ -79,9 +77,6 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Wait for all operations completion in the stream. */ void Wait() const override; - /*! \brief Check potential errors for the cuda kernel calls. */ - void Finish() const override; - /*! \brief Return place in the device context. */ Place GetPlace() const override; From 2f07e6cc2467b1d44b769804f2aff8854032bd51 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 7 Dec 2017 16:17:19 +0800 Subject: [PATCH 226/275] add graph in IntelOptimizedReadme --- benchmark/IntelOptimizedPaddle.md | 33 +++++++++---------------- benchmark/figs/googlenet-cpu-train.png | Bin 0 -> 18254 bytes benchmark/figs/resnet-cpu-train.png | Bin 0 -> 20243 bytes benchmark/figs/vgg-cpu-train.png | Bin 0 -> 18336 bytes 4 files changed, 11 insertions(+), 22 deletions(-) create mode 100644 benchmark/figs/googlenet-cpu-train.png create mode 100644 benchmark/figs/resnet-cpu-train.png create mode 100644 benchmark/figs/vgg-cpu-train.png diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index 16c2390fd3..26930a7637 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -2,21 +2,17 @@ Machine: -- Server - - Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket -- Laptop - - DELL XPS15-9560-R1745: i7-7700HQ 8G 256GSSD - - i5 MacBook Pro (Retina, 13-inch, Early 2015) -- Desktop - - i7-6700k +- Server: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz, 2 Sockets, 20 Cores per socket +- Laptop: TBD System: CentOS release 6.3 (Final), Docker 1.12.1. -PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS) -- MKL-DNN tag v0.11 -- MKLML 2018.0.1.20171007 -- OpenBLAS v0.2.20 -(TODO: will rerun after 0.11.0) +PaddlePaddle: (TODO: will rerun after 0.11.0) +- paddlepaddle/paddle:latest (for MKLML and MKL-DNN) + - MKL-DNN tag v0.11 + - MKLML 2018.0.1.20171007 +- paddlepaddle/paddle:latest-openblas (for OpenBLAS) + - OpenBLAS v0.2.20 On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively. @@ -35,9 +31,7 @@ Input image size - 3 * 224 * 224, Time: images/second | MKLML | 12.12 | 13.70 | 16.18 | | MKL-DNN | 28.46 | 29.83 | 30.44 | - -chart on batch size 128 -TBD + - ResNet-50 @@ -47,9 +41,7 @@ TBD | MKLML | 32.52 | 31.89 | 33.12 | | MKL-DNN | 81.69 | 82.35 | 84.08 | - -chart on batch size 128 -TBD + - GoogLeNet @@ -59,10 +51,7 @@ TBD | MKLML | 128.46| 137.89| 158.63 | | MKL-DNN     | 250.46| 264.83| 269.50 | -chart on batch size 128 -TBD + ### Laptop TBD -### Desktop -TBD diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..c3f67faf096fe9b45dd815f294b41679dc7c9e54 GIT binary patch literal 18254 zcmeHv2T)Vrw=NV+TARqyxBShx{b)#L2_Q3i$7sm!Y~cN%@EKbHFcTx3BA5Cn2ebrP{r90{ETc zj)sL73CUS+;=g0#4xDfj5;eT$jqAp6=u*Z)lqlFId^0!+z0Y?1;=?Cgd|Ewp9ya9A z3-hMN+D~88+36=fpv&W6VW2-tr^-TzNO=@W_D19a=Sqv1$4#$FK9`~L`SSAe`TDi~ zAC;R}E31^-d&N7$CWCtlb=vixKY!L4tF%o!(G>Xn>#3$dvJLj={V%n#uN(7vCnTEq zcYbUHNQO)>vHg7G!5I>$o~QV8iTlN};~MeX%TUmkPP#OIW?)Vj=k5mG$UVvJM-vL9 zJ27g(%;-Q$nYmu?=Yb*N|N4#%F^mr0Tj&%EB2Xz@RyyuTLX&+=^1;a-r3*CJ-IdNW z?u)&wV&Yb=v=2zgxJZtXE0K^+lbZ*J5d>S4p8tM8`{1<~AVCBv>Cso`Z;;@9XdRz~ z{At82Ci$y1@sraR9}r~7C0l^bezgcu$AQOdb2u}dZ=65sLn&MM7{P2F123)+w)A}T zr>9u*%YzYI{|tQ{82VI50W(pw!?q6y%E$k7@`a0(8U0Afi26u++Sl-$!)Y9Ta)MEHPufX zWNu>lUp+57)zNN98w*kAo#beNcOMADwH#UP0HaFJ~R#KX<{OMM$&NqjDHmqhW zW5v`F*z;a?&xctr@lCD+V3}@M4qVR5m;Yf^5>T=?@e?Qg9dpmitG}4iDp6>O>9l8N zb-JUbB<~oEEt~}B^4wh2MKI&L$No1DQ?c*zSy(M*G4afew~Km)r273gt*w`zVAD|N z@Y5HJdd#F})8Qv-@D!`Y+5EG9B7x0Fvx7|#VfI=p2UJ>$1kg1%e zi?E7V@s#~`2}F0WZhhdR9r7sU;`992pLB2Eq$io|rH&`{o>Khzru6=`BAR0eGq&$7 zS|-Q2-gR-K*#w-h%@G}u8oFaa-pBAJ+WtCMSi9iuCk)*Y z{m!eVomZV~fKJ1m8fqJs+)_M(R@2|C0ajiX<=p=fo@b<5`Aw^a7mxHAnBx<$)V&d8 z^(akH910jbc6ot6&syqp8X?{nyI$rII8YP3QyU!Mm|mLxr0mL=MD9z;NnU=YFm!e(8M@;zUx6ug%7?_r4ls zZ5w;jL_4bVYi&z6%;~&o{DD5$MsIGl`-?UqR|XB-dYhMn4|aB3h4muwwM~0VHj2C7 z?==Nj>n*9Ab(w3;!CXfLno`E&n^uz0a*i4+DW1wRXL>_tjTda5g$tI=NO@f1(M|T# z_CQs{Rjy+T7Hn;2uHVA%WF~ra(~U8LVk!reHWvI;S7MyB;957UIf4mH!FQXogEW`x z)p|s+dh!Pe<-d%qsx=s8{e4nPT($ypWu;-c&9gFv3j7}4*52$MKh9d+bJqpuf{F>c z&of)Xe!;a095Vyw|W`6+6%Eb1xeq-;BOliksjJAhPP!&#bzGr%T9e_7@HcTGrVoB*pRcb zMBt9JgmDM*`9iC3R%7&i^+PR+_jB(?i^sEZHEk)N{TBQrtKCiB*W~ZKWop!)s#oS) zr?A1!Unw_zIIcSLez~(J`JNz5D57&-2es-4M|;d#qcOLno#w#()0W=w4^jZs)tOn;k>+1BwT$ii(Ld(J2v@ z9hEpq-|MEyp78@Unp=kD)m`Zy(_F9v=QZlyf~D&ElegQ#)W1KpCsWBXOd4q^Fik_?16+C~@?6h7OU z`;3UzH+Nj*cXRKumzlUlmh-3)q0G55%E|^R@YJQ#>EP3z-f7xjXbNgG1HnED%~hB0 z<66fZ+x%2lO%=|ncS1QEUu-AXE02Ser`UtuKRf8ocRO22DP&FsA#1Jc0o=mwWH9C&OS z0O0n%GLoCNg37bBJU;xBrWVIWu3sp@pU}dT0AeWVjwevIK*eyr)z0nAKjUanFQ9fy z>@Phxjoj+aZ?cLhUPR`d^z7|-k_$t-I1V7%HzAE1XV@-vb;g|UXp3xq`z@!{b$X{a zHfT*jj8l<4S|<=@2J%>_Ms{mm8_#!1ua1$C=jdyWe>q*CS3Rw+GYA)1k1ne)=n`wl zHtEtqlzCRo2$qF2%S@aeOa#62U$yh#OX{6b*sR-06_ZCT6%Hooaq!4J2g2{2scFiA zsO0BUi;P#G8?Ze77y9Zo4k~&)G8fNXadr+}{Or5PBkJ;YiU(HvlmJOjTVcw2O6YrK z-N2^9LJ_IcX>MBg`1(g|H4eNPCfetTWj=0k>TDVZ;W4Zo9tPtS3~&>CFC=rWGnHVN zFuBk$1m0j3>z2xAFMBjUqWg93o_LvgmDrYWi?`4OvSuL>HFFR6X=?0f#Op0&!m!GL zzNjSyk5`LIYK1{730XH(O8L~?({}l0SlF#nkTf|MJ4;xCv+3YfoOiKIjWlD z2ilfx?P-yjCUYK6Y7xzp3FBLj%*Zzsy*P2OzsX5kMlh%HEGIxLrH4xheTCaMN^(%R z5gH|gEft=jUPE*aSH5)(U>tX6m=6Xggz0uxgg7@b^H{wTxI3V_Vv-# zGwW8IH>_rA{c2k5?u5eoTZPfK>-{6bEZ=*!E*JD8*;=1k^n0auiFqgF`@DviWf-)M z#S&PBpxuL}kGE}AdH`x1zFQUzpEEMtKHGz#6O-NM<8Mpscactrk1Y-E zTTRubTZn%wQqk*$8k0o1y^&9)UI*7nn-E=d3 zK&YbKS?^gn$>(D1*2}CWTrPV0#hR7;d={-->+uExJ0n4}y$b-=d>ktqFW6hw2bkz? zp9KeB?_X!gk^P}R3Yq9xIA%DXCOgUS>OpJ4qW?4J0hDu{{&wrUS5QYWhR|nV9QgB= z$hW$v1;Fdr8O}`8OI)(tE*)Rn0^M~H-n|qPs1u3sjC8P{67g9zdShvuXOse7(%-{D zzZDBF=}em9kqeE=+=0YM&(`q9;cj5D;kM>x8@yeHWN(Ch&p0IwP6~@(qfiu=?i*c7 zU$BQCT*=ycpLgP0e0X_aALf!jA>ow>D#E5=!JboNYGF&44FCq{ae`^t7%mn!x2DZ#e{7f_1@{4lqy0Uh%~ z_c|O{?+47AcVpW)h4mv;9qZl9wb58>?7pAN&PHQp*_l#>>r6(X*J|5E=^h9p_7zw6VI^oSv@f9IvNK?L^jeI`L(M;tGS{Yr+KRFUF0WueHgsx z`lEs|59Vdpz8CaGE$kWFE(Tf`v$!bNu35|-ht<8B=wxNzS`fhB6zuJjFEcx`OUUasSrWWcq< z=s$<`oDsQKTTvLX`!QoZdZ^;oy+E(3m>08_{VGf!FfD`)iZ;&OFQ+#7fAmBNVq#<< zNROlqw?3z&k*KRnRGibFuLkwk0!BD79V*OE=y6QerWmPhJeXBi)QWkf&7u{coStd7 z^wut2w}8R^R^^Ygj4OO@sI1+kiA+v^uLfwMvPb<8O54(Rv1{J&pr`&UC|v5aOM%rq zhd)S#x+o^QWqgP|z$*~36z@CmGD2;SFVCu8N_~uLPkr)hR!#qKMW}X*X(QMzB~|17 zQh|rcu!7Kzy|-#%p<5}SF9Gf~Fk@eEv-e1xd+_*+`oa47qN)Lrnl0;!o>db^V|kF2 zVY2A1nOgjExWZb_%;aq6uy1IxU+84PKD$A)9Eu~g~{8(iY$rux7{*n z@q=Or~s`?L}bAb?f!hDXX^XwK)4@Yhs^c2EOM##{eEBoDwrv%33!_s~T_*)Ei~55(=*d*?72B0HrD*he8FL+w6+W{yi=YMyHZ@&H@dxN1G) zRf_N%a%uuQcAlk#B;Ex+yTE54n~HS{eig5dZyjmr#9d}=knpu?4o8Qp&Pq#qEo%eN zZCAZs^YW;DNQC1PH(_9tT`{TZcH#FPpE(uS`F;6f;B&Mo#{oNjAUq5(9&PwTfOvQE zG>1v}@6EQ&|HECp=EnN~ONQRq&*FD>ie&y|B4!^y_$jmw(Q_@UVMV8>r*{qx4!)b& z*w`@O=2d)kFxOvrW+!FCN9D4f*7)F~i`m4;N&Eq5nt>BJ)PKN4J0%6>wKce){)bcwHX zUu2!}9WoDE5qM%uo$*fwC7X@31(9NfpH{EM(pBMH{$m%Zgi4>@USX?#1~$UkM>W-A zwSnE4@~<@B{d|3tP9k9)DLF=9K-LK2z=DMZTKnjQI#=Z29g!uWL5-Dv(nRvKmta^s zJ8+hMqT1Y~KRaP17|a?Nf7i+S%!8uizvTY%o(y`Q(9&4~T`vyrrV_*cI3IK7{+Hvi zeOMqxa>i+~!UO(}E%(wibGqq%Q)19S>N#2TvbZPcWEAu8|O>jT+MWwr^6?m4eTRA zrj3h>iRD$Jz1z7?-H*aZ8lB~zm99S1Fct`HMla9lYvLh2(JHgAkRWR?H}pbv?m|z| zkG5MtzUjRD{AoZ6o!8}Iy%-VyR&>9B_@iovK5u{Zn3RL8z|Y#l%Bk6&IdLLRrfnJ` zENEol1?`$MIu8hq6wGLgi<+`rBq8PGKjR+s^xJGuh&9c^yWps=hR0_LZ5^!n`%PZ( zqqUz@Y(_1q{f6MciIARGnB5)Bu8gQl<}SI%?8)msY=R|Jzv%_A;!;CetvyYPg*`qo^bsbNsXxZbOL?40>< zduTzx&$=Xv8>;lgw%!^f_`zHHAIy=+EQrO3~&M6sT~>Tx0gYuH9StRsdc&6e*kGFs+(7 zqGf}HgoS#BDa}qlt&qMO4wbB1d*!o1^A%WFb8opU8(h@-MFJt`Nwq=W^zo_1WXfXU z)hEWXG9scMi?E2f3;>C_&Zq|JDnSy7LnaS#0(iZEW+{~h9Qdv#1B!@-hXo0}XDfS3 zSczI1E)Z$u(J4EO64@t!+#1zI*o9p)!`6U$%#ho|KAswk%g{i1c(>=9s8?p8C^ms$eW-$iWp{@ z#+wQ46o7?Z-z9r#!`_+}KMzPTJz^>jm4IL*O^R?@`wacSNdc3-=R*>*?TfEZl%V1I z>2jpAS7R+_izabGcKOODz%t^uE9lVjM$yEeVU5MzzIjqawk|Q+7dNQuQQ?A`9yiy| zy1O$t>Lc8kJ778hk9?*D(v}fqgmg1V*dkfA6>Mo8N86FEQsjvjdki~~V@e9NfFP3k z@8OYgPZ~}t7o1xf$kPSQf7Y7?HpKK#r_*$G<>`So;!t@ZTWmdMU9ul>goThMf>^~A5*5=`SePv0jG9hf zDp;%|2j)w#Yc4qt_7v1OR0jh!kA?VT*vSH}o|WfvCzRB%4pjL2zvNw=L)k|`oa^jg z2?U4|=PJhn98&!|RUwGpN?t9;uXpSz9eVWbRqFuaT!!!aJCEijL)=H*J2In4WJH0MH77&I;5lx2Z^=6Jx zf|jaAJ?pOzqa!k1OKL=xKUTGAy(okSAW>)@eN)ix= z=ox8{?Q##W$fnNYvFIBl>?u*E40v8{3z_IkCJg-ySD@gX4p72#HSR;_RWkxlth=G6 z8{73?#S!J{6?Dfcs|iXKBTtIhb?gpICG#l;Y%E@+!Cw)5l$>$1LroU%Tv$7T8SwNF zF6w@{H@_&uXl>vuEl1=4ZYX{ZIm22z;Msk%=%YE& z&EbF;u$Q}SNv1|jA9o5_r@0*cE-&Ab#)2AP%k9A8#xK z?|Ve2>e&uIAgajNlQr^}iZliGhB1JO3RP&-FA!B!96=B?WA}XcTIdi4OoGCi09wE` zkFXzfh$>OU{fozg8X`#)$+YMk{d7eKA46B(z}L@G%o8FG`&UNrQUEN~?)!AOj!2M% zR(O)FrtS%f>cuH$^urJyAvJ8z>vKeUEF^Jc4FSj|H=+8FY;-Q>>ZbJ?c4ly8v(ghI z#zR1||7pg6^8c8O5`zMp5i=`=A0cPtw07T8iG2$8P+ZAy7i(DS%L^p#M@*@z{6^Ox zr#^dhRy@nan>4pmFcY-YRNr#{l9ZcUKG;G8Bd7TJuH~g~>eMMguHfcf;b#Kkny^&w zek@G#zJI#z##{lPHX>u9=kB$#TmQ;YM$wKoaZ2Tc%5Rqv!x;tnKExJ@m}nrFaV6!B z;>4A103WdC;=6i!Aw~%-a_`h5N_llI2kfqK9w(jAqx}AO@oE`X#`GF(b|;nA>E@_^ zlmRd)vqmMXnTp1hgRpWc!f(YW)|Wjs!i@OJ^>P?uIe`UEMsKh!@}v|zZYC3IBvN6v}uit3d>*RGUgdWcPIOlSMgI70oa zSFdJUzh5#tf9*QgFBj8_qC@qO0~UtRXb#3JJppp20;s~hrXkt2jPF#_P1Pd`rkucX zhIbM@;38i!3IwU^e1%Jg&>G%=-@Wl1m@9qCK1C=ZCd;&1Pvmi?T%z0>bJGO+SXFQ* zd5Se$tN=lWuCKpUTp?=h+0E0l0t+K_P_u>u7#fhcFqiUxnYZZyROe}MtXW4u`MpC$T9%s>y~;2%y^Q2A%VE4CH>bKi_$+y8GX3cs-{KRGRiSvh>0tTK#I;T8Y2{RSI3021k`Ba0|>UJY6W@X)AuGYU= zgi+j|FHvl&-p-di!Ct7Z1vVOhe;yx)0u@7|dRYpA<=$K4!IE*tZ@*KqXAPYZ3spQX<7O+r~RVYd&^hvpcpfFr>^{9?^%0ap9<98+^_D^u- zX1!5}rgyQxSaXZwp0_G-od@isBSf!c)$Ow~dx=bbby8)T11OBSg%ur&bDQG?<2rTu z=OdDL7YZ4DIy}L3`8xbhf!NLZ>V^4!lXtinsA5^NQVp!e;0?jW@LZlu;!rejl7D>|K?m?%`gt%qC@xC{uB{2 zMZgCKCq_LzP}5Zv9CZ!MVhA%6Ja2{ybqd0_MGd>54N?HYa^wE}x75(pwBefy+r67& zKg>H=G7WbYB?arHuwzzsT=$)uC27#{Mg4D5INl9!x@}3yNHA>sohzF*#ZPm#EH8zJ z0pV!v-q1~P=pn5#T9G}MsYb5)mZae*vHAb_mTiZeYeri zxpWM{grmZ@P~-}PBr^#i%=&xu%-~=Vy;EM3CvRadLz&T$vOmdn02{o8M`u61J3nIB ziPz6901V88i&|AOA_a`kj4sHTxOtVSboJ&7sVLj*<2?jF@0+{w(BpH@i+#0$q|Mzs zG9|2s?>+Mi(H#UuuW!~a-01fe>A_(!SSkLRxmTyxedMIvW|4i3!tG4#S?aCK=;xX5 z#Rq{@LNRw{0g0&jOr4DzT^VcUNWjS!9*akI)>DhY&ur(c!eC#TKh1r!zRJF{WI4g; zeV>Ef7F)As*Tx_X%vQ#=~u3#L1#hE%%*{^wivLA+> z+}4)wo@s8FvAqnHSD$J20WUqp^kp#b-O*_Ud@A_{n}p7|c{^d8tDQVG4c*h7*3z-I z6VF+Ewsnk7Nh|0nGU?DT z?PxQ1ZD5wc!3i8_O8y7qm5X(G!!OC*+N)~K4YIC=WcnZo&gf51dwZ96jiR?}nTvWR z{XdKCl(w(T?OjS`|Gd)b4Dm(~-oi18*2d`T-Ryx;n#FN7GP@y^<^dCNk}XZEm!x>B z5>vWWhKEaaKChM9BFtoS!Wl6i1r(aB3+R^$w)=yZf?P6T(?#xk`y)j}X>MW1GQ>^` z)dBN(r|s6h`hv?Fs_$y|+B$UTeVpVKUa{DBO)dhVCAR^a$}qvz2US{$vC zeb!RjFuEF(8k_7ft10&CvPGA%)~lhF6~CV;=D3bo3jjpt?4PQUXWIgBT_Brzm?`T| za|OfOO&_X=ip-Jr1yX!Yh4wHb0T;h%gzw!qGQ&Hu@i$~0b2GbJKPe?LFzA7jr>_~i z%imiQ;X|p7<&av`Rqo%LZ8k^MttiG`G_mj>oic>hu%JA4h?5cQJb1AfpulO1?dTxE zn2GdnB{~tPRP{W~UP?1aSrQ!yKP$x7JH-Ma@q|d3~Ae5jTp~ z1~JaQ-lr(<39wx}7adqV$|TmhUFf>Y-QjyM7ParsDKG+ycg`1~-FK>bQE*980@DHy zqaI4`EguIAm{!Af&Vw27Xn?+Z%6923Hmt-dtY5^k6d>tH>hwnoJ7CL^1f6-n^p}k+ zSA=Qz1*(21-kZ!aUJDK{xX7TrSZI_y9(nHJSXGQT)X4XdV6^g~zC+5VgBjn(kcFz9 zn0iRhwmv4Y9xnsy@lu~SlBV(pcUj}DeR);WcjIv6w<{ zolQ;>oBI`4z{FP7w5WUgG)#&K@q!;y%*Xg#+w*?It}0i!t^P9L1MiJ(zuM?MX9tj3 zfpfWi>P%S`1QNio)3R+2g1Iys2G!e#{vJs7UH8{hXMYV8QYnMIM0Cq5lP0Kuu<^cz zwdx#BnkuK3L<^Oq)%1Hg4jaE=U>bj7)Re2IDu#tnO#6e~Vt+<@f39!7%G3>Tt`1g0 zJy5R-q&TfI6#}=PbLmc>}n)QI|a#`&$f#nG@HB9 zFmppYJ$4u$;iOIj2~B_~mrU%1`gy0z-DZEv$%sl#`UD0Bdc0i2tMWYtBc8dhY<6{Z z?PU?W?byFhN_5vcE*(R5oL1>m@9*cj7yo_@^etW&`VSzayBhboj`!GK4;ByjL0 zC1Zzez7r{4m40}BHle|#Uh2}#R620lLg;e@LCz&3U`QNSB4z>u0*L&j6&G1Yya}2eKbK{lPz?vEW{s=Q`=XZef`0GzyzzIoF7A(Ah6$mr+RNe0W zK3qhwaOhcih+#}Oeq=248L~Grp?5HE(VCk8Vby78%1{_`0^>1^RNm*U@fh6qh`mW6 zEw^xH^hi;%Hc0gFFpUC8Z1xVr;yn#e&m78v7MH`7x@8=iOE@z$bReu~(AnA4mURm~ z(32O+d6xB=ooGUQHK3Z%j{QI~fMY;9CR*b=ZJuj!Wd6X!7BqNEQ^E+a;=5M7>qh4# zCXwi$8KN81+}u9h7Qhl8*4 z`gkK^0`z%sC7$PqW5^G34QiQLFut>ojadbJWga|#s09s#*>8zORFcv>>@BEM$WG{YmWA>^Z5IY<5 zWB$Drhlwj2)4xcW>HO3gS@IVvHYDe0b=Ce&N}`)v{U=rwhyGUfA$R;u%Kx2MJ0lLQ zt*uQdDJfCUDg-vE&IbU~=IwdDr)K-(1`a*YcB$+`Giw#!35EhNUN-C9KYtnot(Y$s z@88?5E&H@zP}BYobUqd#Ck%gkjuY#l=ZuCC;(Ntd2$dh#*Vnx!)=6Al{(*Kaz}SUE zzXcp=@Ji1i4#HW*WIsDnF(ynGGPWSL=Gr(OxRo6zN_A*f_vqEdx!*&qQy&5xpyN(y zW9p>T%b%wXc%z2_LKf|N!J!}`yA9bWSoeL~V zCPp{%Y}F_odVbO0QU0s&+gsoEk8HpO56zP4noq=XT7e+%GKb7^#~p-hs5N?;sU6PoN3!P0?Aq5Df&o{Ptf-G3>m<@}&_0^71htjcdCasO75 zHfVbjZqbjV_5Pe9`dZ9s7xJ*+&w$Bka}}6-(%>`jG&IPEyxnKcGtNqpAT=1Ni@fDrF zx!_?){F{`8sd;Hmf2Dh;V#9x;%YR1qL<<@Hjqh}flH%|EO-g}pMGk3yQ?^T5aU%Aw ztV|5##(z;ZwBHyxW%V~H4sW_1|1T#8+%_Eo5Eo&@%=N?Vyc- zW6cN@E*kvGC|dx#n+JzZaR`)Wv)u_)zr()5udr`dT=6StH~gQBH9KkC!u{WX)FBrhBKHYdaIW_6VDCeE$c1)HK!NHIK<3BKZ0!JrX!1Wstxg>-T!=^>XaCtQ zL1ZE#-_re}lCJ0!V^uJqG_6;aYYc2s!wNCx1e_K<^2CCj6cu;>r zv^-gw46z=Hc!#kkQ^^a@OWq9MDr8-nSQ0llepHvk^&(3Nm^M@YLIBpmFR6L|$J5^x zXC$Oo=w^X~4YI_Vqi#q-n1NO5*gv--PXXyjcCs|sQQ6M}5*!uHKVLrru63Ly)e-+y zT|&H0hz~pY=iAr+!%K{RDtub6`{ev{*YPswcQH>g3|kPh;}xASflK5RzWwO62H8|$ z%THuf+(k7j6|x>eo%b7K+g1YOm3+m#{DT71doJow;GnO=)ynwl{yiz9g<4RC*u+6{ z;r`+|#q!1^*R8d)Z0L{1M(DfRWhPHPxLFxX!-it|o^03#UvXJrD2`e__(WY;oU1M4 zy?b!p7D6LVx%XiuZK(5x>+_qJq3o?%x>(a@OHoDvCZ3OLz?C&7l>ma=d@iq#aX;pq zAgY^DfjvC%XL004se)vX04}GRU}$y8xK~3q{;~3luft|Oi!>%ziI>vT3Iz^gmzqQ5W)4uL#nsmU-*XTq}6GKUe z?(9Q*nsjt*qJ3T}o6({r6eR29+z{)B=0In_?< zU2IN56R*lv>x^{JiQVK@32GX;;CuZCpMY`A#&yMv;@};& z#B=!Rop28B>2S5@I}vN2FiNdWaqT`BFuuC9FH`9^AxTL7v2~{8#M8DqpW`EY>Eja_>Mw1t3se}CB}TRPg$)eT z;RnN0RSG`GW^eai1l_n`TUD*bW0$>quGX+3Pa_Af9LOV8j0szdCp=lAimsm4_Hsw@ zJ8=S41+jjmpOrM6CxZ7t+I=s+EimZT6J#Pke2}})JAbc{K9Pd}xgtL1hb2&2VUyJ; zh6-3!*iB*y{joWPqOw$T0>$+8HlxeRM&n?dOJ_vNe2d(pJG2~x~7rwePbp#m_CN!ulkM|ZM(Bl(3h-+Rd%Ug%m|vz|C;rDMdfV; zome|q4ADnoS<`g6Mf7yzq6nArLYH!$y^F(L@U#3b{uh9;K``~4MfNIr@llk$CVm+1 zC*r8sLuc{vtO#-4cS5HUP>fWRYE{`dz`^5B& z5Q~HRoPUP_iW@c2BSxV?k{34ChWm{3&|t@CGsm@#Z!GChEs79%DJj)Rg4P8K#jde2 zW;A_Mxt7n82et8WseKot=tj{B5EPoePoeTkaWJAq&1;bMvD*7;>(O^1>?$;5+w~7SFe^YNr75!^?hSjlZ&Z7tKh}@^5 zbNz_>o@W5px)v$3_l_Q%KTKUaQvy|Dhm`nb5h%X9ZAP+mR2}~1GLZd$r*A}iREB$- zA2@w@Me3+{^dOu_zmL-%9#QI5S>UW`ltF3ds3iR=0vPo2z&WztSGix00=f1N?8G|8{yvE6BZm5yZ2?w9M%r@&C4FeBFogZ(o7KoOr+{W^{b|NrqTZF_So&W zpQA4}ZhO&o!nSKM(K%b%)H%1h3OGg)=U%B`f}pah-vCAH!KkcLI|F9HK^EI^1?Twt z8Il>*AFYcTDprJHxRM(d$jP1C=)Gp3uYNo2M>TFk)E4jZSS$SkY-D+q!i5{M zF_{FzdK+tqhDv#Cy&Ae?R!*RNMhc1?_lDRaxgSaCy5NLrOE1E9o3uTnQ*P(@oXtS- zDdU8EKXKrB?E{k2vbu-a#NOfY z>4kI{wkCNnflw*4BD0oO#Y)+N2_;n8Z)0xCRI*AyozJN_Z8$!)y>OWn04}*fZp+8Hl|x=Y`9@RxV^GBh1?1&1!_jSpz8>uP+>Udv!f*9XSiy z{Ie0lOxJ?rh(9 z>wsb(%Gh(L*_04gjIM$63@z2Lu8+JBDIiOr@Vd|sI~OI-OzYyI`vADJIX$@moMAgx zmilP`_r*FpzS7A$pb3UorhRCE$C7(AEsRz4T5OhhA6}y~{=lv7XZZ;yG@;KQk~gh> zyrBXDx$m*C5Zfu>QnrWM0dDcxPukW3=k9zy;Kjf(eZ@$mg#=XUG*HxeBF^0#7^Rn6 zV~>f#6$DLt#wos%cfvVy&F553RBCXsfsWo2;Pxh-q1bvcQr;;c=HeDzMZDhC?;t3o z`{}g-dK1SRosClOplwNUQ%tg=Na0QyDbJTkw&csu9DW(#%Je|D;#1mm$}W2)39SH^y2XTJ>R;>%I7G0lvws?7aNn+gz^ z3$CcIU>;zZK{_KM=^WNK7f=gD7wZ6LJn5IWh}4vt0DSo2WuM^D;#r30$L~s4Jd>`k z`Mu&Jo+TlBe35Hkt6_-V(hD)He>cDKh#aYv9-wf?9^PO$BE!^2K<R`zVh(Jqc9G zvvO$|(;Y>UYd|FV7QtP1wCuJa0TlXwMHjRnXw&{WTC6hR40R6)>(O6Y(N{Yz*0j6w zNdCUW1B3U)lR&BP(B1QEl*fK98IkV({_8JCf3(H&TR35?5bYFOfM28%`PaAa;7dk5 lnuqV?{%77{~KUcujl{( literal 0 HcmV?d00001 diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..b96ecd5ff940c0d000613b1ed1f11fb16796cf47 GIT binary patch literal 20243 zcmc({XIN9+)-DX#3kdkAfYL-zq)7<~NE4}{XaJEGlq!jUQX>e6*boq@p`${Gp+%_z z0W27rfP@n1YG|Q{D*c-Q<=O6U@ArMrKIdHLhh7(1Yt1$0DEGMMm}9Qsiw0WEd%5;9 zFfcIdXkUOaFfhRx7#NYecY8mj?yb0g8a*cj* z&rNMJcLoM_5BfjGUI$(j1B1aUoeOHlDEN5tXo#%h<@wb0I{b{QOwE~BL3}B!&eaD` zkDz|OE{tKjaN%oLgyyCDIm0)KWu82FvMV~a<0VD&-1DE0cT}Hek;Xl);wP8SDJ!RBYgS+0HE-a|P)7I{;?T)2rs_-D7}^pEUE#5qSh;xA|LWbHUl_Pl zw_Rsoe$IH3`OXThS|aJ|Oei`|T$%MQ!*)If#@(t6O#a)mD?gwX-(XJA|Jt}Pb&r9# zPwn?#41qzruM$|88k~MT0q!C0WB$)CzZ0Pdy2i!u_EH|4-5T<)1S8FKq_E$*moiX~ zCT;!xPMR^6i*4I)X<5%cW+0C8eM#E6DJq!5Rl=98$^36z-mmp7d~G;fTux37ZM_&{ z%_%O+bWimFW2or?rRnoPGq3pgj&$Nz>hOd+Ycr{MZGCqBse>xCgNDRW{MraU_i#<$ zYEPe%ek+um&g}Sct$Rp$p;fIV)zELm2+g}rm7 z?^_$^FI?$JUG2cm)ZboT!mlg%8kAhoPUu@-?Nd1+Pi*O5{c%Vm|mNVcBE z`%q?;U?b)Am8SKTbd^O;>P(BuYK!6A&qo129}T_sr^xwJsIDaMMZ$@x)b+(w!_Knu z@^XX>uitEQlxEfajMSu*T&`BTm}-=M`&H=ok)CTV&m#AIjbxk7_UHOY+F{gk?%X+# zWb=`NPuVrM$vfSUgf+T&$Auh3OhZjgrBw=PV??Na)q(Y?1C@6JY3A+3BNbB*uwLu* z=_oVmr});QRzF#{S-$$^h5DC6RyzFJ!RNeP!wncAi@)v8uG=+_>(Q80GA#v%t;LpIgTUum-1am!t$c1e*EXP7Iio1C{xf|2XSCj$Np9bQ zUmv+@Zf(s!Lf=TuG~r5@;n~LNO9yEnViNQ3ALGHs87r!D@EEHyK#9=vi@+F5SYG8B zws)uovO~vXcOsO%uBWJxW6a7VOl>EJJeK1%P}TbFTb=Qt=&D@l%T)FVw>H~6g30oG zR9j>nHnADC`Zav@t5Lx9W*uHkiTl2j-`jkQq`h-yn(>?x<-*Y&63XuM@!Q}b&ElLA zwNnj!D-DK6q?X_KqV23d)14B&HfOy)XVMAmrBZ%}?CnR+4~|*~?y>IIZe086{N$B! ztF;7LWcl?xZ(ZgFvi~?4UiB({P+y#=h?nwSgT`b-xatz`c} zt@FaXd97Bb#+tZgbYMYHLD=eWsAXyOD1C&K>DAuJ)!qT1yYN&`{-I}nn(Zg*8R9HR zF|yApp+30q3BRbf{zKmWC$la&bs4KT9UWQAPRPAtOE|$|k&$afVKurIXU8 z5+yv3S!8m?9bYES3LKm3JnmFEXHhw)V>&OOptGR;1m5l-YJdAuT|Swmq&J=XC|b+M zz)7p4D|7DIj81vFf63SWJDu3{9t~^}k57)?a!RWy$JcPUoIe%qPj%vbHByl-OjWEi zvI$t|@lGvz+&_=1m7m*b+W65?oQVTp(utJp6r>1BLrAvd)lM zU*w(cv_C1a!nBZsN3m5HER+}9B?0aEU%;DQ|yfPC>+DM*?rSdkwRsU zd(OzhOE`N4ixMvH>`VIotmt*Bd5*5KeVp_pRF1KZJfdoI=rTL@4oL8p_~#OLzuGZs3tG3k4~s=JMr^K)DPc zC0;qv3LTFG3ake1l*GA9->zJ#uS^<`Ld?2Z2((6*&onBYJVz+-GSO`0K3ex`D5v@(UIw$euGMEN;L8n1qq_j5N|q+&Vm(hFXnOBrEm0furl zgQ4i?lSep8uC9IOUHd-s>C*fw>eZ=T0aLpq8U-a?Iw*oc)seT~9N>R-$xP}ph9d9O z#vu`G*s1Y$r8;1xx+S@E|9KnMfK_t9YDszjq{@Q3%7QU3l7$?KPDe9pEoAG&*PY-V z(0EUu2J@o%W0@$jonVcn^_=BZ<(b!A@V|jEwlkC6=PXyUz|kj~5>( zK$uVpGcGq?l7b^f29n&2o;#loK(D>bFkiCCT}`j_UoH+<9%xz{Yf4#;E+~nOY8Qd* z^HE|w{NY`KOmZ_Dvkq@ec-1TFr~SD1&Gzc@*jN$CnnD3bf}Lt)>El>Zcw>8==(AX{ zq2(gi<)R~0kD`k*IntJ-N5+ZNLhUf8I@q)}{PuHAmCU|^ib#XES(>Dysk4kc!dhKA z4$W=}x~`f1-JM#npk!%yX_#HT=hTUhy#;OWQ2JdS65FhOGcRQ?mf6zT7pO`?(ZnYhfn>22!>L}t+<_pZ=0fV)yZ)9GzdT*hWETMTQj z#5{F{z-=xfRG_}9?V+Xi(g~hi-$`>kbS123&?`U2!^OcdF>?gv?e{ zM@NR`9h)l5Q?GWtFtT~y#g^E_7qRZp66%d8seCz4CrgA&zfW8i-VNG>8%LP3q+7Nb zm!7~&k5U_QiG0v{_65>MRfT=#zU6#99NW*pZ41=B{{pZ*F*~-ya91lPo>$TKnfRgF zf+dpgdpDqN2mRCW(r_P|c}O)J6}!`!dzh3r=5~RlDB(gamY_?_^th8nGw+nn@Xf6F zezf9yTXyn5z5~Hc8dc&Y%!%*jOHFjHC|#uhz$+rT9_5`a^`h1IIvXQy;f5wi+=0A zn&W%s#oeAgR|#HPcRugQrs`PVzAC{5IdW>5NmtS$@5T#mtF_9a66Nx%BM$Z@wx5}& zkolfI%4Ojnx==cV_Uu=|BIC5kGFqo$1I)JZSoHk2&>IAgc&*s7aTtYkx2Q_`0}7*A zXp*IM6mP7EDfe_TxAKZI<11@@bpUofG1}#8KWgcoUHLU_xt~LyD-)8sKZhq-@cMt5 z383?ckX=xO-mN96ZwZ{_$X++OlQ~2YZ++u-4t%G`s3r9b+`W)&Q&7?&)DwRENSw!k z51mm`g2!xzJ#2Hj12bACOE}UA9x8?t$J6Q$LZy{T;t=8qn9{Elw_>q@5!>}r-t1Xp z3Q6dD_hK=ZgcTyf{zJFhw;JN8cff*o{_==hy)Q;qg!w)+p8*+U;Y1XH?8z}7H$!ww zQDWx#t#qTz96N^6Q7fbmK4lYK=zA%s}AMUk>IO6NV9Q z$5Xic>gtsJOr8XQk@Jxa)W~2Sd)%dlN)Amwlc0j+b2gO&S=lecOtUUSdc_h)eZ$Z$ z{Y43%Tp5|anficas!s#*C>-q1zEAM%; z_fQrs*xE;4kneU+W0OuS*tuuXNQs%62A47&c%;~f$QlQy1Biy%;T>oF z_Pi2buW@kmlhSECy{%+ykIt^W@hQcId#EiBsSe#3j&#d%ZGXIWnc*64@qwd{voi!vU4xa#)>appxDfTj zgz!<^G(V^aGZJaZ+68^#xO z9dp{}qcRRKT@NVQ7CB<$1U~z=AZ67>ojPoBxB`-Pa5X;c7G-fh5D|| z@}D^R$gMF3laDWiW)=E{dMqDA*%Nqpc_pUvTN&pM1WVuGe-6DwO{^78Y<&O040gkb zxMOHRHotdj9D9!9!s85R=PU5jabHM&o30i|stqgnTSfJzn^eZCx5Wl41_O|g!h za$nw|L}`gnoE^wW8AvS@^_wG_jd~CIylF)ZwqsmacMCn27M}igf1)SqS^xSHaV0;1 zGp}(t)VygnW;YeCBUzpIIV0c1*RQsC;zeiH$~W=X^;FLQ)A}1U@xY72LV)7xf34Zh zxVx40N@rq*ubCa^0G+ZTpCl-RP2}szT?;pUL2`S2$E1lTH2`oQ-%j(ZmB+7#yC>uH z+75~sxfh)#A{)H~5WDr%`HJxdm2r3KMFcZ&DE3!eUz!5eX4D@_)W5UIB`H}TebXLh zSH*0f8;}TCtj!*r_p{n_kD3)sOQ_g`+giTovx0RH30AddqcXvI_uVW=>n(EphklnU zt{w(Lo0&g3%fDH?V2w**AR?E9(zmJspU*Q;AW$gV$=?-&mVR+qiLb7%4q5rS4SC441#`RAuC{MZAftZGrJCSQ$i9yqqpM);4aBDy19n%fP`Ntj*-tmQ^uKpuMw7s8W*|+Ww;PIYx+S ziIOE%v^{}VCnji4!90X6Yr9C8oeNvrZ5fKb&rdwv^HEUR5>W&@n89yLSUMWc&c_h= zX7^Wn4n=i7&C|(ZSEQkl;#Ng(q#3*_KVLcVcu;hEb|cA57+Fx%h;mB(Vy|@#-B^(P z0Mq`l0cCauEHu0p-yELkDt1kse@=5}*-71hccrgr)OWIcp@2Fqow}&^ra|TBx1b}2 zpVM#m-X2OmQk`xo#2_*kj zM=K*g?EZEKNAwkH=sS|A^Te$cwBWIsFylun}~JTnKOSbJ&0lZyue*rXSkT4DeLPX63I;`hgMf^ z?cnr~fnw6Edcx`WvqUrfg_`L3!x_Hj{Y3t@{05}8#;x|%Gy2S`HoNsy0a~aDBtxpY z-Reo~dSSFYO@y<&V?C_NtSiuGk^QxF7PVK@_j3ilCgJ3A)fJzE*W}3P`MZnlk)ZAw zm;<20qBa7bsOnN{PxXE01}spR&x~|cZY(h{HR7S5VIaX-u@fTRNp;fAl5$;!2AY0hQQ;X%*n>K z=QlXry$kA|=mT21zp;XWNdti7@#E}Uk1+BwaY*n!jb_>c#V0^+jSDA!LA9ksApU2o zA^fF(qc%6ilXw|p@az32nJ#5O8-0lfh@+u7y2-!ZX#k!!;9C!ne)j; z3|Ds>!kybNuI+p`2eu#Ih07Ojuq;Xckj0CmPSs7af@<>I&!4CWUf<^)6OSG}s&8m$ zm}%)%=i^m?(@poHbjoMxcBrcegTIB#Keh5 z(rjU2p^2?#(x!3NHXN-k_{nU~7hF?k9@z}9tz=8cHFEbQE5KTO$zm_>pfQ>dxYG_7 zSJ%`qG>7=2(&}U3{D2UKck&eQ5T)zx?v>paMmQ-0aBNis){=Hz`_3bBbps0H`__Yc zX#pkHfO^w9+!pGNKG=a964II#V1L{yEe#hqZL=149HOLBPKE%nN>_kc(jyX5f{mQBU5L;HZ)&i;EHo^p@ooo}tECUQO{iya zH_#I~gKU|$p(nH3%WX=qmgTn7xpqe=_6y#@d-Cfa&nsC|vxI(RHpOf4sbYCv9D4jd zL^VA{5)RMNmKP4x)=zyS*d0pd{C*K@3&soo{@Ls)KP%1=tf`#uI6DTn&T^XB5*oi| zMzsYSD47MZO`Hy-zYcI?px$yVznAq@D2-xO^3^PJ&>xZkqmtYS? z^FnnyweK2~vmNVrcw%GdXOVHrue9|I4@ek+{LE?LnWxZ<8&0rB&RA+{~Flnb13f@F24bKkCm1!PdzC=)tF)cxl-_`H>4)Wi3Tm z$hsh1pd+WdgH+Qq7G4>8#DCtH%>nLL?+!4!GDQUaMXWD_2umzE{0J4hOH;7TGPsPb z2}DR4-&bVZ(1Z|c&3%4NgwJ`)PtPRjq4+hCA=$_ELv+B2jq6WuP)FrW(=(d1Iq3nJ zoNeGSQp**3({vBrQy-jN;A6#6l|~L)2@yp~cbrx6C46~@+kJrr#(_7|af++Efbiki z{WS0@;n+1;t1NI5=k6Ma(8kcj)~c~f8VDaRuackETZF+e`7u}DU@G^@XryvW zr}s9c=fcSJbT7L;1JZp&G8c2Qy1GE2ZFzMrh6m z8=lP>rbWdc4{M#(OG4OsLf}$eF^QjG>JF(0y#PFOoX3ALyPi+AKW>*Odr*JI=rzZ?x5dKuO7dzZ(Fr%uSQ+Z#9=? z<%sp@lbAm|L>m6F&SvBtco6dT!8gf29^{nxpFMci0ievCp;rf!myV5Vi>;wI1bTJz z08>OlZ<=zmz0}5RR6L$Dsca|gp~r{BWP3VcI5Akd2y9M<<|vpYAK#T(dy#Eo`-UbG zRQV1**#_xata`r+kmmTExb6g**&?$TA{9S-}1eEV0_egdcxn0tbHh z@VAvQ+WBqRMi0?ZJzXxW zctirvqkGmTK)wiVC5gs!h$HKCH5>3xqGc3-nTN$L+!5QDIl@Vp)DA-1!#PwM<1PtD zWLH^Y>Vx;5rw3j1xyaZ9K#c3(^fuKL6-mTF(=)1Hd^-}H2o?r#n-8rwYAZP;Dm4`? zbUa4#3x_?)C+SIRw7nNz7~VPy_k1EsZbM-X>7d!s4utmk`#Ea2K^ASv6H*zv%G!Zz z{vt0U15?x-F>6#}-)AGQwf>}a_7oR;-}kv&lJKP}8}m2_yhI6puH(PYqOfc~UCC9e zuz0^r$<@`+n>ZtGMg>pqlduN0jZ7*pBV`c{GHIBJKEZ`pUxBT!IDKBOWpvONE_kWs z9692lfgkty9kIetx*pAqy>2K8K>Tj#hs!4>(iDU&8{IU4+ejCU%(V+|OtP&f1p+3S zGWX>4QOY5Wlo@3J4FV7Ug&{a3RAQqGA^;Q%oXun2fG<41prqeLAPx^a18_Sqtexje zBg6Kq_seuQ;H|1;7s$$!mr{qNB;n0s?-nII5EpI=ZTLg7jt-?2L_X!>RLFM9Ky_rK zEc$G{2R#xl71rNz-d@Z%ST%rEmZ$(K=N?wOff&?r^?3TFRHfOsW#NqW8Y2x zSgz*(B-i{A;odPDk^0zuD^hJho$l^Te(U3%1{!>B&px1%b#*@)3&C1Gs@~iW(>TUFisa_$M7SDvMHBf zHV1|C@_d? z7uzCo)5&YLoIIHJ1ki>iv|U=D5!>Tq>1W{BNYa@Yz-iC09^3XFCI2D)l{2tU*V_oB z-p0dWxZsX707YYCZnXaq>vaT#HR*|JKi>nILQM2SoL+jyqrh(*yNUH|LK@D15Wvu`eTbw+iem-XxQ~{#$f3s97@Ke z%jnsI^tyk0wdxu8(=)<|qX5ulUHZAAHX_6pVKe^cgq_fy5{(p&2|GOM`BUkJHNDc&)R4pUjlgR_|m)rjz+p7F0hF(KF1fFA$83JQMYJ%BIIVG%gqf0F)_5o;_(i8G* zS;Fcujj_G@IwuWPrhR?7d--!Ph2GX;_RTnK^g%UK)(z)9#4LkiFL|0t%X=|0M=;)@ zU+krX*GU@+ZuP+BO7nP2!=-#N(BSs#wPR>2pDGl$z>yLQQX`gIADcPQ*ZR12eO#s( z0&jmyrM2gsfmau-b2hbGa|V(Lpzyj; z-G%OAZIZAiw~6wvMic(4d{&;X$wS_tNO#-hc-xY>@2gnytc1Nz2V|U!F3eMH0Z@Ab z0#&JE3`H~6(W>stC2kI%vtZFtCoh(Mj3`?rg_1jkzRYH0^5SUcF+AHZw8%HK_tLru z)aeo{UA)y`@Uiqb+Vn+}8=Nm|F{3w2A0fr+BG-_PjkCuh5}rDT`85I@yK%kUn7Ssh zY;3!RuY9wd8EKyIVK@NSOJYPld;`sCBH5ft);a zsZ&Y-tZ_Os({D}3xI9D4q_$tx&%%fa%CiFi#@eqOm#;$Uye-8-9Y3OC?pq5KMHkq` zA{6qKnn*X)k!Btvv4>AMmsTTdf07eUyojHm)G@=UwreF?v(6#(H6+A({dK+tPBIdC z!^bwF3!B)|AGtQ(pdpr{txqPNH7d0VCF?QdinVvMe`{|XHaVQU)M-3yYTa|p)zJ+Z ziq#7;4Gyc?`n+u{S}D6I zo+rldx_6Y}NN&1ZFZ~?#TvYUYaB;N@r>pYbTljd3opUF~<=y2MT=Z*5#|Bfj9A0cq+3srY{nR+*n z+_VCMuDE4Oeet}|b?TY4sGm1h4DV(Aym)7MeMoD8x0U9p0~E-45zE{AJw`;1ysORR zrPpGRlC|3o;uuet&pV9o_=3oGb48<1IvIU*NZp%iHVD{cd>3r&4>nm3qf~KS@in!e zj`|dILa2ydLAiaJps*tL;9n#HQ?tWJY4bvB@5{CX1SvX=TR;O4e!g!~D{>~0I`T+YWyLY*2n`^1jA?MW zY?pBkc?6O(Sn(t?y{D;s{S&m+P*_%;XA|=Tey!d_K#jURnmX$S@PYFU#y%;3KMa&) z6ckURA?76*BIe48oMeo?V|_t@b}8u{%Ggh-@gRN42rqsP7a_;KL4PK}dNHH;VPB_* z-uf2wqO8RXfN=dg`Iu|C=8{i#@|Sq68pR+*1hSx5{X;Xw8P<*$)Yirydr!IO2eCih zF9U^!mtfSFPX%;p745&msQm;RN+&n?#OeuwR%d>w$RVda?I^MG1?aS>pW*eeTmgKG zhMF|RB)+Ho`JPMXbrK%PpA&v(XrSytNo^k&;65h%Ipw?rL~Xoe<{EO zVvH}3-+fH`fN~x;M7quw)jzo$?$nEUq(4jin1W4THIw0{+n^{@S(cZGT=e1Yl3Gsk z1M=2CrovvviIx?da?o=qcUaF>Vv?vPb%-Sv&6e34EoFurtFK|Pl2+uC)M7EtA8k@x z!zhU^J$t0&vVa$j7w$2Qntznz%cn*5px0D;m43ucABR{Oef*^nIZDg~STUQmKi-qh zLz$Z&@4}L;LZ_nq7ZFPV!Q(*(lu>I^CQ7lyl}5$MbKK<0oEVIgGuPrW*P|L1v7ub2 zv>F4egS<|}VPq#u8d3oyMweNISU9p%fUXw}M5$1P0O|4M)Y4l{ybK@2q!f#EB94ke z!{A|WwB#eW6FBio=Fw6>KDt`TNvrhds(^?G`))XSfy%g=UW`jh)A&$b{F{z%4Xmzq zkBF#-=g$^xj#;(Pi=RkkC1@2eGC!d7zjD8wgNWbxOLAn;La}Pa*MU9exJ?d9h~Va; zKtgl}pSXMIYtbo0NkMkACZhBqKmg>AM6s|C8i_NvZ~;Y~PRo#Q$vUCwGF!tEM&mN$ zoFU4VW(d%EZc^#fRuASkTiSzIe8LO%(r+t09K&tV)C}3M_AI9H8+J|CL)UtBSYo#)NKh|M??BewrkQg^MIBa z`Q>IX7;)Q%UK?U}QXm!~5aQ8}7n1O>FpcTcOhEherR;#W zKcEWd!xId^#sXb?L9~RNPh1q|@%z106cA$q3$4gp^#Hqcg84JmyH^WzFN zRmu`JD)M-zL~2(}17j-Y1SNM9U((Gn{K6=TCWIuPP$h-RrSHQn;tsT6?8x%*+( zmaC;9T!e^iCi-{Cu{-q_oPe8gNQ53CB3DFYiCOt!9O+9th35}=?7jJD?0TtT3kRP2 zLWh|hnE82E2sDL?{1LK=U++<#MOW+|DSyn{Ps}p9t7xZ1zt`@NIsM+#6He*o^_js3 zC?@JSWoBYliixZ}-QC)pW#MJlJ!IE~^V6I{$sPB(aa69@9EVzIKm9u#crM*Jr@wc@ z=QQA^b##aLfEsV^`T`K@fR#opCs41CFBh&o5Aow8s_kN?;cvXow;_!1X?k4;)5$Wx zIOD(*S^AO`EZxtNFX#8iR|qMGnlfXSWJDqD`8GJ(>Kvo6x8$q$fX2M=uuP~?8`&>+ zFv4dLN4%p#hv}v{8CZ^eTh*3DM{c~SVexyYraRSwQ`8tv;?|d@42a^Ue$X9s@z*PI zn<3F7Eg(q2vSf3sj9xrH)qjCXD=RBoIJ~x#`cky1It)GxtD0 zvbMw|NG@8D8rFixz5s)fc~*Dn+Y;u#nApXGh{btlnN90iALJ!w(Y*s~2^nJZEzI9? zl>~@(jTBo{cgguDajNTN@%);!1pJ(87vBvtAKpfze3JQ0{*N+f*qsy!UQ~B;0rSblzt=T9>tn-p;`PGI4`A5$cdfvcDi{l|mdH z?-p7oUycl!)aD7$y0to!aw0%iTo``&a6*@Ud11&B`JHu0KC0LF>)b zz~90A1e->qC#G{+vT9RS1j}n~NjSR*V}&Yyp@u0%Fhj&K!;QXTxFL&TcQt%8TVr8U zgc0x=6#n#xmdWpNgX;ijD!Y`KgHDW^RUgsxgF+Yb}A@`r^Q(n}kNa+^NZ3H_|tl;@--pOWL z*WQHABE%QEs{@+_yB5g+t;yA4C#jL^Sk2EF@*lHK#L-Sy)>dSwf59P7aicT5pHmyG zGjT)D^D)`5I}tt(#1KyWWqtJQ=Z0I%FX#hgjYkjZ_9)iO4oWoNpkob`){H~$=1$Bo zjVfP+{*w@Gc7|*JECdI>@$e}18P9F#cVDkBV)ilguwqq^4SOW+TX(9>D}+U9Ci+JR zedT0B%rY4jRB{i?Oa3SNz4BT>9|K8>e#~9Y&hCqFyd% zG5lY^DK8BFq$v4}HCPyRU1s(HUL&!Y>cJ2lHs#5Aask43npjHda|9r8zB+^2X6L#F zb1okKB5f#KQKR;RQ^J8@UI_nfiZgI$0B=5(gcr)=KOrcAU6~0#bH()Ww?-PQfAlE4 zMqwpB9ii{`0f8q4!5$`JcOm9Ist`cQHG;w+Y5#dRr8Tkg!Yj|KGz04=0H=<7tk8fz z_2Y3l#hMm%S$~MA#7&~lF_z`6RwwLExBfq>=Rk5%v>Dx7Z0NQ*EC%gNF#Me;vhqLq z?Bob1V5Te-UwAM5o*nIry&)&2(+62lNB~v^sdpn8(mZg&Z9q^dR_ohHjP zB(M`(00s_S^GoZHgnRP&Zs_mSFa3Fp(7oeN{c+&gUr@pqhTq7!>N{INh5^j~j`~wZ zS*_(kdcX^R<|Kbn^b=NA@ML~r+cJv)-uh3j4MwZA{wLQawxN{HowIZSMP+YqZ~WPs z;QqL^e$wZihCjwPg6;D!SiA;&lRJ^9j#jddMwDqhO1Z5|(_n#pMoCiIGUp;wN;Ju= z2q?vYK<|Xuk6!zFrs`&}WJ~ype?UOr;qkk;4Gjr`rN}=>i5kwshQgf7A)V%MVUYBK zztF0EXTK&EXB6dd&4Ntvi_3r|0_n*Rz%_A62e8eHUhTRub)vV|pd+!e>T`aOi*Ob_ zeCZ+W0|CDocZn&?em2_lc9A}Xjfs;&jxDr0BF%nr)DF@ykVEL%!zwn3IWDXitM1b; zMni6T(;u@P&1r6*-k2>&Y3O1wRkY8YWtw^@nIsw~D%ZG1-2m60h5r$_KH|WGCh{C; zm@6n;`Me+9t+IhM!GVsudSI7@mF0OlaL6BJMVrp>qx_fZ!9JDMlS!`d1w`J-0A^Q1 zzoJ-^f6Plw*N7s;)ua0jW#>J#!wG`?zo0+vj~oIpq+W#XS#MfvU}^s_HS|VpJZ`O* zXNg080~`eR{->skvYPjUm&U_elD?a;lfG_DQRn6#@di?2?+T{VrTK+p$_Cs> z!#|@65#{NQfU4Z@Bmt^Y)RJ=2bCRzGKw=)9gGzkT!sddMxS3+x+AY#@2IzSqW;b^L z!8A%~5sB!^RBM{uviVhlf<^X*rT;0?cuL2F)Bf-qdIrk@84NJlixz;%9>tq(`tTM? z{x3#(sHtPfoL3!~*bqloC1m}|HJE#A0yBjwkkA%n5O3b#2h@!uEy^a163e~Y<_OV+zR--<$Xb@~UZ!RMM>HISpYzE{XIo6uKu=#+x zJsOYnz*QIXR-+1~eIwEw@6u{4_E!F*MSqi=O$U8MJI$uBh6?BLooR0^G70);g71n2 zIJvj)zd?e3Y0+Krbc6hX+rP5>wI~GIx42 zs{Zmvo}2|lBOI*kw$cxsXaJD-WdktUH2GiIVAF5@w81aqTX+n|WB&ri{}g&e8lGb= zK?U}w^TJ1etUlB&(t|QKiR$bBjHo{1kdQR9DVm!Yt0WEyGly9j_-1|(Q91MLB)`nvo%8K?!4jm2p9o4U_2Mc% zCQsb9Iyc`v-p(RT?^tlWop-D1p`g(3{&J$g&Brv7sm}Geaigb{Yy8K5U*P*@_>Gk! zc&%P@X%;*N3jYD_7mxpLpLZ0x8Anr)zA*BuB@?uFB0vK@;#9=nC-BdLPE>i-^xaz{ zUQ`FIvY<%tcgJZU=s0COuCdiV8ps8D4!CsF|86j40S%yF7mI1L3mo*gwt`XHE*@w8 zEeY#QzPmKHyV`79qdtEK8e1QdbV%B^^~JlU0^enC?;j-sCgRAP`j&s{$=y3yA*u7m=s)(zdUM@6ukT3N$wO_g$I(ufB}e(PRy+O69j$T;Z*(2*<J90E@~<7kI8n}j`kHr@0h7mjU5DG@IEpct=*qp_&+irtBEDz^hlp1X3%I&|sCe|0l zjU%Uhxzs00&s@$*RpjzzhL@@paap#QC{UarSw3H;$HGt(3J83VlSX<%J={4;^JOy#}5%;}Qi^9~9p2jH_i{tt; zFfEGm+$d6rmdm>s_m&qy8Wv~Zmv@zPq z=rG}VriK%5*=Lc^hOlz61R!Hu5?>_t&{fp_^U(%UXu6eBO~c@ooS4|K@wpq$T@OYs zn4Zt(EFd>yjxJvY&5l7up3dcHm&fMIB*JMMBJyidy=UF5Ye_kumnt+&e>Bw8$xBUQ ztsONWu7?~F7lRBc6r~81zs0P(mZSEI+^gMIUud-?mvv~8lYW$b^`s-gZ71GCAhN|~ zf@o8V5)tqQ*tEeE?b!#fj(^DN)+5?UDkRXJ!5g%7G`@z z>iNfz50a`5pqmDtmgZY`Lzn3Z-} zC#=R?fe9g(7MzuI>)Ea(f+NQDna+u$2a5cR#&a4_)X%ZRxRkCz3#pO$ZZWZnc3!u7 zoRbRr=MAM2vo-4_4*cOm6VHkq!qHYQbv`9d;G2nh3)|ASU2?)T6JHL!S#--ceIiuc zhxI~q3|eueUqN)MJ?7RJ>^#@@sx5}Q6;_Vmz!9^1!2$U&jfPMN+)GDNtYtp6TWYzp zAt?D!%lmv83x{z0xbVnz@x05dHC7b2<(Tw_3yNOtCUf?#X9~h{)M^{{4=kNW47ZG4 zbvdR}U*Ka%G5`3cp!2!hAeEK`Z<(#&6fuww{Jl>3PAYni|PCSP7G=wF)#+7 z*n;qPK_D>%9k>5GG3*72VXyXyzY_xwNDMrG32)mP)fkLwqI!JKR$`a}iDCM=Hs{u; z&%p<5vY6O)wzA^gUEm_<%-&RNc9@ZUC3J{cm2req!p^`7oM$zwhK&iL0xt zb*t)AXeg4mR)Bh=8=QrZ7-@S<6f2P`@@zTS{}ehVe|Xw^vHxj(UwPxe<;92h3b@go zir(EL>RpzG7HO$6Ic;fS_Bu%FZ3Ic?D!=gdheCI0T&1^nyBIR9Pe14@&0g(q%n35? zyyfvtJf0h0`Ze&<&Ky9yhJHY|VXMTQPJ|~!w%)zo)8ALno$^yPGEF%hu(TzLPV_NfO-*UeqQ81i2 z-6dqb=jlf%$DK%d?m@3jKi+`AxsXqj!&C;YHoWSW0893vX`6yJlv7zpND0@dAf+%G zUyZ`-gqK{uU1W&_8;EDb-k=J%4XA=9Uk*II$@lek^DuIP9y{zAKh2q77)lS&n&{w_5zZxUR;raD;#y8h;ib@z;tb9<~by? zByvKM!1p9P24x+;fJuTTSO=&-dAdDT21!EYWr*&+!2S-UK^p3QvAu``FK86W;-T=h zKUTm~^IdF|_;N?NseFsWLR z(GW_eB997Zm53saSBwLo(6Ew$uO1IihnW;UOAHfhNWl2$ErQkyj7&K@x;c-6xd1nkE`Iv)&0VZe*-u8%#q2p z<7`_HI}FaejyP)X*#fiYPqvu4@AFI{h%~8bDL=#L(H9J>SUW(y393oVrUV>fz;) lkO1}p1*qU{{u{0P1OJZ literal 0 HcmV?d00001 diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png new file mode 100644 index 0000000000000000000000000000000000000000..f830ca6a87d10b72a5113636dd5686ab25a2e864 GIT binary patch literal 18336 zcmd_Sc{tSH|2K|Q3fU?ld&r&`vMX69OUTa1KGv*-v86~t*0D!+lI$Tv8)5AGUSVv5 zGM2{rJEQb@zklEFb^pG<>$>m1?mvucW?tu<=XoxV=W?EB!nHM3C{Ht{{{0{#(u=&0Nvdfj<(0lXo#yRLDah^Q=@eE05g@cz_2RU;1~ zB3e(vAMvCu>jNSp6^Po+>v|8&m(xum1l6jlhZ;v~_9RZfqpqZp;WYk!DxqYVSSJ1@ z8^;X^RiB&i&(8OuKFXhusfR-=oo|kl3DIOV{Im_2y3A0Ot*m4lg?@&np~2ru@+@up z@`Og~UCdA0o7SBtdF1ZUyhWj2^JNS^44$exAHupjNMm?keA}d@6jK2ot1rQ2Vw$Ba zoya`Ria$~XERnsEX4-r2${ZZfmFH@KD$~+2L5a-yaZ>P`_0n!acrx_%(PJ)D%T}1t$F=?=qPJ;Vl>#guij@?+g|yBuS;r zcXiMH9p8k=0yF=4_6=vS9|mfU$b*?f%B-N;e;*5DW83H3FnQ%_g_^lBj_i{!sK_qQPl*? z>|iS-0(U02Q1na}b61s#a8*o;b*E#=Gd(wpi;H`jgoK4%du8|5`?h-b@Lhp&{@>|d zTx6**xy?X>^i|c zC$rJ7D}ir4?lRB0fLco^O*6;08B8V$iX6=F$R`Wn$YH-eEK4$Obq7MtRU3asx{k;5 zzZ*hK0^%lTa6tPkG{hsi&T;FGufKhh*kvcbGm`3o+MN{^7OGhf58Up1;bOTBrt;xg zvPb1E8C{iMPSqVsi?ta{8d8zp?qOP%LHzn5E>p|h00zl7Of#<*h*(>W1`%N#j}-`7 z<2zgfa2OIc`iEs*csyQ1iovNj+FZ5eM6c;&;Lg~TrYm^mIr+7getzm;e=A?&-8ae_ z{Fwo$kh|5aRzd{cIDf?)ja)oHHtup^+Iho=8dkg{PFS@@CR9A(j-5pt_{RNQf*sU_ z`%=$H0d@4+O_AMUJ^0lUc{|1?E-NBSh;F!EB%QJ^If>rvrdTl&Aw=X#cHuD(Hi}`)x(`BfpQ?3qoE0}r7+|ML z7Ph%0+JM;R3BoTnMxJ#1h1zOx^<22Ek$8b)uYf&{e1R(P*GHpuF?oi!)+b)|s|eXz zu74VD)!QF_mf`e^`RShQs!!KTe|oS%!I2bvZ1O2MPRoB$cZn|`v*K~Wl{lt~?7&bo z@?K1;{H;Bavk&o7%F=d@_!3ut$NqvZ&yc294%U#3;~Bz!rZcV{tDT-&A!&|Che7e{ zQXUdED9=o-EfafuUCw20w@$k+h*J_1f!xQc(etg1@;`#9#%+an#wItbYbMa+t!2mt ztWCZaP>wBIPJ)WqZk4xw-WGKAY45V2{Z+QMshs_C z1pm+ZR|TI^MB6@?UQ>Hln`<{wI{WRK4a(Y9cN&VT@G=OcJs^lLHK#mU%kc78R`q~? zP+j2dwOGWXu)TR~p}@pgbl7Stkiysq3@*w$wsz4$UX4ezxm@LTEfl>#sl%u{no*se z6SbgXn<2jz6Fu&rI;#Jj&Uqx-9_qz`&qF&KFYid|vb*tn^t@hs<0FGZi`Fb(o{e1Y zK@TV{as}2n_4?=QpmwE)Fz$VM#iWHRT$=nagxw3WzG1Anc&pZls`YQg%+|TL>J7AncNZqbJ4_NE^<9Io784iNW8y^3 zC9}nvJ=zcp&k7-$##`&b(f!YJ+Na@Zu-)(UtB*vhsk9%=uT^H`cy+<;pSngN$Gc}X z+0{ojNlK{NX4YtAb6LZshcXJ~#2RumSOe5>^kz8{*Xm?!t-dVWTthOrt2s{ho#b_r zvvlc;mSavTu9(Q?eHXeI;}Y^57drf9YWd^UqMoe1m)^=+uS+D&)S1^;Ynht5{8v0A zUWHOjq%G0Hm$-xQMfLvT(Ju3aDz%&m5emNS{mE%uecz=_uH0{S5(?tQ<%VQaTwbfp zLs(nHyq{}4*lAR;BMGxiz6x_UMaPWah8p>v6cVtu z&Op-v+Zr^Qw8AJ%_g}wm8|fLO9vLbm+s^iqV)i1wCP!;}vvP_Nm+OU0rooe0&GNKV zraz82u(v9K|Kb$=BVGne8Pc5ENoHb|C|VPXlsWrSlpnS;I<$gj zR5wwpEq8S<*vn~b!)n)c>115g=S*1}tWNZW?)5c94w5aPqxy~P>K;8dXYEafnBC}4 zmTxur)gG5o8{K?YA{Hv$Y5=*kkyw{Ibg=;xR%rT5>l_CJDqT_*f+SXS1m*r zA$N!K!}UHj|AG0|!6)n4&ab(3F0>l7$1xPIXG`cWcqwnII!wA_psbHlq<2;p>~abt zh%m*kH-}|YvT{|-+;5%i_2h0jky0NK&vjvsqQjg*c8|N=2`ZhT2D(P=g$-*L9;+o9%ebR1`pM~n-Oj@!c4)_uXtXc+Q zi*oH()3lsw+CB7COH5D~jrB`SQ1~hbkV}j;XcB zd*71(VD@$-*-kCn59)z&T)^&FD?ZO8p!s5o^;FQzcLyZP#zV_hO#kpOdF0JBQP@IL z*0BoHD}7zh@#-f_O((tKH*jfJVX>OFEY_}J82l0r+TZqtvosm zqE@^2S)JuafNvytDA!HH*~Fo|Te&HPdU0)z&lFSS8iHezi^f@aBNZbxU9?ghpo{D+ z9)9h;4Eb5V^jyt97Gc~}^NXg}F_uoH_L@{{HRE17Z=WsW85ievqE5$GpCm>PvBdKW ze?0TWis|XVo0&w@W{T%2ud@8veO+8Ob1LzS4rfZX(mKA!4(Z?^kof@z)H+<=D5b1D z>cai@z4=+#ys+Ki<`vlBTdHNNb_OCd+BjJk?-7yvR$|QxiS517 z7li!pDRjmi4250n_1yM!b2*{f+v=V>`GCx>{@bM0vyI2^t@5NJ;pkn}x^c>ui#SO2 zjUaqV9plBhYnY{L&a#HOel@u}^6t8O+hU!=8IQ)XaVP^PhkNFAo)FvJcN(uk2FCM6 zkSW4uBysio>qA6|0@9;WE|NGdo7`irQVPDv*4_u49R@Dk_Ugh=Br_6?$i+bIa&i$q z>f-6VZ6fhvk~m9Nj^|u9YBkm`Vv<}Zc(~e&79dvhS?L$2fK=ydgw-@9nwQG4dZ}OB zR%6qfr6ITkp{gzsSWzKF6lT@f{q64clYN@IA_EXHiUUFW! zbAlGz-3zN5@wb1W5|8iYDaR_CP=N>oYO534+2^lZ)rnZ3$igo4@zhvv-WYXh9c$<8 zKvtHnbp2duH>vv;P=fCrnKqRh%u$tA9s7ck@F`HFv|8C<*j|5$RWjc0L8Z%a6;LgI zLFshyOps(PY}mTa8^>Xa{nbZ3ZiN-WcKWnID4D;UdM3&Pjhq3-?%k}l#yW=k9O@UX zv#lC}U(9AU7`ibvl}le=MxJMGKl@PV*uCwU1R0U_Tw#CSo&HP;7b(N`zQ9e&E%%>k zzCV}rl=3$bttjebn(kz7|00c(JYC$DOdn}+x%XRiUUyb5^qEVU+%CrVCWq+AhtN8s z)?3AVXhr6ItJXIqnzaFnUioq_x<5qeyyLI5iK3 zOBouMdHZ~9M|7~_Wd~9qA-9p-!OPk%*vngdtGi|VBx9cV_?Rrr!QJUuQF6S8OJLHv zY%&)eEvd#G{w)`oDv&xrTF54LJg zL8Ui4Ah5#s(CPEdbyPxImD!VstDBqh=b=pb0ZZ47ZAiNtiFz3qkB`?+eKi2F<^l-t z57K!yE>YpfhP$53o7DA5)VCj3Ke5X50^z>CP-q%~7J+RK!Y5{mU8@rB8w>7zIS$#< zOTqRqNO9YoIw?}H-JgOgG~N1|llm@f@?PocV0qaRIxLcZ>%?Z(J1#9q@q!|mF4bw( z^akhyozPF0=P3L`C-2GY-8aix zrC!^lv)(M(Fs3g)Rv000H&4R3ck{z2s>-}`pk|w&Q;lB~CSJ0RUl-gnj!yF)Ai_+_ zlxjtcy`AO0)B($INNj|^U~F@2^*f6+ufHZ7MbsYUhUH}-(V*?#5*@LTFq$-~cN#xU zZKLh@ZUL3oe|CXfJ1LFJv;eq${&_g%D|oX4D+@rXfz>&gq?<#wIg`JL)3G&r)wfYMqm zlP>WY6{nGcHuDhkax!?8RQ$fSj)7!PweWzdz-Emv!tTSBrIE z|9MsjO)krqVUFg}wSJz?7V|HrCg@KUQ6O1jmhTL0`bIX-a?!sbc!j+;ay%n{$v23b zTkjS+LKtOe<7K@XHxb9XlQNU_q^zdbz$m#h#CLPV&XvWYOn!dCr;5r<7Cn@e`l3zo zDF2oKj%D%_FV&x%KUk3t?79wLD07sr`+*OZHPY6O68>vqV8s6|Q@pi9y)G@)+O9g8 zuWDv)p3&La>C$LmU~pekEqdWVYqr(xZzZFfj`*iR;gs)leU~VFY~|eCTupv44HQTB z(T37U>q;T{5kskyINpquXIl%{lH(u)q!+`EY8k4e;-WBmdU_tbF>q9}ZP*K}e`mjU ziA1Txpl*)e>>-*sVOAxPJ^B)4@${dnM-cU82b4|@iwBpe71XYgMb_7oP6l=%llZ8} zW8$cQP$}%P#Xw2oM@mIJAYSkExLW^H=;^~#JuE}XWs+KYgCvpH zj^rNHq(kxYh1zg!`#>^sOfTW3yU8%XR3uYa949QjEZ%VB~DCB=I5h` zycy1gS;UUdO6{CWln>7RoZ)Q_w1HCO^~=8(4<03ZLQE@~!z!YrC5N9$4~%B+8c^v( zBErx}Mq#vpvjZE;5m&u=*g{o1kV!()blfc|7TyYVhE1S|49t;9O3ZMr z_r2=AyxGH;16JvbW{Z;|>F2uN!ySY*!?&NGBo!d;dE?+~k8!jg=}9SBpAr{F(~pJ_ zco?ii`YJf~J$JgKD$>Y_xOPu$b9X(>Q_}ACn>145lfim@wR>iYd1_ryyPov}$cu3i zyooF8CSqgoWj|1af~P^uY1lYRZ>o-Afsk|vv`=$Z_uScs@TWoPpVjR=mYrVd2{c^S z{+QsG{wR-Gsu^vddzIagz{(Bk3n>fgTX0HQm(nWLttTTXl*^wB!-!%kHJdsOjYMsu z^LJ=PVE92hO!0>K^<7XZt0J|2Ud}Y*{KiUDe>3Tf^+LH#3apMk-}o}=31YGzV^Q`5 zO=&t$+MGz%x7Iuv9`6?`uqDyqx=3>%xTux^zC(~~*&Z3^oLv_KLHtoFO)E=`9M4l* z)iqN6b#&I$a4cgH9x{vl7f6<=sRI+XOHRV)lS;iAV^!ISf~ClYx7>5JJk%a-GFy0` z>gh}xx&JV?v@mHjLiZ%iOE94T@g^cG{2qy}e4E3`3#+dFlnqVpi^r3`DUkhE z;;F0tBDu2nu{Md!mO1K&+N;eA$^^NT9y4#SelecO+DS9wY$p2KV-|x>3*o zq|GsS-TQ|cJe&x8f(dFyF0QHDYHLBRB&&uUdV7H>#skwXr0AP?#ZEtX=8sj}2IfVo zLm8ZP|7@=4U55g9;n!)#wrB?F&7eDdyQCyQ`MzEueVS6*-rn9fGBUEfwYa!=MwM4U zHo(oz?LxPnXIfLYmNvzny`xL*y?L5Ih^wSW82(1`;rwqTf-N!7t5WjvjghcOLt}Ok z^FsExK&X^l3wLH7uL}Ep0e`CA;aWw}yW8fu<5VA5>bVMJ53vDz9UxEn`$YldzS%7U zht8Q*%XiqR%>pr-ekHaEq8Iy(%J8Z7gihZM5m@XCf4kvs*O>vOC-wxR{6zb!6d8a$ zyAfG}7teLIMOmwfnCYL*wm~v{e%^|u{|ldsT+}x2a>iNY4>7d8)80sw`^r#KsY*1< z#4g{p{~+bC{;?QLaboqksoRUX{fOM_g=M|6=8B>)0qH~s^|j#NVCAI9){Ah~%^o(D z5QXUois;v6$<;LS+kLTg)0QHb;sz#6{G7 zGq*!w8;?Kn-!9Wv{dAlF9%q8T1fRx1@^e*}xTp)^uFnK>`E602d5&0FSl!t|_=Ap{ zP#t!CGKD56n|tO5gpWzMrw7?h2Vc#!k{m&RzJ+3LH-JhG$12PyJjRcf+eD|{&&l)4 ziNY_I-6;gsdusK=fse)gYaXjO)TtCCm9m&5NL?o)$pX~!p$+W&?wdRSLmoZ--#w)2 zy;M&blatsDmP^!yc4+z!;|~>b@@fjSJTtb?LmL|#YK@x~o^Rr(go&lR8=d_=r6?^j z;Jj{rc81NVYPiDc_n?1lmn^Atk7=fWtG6tAbj^QITBj?iR1R{^tXNe;Wyn8b&B%NF zA)iktw?a!sU7ezNW|gQn^J zMY5ZK$~);7>&Pza^qCQfB{U&6hKnc%!=olFvT3fiN7)0DdaI0dc0d~#Wz1ARCt~(v zzouCn3nos+7wOP;oIF>!fca*_2QZ}U4$~_-=KA-_oYuCe?xwqB_acSH`jbk_Wm=ut ziKRAg+)LBv3RaaB#ofuhltRU))7SCQ3RB#|jm_5d#LA7v81IH15?pYPoFpb?@}*pl z&sE&H36gkLIl~@O>dP93;!kkVSc^bq&6gd&0+X$F2=RJ)@fG7Ir7sC|1XdKgyNNNH z=j1Udi8G3W$@QJzfXPoVMLs`yim=(oS8CXRHVZD&rSo9;>yBT3VWT3}&3vf41L-wz5hw>@rZBb zp|4a5`r&ebHwfFb+rof1R;a}RrS=#lhdLXV)Kp=Upd=9ylfru7qc3M&^?GTA{H7}8 zMSZx@GnDJ+m4LEa@ow;n<7~ctp+F`HO8*VJ$bbe&{Wlsgf=0B`&Lq|9cvO2bto3B8 zNOZ=yvQltJi!&}NDnhEPax!wUDTdTg);)fmzAAfFLe#~LS^EC{`=zPE=$yNoj!%0< zqEx)GR@8Zkegyp!h51#lyR|*-_l%B@NEH^*TWH=9vDdAzk|oL4^da$ditnL2+?%hC zj;vRBzo+ckHYJ^YM0he|%Fo!@&z^~(JmO3=TcoThNRw$*({KDqN{NkN7=?r1e$fJ5 zFsF50o*Ng14Gj(P>~)GLJLF;O-{}Ci+|kVLrrn?sdvL%30BqE8WvJQGW(I;ebX#N)vkW@NZ4^Bls{?00*ycL1KAI*B>` zPvA)o086GcNzAcxm||(kJGZk`H4KO|?@=d}eo$z2VF31U)g|o?&u@F^zqxc9*n^jM z?Jt031cQHRB{6LO2wl#}c#VSib&pLdBkoQjPp#l3g%)$NsJ{WMQ#;7f4LA+r=T6VgW686V;%o2I`CXchrE`(TF>|;g}Gh%(x#( zKr*Lx3IM1@d*_@#0B#UKKi_|#AB|9|A({=N(ibBpRi}G$vET|U*CcNDrR!y1ClCf5 z;xxy};uCH(?z5yE8_)1XrWYea=?fQ>V_v5UI4?iA*!*P-t-}ta{@0^w5HIOFsVOBIv`V z$TR_ijuEj!`e=|VwSc4Kx;liY$y1%#xOfnlHhO6=%`ANW`_qgSd>((vG zivIQmdEU?ZE8@-Q$SoJL1A!3FQ--fz-r#jXeR?oUamOXlp!0b`(FJ^^5?i@djnY$l z69DQ!=EmBpegGZnWR3f1=piu6x=?f0ET$jJeNmfUXZzUWUaA0_U@_Tv>-7})ilw)8 zW91m&Y0AxS`_PNR>U`wjMHZXweV~ZCR zwmJ)x9DIvj3$nBzs)ikyn{TzOytz$fntMpwxyk4Fa#5INQr@NmWwJ2D%;CHHPP;oP zkiAR98I{9e?I;eIxqj#atN76(xP-yBijz)Gr*tKj92patw}vP%j)9o?FTjfD)4*X5 z?!;t)uhmXvFGEo-hW~yBk8q(&PM*XE<`h8;|5^FR{6Mjd4;4EZ?_i*2RdHtltGE5= zdpBKd?YniabG+8X_=%XuDG+G8yb{c5{96t(DWDdqRtt)nu&p1U zu1eweZ;SyNSFpbP6>hABwQ7u(@D3n^j~R(v9j3qsXD~UEFMNjOf;ujv7^e6!HcRhX1C z=9K@!E949_099nZmzpRNMAUQ5qc?of2IM84K^GQ=PukBg7bS_pyt@pxkA1dc_X)h0 zlw8Wpn8@+r6d|}I{a0}59oE#3MqWx3%gDW{Wv=*(lOT@N^zU?92b*U%HmD=|D|t|Z zymEYqCMwYoq0M%tj=DoIAvI77_pP6RLvN4uVLd_rvx)M_(;smcDVN<4uZFpltI z6lSjP3!SJn2B$kr*uLmp6FXFBMnF8`Q;^^uA*`&b9I)*Ai12s1S8DCl?xgGMh66#N zmz1=$w3(^tc!c4;OXHYcgsBN9wWMKH^V0{bv~T?C)+bE2Z)PW5-b*!eM<)wwG=?5> z&CytB#S~4CS0s^e+)*WpGnxSY0Rx(3^o|gPWPj_a*@E;I@;xJzhjN^+= zshYD5J2i6)sqZ)5&cWZr`l!hOQ!tIn32Qog2xEE1`JK~A!swwq48NK8yLaziRnU8$ zPY}wUnRL4p2z{{|dr;$Nt#gqJr;dxf_)5yrrAU|fa3mjf(?jYju%yxsv_X-|1k;}g zlE>)$?2_G&eqF5ztUrSlMvZgMuh*y7UH}lvxaLTvFQ*^ln^+F#fk_X1;_XHFT)XKo zN;ky<|8zfo`|Q*`;b2NZ>CsFs<_cJDin|nR*H}YC^e_?~Vbiiu&8cKUNbYP=qrFd+ zncygyPaKft@X{)=EDTh9-U_uD_pIfL&t%*s8YG$=SM}n=WjO0$Wa0QiT*&M+227%n z$ZX>y0x_<`UfT+9>wPG%6QCQaw(09L<*auV@F7O;!LvF`R8&tV7dT%$WaZU3o2lsp z)M`7+#|H`6x;~#shP?I9aeb1JU5Jhja@;JtW=bL%P$h>FDa+4@yQuT2rBat}omC_a zMGp;+y$dtHR(q+mEHV7zD{YG3sM{WuYvS=@7qAa3Dd+E8ql0Ba_1*WRWbT_)aYPNy zfSk9amG^QI5r+RQF{1zr?lGVRGVEqTCF%agA)Wc>kji0vf?v;+7fkRg&FaK1MJ^7T z8}8urtya}8Mo=7%z;6o`o~C_r@zrVlkXLN?euuJfn#J}UkJ7>0qPdOhBJ0yj#!vlk zAX+mFTFDOwX>6J4oaF|(dG7V;k!1ytXvLQ2%Ms8+qpI_Oxhq*1Z?av?8f(1oe2|z@ zy3RAw+ejS7c+Q+&HYW$V`8^opjOm;QbfMhDj#UJh?aM< z8+Nv@j!K_mxcitoOIu9J=BF0OsAB7j6S!GBr8de_-DMJw#_kS+pimUHN5*GYZp^6s6*8kU@VOVlo#TJ2$a(U_eMmJ-1YM2~__g^oCIzITwmvQ!R#{7gnoY9H+VE=3_3?@Cc*; zvOj;1dXK;x#kebi0EP0Kcaj&O2C}HFeNK;8r!QmC<8b2tpO37>SM4PyaA_fY6e9_- z>7+i##6^JQo1cDflEC=^r?xMe%M1mi0?hSBJ`Rg*5?5XWm*}! z%)c_Os^RCr3}k>P$$gKg8nh(;9*G6X{c*~{ue`M_OCIulIzJ#mA97lve2EXQAmU5b6ziV-p-i{R*)L{IT)#@ zLf3x&SCW|1Ub<3G74cO$`nD43uBDZ{ja&5w5%Y549X0>Xj&?ejBhXW(8p9PkfxLi? z$$jkSQEERRwahh%glXxR)_K?gha~ot1!rGn#xt z)x&O6=I z=bsTHtZrsIp9~3-Dv2fMp`tD@>zll8Q|w7LCl`Eqnrqfqjv%6%1btb&FA%N9ZCq{w zfCmvX%TCaAA-q#FNYoj5-)RuK*1N^k8tAdAnPQTW6HU++F2QC=T~ddkN;`#-s~rR) zTM&ER_XK{---R}4&K{zAJ($YaJNY*9vPnp_*yOlLlZOwyPDj2=XGejHB2Q{6u?1Pp zGxwPW&}FhX5O?3Sz2D?2h8%Y%PU^lWOtIo_`TB$S=(!%hk?DkeHn(v4JA-lnAfkGY zwGC8wYxq(0Cc@idR(N=TgDvKUtlFYNZBdCr(sI7*^OxS&tobh%=4)P!gPdO#?UM0` zQ{FFLa1Ts1nVS!8qgzcp)}uF>@k>?L09I_)r<*diea*L_bVwC93y&tK`j~Nr6(0QK zjQka)^TG@?e{>Dq&Ug^d@XAve%A_3_tF*rdvH~WYjl_8S)F6UG8sdiK5ebovj{4T!$en$}0(P*)^osIDva`8&_rbHGY=oikFo&v_XW1uRNxfxy;j-N=d3Zj20fR0FUhKa3v7;40yzZ zK@g`EwRFjMmQN#WJJLmJI1@EjiO)+K`h1r*PWiGYlJUZdx1^^Qud%>>NS*;4BvpC7 zySvj#rFe9TwACR|#{}HptbMv)DV9|Ft)n$V_A0^K-9=%{Kgtf({0p~M$D<6T8hES$ zHN64AszF-SU#+DE?4bY6y4QWpfQ(;nmHjT``z%nquTnH5^XXh-29HHWEoW82QoaN~ zwP~?g-BjP{4p4{Z=YW|nJG(7!U>3ro9V^3P zSg?9iZy7mRnZn4RHhZ&-X-82 zGnN`MTw*%uV8285P+8b>a?0!T}cZ=Tn_GRKjGzfHy0nVE#VF-Sup+OlQ;hFL43hWPe_|Zds)5m(dmEMOW z9SOK7jOnSWYz6j9`Lt;Wrna0A-5$dKapPyf4if^z?NP^X8eSt8YtTIOc7$E=A+C?J zNuv#j3n=uKPJjS`nC5&ZiAHH3NUh$mkz3P)#eaEF>b9Z*o3V0GP;ML9qY-@LVz zILd44aSdpBmHz~h2B_`7;2Tl`$30pzf2jY>y82Y%K>OvKX9P(n zGS|%W+B4#+9>3i^fMe=YF66cl>d zWWC4<74Ss#6l`|wkO9CmnD^w<{Zl*dtBFD+g#YY6Wg?N;S+#r$`pl5(1tH!5hSU@G zFCCa;%!^%+{lnT1Xyl{Igzn9uY?Wx2&XR9TBZawG-V^r2X2{e5=To~~6*QP2bD!ZQtjZk~p7oJ z2CS>P`;K)60h#>5;e1|EJaT9xz?cpfOgn0>n0}h8fS?p0e_37Gj|U={HGnel0*htW zYf}|$ZJ}euw1bcwEHO(}*Z^>V5vc0u%a%;|Kgbal(a7!m`6=wsy%0iD%Dh9d0v$dz z)lTwHBI@H~jANt!5J3RC)40#B(&x3IOhz})=*f<2gV)_^D^#?S!)V_UGAU1I?!ouJ zdHU%}5QnVX&5Y&Fw4$fBd=C^}6!sU1F5BcJ&P@LUM4u*u1D4YTxsd{Y5Q303CgD>Y zn`8`f{{p%~got)YDwhSOIEakgE;w%pei*V6py;vi6DI30JRea4uv!4Ad~6>+7-cSa z0o^%w-Z~fy?C@0J>$`l(lSk(82L!#Spumfea0sLkQ1D!$-UDwS0zFhta!M-lAG>$B za;V8ai^Bg4Vs4p!)Vtd7k7og=LU%|U;G7X(f(ZZv{D>?JFLxeTR;e&N4}+Qgtw+Yt z5IkSEohBthEyqqAv-9Hcm*zh}c z4n7}&?Uxnku|S=E^5))opn7)?9|ADcG9*7b5xtRO3>MwJ`v;RSg z|FRo2J<|&Lzl6StnB6f1=GO3s|ES-x?~uiL`TP6#mdXwr&u%`IICKzY*tbZXz1d8- zEov9=+6D}Jg!!kre>yC9`SM}fPhKU2S%%*0g`Vbo08Uf#leTq%e^$)@mzdk50`vu> z%dv(Eg3i`7lG+>h6v1m5a1NZ9;{7E7OtAvfZw(wN`=2uoP5VD#96urL75JBE4ZW>L zXmEE9yu1x?UOqlPpdwYjXKrRTp!J1@{n$XA5s5A8BFC8IsTA{kVC0<-z0LyHqw*mC zUjj^-KqD^*ikNTr1aAGf36yx{ZXyxx&XmnDVC{>#vvp4hiu3NDi~x{xcxDK?*ynb4 zcPFQIvxedhS@>Y!(|K9qf5L>{Z=?DPK3vM}LZVOE<6M{iB68rr5E+_MSnK2S{UgE$ zL|q^}wr(xU!Gi$4NA2OK%8D=JDF&|mPU1!ax!4Q-v%Ypp5Vz(+^!Hi=V` zgq8)umBDUN!gO71g-m}N#(sddNuSFxC;kj4COgX!gy$oTb^6<=+Sq%xrCj72@?Pu~Wh@G&SmoesXlbkqXI_LV&dKrsj9-*qqF1hOH! zG;+9|!+sAca7A%xdwV+vHmR_BeOR9nu8U6~mZP>$2-+1J!Q*|3ztKsMuFyP(1(r4X zQ{i*S0yD?0P`xjLd#`H-MtD$Z+}fi^*w?&F z&tH0@S}T8B6(W=GKO<9GErWr27MIz>${U62xy}u|Sw_N5Yg5^Z{aqU;%Y{@r#;TRz zk$I;1Xe2YI9{tOFNvqGX)s;5+v$GeBI-=4WFJ+i_-W@4>n5gHL7s<{%A^jjh3|7mk zu+@BaQfHX( zA{eTJzN^I{?rMzfh=rU7T_PVmHq8|t!aYUPpmgb}BhZerc zJ;WD%-4^LDtg2`YT-I~lc9q82;Noyq+4-U>g~5I0SaA0$!4fKM_1ekIyjA0N{(VoW zUpe&8$20kD`U?4}SDxU@rM`4*=Y&M3{$tIds5KZm8VL)(N8gq3rQz+P*kP%BT)u+1EqLvQwRtqtogS=B4Z* zY5XSJS}P~y8s2wRSr<9_Y1DY>%jOSxu68_N<@4-<7Qk6|JD7n7UcM-3BJt(IvR-5{{Oz^^B)7-N!-4sOwrY_u)>5+GiiS}z@ zArkjJDy%mh?QJd%Af!YrT-h)J6PvHE+M@XEzl}d0e(H%m^U=Gqtnh1EI3v^iCD(?J zeGIS8CNo=v+SbV2m#&Q}@hTIsMDbCtd;xu=nxn}N*Ut=_=k!I~UJ12uI^l@^x-@aW z6EV9z*^M0>&usz6UebxFZYGx7i#p2nJro_cM(;cte-$+_+D}{FkP+$pUORpM=@ZDK zO(`i)5BVX)EJc>HnaOHa>dRXHLbG!q8uS{Qm9pKg8K0zFL7PyUW`<@LTj;spz_Dy+R#k2}_<)<{4p zdF)_1-L`t}3?(Qdo_>`2=(!_KFI(Th+XA{#G?K?y;mZno^v^@YfSJyI7SkFFbce23 zNl<%x-#Xb{Xe8=~aEiWdc6weoWgH^p9IeOsl39AEXuDj3VYGQLXU6u4yFWW*W5Mf+ zL{yHCaYMdaWYuJMQAkSR`UA~UcEHTP5hT@|=IJ_m{$M@}#^kg)1l662Bc)usWf{-=YQm~34OG{*!o=_($vix0To zq5=`J<41GugPm-!XD$kgl{~+8A3qb-J_SvfEi(Leb2LhE zIbGUiBg^@9md|=zUX^cU83kNWuq7_e)JgPRTlmI5`crc9qnY3HV^ zoLuGos9V(OeJ?(k5KW2i6u#HBM2fjbF~;>jvY(h~grgk4AAIU-l#TRvo^*&0UF2|6 zYtXH*=}5Q;U6QaysmE7^MrQ25g!hKw*i?SSUxsH=G{=mh@<(qa7kT%+%U>5w)W<<< zsXiwqPOIrY6!B5WJOQ}NXdHau0~s0ulS-PsmB>8orO%swPL;pNXTeR{Ft;V&w%Xk$ zn&G;dFh;32$7k-ucb&+JegyoLq__Bfd2 z6^0uKrM&1EN+})JK&Kh1R~^GESU7?uOHRcF4v=C}Ky$xk^g{YOlIOwq?7h_^6TMaa zUhj^jRP>vUFk}=G^D{s5IQG3;XDX%H)Bz>H6nIVA>a< zOB^;c0H(iUZf(a0G~1$HeTxlr46tfX%@E@^T(v7r->h^%z6o;6q&0`tfwh!pK-vfuZpumLeb#ZhaTTgh&^>3 zsoPh8B!y31kzk5Xg_ntvE(m-aZuV#1?d>#j0EDYnYBIv4RL_9pDaxur1RT3b99)*q zygVsfN)RC}_hwoZ23i#-dQ}m_pZ{4hv?Xs~w|Vn&9#tGga;a$w?X%WnBoM;2r0&z7 zsFf|k@x2UZr9-Uw*<1Q%mFnfc$SvF3aRSlwXOx;Eysc2pX`S~p1{~p5;L@*yK<{;b zCk%8tetSOTOzo^W-lQ`gz5|!D+G)p@|I|_D0uIw6y1{{Cz+<~9%;02gvYzyt8T5ER z_#KAqxYPu*wj>;z_Ay|ylqCujcS#ddgteNy7rH$(yWl{7GmIz(oWt=_96XB@!M~g4 zuYWHEt?tF6dF)pRaFYeFChHgPJ3FBE879a&0(u&1g6kw{%yvhZMie{(o~np(BRN82 z@W_9#)YP2fKaUcVk^Q;4WBDGzPJ0A;Y}$$7`WWa1{7+7DfeZY$3EL4kEBt@{wh2(! zs5iW)|E4e&sP5k#AzX-o5<7JoK=(X!0ms;I0CY4m$*8fyt2tu~|wI-^muA+zV z!!+|-PTMCi#SUJa-CiBX Date: Thu, 7 Dec 2017 16:33:18 +0800 Subject: [PATCH 227/275] follow comments and add limited version of dmidecode --- paddle/scripts/check_env.sh | 105 +++++++++++++++++++++++++++++++++--- 1 file changed, 97 insertions(+), 8 deletions(-) diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh index 03fb102705..564c1c09be 100755 --- a/paddle/scripts/check_env.sh +++ b/paddle/scripts/check_env.sh @@ -12,7 +12,7 @@ ht=`lscpu |grep "per core" |awk -F':' '{print $2}'|xargs` physical_cores=$((sockets * cores_per_socket)) virtual_cores=`grep 'processor' /proc/cpuinfo | sort -u | wc -l` numa_nodes=`lscpu |grep "NUMA node(s)"|awk -F':' '{print $2}'|xargs` -echo "CPU Name : `lscpu |grep \"name\" |awk -F':' '{print $2}'|xargs`" +echo "CPU Name : `cat /proc/cpuinfo |grep -i "model name" |uniq |awk -F ':' '{print $2}'|xargs`" echo "CPU Family : `lscpu |grep \"CPU family\" |awk -F':' '{print $2}'|xargs`" echo "Socket Number : $sockets" echo "Cores Per Socket : $cores_per_socket" @@ -37,14 +37,24 @@ fi echo "-------------------------- Memory Information --------------------------" # dmidecode support start from 2.11 +dmi_ver=`dmidecode --version|awk -F '.' '{print $1}'|xargs` +if [ $dmi_ver -lt 2 ]; then + echo "Error: dmidecode unknown or version is too old" + exit 0 +fi +if [ `dmidecode | grep -ic "Permission denied"` -ne 0 ]; then + echo "Error: need root to run dmidecode" + exit 0 +fi max_dimms=0 num_dimms_installed=0 for dimm_id in `dmidecode |grep Locator|sort -u | awk -F ':' '{print $2}'`; do - num_refered=`dmidecode |grep -c "$dimm_id"` - # the acutal dimm id should be refered only once + num_refered=`dmidecode |grep -wc "$dimm_id"` + # the actual dimm id should be refered only once if [ $num_refered -eq 1 ]; then - num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1}; {if (s==1) {a[NR]=$0}}; - /Manufacturer/ {s=0; for (i in a) print a[i]; delete a}' |grep -ic unknown` + num_unknown=`dmidecode | awk '/'$dimm_id'/ {s=1; f=0}; + /Unknown/ {f=1}; + /Manufacturer/ {if (s==1) {print f; exit 0;}};'` if [ $num_unknown -eq 0 ]; then dimms_installed="$dimms_installed \n $dimm_id" ((num_dimms_installed++)) @@ -70,9 +80,23 @@ echo "DIMMs max slots : $max_dimm_slots" if [ $max_dimms -ne $max_dimm_slots ]; then echo "Error: The max dimm slots do not match the max dimms: $max_dimms" fi -echo "Memory Size : `free -h |grep -i mem |awk -F' ' '{print $2}'|xargs`" -echo "Swap Memory Size : `free -h |grep -i swap |awk -F' ' '{print $2}'|xargs`" -echo "Total Memory Size : `free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs`" +free_ver_main=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $1}'` +free_ver_sub=`free -V|awk -F ' ' '{print $NF}'|awk -F '.' '{print $2}'` +if [ $free_ver_main -lt 3 ] || [ $free_ver_sub -lt 3 ]; then + mem_sz=`free |grep -i mem |awk -F' ' '{print $2}'|xargs` + swap_sz=`free |grep -i swap |awk -F' ' '{print $2}'|xargs` + total_sz=`free -t |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs` + mem_sz="`awk 'BEGIN{printf "%.1f\n",('$mem_sz'/1024/1024)}'` GB" + swap_sz="`awk 'BEGIN{printf "%.1f\n",('$swap_sz'/1024/1024)}'` GB" + total_sz="`awk 'BEGIN{printf "%.1f\n",('$total_sz'/1024/1024)}'` GB" +else + mem_sz=`free -h |grep -i mem |awk -F' ' '{print $2}'|xargs` + swap_sz=`free -h |grep -i swap |awk -F' ' '{print $2}'|xargs` + total_sz=`free -th |grep -i total |tail -n 1| awk -F' ' '{print $2}'|xargs` +fi +echo "Memory Size : $mem_sz" +echo "Swap Memory Size : $swap_sz" +echo "Total Memory Size : $total_sz" echo "Max Memory Capacity : `dmidecode |grep -i \"maximum capacity\"|sort -u|awk -F':' '{print $2}'|xargs`" # DIMMs fequency clock_speeds=`dmidecode | grep -i "Configured Clock Speed" | grep -i "Hz" |sort -u | awk -F':' '{print $2}'|xargs` @@ -165,3 +189,68 @@ done # dump all details for fully check lscpu > lscpu.dump dmidecode > dmidecode.dump + +# The expected result would be like: +# ========================= Hardware Information ========================= +# CPU Name : Intel(R) Xeon(R) Gold 6148M CPU @ 2.40GHz +# CPU Family : 6 +# Socket Number : 2 +# Cores Per Socket : 20 +# Total Physical Cores : 40 +# Total Virtual Cores : 40 +# Hyper Threading : OFF +# NUMA Nodes : 2 +# -------------------------- Memory Information -------------------------- +# Installed DIMM number : 12 +# Installed DIMMs Locator: +# CPU1_DIMM_A1 +# CPU1_DIMM_B1 +# CPU1_DIMM_C1 +# CPU1_DIMM_D1 +# CPU1_DIMM_E1 +# CPU1_DIMM_F1 +# CPU2_DIMM_A1 +# CPU2_DIMM_B1 +# CPU2_DIMM_C1 +# CPU2_DIMM_D1 +# CPU2_DIMM_E1 +# CPU2_DIMM_F1 +# Not installed DIMMs : +# CPU1_DIMM_A2 +# CPU1_DIMM_B2 +# CPU1_DIMM_C2 +# CPU1_DIMM_D2 +# CPU1_DIMM_E2 +# CPU1_DIMM_F2 +# CPU2_DIMM_A2 +# CPU2_DIMM_B2 +# CPU2_DIMM_C2 +# CPU2_DIMM_D2 +# CPU2_DIMM_E2 +# CPU2_DIMM_F2 +# DIMMs max slots : 24 +# Memory Size : 376G +# Swap Memory Size : 4.0G +# Total Memory Size : 380G +# Max Memory Capacity : 2304 GB +# Configed Clock Speed : 2666 MHz +# -------------------------- Turbo Information -------------------------- +# Scaling Driver : intel_pstate +# Turbo Status : ON +# CPU Max Frequency : 3.70 GHz +# CPU Min Frequency : 1.00 GHz +# CPU Freq Governor : performance +# ========================= Software Information ========================= +# BIOS Release Date : 03/10/2017 +# OS Version : CentOS Linux release 7.3.1611 (Core) +# Kernel Release Version : 3.10.0-514.el7.x86_64 +# Kernel Patch Version : #1 SMP Tue Nov 22 16:42:41 UTC 2016 +# GCC Version : 4.8.5 20150623 (Red Hat 4.8.5-11) +# CMake Version : 3.5.2 +# ------------------ Environment Variables Information ------------------- +# KMP_AFFINITY : unset +# OMP_DYNAMIC : unset +# OMP_NESTED : unset +# OMP_NUM_THREADS : unset +# MKL_NUM_THREADS : unset +# MKL_DYNAMIC : unset From 32cc0db1512bd1e7063ff0e3ae080602bb7849d2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 7 Dec 2017 17:31:38 +0800 Subject: [PATCH 228/275] check if cmake has been installed --- paddle/scripts/check_env.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/check_env.sh b/paddle/scripts/check_env.sh index 564c1c09be..af16b84ca8 100755 --- a/paddle/scripts/check_env.sh +++ b/paddle/scripts/check_env.sh @@ -150,7 +150,12 @@ echo "OS Version : `cat /etc/redhat-release`" echo "Kernel Release Version : `uname -r`" echo "Kernel Patch Version : `uname -v`" echo "GCC Version :`gcc --version | head -n 1|awk -F '\\\(GCC\\\)' '{print $2}'`" -echo "CMake Version :`cmake --version | head -n 1 | awk -F 'version' '{print $2}'`" +if command -v cmake >/dev/null 2>&1; then + cmake_ver=`cmake --version | head -n 1 | awk -F 'version' '{print $2}'` +else + cmake_ver=" Not installed" +fi +echo "CMake Version :$cmake_ver" echo "------------------ Environment Variables Information -------------------" kmp_affinity=`env | grep KMP_AFFINITY` omp_dynamic=`env | grep OMP_DYNAMIC` From 93a225563939c7b03fe296316e322363e7f2633f Mon Sep 17 00:00:00 2001 From: ying Date: Thu, 7 Dec 2017 17:52:13 +0800 Subject: [PATCH 229/275] add softsign into doc. --- doc/api/v2/config/activation.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst index eca3ce03bc..5317e66b64 100644 --- a/doc/api/v2/config/activation.rst +++ b/doc/api/v2/config/activation.rst @@ -99,3 +99,10 @@ STanh .. automodule:: paddle.v2.activation :members: STanh :noindex: + +SoftSign +======== + +.. automodule:: paddle.v2.activation + :members: SoftSign + :noindex: From cfe6d694dd6e3cf77cf7000721aa71a8396ea988 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 7 Dec 2017 18:11:25 +0800 Subject: [PATCH 230/275] message cxx compiler version --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index c6c82b61e8..fa233939eb 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,6 +22,7 @@ SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") include(system) project(paddle CXX C Go) +MESSAGE(STATUS "CXX compiler version:" ${CMAKE_C_COMPILER_VERSION}) find_package(Sphinx) if(NOT CMAKE_CROSSCOMPILING) From 3a0a4586a3129bdc1100276236bc1cc50227fa8f Mon Sep 17 00:00:00 2001 From: QI JUN Date: Thu, 7 Dec 2017 20:24:36 +0800 Subject: [PATCH 231/275] refine GPU memory allocation policy (#6373) * fix gpu memory allocation policy * refine codes * fix code style * follow comments --- paddle/memory/detail/system_allocator.cc | 2 +- paddle/memory/memory.cc | 16 +++++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index 6b4e46f56a..b543b767e8 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -83,7 +83,7 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { paddle::platform::GpuMemoryUsage(available, capacity); // Reserve memory for page tables, etc. - size_t reserving = capacity - paddle::platform::GpuMaxAllocSize(); + size_t reserving = 0.05 * capacity + paddle::platform::GpuMinChunkSize(); size_t usable = available > reserving ? available - reserving : 0; // If remaining size no less than expected size, using general diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 95cfe2525e..9cafdfda75 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -64,19 +64,21 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { int gpu_num = platform::GetCUDADeviceCount(); as = new BuddyAllocator*[gpu_num]; for (int gpu = 0; gpu < gpu_num; gpu++) { - platform::SetDeviceId(gpu); - as[gpu] = new BuddyAllocator(new detail::GPUAllocator, - platform::GpuMinChunkSize(), - platform::GpuMaxChunkSize()); + as[gpu] = nullptr; } + } + platform::SetDeviceId(gpu_id); + if (!as[gpu_id]) { + as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator, + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); VLOG(10) << "\n\nNOTE: each GPU device use " << FLAGS_fraction_of_gpu_memory_to_use * 100 << "% of GPU memory.\n" - << "You can set environment variable '" - << platform::kEnvFractionGpuMemoryToUse + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" << "' to change the fraction of GPU usage.\n\n"; } - platform::SetDeviceId(gpu_id); return as[gpu_id]; } From 34b9294052a2589aeac5f68bce2658c0fa6afcf7 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 7 Dec 2017 20:50:07 +0800 Subject: [PATCH 232/275] add version and path for both CXX and C compiler --- CMakeLists.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fa233939eb..b309ff37e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,8 @@ SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") include(system) project(paddle CXX C Go) -MESSAGE(STATUS "CXX compiler version:" ${CMAKE_C_COMPILER_VERSION}) +message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION}) +message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION}) find_package(Sphinx) if(NOT CMAKE_CROSSCOMPILING) From 93563efa19248b0f68a428d5eebec1a5d3800d1a Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Thu, 7 Dec 2017 20:41:30 +0800 Subject: [PATCH 233/275] fix doc --- .../paddle/trainer_config_helpers/layers.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 8c5cc25d6c..afd8a7579a 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2722,14 +2722,14 @@ def img_pool_layer(input, .. math:: - w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride)) + w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride)) \\\\ h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) - ceil_mode=False: .. math:: - w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride)) + w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride)) \\\\ h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) The example usage is: @@ -2863,17 +2863,17 @@ def img_pool3d_layer(input, .. math:: - w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride)) - h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) + w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride)) \\\\ + h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) \\\\ d = 1 + int(ceil(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z)) - ceil_mode=False: .. math:: - w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride)) - h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) - d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z)) + w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride)) \\\\ + h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) \\\\ + d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z)) \\\\ The example usage is: @@ -5429,12 +5429,12 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): https://arxiv.org/pdf/1312.6082v4.pdf`_ .. math:: - y_{si+j} = \max_k x_{gsi + sk + j} - g = groups - s = input.size / num_channels - 0 \le i < num_channels / groups - 0 \le j < s - 0 \le k < groups + y_{si+j} = \max_k x_{gsi + sk + j} \\\\ + g = groups \\\\ + s = input.size / num\_channels \\\\ + 0 \le i < num\_channels / groups \\\\ + 0 \le j < s \\\\ + 0 \le k < groups \\\\ The simple usage is: From 113c026d12e80dceda1592ec6f1940d0569f4000 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Thu, 7 Dec 2017 20:40:36 +0530 Subject: [PATCH 234/275] Swish activation operator (#6358) --- paddle/operators/activation_op.cc | 19 ++++++++++++ paddle/operators/activation_op.h | 30 +++++++++++++++++++ .../v2/fluid/tests/test_activation_op.py | 16 ++++++++++ 3 files changed, 65 insertions(+) diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 83262f950e..7f3118f176 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -506,6 +506,22 @@ It is recommended to use the defaults for this activation. } }; +class SwishOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SwishOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Swish operator"); + AddOutput("Y", "Output of Swish operator"); + AddAttr("beta", "Constant beta of swish operator").SetDefault(1.0f); + AddComment(R"DOC( +Swish Activation Operator. + +$$y = \frac{x}{1 + e^{- \beta x}}$$ + +)DOC"); + } +}; + } // namespace operators } // namespace paddle @@ -592,6 +608,9 @@ REGISTER_OP(thresholded_relu, ops::ActivationOp, ops::ThresholdedReluOpMaker, REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker, hard_sigmoid_grad, ops::ActivationOpGrad); +REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, + ops::ActivationOpGrad); + #define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ REGISTER_OP_CPU_KERNEL( \ act_type, \ diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index 8cd3bfbbd3..ac0e0a3b01 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -700,6 +700,35 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { } }; +template +struct SwishFunctor : public BaseActivationFunctor { + float beta; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + template + void operator()(Device d, X x, Y y) const { + y.device(d) = x / (static_cast(1) + (static_cast(-beta) * x).exp()); + } +}; + +template +struct SwishGradFunctor : public BaseActivationFunctor { + float beta; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"beta", &beta}}; + } + + template + void operator()(Device d, X x, Y y, dY dy, dX dx) const { + auto temp1 = static_cast(1) / + (static_cast(1) + (static_cast(-beta) * x).exp()); + auto temp2 = temp1 * (static_cast(1) - (beta * y)); + dx.device(d) = dy * ((beta * y) + temp2); + } +}; + } // namespace operators } // namespace paddle @@ -730,4 +759,5 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor { __macro(elu, ELUFunctor, ELUGradFunctor); \ __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor); \ __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \ + __macro(swish, SwishFunctor, SwishGradFunctor); \ __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor); diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py index bd52bef260..b052374dc7 100644 --- a/python/paddle/v2/fluid/tests/test_activation_op.py +++ b/python/paddle/v2/fluid/tests/test_activation_op.py @@ -1,6 +1,7 @@ import unittest import numpy as np from op_test import OpTest +from scipy.special import expit class TestExp(OpTest): @@ -455,5 +456,20 @@ class TestHardSigmoid(OpTest): self.check_grad(['X'], 'Y', max_relative_error=0.002) +class TestSwish(OpTest): + def setUp(self): + self.op_type = "swish" + X = np.random.uniform(0.1, 1, [11, 17]).astype("float32") + self.inputs = {'X': X} + self.attrs = {'beta': 2.3} + self.outputs = {'Y': X * expit(self.attrs['beta'] * X)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Y', max_relative_error=0.008) + + if __name__ == "__main__": unittest.main() From 3b1529d2e56660aaaefc0e60413dbb7bacb8ff71 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 8 Dec 2017 10:51:21 +0800 Subject: [PATCH 235/275] add print_operators_doc in travis ci --- paddle/scripts/travis/build_doc.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 278485f788..ff0bac6a07 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -10,6 +10,8 @@ cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON make -j `nproc` gen_proto_py make -j `nproc` paddle_python make -j `nproc` paddle_docs paddle_docs_cn +make -j `nproc` print_operators_doc +paddle/pybind/print_operators_doc > doc/en/html/operators.json # check websites for broken links # It will be failed now! From 3c84444795b71f77b26757c95a8162e56ea777b9 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 8 Dec 2017 11:30:09 +0800 Subject: [PATCH 236/275] follow comments --- .../paddle/trainer_config_helpers/layers.py | 34 ++++++++++--------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index afd8a7579a..cd61dc84af 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2722,15 +2722,15 @@ def img_pool_layer(input, .. math:: - w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride)) \\\\ - h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) + w = 1 + \frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} \\\\ + h = 1 + \frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} - ceil_mode=False: .. math:: - w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride)) \\\\ - h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) + w = 1 + \frac{floor(input\_width + 2 * padding - pool\_size)}{stride} \\\\ + h = 1 + \frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} The example usage is: @@ -2863,17 +2863,17 @@ def img_pool3d_layer(input, .. math:: - w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride)) \\\\ - h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) \\\\ - d = 1 + int(ceil(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z)) + w = 1 + \frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} \\\\ + h = 1 + \frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} \\\\ + d = 1 + \frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} - ceil_mode=False: .. math:: - w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride)) \\\\ - h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y)) \\\\ - d = 1 + int(floor(input\_depth + 2 * padding\_z - pool\_size\_z) / float(stride\_z)) \\\\ + w = 1 + \frac{floor(input\_width + 2 * padding - pool\_size)}{stride} \\\\ + h = 1 + \frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} \\\\ + d = 1 + \frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} \\\\ The example usage is: @@ -5428,13 +5428,15 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks https://arxiv.org/pdf/1312.6082v4.pdf`_ + .. math:: - y_{si+j} = \max_k x_{gsi + sk + j} \\\\ - g = groups \\\\ - s = input.size / num\_channels \\\\ - 0 \le i < num\_channels / groups \\\\ - 0 \le j < s \\\\ - 0 \le k < groups \\\\ + out = \max_k (in[n, k, o_c , s]) \\\\ + out_{i * s + j} = \max_k in_{ k * o_{c} * s + i * s + j} \\\\ + s = \frac{input.size}{ num\_channels} \\\\ + o_{c} =\frac{num\_channels}{groups} \\\\ + 0 \le i < o_{c} \\\\ + 0 \le j < s \\\\ + 0 \le k < groups \\\\ The simple usage is: From 36fcc95cabdd74a9508f88666b51de8c29dc753f Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 8 Dec 2017 13:11:11 +0800 Subject: [PATCH 237/275] Nmt decoder train (#6367) * init decoder_trainer * can run * fix lod * add sharelod to cross_entropy_grad_op * add avg_cost to fetch list * modify learning rate * can run * optimie code * add early exit * fix print * revert test_understand_sentiment_conv.py * add act to fc --- paddle/framework/op_desc.cc | 2 +- paddle/operators/concat_op.cc | 12 +- paddle/operators/cross_entropy_op.cc | 1 + python/paddle/v2/fluid/layers.py | 3 +- .../tests/book/test_machine_translation.py | 120 ++++++++++-------- 5 files changed, 80 insertions(+), 58 deletions(-) diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 2281d93df9..cde3f1ac2e 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); if (in_var->GetType() != VarDesc::LOD_TENSOR) { - VLOG(3) << "input " << in << "is not LodTensor"; + VLOG(3) << "input " << in << " is not LodTensor"; return; } PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR, diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc index 6134ac78b1..cf522d6921 100644 --- a/paddle/operators/concat_op.cc +++ b/paddle/operators/concat_op.cc @@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel { for (size_t j = 0; j < in_zero_dims_size; j++) { if (j == axis) { out_dims[axis] += ins[i][j]; - continue; + } else { + PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], + "Input tensors should have the same " + "elements except the specify axis."); } - PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j], - "Input tensors should have the same " - "elements except the specify axis."); } } + if (out_dims[axis] < 0) { + out_dims[axis] = -1; + } ctx->SetOutputDim("Out", out_dims); + ctx->ShareLoD("X", /*->*/ "Out"); } }; diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 1e82742eaf..2b06012b69 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { "Input(Label) should be 1."); } ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); } protected: diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index fb444f2d86..b4426bad14 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -430,7 +430,8 @@ def _create_op_func_(op_type): dtype = each.dtype elif dtype != each.dtype: raise ValueError( - "operator {0} must input same dtype".format(op_type)) + "operator {0} must input same dtype. {1} vs {2}".format( + op_type, dtype, each.dtype)) return dtype diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py index 5bc7e1b59d..80ffc5a544 100644 --- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py @@ -1,59 +1,62 @@ import numpy as np import paddle.v2 as paddle -import paddle.v2.dataset.conll05 as conll05 +import paddle.v2.fluid as fluid import paddle.v2.fluid.core as core import paddle.v2.fluid.framework as framework import paddle.v2.fluid.layers as layers -from paddle.v2.fluid.executor import Executor, g_scope -from paddle.v2.fluid.optimizer import SGDOptimizer -import paddle.v2.fluid as fluid -import paddle.v2.fluid.layers as pd +from paddle.v2.fluid.executor import Executor dict_size = 30000 source_dict_dim = target_dict_dim = dict_size src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size) -hidden_dim = 512 -word_dim = 512 +hidden_dim = 32 +word_dim = 16 IS_SPARSE = True -batch_size = 50 +batch_size = 10 max_length = 50 topk_size = 50 trg_dic_size = 10000 -src_word_id = layers.data(name="src_word_id", shape=[1], dtype='int64') -src_embedding = layers.embedding( - input=src_word_id, - size=[dict_size, word_dim], - dtype='float32', - is_sparse=IS_SPARSE, - param_attr=fluid.ParamAttr(name='vemb')) - - -def encoder(): - - lstm_hidden0, lstm_0 = layers.dynamic_lstm( - input=src_embedding, - size=hidden_dim, - candidate_activation='sigmoid', - cell_activation='sigmoid') - - lstm_hidden1, lstm_1 = layers.dynamic_lstm( - input=src_embedding, - size=hidden_dim, - candidate_activation='sigmoid', - cell_activation='sigmoid', - is_reverse=True) - - bidirect_lstm_out = layers.concat([lstm_hidden0, lstm_hidden1], axis=0) - - return bidirect_lstm_out - - -def decoder_trainer(context): - ''' - decoder with trainer - ''' - pass +decoder_size = hidden_dim + + +def encoder_decoder(): + # encoder + src_word_id = layers.data( + name="src_word_id", shape=[1], dtype='int64', lod_level=1) + src_embedding = layers.embedding( + input=src_word_id, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh') + lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4) + encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last") + + # decoder + trg_language_word = layers.data( + name="target_language_word", shape=[1], dtype='int64', lod_level=1) + trg_embedding = layers.embedding( + input=trg_language_word, + size=[dict_size, word_dim], + dtype='float32', + is_sparse=IS_SPARSE, + param_attr=fluid.ParamAttr(name='vemb')) + + rnn = fluid.layers.DynamicRNN() + with rnn.block(): + current_word = rnn.step_input(trg_embedding) + mem = rnn.memory(init=encoder_out) + fc1 = fluid.layers.fc(input=[current_word, mem], + size=decoder_size, + act='tanh') + out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax') + rnn.update_memory(mem, fc1) + rnn.output(out) + + return rnn() def to_lodtensor(data, place): @@ -72,13 +75,18 @@ def to_lodtensor(data, place): def main(): - encoder_out = encoder() - # TODO(jacquesqiao) call here - decoder_trainer(encoder_out) + rnn_out = encoder_decoder() + label = layers.data( + name="target_language_next_word", shape=[1], dtype='int64', lod_level=1) + cost = layers.cross_entropy(input=rnn_out, label=label) + avg_cost = fluid.layers.mean(x=cost) + + optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4) + optimizer.minimize(avg_cost) train_data = paddle.batch( paddle.reader.shuffle( - paddle.dataset.wmt14.train(8000), buf_size=1000), + paddle.dataset.wmt14.train(dict_size), buf_size=1000), batch_size=batch_size) place = core.CPUPlace() @@ -88,15 +96,23 @@ def main(): batch_id = 0 for pass_id in xrange(2): - print 'pass_id', pass_id for data in train_data(): - print 'batch', batch_id - batch_id += 1 - if batch_id > 10: break word_data = to_lodtensor(map(lambda x: x[0], data), place) + trg_word = to_lodtensor(map(lambda x: x[1], data), place) + trg_word_next = to_lodtensor(map(lambda x: x[2], data), place) outs = exe.run(framework.default_main_program(), - feed={'src_word_id': word_data, }, - fetch_list=[encoder_out]) + feed={ + 'src_word_id': word_data, + 'target_language_word': trg_word, + 'target_language_next_word': trg_word_next + }, + fetch_list=[avg_cost]) + avg_cost_val = np.array(outs[0]) + print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) + + " avg_cost=" + str(avg_cost_val)) + if batch_id > 3: + exit(0) + batch_id += 1 if __name__ == '__main__': From 00b64f66794a7b92708147a49cc9ca53f74a7397 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Fri, 8 Dec 2017 13:11:50 +0800 Subject: [PATCH 238/275] Add a c-api interface to initialize the thread environment of Paddle (#5773) * Fix bug in MergeModel.cpp. * Add a c-api inferface to initilize the thread environment of Paddle and add a GPU example. * Add some note for paddle_init_thread and move the inplementation of paddle_error_string into a .cpp file. * Add some comments. --- paddle/capi/Main.cpp | 7 ++ paddle/capi/Matrix.cpp | 2 +- paddle/capi/error.cpp | 32 +++++ paddle/capi/error.h | 7 ++ .../multi_thread/CMakeLists.txt | 29 ++++- .../model_inference/multi_thread/main_gpu.c | 113 ++++++++++++++++++ paddle/capi/main.h | 7 ++ 7 files changed, 192 insertions(+), 5 deletions(-) create mode 100644 paddle/capi/error.cpp create mode 100644 paddle/capi/examples/model_inference/multi_thread/main_gpu.c diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp index bb8249a551..c038789340 100644 --- a/paddle/capi/Main.cpp +++ b/paddle/capi/Main.cpp @@ -43,4 +43,11 @@ paddle_error paddle_init(int argc, char** argv) { isInit = true; return kPD_NO_ERROR; } + +paddle_error paddle_init_thread() { + if (FLAGS_use_gpu) { + hl_init(FLAGS_gpu_id); + } + return kPD_NO_ERROR; +} } diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp index 30f3a766f0..cbacd1fb71 100644 --- a/paddle/capi/Matrix.cpp +++ b/paddle/capi/Matrix.cpp @@ -40,7 +40,7 @@ paddle_error paddle_matrix_destroy(paddle_matrix mat) { paddle_error paddle_matrix_set_row(paddle_matrix mat, uint64_t rowID, paddle_real* rowArray) { - if (mat == nullptr) return kPD_NULLPTR; + if (mat == nullptr || rowArray == nullptr) return kPD_NULLPTR; auto ptr = cast(mat); if (ptr->mat == nullptr) return kPD_NULLPTR; if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE; diff --git a/paddle/capi/error.cpp b/paddle/capi/error.cpp new file mode 100644 index 0000000000..169b65f921 --- /dev/null +++ b/paddle/capi/error.cpp @@ -0,0 +1,32 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "error.h" + +const char* paddle_error_string(paddle_error err) { + switch (err) { + case kPD_NULLPTR: + return "nullptr error"; + case kPD_OUT_OF_RANGE: + return "out of range error"; + case kPD_PROTOBUF_ERROR: + return "protobuf error"; + case kPD_NOT_SUPPORTED: + return "not supported error"; + case kPD_UNDEFINED_ERROR: + return "undefined error"; + default: + return ""; + } +} diff --git a/paddle/capi/error.h b/paddle/capi/error.h index 44d8c2040d..9d9d0ed63a 100644 --- a/paddle/capi/error.h +++ b/paddle/capi/error.h @@ -15,6 +15,8 @@ limitations under the License. */ #ifndef __PADDLE_CAPI_ERROR_H__ #define __PADDLE_CAPI_ERROR_H__ +#include "config.h" + /** * Error Type for Paddle API. */ @@ -27,4 +29,9 @@ typedef enum { kPD_UNDEFINED_ERROR = -1, } paddle_error; +/** + * Error string for Paddle API. + */ +PD_API const char* paddle_error_string(paddle_error err); + #endif diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt index 98e411ddc0..2fc8debdde 100644 --- a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt +++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt @@ -1,8 +1,29 @@ project(multi_thread) cmake_minimum_required(VERSION 2.8) -aux_source_directory(. SRC_LIST) -add_executable(${PROJECT_NAME} ${SRC_LIST}) + find_package (Threads) + +if(NOT PADDLE_ROOT) + set(PADDLE_ROOT $ENV{PADDLE_ROOT} CACHE PATH "Paddle Path") +endif() +if(PADDLE_ROOT) + include_directories(${PADDLE_ROOT}/include) + link_directories(${PADDLE_ROOT}/lib) +endif() + +set(CPU_SRCS main.c) +add_executable(${PROJECT_NAME} ${CPU_SRCS}) set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99) -target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared - ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${PROJECT_NAME} + -lpaddle_capi_shared + ${CMAKE_THREAD_LIBS_INIT}) + +find_package(CUDA QUIET) +if(CUDA_FOUND) + set(GPU_SRCS main_gpu.c) + cuda_add_executable(${PROJECT_NAME}_gpu ${GPU_SRCS}) + set_property(TARGET ${PROJECT_NAME}_gpu PROPERTY C_STANDARD 99) + target_link_libraries(${PROJECT_NAME}_gpu + -lpaddle_capi_shared + ${CMAKE_THREAD_LIBS_INIT}) +endif(CUDA_FOUND) diff --git a/paddle/capi/examples/model_inference/multi_thread/main_gpu.c b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c new file mode 100644 index 0000000000..6fd376e0d1 --- /dev/null +++ b/paddle/capi/examples/model_inference/multi_thread/main_gpu.c @@ -0,0 +1,113 @@ +#include +#include +#include +#include "../common/common.h" + +#define CONFIG_BIN "./trainer_config.bin" +#define NUM_THREAD 4 +#define NUM_ITER 1000 + +pthread_mutex_t mutex; + +/* + * @brief It is an simple inference example that runs multi-threads on a GPU. + * Each thread holds it own local gradient_machine but shares the same + * parameters. + * If you want to run on different GPUs, you need to launch + * multi-processes or set trainer_count > 1. + */ +void* thread_main(void* gm_ptr) { + // Initialize the thread environment of Paddle. + CHECK(paddle_init_thread()); + + paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr); + // Create input arguments. + paddle_arguments in_args = paddle_arguments_create_none(); + // Create input matrix. + paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1, + /* size */ 784, + /* useGPU */ true); + // Create output arguments. + paddle_arguments out_args = paddle_arguments_create_none(); + // Create output matrix. + paddle_matrix prob = paddle_matrix_create_none(); + + // CPU buffer to cache the input and output. + paddle_real* cpu_input = (paddle_real*)malloc(784 * sizeof(paddle_real)); + paddle_real* cpu_output = (paddle_real*)malloc(10 * sizeof(paddle_real)); + for (int iter = 0; iter < NUM_ITER; ++iter) { + // There is only one input layer of this network. + CHECK(paddle_arguments_resize(in_args, 1)); + CHECK(paddle_arguments_set_value(in_args, 0, mat)); + + for (int i = 0; i < 784; ++i) { + cpu_input[i] = rand() / ((float)RAND_MAX); + } + CHECK(paddle_matrix_set_value(mat, cpu_input)); + + CHECK(paddle_gradient_machine_forward(machine, + in_args, + out_args, + /* isTrain */ false)); + + CHECK(paddle_arguments_get_value(out_args, 0, prob)); + CHECK(paddle_matrix_get_value(prob, cpu_output)); + + pthread_mutex_lock(&mutex); + printf("Prob: "); + for (int i = 0; i < 10; ++i) { + printf("%.2f ", cpu_output[i]); + } + printf("\n"); + pthread_mutex_unlock(&mutex); + } + + CHECK(paddle_matrix_destroy(prob)); + CHECK(paddle_arguments_destroy(out_args)); + CHECK(paddle_matrix_destroy(mat)); + CHECK(paddle_arguments_destroy(in_args)); + CHECK(paddle_gradient_machine_destroy(machine)); + + free(cpu_input); + free(cpu_output); + + return NULL; +} + +int main() { + // Initalize Paddle + char* argv[] = {"--use_gpu=True"}; + CHECK(paddle_init(1, (char**)argv)); + + // Reading config binary file. It is generated by `convert_protobin.sh` + long size; + void* buf = read_config(CONFIG_BIN, &size); + + // Create a gradient machine for inference. + paddle_gradient_machine machine; + CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size)); + CHECK(paddle_gradient_machine_randomize_param(machine)); + + // Loading parameter. Uncomment the following line and change the directory. + // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine, + // "./some_where_to_params")); + srand(time(0)); + pthread_mutex_init(&mutex, NULL); + + pthread_t threads[NUM_THREAD]; + + for (int i = 0; i < NUM_THREAD; ++i) { + paddle_gradient_machine thread_local_machine; + CHECK(paddle_gradient_machine_create_shared_param( + machine, buf, size, &thread_local_machine)); + pthread_create(&threads[i], NULL, thread_main, thread_local_machine); + } + + for (int i = 0; i < NUM_THREAD; ++i) { + pthread_join(threads[i], NULL); + } + + pthread_mutex_destroy(&mutex); + + return 0; +} diff --git a/paddle/capi/main.h b/paddle/capi/main.h index 893ebcbd58..99c4e8428d 100644 --- a/paddle/capi/main.h +++ b/paddle/capi/main.h @@ -26,6 +26,13 @@ extern "C" { */ PD_API paddle_error paddle_init(int argc, char** argv); +/** + * Initialize the thread environment of Paddle. + * @note it is requisite for GPU runs but optional for CPU runs. + * For GPU runs, all threads will run on the same GPU devices. + */ +PD_API paddle_error paddle_init_thread(); + #ifdef __cplusplus } #endif From e1247d8015c43ad9dd6254650d8238fc0cefde8f Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 8 Dec 2017 13:36:04 +0800 Subject: [PATCH 239/275] Fix compile error in android. --- paddle/math/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index 215bac1271..dcd2a34583 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -34,4 +34,4 @@ add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) -cc_test(test_float16 SRCS test_float16.cpp) +add_simple_unittest(test_float16) From 01e28d112562efdb20ab7d916e23433282e538da Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 8 Dec 2017 13:47:38 +0800 Subject: [PATCH 240/275] change release version 0.11.0 --- python/setup.py.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/setup.py.in b/python/setup.py.in index d59a6a4780..8e856ba460 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -5,7 +5,7 @@ class BinaryDistribution(Distribution): return True MAJOR = 0 -MINOR = 10 +MINOR = 11 PATCH = 0 RC = 0 ISTAGED = False From 361126f26bb4372449b1ac62ee5d04fbd3521088 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 8 Dec 2017 13:51:21 +0800 Subject: [PATCH 241/275] python package name gpu --- python/CMakeLists.txt | 6 ++++++ python/setup.py.in | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index c8632295a2..8e2333f976 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -33,6 +33,12 @@ if(WITH_MKLDNN) list(APPEND MKL_DEPENDS mkldnn) endif() +if(WITH_GPU) + SET(PACKAGE_NAME "paddlepaddle_gpu") +else() + SET(PACKAGE_NAME "paddlepaddle") +endif() + configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in ${CMAKE_CURRENT_BINARY_DIR}/setup.py) diff --git a/python/setup.py.in b/python/setup.py.in index 8e856ba460..9ccb4dc176 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -89,7 +89,7 @@ paddle_rt_libs = ['${WARPCTC_LIBRARIES}'] if '${MKL_SHARED_LIBS}'!= '': paddle_rt_libs += '${MKL_SHARED_LIBS}'.split(';') -setup(name='paddlepaddle', +setup(name='${PACKAGE_NAME}', version='${PADDLE_VERSION}', description='Parallel Distributed Deep Learning', install_requires=setup_requires, From ac18580bb94edada7af6b9353eab8147d4a7e8a5 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 8 Dec 2017 13:52:58 +0800 Subject: [PATCH 242/275] update --- python/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 8e2333f976..6f589e9169 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -34,7 +34,7 @@ if(WITH_MKLDNN) endif() if(WITH_GPU) - SET(PACKAGE_NAME "paddlepaddle_gpu") + SET(PACKAGE_NAME "paddlepaddle-gpu") else() SET(PACKAGE_NAME "paddlepaddle") endif() From 1d301731ac9aeb8d42d77815525e5b7b29b46f92 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 8 Dec 2017 15:22:35 +0800 Subject: [PATCH 243/275] refine the gen_docs in build.sh --- paddle/scripts/docker/build.sh | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 3c6ec6faba..e43b9c218a 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -113,7 +113,10 @@ EOF -DWITH_SWIG_PY=ON \ -DWITH_STYLE_CHECK=OFF make -j `nproc` gen_proto_py + make -j `nproc` paddle_python make -j `nproc` paddle_docs paddle_docs_cn + make -j `nproc` print_operators_doc + paddle/pybind/print_operators_doc > doc/en/html/operators.json popd fi @@ -185,14 +188,6 @@ EOF ${DOCKERFILE_GPU_ENV} ADD go/cmd/pserver/pserver /usr/bin/ ADD go/cmd/master/master /usr/bin/ -EOF - - if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then - cat >> /paddle/build/Dockerfile <> /paddle/build/Dockerfile < Date: Fri, 8 Dec 2017 16:20:09 +0800 Subject: [PATCH 244/275] recv_op use serialized program --- paddle/operators/recv_op.cc | 11 +++++++---- paddle/operators/send_recv_op_test.cc | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index c69e416e10..45222f6b76 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -72,8 +72,10 @@ class RecvOp : public framework::OperatorBase { // FIXME(typhoonzero): do not copy framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor); - auto *block = Attr("OptimizeBlock"); - auto *program = block->Program(); + std::string program_str = Attr("OptimizeProgram"); + framework::Program program_desc; + program_desc.ParseFromString(program_str); + framework::ProgramDescBind program(program_desc); framework::Executor executor(dev_ctx); // Run sub graph to get optimized tensor executor.Run(*program, &recv_scope, block->ID(), @@ -108,8 +110,9 @@ This operator will recv tensor from send_op "IP address to listen on.") .SetDefault("127.0.0.1:6164") .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); - AddAttr("OptimizeBlock", "type BlockDescBind*", - "optimize network run in server"); + AddAttr( + "OptimizeProgram", "type string", + "Serialized ProgramDesc string for recv to run."); } }; diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc index ac03eb3752..c35dc8fa50 100644 --- a/paddle/operators/send_recv_op_test.cc +++ b/paddle/operators/send_recv_op_test.cc @@ -85,7 +85,7 @@ void StartServerNet() { paddle::framework::AttributeMap attrs; attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); - attrs.insert({"OptimizeBlock", block}); + attrs.insert({"OptimizeProgram", program.Proto()->SerializeToString()}); recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}}, {{"Out", {"Out"}}}, attrs); paddle::platform::CPUDeviceContext ctx(place); From 986ca03ce24f6d84eb9cfaef64b59fda2298823b Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 8 Dec 2017 19:45:15 +0800 Subject: [PATCH 245/275] update --- paddle/operators/recv_op.cc | 9 ++++----- paddle/operators/send_recv_op_test.cc | 5 ++++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index 45222f6b76..eed482c1b4 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -73,12 +73,12 @@ class RecvOp : public framework::OperatorBase { framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor); std::string program_str = Attr("OptimizeProgram"); - framework::Program program_desc; + framework::ProgramDesc program_desc; program_desc.ParseFromString(program_str); framework::ProgramDescBind program(program_desc); framework::Executor executor(dev_ctx); // Run sub graph to get optimized tensor - executor.Run(*program, &recv_scope, block->ID(), + executor.Run(program, &recv_scope, 0, /*global_block*/ false /*create_local_scope*/); auto *out_var = recv_scope.FindVar("Out"); @@ -110,9 +110,8 @@ This operator will recv tensor from send_op "IP address to listen on.") .SetDefault("127.0.0.1:6164") .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); - AddAttr( - "OptimizeProgram", "type string", - "Serialized ProgramDesc string for recv to run."); + AddAttr("OptimizeProgram", "type string", + "Serialized ProgramDesc string for recv to run."); } }; diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc index c35dc8fa50..3e2e2051af 100644 --- a/paddle/operators/send_recv_op_test.cc +++ b/paddle/operators/send_recv_op_test.cc @@ -85,7 +85,10 @@ void StartServerNet() { paddle::framework::AttributeMap attrs; attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); - attrs.insert({"OptimizeProgram", program.Proto()->SerializeToString()}); + std::string program_proto; + PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto)); + + attrs.insert({"OptimizeProgram", program_proto}); recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}}, {{"Out", {"Out"}}}, attrs); paddle::platform::CPUDeviceContext ctx(place); From 3a222a4dcf4e3bf05c2c24bac76e9551144e4fcb Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sat, 9 Dec 2017 12:39:19 +0800 Subject: [PATCH 246/275] add release note --- RELEASE.cn.md | 36 ++++++++++++++++++++++++++ RELEASE.md | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/RELEASE.cn.md b/RELEASE.cn.md index 5deaf230a8..a6531061af 100644 --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,3 +1,39 @@ +# Release v0.11.0 + +## Fluid Python API + +- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如: + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- 在v0.11.0版本中,我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中,我们将提升和优化Executor成为一个调试器,就像GDB。并可能提供一些编译器,这个编译器会读取一个上文所描述的应用然后编译成一个等价的 +源代码,这个源代码可以被nvcc编译成可以使用CUDA的二进制,或者被icc编译成可以充分利用Intel CPU的二进制。 + + +## 新特点 + +* 发布 `Fluid` API。 +* 增加了用于模型预测的C-API。 +* 用Fluid API实现了一个简单的GAN的例子。 +* 增加了关于性能调优的文档。 +* 为`paddle.v2.dataset`下载数据集提供了重试机制. +* C++中使用protobuf-lite替换protobuf减少了二进制的大小。 +* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment). +* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。 +* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。 + - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet + - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。 +* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign) +* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod) +* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance) +* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq) +* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score) +* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice) +* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* 增加移动端友好的网页 + # v0.10.0版本 我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。 diff --git a/RELEASE.md b/RELEASE.md index 146f7afa7d..d6aaa341a2 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,75 @@ +# Release v0.11.0 + +## Fluid Python API + +- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*. Fluid is + designed to allow users to program like PyTorch and TensorFlow Eager Execution. + In these systems, there is no longer the concept *model* and applications + do not include a symbolic description of a graph of operators nor a sequence + of layers. Instead, applications look exactly like a usual program that + describes a process of training or inference. The difference between + Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's + control-flow, `if-then-else` nor `for`. Instead, Fluid provides its + C++ implementations and their Python binding using the `with` statement. For an example + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program. +Executor works like an interpreter. In future version, we will improve +`Executor` into a debugger like GDB, and we might provide some compilers, +which, for example, takes an application like the above one, and outputs +an equivalent C++ source program, which can be compiled using +[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) +to generate binaries that use CUDA, or using +[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries +that make full use of Intel CPUs. + +## New Features + +* Release `Fluid` API. +* Add C-API for model inference +* Use fluid API to create a simple GAN demo. +* Add develop guide about performance tunning. +* Add retry when download `paddle.v2.dataset`. +* Linking protobuf-lite not protobuf in C++. Reduce the binary size. +* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released. +* A new style cmake functions for Paddle. It is based on Bazel API. +* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN. + - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet + - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML. +* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign). +* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod). +* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance). +* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq). +* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score). +* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice). +* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* Add mobile friendly webpages. + +## Improvements + +* Build and install using a single `whl` package. +* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标). +* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices. +* Remove buggy BarrierStat. +* Clean and remove unused functions in paddle::Parameter. +* Remove ProtoDataProvider. +* Huber loss supports both regression and classification. +* Add the `stride` parameter for sequence pooling layers. +* Enable v2 API use cudnn batch normalization automatically. +* The BN layer's parameter can be shared by a fixed the parameter name. +* Support variable-dimension input feature for 2D convolution operation. +* Refine cmake about CUDA to automatically detect GPU architecture. +* Improved website navigation. + +## Bug Fixes + +* Fix bug in ROI pooling. cc9a761 +* Fix AUC is zero when label is dense vector. #5274 +* Fix bug in WarpCTC layer. + # Release v0.10.0 We are glad to release version 0.10.0. In this version, we are happy to release the new From 5d4f9fb32de4122194e3e343022a894f0aa2ad4e Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sat, 9 Dec 2017 12:57:12 +0800 Subject: [PATCH 247/275] add some content --- RELEASE.cn.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/RELEASE.cn.md b/RELEASE.cn.md index a6531061af..df273cf7b7 100644 --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,4 +1,4 @@ -# Release v0.11.0 +# v0.11.0版本 ## Fluid Python API @@ -34,6 +34,29 @@ * 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) * 增加移动端友好的网页 +## 改进 + +* 使用一个Python`whl`包即可安装. +* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。 +* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。 +* 删除了有一些bug的BarrierStat。 +* 清理和删除了paddle::Parameter中未使用的函数。 +* 删除了ProtoDataProvider。 +* Huber loss同时支持回归和分类。 +* 为sequence pooling 层增加`stride`参数。 +* v2 API自动使用cudnn batch normalization。 +* 可以使用一个固定的参数名共享BN层的参数。 +* 2D convolution operation支持variable-dimension input特性。 +* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。 +* 优化网页导航。 + +## 错误修复 + +* 修复ROI pooling的Bug. cc9a761 +* 修复当label是dense vector是AUC变成0的问题. #5274 +* 修复WarpCTC 层的Bug. + + # v0.10.0版本 我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。 From acab5e656a181efd2f8fe92e8c01e193dc098e78 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 10 Dec 2017 12:41:10 +0800 Subject: [PATCH 248/275] update v0.11.0 release note --- RELEASE.cn.md | 4 ++-- RELEASE.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/RELEASE.cn.md b/RELEASE.cn.md index df273cf7b7..494c59730d 100644 --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,6 +1,6 @@ # v0.11.0版本 -## Fluid Python API +## PaddlePaddle Fluid - PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如: @@ -12,7 +12,7 @@ ## 新特点 -* 发布 `Fluid` API。 +* 发布 `PaddlePaddle Fluid`。 * 增加了用于模型预测的C-API。 * 用Fluid API实现了一个简单的GAN的例子。 * 增加了关于性能调优的文档。 diff --git a/RELEASE.md b/RELEASE.md index d6aaa341a2..7c3c04ef9f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,6 @@ # Release v0.11.0 -## Fluid Python API +## PaddlePaddle Fluid - Release 0.11.0 includes a new feature *PaddlePaddle Fluid*. Fluid is designed to allow users to program like PyTorch and TensorFlow Eager Execution. From 2e65df17295750bc73f02cf421272918ee97cb98 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 10 Dec 2017 12:43:03 +0800 Subject: [PATCH 249/275] change Fluid description --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index 7c3c04ef9f..5a62c95513 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -26,7 +26,7 @@ that make full use of Intel CPUs. ## New Features -* Release `Fluid` API. +* Release `PaddlePaddle Fluid`. * Add C-API for model inference * Use fluid API to create a simple GAN demo. * Add develop guide about performance tunning. From 578ad6d23251c0fc08cbf49aa1d6bf9daae55a88 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Mon, 11 Dec 2017 11:21:47 +0800 Subject: [PATCH 250/275] Use PADDLE_WITH_NATIVE_FP16 for float16_t. --- paddle/math/float16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index f805cad08b..76ad3a0123 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -101,7 +101,7 @@ public: half tmp = __float2half(val); x = *reinterpret_cast(&tmp); -#elif defined(PADDLE_NEON) +#elif defined(PADDLE_WITH_NATIVE_FP16) float32x4_t tmp = vld1q_dup_f32(&val); float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0); x = *reinterpret_cast(&res); @@ -252,7 +252,7 @@ public: half tmp = *reinterpret_cast(this); return __half2float(tmp); -#elif defined(PADDLE_NEON) +#elif defined(PADDLE_WITH_NATIVE_FP16) float16x4_t res = vld1_dup_f16(reinterpret_cast(this)); return vgetq_lane_f32(vcvt_f32_f16(res), 0); From 3ba75c7c8225cfeb001b2d85b6dc062d08e9b630 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 11 Dec 2017 14:28:44 +0800 Subject: [PATCH 251/275] update inference benchmark data --- benchmark/IntelOptimizedPaddle.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index c275aeb5cb..9c884044e6 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -70,26 +70,26 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz | BatchSize | 1 | 2 | 4 | 8 | 16 | |-----------|-------|-------|-------|-------|-------| -| OpenBLAS | 0.36 | 0.48 | 0.56 | 0.50 | 0.43 | -| MKLML | 5.41 | 9.52 | 14.71 | 20.46 | 29.35 | -| MKL-DNN | 65.52 | 89.94 | 83.92 | 94.77 | 95.78 | +| OpenBLAS | 1.07 | 1.08 | 1.06 | 0.88 | 0.65 | +| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 | +| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 | - ResNet-50 | BatchSize | 1 | 2 | 4 | 8 | 16 | |-----------|-------|--------|--------|--------|--------| -| OpenBLAS | 0.29 | 0.43 | 0.71 | 0.85 | 0.71 | -| MKLML | 6.26 | 11.88 | 21.37 | 39.67 | 59.01 | -| MKL-DNN | 90.27 | 134.03 | 136.03 | 153.66 | 211.22 | +| OpenBLAS | 3.35 | 3.19 | 3.09 | 2.55 | 1.96 | +| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 | +| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 | -- GoogLeNet +- GoogLeNet | BatchSize | 1 | 2 | 4 | 8 | 16 | |-----------|--------|--------|--------|--------|--------| -| OpenBLAS | 12.47 | 12.36 | 12.25 | 12.13 | 12.08 | -| MKLML | 22.50 | 43.90 | 81.22 | 132.92 | 199.69 | -| MKL-DNN | 221.69 | 341.33 | 428.09 | 528.24 | 624.18 | +| OpenBLAS | 12.04 | 11.31 | 10.00 | 9.07 | 4.34 | +| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 | +| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 | ### Laptop From 95924686096556d959e67b294e146153ac3b0dfb Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Mon, 11 Dec 2017 16:12:36 +0800 Subject: [PATCH 252/275] Fix gcc4.9 (#6442) * Fix compiling error of gcc4.9. * Refine the check of cxx compiler flags in api/CMakeLists.txt. --- paddle/api/CMakeLists.txt | 12 +++- paddle/framework/backward.cc | 18 +++--- paddle/framework/backward_test.cc | 70 ++++++++++++++---------- paddle/framework/op_desc.cc | 4 +- paddle/framework/operator_test.cc | 4 +- paddle/framework/prune_test.cc | 44 ++++++++++----- paddle/operators/conditional_block_op.cc | 6 +- paddle/operators/net_op.h | 5 +- paddle/operators/net_op_test.cc | 16 +++--- paddle/operators/recurrent_op.cc | 5 +- paddle/operators/while_op.cc | 5 +- 11 files changed, 118 insertions(+), 71 deletions(-) diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index d6b8464100..cf84568ecd 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -25,8 +25,18 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py) SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON) +SET(SWIG_NEED_FLAGS + -ftls-model=global-dynamic + -Wno-parentheses-equality + -Wno-self-assign + -Wno-maybe-uninitialized + -Wno-missing-field-initializers) + FOREACH(flag ${SWIG_NEED_FLAGS}) + safe_set_cxxflag(SWIG_CXX_FLAGS ${flag}) +ENDFOREACH() + SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}") SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS paddle_parameter diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 7294ba1a9c..a17036c652 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -190,8 +190,9 @@ static std::unique_ptr BackwardRecursive( // collect all the offset for each alias, // insert a sum operator to add all aliases to output insert_position.push_back( - {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, - {{"Out", {name}}}, {})}); + {dup_op.back(), + OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}}, + AttributeMap{})}); } // make sure the inserted `sum` ops follow the BFS order. @@ -216,7 +217,8 @@ static std::unique_ptr BackwardRecursive( // If part of input gradient of that operator is not calculated, fill // zero variables to that input gradient. net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}}, - {{"Y", {grad_input}}}, {})); + {{"Y", {grad_input}}}, + AttributeMap{})); } return false; }); @@ -392,8 +394,9 @@ std::vector> MakeOpGrad( 0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1); std::string new_name = prefix + kZeroVarSuffix; desc->Rename(in_name, new_name); - std::unique_ptr fill_zeros_op(new OpDescBind( - "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {})); + std::unique_ptr fill_zeros_op( + new OpDescBind("fill_zeros_like", {{"X", {prefix}}}, + {{"Y", {new_name}}}, AttributeMap{})); pending_fill_zeros_ops.push_back(std::move(fill_zeros_op)); } } @@ -483,8 +486,9 @@ std::vector> MakeBlockBackward( sum_op_inputs.emplace_back(new_name); next_g_name = sum_op_inputs.back(); } - std::unique_ptr sum_op(new OpDescBind( - "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); + std::unique_ptr sum_op( + new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, + AttributeMap{})); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); } } diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc index 2b858f5ea0..9fe49881d5 100644 --- a/paddle/framework/backward_test.cc +++ b/paddle/framework/backward_test.cc @@ -106,15 +106,15 @@ class FcOp : public operators::NetOp { FcOp(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs) : NetOp(type, inputs, outputs, attrs) { - AppendOp(OpRegistry::CreateOp("mul", - {{"X", {Input("X")}}, {"Y", {Input("W")}}}, - {{"Out", {Output("mul_result")}}}, {})); + AppendOp(OpRegistry::CreateOp( + "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}}, + {{"Out", {Output("mul_result")}}}, AttributeMap{})); auto input_b = Inputs("b"); std::string before_act = "mul_result"; if (input_b.size() != 0) { AppendOp(OpRegistry::CreateOp( "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}}, - {{"Out", {Output("add_result")}}}, {})); + {{"Out", {Output("add_result")}}}, AttributeMap{})); before_act = "add_result"; } else { auto out_varname = Output("add_result"); @@ -124,7 +124,7 @@ class FcOp : public operators::NetOp { } AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}}, - {{"Out", {Output("Out")}}}, {})); + {{"Out", {Output("Out")}}}, AttributeMap{})); CompleteAddOp(false); } }; @@ -278,8 +278,9 @@ REGISTER_OPERATOR(scale, f::NoneOp); REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel); TEST(Backward, simple_op_not_need_grad) { - auto fwd = f::OpRegistry::CreateOp( - "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {}); + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); ASSERT_NE(fwd, nullptr); auto gop = f::Backward(*fwd, {"x"}); ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName); @@ -296,9 +297,10 @@ TEST(Backward, net_fc_backward_normal) { {{"mul_result", {"mul_res"}}, {"add_result", {"add_re"}}, {"Out", {"out"}}}, - {}); + f::AttributeMap{}); ASSERT_NE(fwd, nullptr); - std::shared_ptr gop = f::Backward(*fwd, {}); + std::shared_ptr gop = + f::Backward(*fwd, std::unordered_set{}); ASSERT_TRUE(gop->IsNetOp()); auto net = static_cast(gop.get()); @@ -322,9 +324,10 @@ TEST(Backward, net_fc_backward_not_have_b) { {{"mul_result", {"mul_res"}}, {"add_result", {"add_res"}}, {"Out", {"tmp"}}}, - {}); + f::AttributeMap{}); ASSERT_NE(fwd, nullptr); - std::shared_ptr gop = f::Backward(*fwd, {}); + std::shared_ptr gop = + f::Backward(*fwd, std::unordered_set{}); ASSERT_TRUE(gop->IsNetOp()); auto net = static_cast(gop.get()); @@ -346,13 +349,13 @@ TEST(Backward, net_input_of_network_not_need_grad) { {{"mul_result", {"mul_tmp_0"}}, {"add_result", {"add_tmp_0"}}, {"Out", {"hidden0"}}}, - {})); + f::AttributeMap{})); net.AppendOp(f::OpRegistry::CreateOp( "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}}, {{"mul_result", {"mul_tmp_1"}}, {"add_result", {"add_tmp_1"}}, {"Out", {"hidden1"}}}, - {})); + f::AttributeMap{})); net.CompleteAddOp(); auto bwd = Backward(net, {"x"}); // x@GRAD is not need. ASSERT_TRUE(bwd->IsNetOp()); @@ -381,12 +384,13 @@ TEST(Backward, net_input_of_network_not_need_grad) { TEST(Backward, net_shared_weight) { ops::NetOp net; net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}}, - {{"Out", {"out"}}}, {})); + {{"Out", {"out"}}}, f::AttributeMap{})); net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}}, - {{"Out", {"FinalOut"}}}, {})); + {{"Out", {"FinalOut"}}}, + f::AttributeMap{})); net.CompleteAddOp(); - auto bwd = f::Backward(net, {}); + auto bwd = f::Backward(net, std::unordered_set{}); ASSERT_TRUE(bwd->IsNetOp()); auto bwd_net = static_cast(bwd.get()); ASSERT_EQ(3UL, bwd_net->ops_.size()); @@ -394,8 +398,9 @@ TEST(Backward, net_shared_weight) { } TEST(Backward, op_all_input_are_not_need) { - auto fwd = f::OpRegistry::CreateOp( - "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {}); + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); auto backward = f::Backward(*fwd, {"x", "b"}); ASSERT_TRUE(backward->IsNetOp()); auto net = static_cast(backward.get()); @@ -403,8 +408,9 @@ TEST(Backward, op_all_input_are_not_need) { } TEST(Backward, op_all_output_are_not_need) { - auto fwd = f::OpRegistry::CreateOp( - "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {}); + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); auto backward = f::Backward(*fwd, {"out"}); ASSERT_TRUE(backward->IsNetOp()); auto net = static_cast(backward.get()); @@ -412,8 +418,9 @@ TEST(Backward, op_all_output_are_not_need) { } TEST(Backward, op_part_of_output_are_not_need) { - auto fwd = f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}}, - {{"y", {"Y"}}, {"z", {"Z"}}}, {}); + auto fwd = + f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}}, + {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{}); auto backward = f::Backward(*fwd, {"Z"}); ASSERT_TRUE(backward->IsNetOp()); auto net = static_cast(backward.get()); @@ -437,7 +444,7 @@ TEST(Backward, op_part_of_output_are_not_need) { TEST(Backward, op_part_of_input_are_not_need) { auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}}, - {{"Out", {"out"}}}, {}); + {{"Out", {"out"}}}, f::AttributeMap{}); auto backward = f::Backward(*fwd, {"a"}); auto &grad_mul = *backward; ASSERT_EQ(grad_mul.Type(), "mul_grad"); @@ -458,19 +465,19 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) { {{"mul_result", {"mul_out1"}}, {"add_result", {"add_out1"}}, {"Out", {"out1"}}}, - {})); + f::AttributeMap{})); net.AppendOp(f::OpRegistry::CreateOp( "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}}, {{"mul_result", {"mul_out2"}}, {"add_result", {"tmp_out2"}}, {"Out", {"out2"}}}, - {})); + f::AttributeMap{})); net.AppendOp(f::OpRegistry::CreateOp( "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}}, {{"mul_result", {"mul_out3"}}, {"add_result", {"tmp_out3"}}, {"Out", {"out3"}}}, - {})); + f::AttributeMap{})); net.CompleteAddOp(); auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"}); @@ -509,7 +516,8 @@ TEST(Backward, simple_single_op) { auto target = f::VarDescBind("out"); target.SetShape({1}); - auto var_to_grad = AppendBackward(program, target, {}); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 3UL); f::OpDescBind *fill_op = block->AllOps()[1]; @@ -546,7 +554,7 @@ TEST(Backward, default_attribute) { auto target = f::VarDescBind("out"); target.SetShape({1}); - AppendBackward(program, target, {}); + AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 3UL); EXPECT_EQ(boost::get(op->GetAttr("x_num_col_dims")), 1); @@ -585,7 +593,8 @@ TEST(Backward, simple_mult_op) { auto target = f::VarDescBind("out3"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); - auto var_to_grad = AppendBackward(program, target, {}); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 6UL + 1); f::OpDescBind *fill_op = block->AllOps()[forward_len]; @@ -817,7 +826,8 @@ TEST(Backward, shared_var) { auto target = f::VarDescBind("out3"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); - auto var_to_grad = AppendBackward(program, target, {}); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 8UL); f::OpDescBind *fill_op = block->AllOps()[forward_len]; diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index cde3f1ac2e..7ba1e3e4e3 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -316,8 +316,8 @@ static void InitInferShapeFuncs() { for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) { auto op_type = kern_pair.first; auto &op_info = info_map.at(op_type); - auto op = - static_cast(op_info.Creator()("", {}, {}, {})); + auto op = static_cast(op_info.Creator()( + "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); if (op_info.infer_shape_) { // infer_shape has been registered. continue; } diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index 1e19f82b34..59ddbc7791 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -261,7 +261,9 @@ class OperatorClone : public paddle::framework::OperatorBase { }; TEST(Operator, Clone) { - OperatorClone a("ABC", {}, {}, {}); + OperatorClone a("ABC", paddle::framework::VariableNameMap{}, + paddle::framework::VariableNameMap{}, + paddle::framework::AttributeMap{}); auto b = a.Clone(); ASSERT_EQ(a.Type(), b->Type()); } diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc index 5988874809..f21df37a29 100644 --- a/paddle/framework/prune_test.cc +++ b/paddle/framework/prune_test.cc @@ -54,7 +54,8 @@ TEST(Prune, one_operator) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block); + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); f::ProgramDesc *pdesc = program.Proto(); f::ProgramDesc pruned; @@ -71,10 +72,14 @@ TEST(Prune, forward) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block); - AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, {}, block); - AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, {}, block); - AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, {}, block); + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{}, + block); f::ProgramDesc *pdesc = program.Proto(); @@ -90,11 +95,14 @@ TEST(Prune, multi_input_op) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, {}, block); - AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, {}, block); - AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, {}, block); - AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, {}, + AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{}, + block); + AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, + f::AttributeMap{}, block); f::ProgramDesc *pdesc = program.Proto(); pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true); @@ -108,9 +116,12 @@ TEST(Prune, multi_output_op) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block); - AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block); - AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block); + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); f::ProgramDesc *pdesc = program.Proto(); pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); @@ -124,9 +135,12 @@ TEST(Prune, multi_target) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block); - AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block); - AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block); + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); f::ProgramDesc *pdesc = program.Proto(); pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true); diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc index d5b124682d..03c58a7eab 100644 --- a/paddle/operators/conditional_block_op.cc +++ b/paddle/operators/conditional_block_op.cc @@ -142,9 +142,9 @@ class ConditionalBlockGradOp : public ConditionalOp { continue; } auto new_in_grad_name = cur_scope.Rename(in_grad_name); - auto assign = - framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}}, - {{"Out", {out_grad_name}}}, {}); + auto assign = framework::OpRegistry::CreateOp( + "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}}, + framework::AttributeMap{}); assign->Run(cur_scope, dev_ctx); cur_scope.Rename(new_in_grad_name, in_grad_name); } diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h index ebeb262d96..8935751f15 100644 --- a/paddle/operators/net_op.h +++ b/paddle/operators/net_op.h @@ -38,7 +38,10 @@ namespace operators { class NetOp : public framework::OperatorBase { public: static const char kAll[]; - NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {} + NetOp() + : framework::OperatorBase("plain_net", framework::VariableNameMap{}, + framework::VariableNameMap{}, + framework::AttributeMap{}) {} NetOp(const std::string& type, const framework::VariableNameMap& inputs, const framework::VariableNameMap& outputs, diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc index 63bebd5b44..22fba9568d 100644 --- a/paddle/operators/net_op_test.cc +++ b/paddle/operators/net_op_test.cc @@ -38,10 +38,10 @@ TEST(OpKernel, all) { net->AppendOp(std::unique_ptr( new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, - {{"Out", {"y"}}}, {}))); + {{"Out", {"y"}}}, framework::AttributeMap{}))); net->AppendOp(std::unique_ptr( new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}}, - {{"Out", {"z"}}}, {}))); + {{"Out", {"z"}}}, framework::AttributeMap{}))); net->CompleteAddOp(); AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, @@ -58,7 +58,7 @@ TEST(NetOp, insert_op) { NetOp net; auto op1 = std::unique_ptr( new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, - {{"Out", {"y"}}}, {})); + {{"Out", {"y"}}}, framework::AttributeMap{})); net.AppendOp(*op1); net.InsertOp(0, *op1); ASSERT_EQ(2UL, net.ops_.size()); @@ -68,10 +68,12 @@ TEST(NetOp, insert_op) { TEST(NetOp, Clone) { NetOp net; - net.AppendOp( - std::unique_ptr(new framework::NOP{"empty", {}, {}, {}})); - net.AppendOp(std::unique_ptr( - new framework::NOP{"empty2", {}, {}, {}})); + net.AppendOp(std::unique_ptr(new framework::NOP{ + "empty", framework::VariableNameMap{}, framework::VariableNameMap{}, + framework::AttributeMap{}})); + net.AppendOp(std::unique_ptr(new framework::NOP{ + "empty2", framework::VariableNameMap{}, framework::VariableNameMap{}, + framework::AttributeMap{}})); net.CompleteAddOp(true); auto new_net_op = net.Clone(); ASSERT_NE(new_net_op, nullptr); diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 8b60b9c912..29f9163643 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -408,7 +408,8 @@ class RecurrentGradOp : public RecurrentBase { attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( - "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs); + "fill_constant", framework::VariableNameMap{}, + {{"Out", {pg_names[param_id]}}}, attrs); zero_op->Run(scope, dev_ctx); } @@ -417,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, - {{"Out", {pg_names[param_id]}}}, {}); + {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); sum_op->Run(cur_scope, dev_ctx); cur_scope.Rename(new_inside_name, inside_grad_name); diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 9b3f21cf94..b8e44bcc5a 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -187,7 +187,8 @@ class WhileGradOp : public framework::OperatorBase { attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( - "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs); + "fill_constant", framework::VariableNameMap{}, + {{"Out", {pg_names[param_id]}}}, attrs); zero_op->Run(scope, dev_ctx); } } @@ -195,7 +196,7 @@ class WhileGradOp : public framework::OperatorBase { auto new_inside_name = cur_scope.Rename(inside_grad_name); auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, - {{"Out", {pg_names[param_id]}}}, {}); + {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); sum_op->Run(cur_scope, dev_ctx); cur_scope.Rename(new_inside_name, inside_grad_name); } From 8d428bd9b89ec59dfcf16eb9e319a9106415b766 Mon Sep 17 00:00:00 2001 From: ranqiu Date: Mon, 4 Dec 2017 19:28:12 +0800 Subject: [PATCH 253/275] Update annotations of layers.py --- .../paddle/trainer_config_helpers/layers.py | 121 ++++++++++-------- 1 file changed, 66 insertions(+), 55 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 4bd94861af..48858f4c34 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -1516,34 +1516,33 @@ def lstmemory(input, NOTE: This is a low level user interface. You can use network.simple_lstm to config a simple plain lstm layer. - Please refer to **Generating Sequences With Recurrent Neural Networks** for - more details about LSTM. - - Link_ goes as below. - - .. _Link: http://arxiv.org/abs/1308.0850 + Reference: + `Generating Sequences With Recurrent Neural Networks + `_ - :param name: The lstmemory layer name. + :param name: The name of this layer. It is optional. :type name: basestring - :param size: DEPRECATED. size of the lstm cell + :param size: DEPRECATED. The dimension of the lstm cell. :type size: int :param input: The input of this layer. :type input: LayerOutput - :param reverse: is sequence process reversed or not. + :param reverse: Whether the input sequence is processed in a reverse order. :type reverse: bool :param act: Activation type. TanhActivation is the default activation. :type act: BaseActivation - :param gate_act: gate activation type, SigmoidActivation by default. + :param gate_act: Activation type of this layer's gates. SigmoidActivation is the + default activation. :type gate_act: BaseActivation - :param state_act: state activation type, TanhActivation by default. + :param state_act: Activation type of the state. TanhActivation is the default activation. :type state_act: BaseActivation :param bias_attr: The bias attribute. If the parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param param_attr: Parameter Attribute. - :type param_attr: ParameterAttribute | None | False - :param layer_attr: Extra Layer attribute + :param param_attr: The parameter attribute. See ParameterAttribute for details. + :type param_attr: ParameterAttribute + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute | None :return: LayerOutput object. :rtype: LayerOutput @@ -1632,14 +1631,14 @@ def grumemory(input, h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}} NOTE: In PaddlePaddle's implementation, the multiplication operations - :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in - gate_recurrent layer. Consequently, an additional mixed_layer with + :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed + in gate_recurrent layer. Consequently, an additional mixed_layer with full_matrix_projection or a fc_layer must be included before grumemory is called. - More details can be found by referring to `Empirical Evaluation of Gated - Recurrent Neural Networks on Sequence Modeling. - `_ + Reference: + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling + `_ The simple usage is: @@ -1647,28 +1646,29 @@ def grumemory(input, gru = grumemory(input) - :param name: The gru layer name. - :type name: None | basestring + :param name: The name of this layer. It is optional. + :type name: basestring :param input: The input of this layer. :type input: LayerOutput. - :param size: DEPRECATED. size of the gru cell + :param size: DEPRECATED. The dimension of the gru cell. :type size: int - :param reverse: Whether sequence process is reversed or not. + :param reverse: Whether the input sequence is processed in a reverse order. :type reverse: bool :param act: Activation type, TanhActivation is the default. This activation affects the :math:`{\\tilde{h_t}}`. :type act: BaseActivation - :param gate_act: gate activation type, SigmoidActivation by default. - This activation affects the :math:`z_t` and :math:`r_t`. It is the - :math:`\\sigma` in the above formula. + :param gate_act: Activation type of this layer's two gates. SigmoidActivation is + the default activation. This activation affects the :math:`z_t` + and :math:`r_t`. It is the :math:`\\sigma` in the above formula. :type gate_act: BaseActivation :param bias_attr: The bias attribute. If the parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param param_attr: Parameter Attribute. - :type param_attr: ParameterAttribute | None | False - :param layer_attr: Extra Layer attribute + :param param_attr: The parameter attribute. See ParameterAttribute for details. + :type param_attr: ParameterAttribute + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute | None :return: LayerOutput object. :rtype: LayerOutput @@ -1712,10 +1712,10 @@ def last_seq(input, """ Get Last Timestamp Activation of a sequence. - If stride > 0, this layer slides a window whose size is determined by stride, - and return the last value of the window as the output. Thus, a long sequence - will be shorten. Note that for sequence with sub-sequence, the default value - of stride is -1. + If stride > 0, this layer will slide a window whose size is determined by stride, + and return the last value of the sequence in the window as the output. Thus, a + long sequence will be shortened. Note that for sequence with sub-sequence, the + default value of stride is -1. The simple usage is: @@ -1724,14 +1724,16 @@ def last_seq(input, seq = last_seq(input=layer) :param agg_level: Aggregated level + :type agg_level: AggregateLevel :param name: The name of this layer. It is optional. :type name: basestring :param input: The input of this layer. :type input: LayerOutput :param stride: The step size between successive pooling regions. - :type stride: Int - :param layer_attr: extra layer attributes. - :type layer_attr: ExtraLayerAttribute. + :type stride: int + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. + :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. :rtype: LayerOutput """ @@ -1768,10 +1770,10 @@ def first_seq(input, """ Get First Timestamp Activation of a sequence. - If stride > 0, this layer slides a window whose size is determined by stride, - and return the first value of the window as the output. Thus, a long sequence - will be shorten. Note that for sequence with sub-sequence, the default value - of stride is -1. + If stride > 0, this layer will slide a window whose size is determined by stride, + and return the first value of the sequence in the window as the output. Thus, a + long sequence will be shortened. Note that for sequence with sub-sequence, the + default value of stride is -1. The simple usage is: @@ -1780,13 +1782,15 @@ def first_seq(input, seq = first_seq(input=layer) :param agg_level: aggregation level + :type agg_level: AggregateLevel :param name: The name of this layer. It is optional. :type name: basestring :param input: The input of this layer. :type input: LayerOutput :param stride: The step size between successive pooling regions. - :type stride: Int - :param layer_attr: extra layer attributes. + :type stride: int + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute. :return: LayerOutput object. :rtype: LayerOutput @@ -1844,8 +1848,8 @@ def expand_layer(input, expand_level=ExpandLevel.FROM_NO_SEQUENCE, layer_attr=None): """ - A layer for "Expand Dense data or (sequence data where the length of each - sequence is one) to sequence data." + A layer for expanding dense data or (sequence data where the length of each + sequence is one) to sequence data. The example usage is: @@ -1857,7 +1861,9 @@ def expand_layer(input, :param input: The input of this layer. :type input: LayerOutput - :param expand_as: Expand as this layer's sequence info. + :param expand_as: Expand the input according to this layer's sequence infomation. And + after the operation, the input expanded will have the same number of + elememts as this layer. :type expand_as: LayerOutput :param name: The name of this layer. It is optional. :type name: basestring @@ -1865,9 +1871,10 @@ def expand_layer(input, whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param expand_level: whether input layer is timestep(default) or sequence. + :param expand_level: Whether the input layer is a sequence or the element of a sequence. :type expand_level: ExpandLevel - :param layer_attr: extra layer attributes. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute. :return: LayerOutput object. :rtype: LayerOutput @@ -3294,7 +3301,7 @@ def row_l2_norm_layer(input, name=None, layer_attr=None): A layer for L2-normalization in each row. .. math:: - out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}} + out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}} where the size of :math:`in` is (batchSize x dataDim) , and the size of :math:`out` is a (batchSize x dataDim) . @@ -6161,9 +6168,11 @@ def huber_regression_cost(input, Given a prediction f(x), a label y and :math:`\delta`, the loss function is defined as: - .. math: - loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta - loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise + .. math:: + + loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta + + loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise The example usage is: @@ -6210,12 +6219,14 @@ def huber_classification_cost(input, """ For classification purposes, a variant of the Huber loss called modified Huber is sometimes used. Given a prediction f(x) (a real-valued classifier score) and - a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber + a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber loss is defined as: .. math: - loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1 - loss = -4yf(x), \text{otherwise} + + loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1 + + loss = -4yf(x), otherwise The example usage is: @@ -6959,7 +6970,7 @@ def clip_layer(input, min, max, name=None): .. math:: - out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right) + out[i] = \min (\max (in[i],p_{1} ),p_{2} ) .. code-block:: python From ddf20e589fad724f077b0613ebf3872d2311647a Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 11 Dec 2017 18:31:14 +0800 Subject: [PATCH 254/275] typo WITH_TEST to WITH_TESTING --- doc/howto/dev/contribute_to_paddle_cn.md | 2 +- paddle/scripts/docker/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md index 6993901452..3eb477eb65 100644 --- a/doc/howto/dev/contribute_to_paddle_cn.md +++ b/doc/howto/dev/contribute_to_paddle_cn.md @@ -87,7 +87,7 @@ no changes added to commit (use "git add" and/or "git commit -a") 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: ```bash -➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev +➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:dev ``` 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`): diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md index f3a6f1dba7..1e1fcc50dc 100644 --- a/paddle/scripts/docker/README.md +++ b/paddle/scripts/docker/README.md @@ -192,7 +192,7 @@ For developers who are interested in the C++ source code, please use -e "WOBOQ=O - The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host: ```bash -docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev +docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev ``` - You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run: From 9f44af9d7c9dd2f1dc775f59d059a87eb9e64fd6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 11 Dec 2017 18:37:18 +0800 Subject: [PATCH 255/275] Fix #6460 (#6461) --- python/paddle/v2/fluid/layers.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index b4426bad14..fd8a2ed18c 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -762,7 +762,7 @@ def sequence_conv(input, helper = LayerHelper('sequence_conv', **locals()) dtype = helper.input_dtype() filter_shape = [filter_size * input.shape[1], num_filters] - filter = helper.create_parameter( + filter_param = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype) pre_bias = helper.create_tmp_variable(dtype) @@ -770,7 +770,7 @@ def sequence_conv(input, type='sequence_conv', inputs={ 'X': [input], - 'Filter': [filter], + 'Filter': [filter_param], }, outputs={"Out": pre_bias}, attrs={ @@ -785,7 +785,7 @@ def sequence_conv(input, def conv2d(input, num_filters, filter_size, - stride=[1, 1], + stride=None, padding=None, groups=None, param_attr=None, @@ -802,6 +802,8 @@ def conv2d(input, conv-2d output, if mentioned in the input parameters. """ + if stride is None: + stride = [1, 1] helper = LayerHelper('conv2d', **locals()) dtype = helper.input_dtype() @@ -827,7 +829,7 @@ def conv2d(input, std = (2.0 / (filter_size[0]**2 * num_channels))**0.5 return Normal(0.0, std, 0) - filter = helper.create_parameter( + filter_param = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype, @@ -839,7 +841,7 @@ def conv2d(input, type='conv2d_cudnn', inputs={ 'Input': input, - 'Filter': filter, + 'Filter': filter_param, }, outputs={"Output": pre_bias}, attrs={'strides': stride, @@ -875,8 +877,8 @@ def sequence_pool(input, pool_type, **kwargs): def pool2d(input, pool_size, pool_type, - pool_stride=[1, 1], - pool_padding=[0, 0], + pool_stride=None, + pool_padding=None, global_pooling=False, main_program=None, startup_program=None): @@ -884,6 +886,10 @@ def pool2d(input, This function adds the operator for pooling in 2 dimensions, using the pooling configurations mentioned in input parameters. """ + if pool_padding is None: + pool_padding = [0, 0] + if pool_stride is None: + pool_stride = [1, 1] if pool_type not in ["max", "avg"]: raise ValueError( "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", From 69b44f2f198509e29b8ab50edab9ab34f56fd1af Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 12 Dec 2017 15:31:44 +0800 Subject: [PATCH 256/275] unify MKL macro definition --- cmake/cblas.cmake | 2 +- cmake/external/mkldnn.cmake | 2 +- paddle/gserver/activations/ActivationFunction.cpp | 4 ++-- paddle/gserver/gradientmachines/NeuralNetwork.cpp | 4 ++-- paddle/math/Allocator.h | 2 +- paddle/math/MathFunctions.cpp | 2 +- paddle/math/MathFunctions.h | 2 +- paddle/memory/detail/system_allocator.cc | 2 +- paddle/operators/math/math_function.cc | 2 +- paddle/operators/math/math_function.h | 2 +- paddle/parameter/FirstOrderOptimizer.h | 2 +- paddle/parameter/ParameterUpdateFunctions.cpp | 2 +- paddle/utils/Flags.cpp | 2 +- 13 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index b21fc43904..13294c0548 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB) set(CBLAS_INC_DIR ${MKLML_INC_DIR}) set(CBLAS_LIBRARIES ${MKLML_LIB}) - add_definitions(-DPADDLE_USE_MKLML) + add_definitions(-DPADDLE_WITH_MKLML) add_definitions(-DLAPACK_FOUND) message(STATUS "Found cblas and lapack in MKLML " diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index fc52d339d7..5d24caebdc 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") -add_definitions(-DPADDLE_USE_MKLDNN) +add_definitions(-DPADDLE_WITH_MKLDNN) LIST(APPEND external_project_dependencies mkldnn) diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp index f5a41b66bf..57c890e488 100644 --- a/paddle/gserver/activations/ActivationFunction.cpp +++ b/paddle/gserver/activations/ActivationFunction.cpp @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/utils/ClassRegistrar.h" #include "paddle/utils/Logging.h" -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN #include "MKLDNNActivation.h" #endif @@ -490,7 +490,7 @@ Error __must_check backward(Argument& act) { END_DEFINE_ACTIVATION(log) ActivationFunction* ActivationFunction::create(const std::string& type) { -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) { return MKLDNNActivation::create(type); } diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index be112b4123..68bf37d59d 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN #include "paddle/gserver/layers/MKLDNNLayer.h" #endif @@ -307,7 +307,7 @@ void NeuralNetwork::backward(const UpdateCallback& callback) { } void NeuralNetwork::finish() { -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN FOR_EACH_R(layer, layers_) { MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast(*layer); if (dnnLayer) { diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h index 94ef561f06..17563bf5e1 100644 --- a/paddle/math/Allocator.h +++ b/paddle/math/Allocator.h @@ -48,7 +48,7 @@ public: */ virtual void* alloc(size_t size) { void* ptr; -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp // memory alignment CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0); diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index ba86eacbb5..28ab54b450 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -206,7 +206,7 @@ double dotProduct(const int n, const double* x, const double* y) { } #endif -#if defined(PADDLE_USE_MKLML) +#if defined(PADDLE_WITH_MKLML) template <> void vExp(const int n, const float* a, float* r) { diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index f6e77029bd..29fe36e3a4 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -15,7 +15,7 @@ limitations under the License. */ #ifndef MATHFUNCTIONS_H_ #define MATHFUNCTIONS_H_ -#ifdef PADDLE_USE_MKLML +#ifdef PADDLE_WITH_MKLML #include #include #include diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index b543b767e8..6a815a1b57 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -43,7 +43,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) { void* p; -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp // memory alignment PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0); diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 2e333a8cde..e099a6a439 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -132,7 +132,7 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } -#ifdef PADDLE_USE_MKLML +#ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> void batched_gemm( diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index 5a42854f22..f2b025b78b 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#ifdef PADDLE_USE_MKLML +#ifdef PADDLE_WITH_MKLML #include #include #include diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h index f157188a4f..5b0c52a30d 100644 --- a/paddle/parameter/FirstOrderOptimizer.h +++ b/paddle/parameter/FirstOrderOptimizer.h @@ -38,7 +38,7 @@ public: real torch_learningRate = optConfig_.learning_method() == "torch_momentum" ? 1.0 - paraConfig.momentum() : 1.0; -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN sgdUpdate(learningRate_ * paraConfig.learning_rate() * (firstTime_ ? 1.0 : torch_learningRate), paraConfig.momentum(), diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp index 1898598e49..d60cb36383 100644 --- a/paddle/parameter/ParameterUpdateFunctions.cpp +++ b/paddle/parameter/ParameterUpdateFunctions.cpp @@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate, const real* grad, real* momentumVec) { decayRate *= learningRate; -#ifdef PADDLE_USE_MKLML +#ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif for (size_t i = 0; i < size; ++i) { diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp index 8f100f02e9..9a7dc0e356 100644 --- a/paddle/utils/Flags.cpp +++ b/paddle/utils/Flags.cpp @@ -20,7 +20,7 @@ DEFINE_bool(use_gpu, false, "Only support CPU training"); DEFINE_bool(use_gpu, true, "Whether to use GPU for training"); #endif -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN // TODO(TJ): change to true when MKLDNN layers support multi-inputs DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training"); #else From 0ca6274451d3693f363f2b8b5d6b29ce722febaf Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 11 Dec 2017 23:15:35 +0800 Subject: [PATCH 257/275] "add global regularization" (#6443) * "add global regularization" * Polish `append_regularization_ops` --- python/paddle/v2/fluid/optimizer.py | 38 +++++++++++---------------- python/paddle/v2/fluid/regularizer.py | 15 ++++++++--- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 719e3b2563..bbdfab2df9 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -18,8 +18,9 @@ class Optimizer(object): but need to use one of it's implementation. """ - def __init__(self, global_step=None): + def __init__(self, global_step=None, regularization=None): self._global_step = global_step + self.regularization = regularization # Dictionary of accumulators. Some optimizer subclasses need to # allocate and manage extra variables associated with the parameters # to train. These variables are called accumulators. @@ -199,7 +200,8 @@ class Optimizer(object): """ params_grads = append_backward_ops(loss, parameter_list, no_grad_set) # Add regularization if any - params_grads = append_regularization_ops(params_grads) + params_grads = append_regularization_ops(params_grads, + self.regularization) optimize_ops = self.create_optimization_pass(params_grads, loss, startup_program) return optimize_ops @@ -209,9 +211,9 @@ class SGDOptimizer(Optimizer): """ Simple SGD optimizer without any state. """ - def __init__(self, learning_rate, global_step=None): + def __init__(self, learning_rate, **kwargs): assert learning_rate is not None - super(SGDOptimizer, self).__init__(global_step) + super(SGDOptimizer, self).__init__(**kwargs) self.type = "sgd" self._learning_rate = learning_rate @@ -236,14 +238,10 @@ class MomentumOptimizer(Optimizer): """ _velocity_acc_str = "velocity" - def __init__(self, - learning_rate, - momentum, - use_nesterov=False, - global_step=None): + def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): assert learning_rate is not None assert momentum is not None - super(MomentumOptimizer, self).__init__(global_step) + super(MomentumOptimizer, self).__init__(**kwargs) self.type = "momentum" self._learning_rate = learning_rate self._momentum = momentum @@ -284,10 +282,10 @@ class AdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, learning_rate, epsilon=1.0e-6, global_step=None): + def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): assert learning_rate is not None assert epsilon is not None - super(AdagradOptimizer, self).__init__(global_step) + super(AdagradOptimizer, self).__init__(**kwargs) self.type = "adagrad" self._learning_rate = learning_rate self._epsilon = epsilon @@ -331,12 +329,12 @@ class AdamOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - global_step=None): + **kwargs): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamOptimizer, self).__init__(global_step) + super(AdamOptimizer, self).__init__(**kwargs) self.type = "adam" self._learning_rate = learning_rate self._beta1 = beta1 @@ -436,12 +434,12 @@ class AdamaxOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - global_step=None): + **kwargs): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamaxOptimizer, self).__init__() + super(AdamaxOptimizer, self).__init__(**kwargs) self.type = "adamax" self._learning_rate = learning_rate self._beta1 = beta1 @@ -514,16 +512,12 @@ class DecayedAdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, - learning_rate, - decay=0.95, - epsilon=1.0e-6, - global_step=None): + def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs): assert learning_rate is not None assert decay is not None assert epsilon is not None - super(DecayedAdagradOptimizer, self).__init__(global_step) + super(DecayedAdagradOptimizer, self).__init__(**kwargs) self.type = "decayed_adagrad" self._learning_rate = learning_rate self._decay = decay diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py index bb1ac8911e..d1955b0047 100644 --- a/python/paddle/v2/fluid/regularizer.py +++ b/python/paddle/v2/fluid/regularizer.py @@ -3,7 +3,7 @@ import framework __all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay'] -def append_regularization_ops(parameters_and_grads): +def append_regularization_ops(parameters_and_grads, regularization=None): """Create and add backward regularization Operators Creates and adds backward regularization operators in the BlockDesc. @@ -14,6 +14,8 @@ def append_regularization_ops(parameters_and_grads): Args: parameters_and_grads: A list of (parameters, gradients) pairs that need to be regularized. + regularization: A global regularizer. If the parameter is not + set. It will be applied with regularizer. Returns: list of (parameters, gradients) pair with the regularized gradient @@ -23,14 +25,19 @@ def append_regularization_ops(parameters_and_grads): """ params_and_grads = [] for param, grad in parameters_and_grads: + regularization_term = None + if param.regularizer is not None: + # Add variable for regularization term in grad block + regularization_term = param.regularizer(param, grad.block) + elif regularization is not None: + regularization_term = regularization(param, grad.block) + # If no gradient or no regularization specified, # then we don't need to do anything - if grad is None or param.regularizer is None: + if grad is None or regularization_term is None: params_and_grads.append((param, grad)) continue - # Add variable for regularization term in grad block - regularization_term = param.regularizer(param, grad.block) assert grad.shape == regularization_term.shape grad.block.append_op( From 4ff6bc175a18d03f1159e78aef0c305703adbe37 Mon Sep 17 00:00:00 2001 From: Siddharth Goyal Date: Mon, 11 Dec 2017 11:33:54 -0800 Subject: [PATCH 258/275] Add row conv operator (#6013) * Fix documentation * Address review comments --- paddle/operators/row_conv_op.cc | 257 +++++++++++ paddle/operators/row_conv_op.cu | 408 ++++++++++++++++++ paddle/operators/row_conv_op.h | 33 ++ .../paddle/v2/fluid/tests/test_row_conv_op.py | 95 ++++ 4 files changed, 793 insertions(+) create mode 100644 paddle/operators/row_conv_op.cc create mode 100644 paddle/operators/row_conv_op.cu create mode 100644 paddle/operators/row_conv_op.h create mode 100644 python/paddle/v2/fluid/tests/test_row_conv_op.py diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc new file mode 100644 index 0000000000..ea0bb99f8d --- /dev/null +++ b/paddle/operators/row_conv_op.cc @@ -0,0 +1,257 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/row_conv_op.h" +#include "paddle/framework/eigen.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +class RowConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of RowConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of RowConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of RowConvOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto filter_dims = ctx->GetInputDim("Filter"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2."); + PADDLE_ENFORCE_EQ( + x_dims[1], filter_dims[1], + "The 2nd dimension of Input(X) and Input(Filter) should be same."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +class RowConvGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of output(Out) should not be null."); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto filter_grad_name = framework::GradVarName("Filter"); + if (ctx->HasOutput(filter_grad_name)) { + auto filter_dims = ctx->GetInputDim("Filter"); + ctx->SetOutputDim(filter_grad_name, filter_dims); + } + } +}; + +class RowConvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RowConvOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor), the input(X) is a LodTensor, which supports " + "variable time-length input sequences. The underlying tensor " + "in this LoDTensor is a matrix with shape (T x N), where T " + "is the total time steps in this mini-batch and N is the input " + "data dimension."); + AddInput("Filter", + "(Tensor), the input(Filter) is a learnable parameter. It " + "is a 2-D tensor with shape (future_context x N), where, " + "future_context is the future context length and N is the data " + "dimension."); + AddOutput("Out", + "(LoDTensor), the output(Out) is a LodTensor, which supports " + "variable time-length input sequences. The underlying tensor " + "in this LodTensor is a matrix with shape T x N, i.e., the " + "same shape as X."); + AddComment(R"DOC( +Row-convolution Operator. + +The row convolution is called lookahead convolution. This operator was +introduced in the following paper for DeepSpeech2: +http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf + +The main motivation is that a bidirectional RNN, useful in DeepSpeech +like speech models, learns representation for a sequence by performing a +forward and a backward pass through the entire sequence. However, unlike +unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online +and low-latency setting. The lookahead convolution incorporates information +from future subsequences in a computationally efficient manner to improve +unidirectional recurrent neural networks. The row convolution operator is +different from the 1D sequence convolution, and is computed as follows: + +Given an input sequence $in$ of length $t$ and input dimension $d$, +and a filter ($W$) of size $context \times d$, +the output sequence is convolved as: + +$$ +out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :} +$$ + +)DOC"); + } +}; + +template +class RowConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *filter = context.Input("Filter"); + auto *out = context.Output("Out"); + + out->mutable_data(context.GetPlace()); + + auto batch_indices = x->lod()[0]; + auto input_dim = x->dims()[1]; // 'in' is of size T x N + size_t num_sequence = batch_indices.size() - 1; + + auto future_context = filter->dims()[0]; + auto weights = EigenMatrix::From(*filter); + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + Tensor cur_input_sequence = + x->Slice(start, end); // Current input sequence + Tensor cur_output_sequence = + out->Slice(start, end); // Current output sequence + auto cip_seq = EigenMatrix::From(cur_input_sequence); + auto cot_seq = EigenMatrix::From(cur_output_sequence); + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + for (int d = 0; d < input_dim; d++) { + if (w == 0) { + cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d); + } else { + cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d); + } + } + } + } + } + } +}; + +template +class RowConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *filter = context.Input("Filter"); + auto *d_out = context.Input(framework::GradVarName("Out")); + auto *dx = context.Output(framework::GradVarName("X")); + auto *d_filter = context.Output(framework::GradVarName("Filter")); + + auto input_dim = x->dims()[1]; // 'x' is of size T x N + auto batch_indices = x->lod()[0]; + size_t num_sequence = batch_indices.size() - 1; + auto future_context = filter->dims()[0]; + + if (d_filter) { + d_filter->mutable_data(context.GetPlace()); + auto dweights = + EigenMatrix::From(*d_filter); // Gradient of weight matrix + dweights.setZero(); + + for (size_t i = 0; i < num_sequence; i++) { // For different sequences + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + + Tensor cur_input = x->Slice(start, end); // Current input sequence + Tensor cur_doutput = + d_out->Slice(start, end); // Current output grad sequence + + auto cur_ip = EigenMatrix::From(cur_input); + auto cur_dout = EigenMatrix::From(cur_doutput); + int current_timesteps = end - start; + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + // For dweights (Updating the gradient of weight matrix) + for (int d = 0; d < input_dim; d++) { + dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d); + } + } + } + } + } + + if (dx) { + dx->mutable_data(context.GetPlace()); + auto weights = EigenMatrix::From(*filter); + for (size_t i = 0; i < num_sequence; i++) { // For different sequences + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + + Tensor cur_doutput = + d_out->Slice(start, end); // Current output grad sequence + Tensor cur_dinput = + dx->Slice(start, end); // Current input grad sequence + + auto cur_dout = EigenMatrix::From(cur_doutput); + auto cur_dip = EigenMatrix::From(cur_dinput); + cur_dip.setZero(); + int current_timesteps = end - start; + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + // For dinput (Updating the gradient wrt input) + for (int d = 0; d < input_dim; d++) { + cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d); + } + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad, + ops::RowConvGradOp); +REGISTER_OP_CPU_KERNEL(row_conv, + ops::RowConvKernel); +REGISTER_OP_CPU_KERNEL( + row_conv_grad, ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu new file mode 100644 index 0000000000..e0d7ebda7e --- /dev/null +++ b/paddle/operators/row_conv_op.cu @@ -0,0 +1,408 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/row_conv_op.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using framework::Tensor; + +namespace { + +inline int DivUp(int x, int y) { return (x + y - 1) / y; } + +// Forward prop (shared memory version, for small future_context) +template +__global__ void RowConvForwardSharedMemory(const T *in, const T *wt, + int num_sequence, int input_dim, + int future_context, + const size_t *batch_indices, + T *out) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int d = blockIdx.x * blx + thx; // index along input dim + + extern __shared__ T mem[]; + T *sw = mem; + + if (thy < future_context) { + sw[thy * blx + thx] = + (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); + } + __syncthreads(); + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + sum += (d < input_dim) + ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d] + : static_cast(0); + } + if (d < input_dim) { + out[(start + k) * input_dim + d] = sum; + } + } + } +} + +// Forward prop (naive version) +template +__global__ void RowConvForward(const T *in, const T *wt, int num_sequence, + int input_dim, int future_context, + const size_t *batch_indices, T *out) { + int d = blockIdx.x * blockDim.x + threadIdx.x; // index along input_dim + int bly = blockDim.y; + int thy = threadIdx.y; + + if (d >= input_dim) return; + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]); + } + out[(start + k) * input_dim + d] = sum; + } + } +} + +// Compute input gradient (shared memory version, for small future_context) +template +__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt, + int num_sequence, int input_dim, + int future_context, + const size_t *batch_indices, + T *din) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int d = blockIdx.x * blx + thx; // index along input dim + + extern __shared__ T mem[]; + T *sw = mem; + if (thy < future_context) { + sw[thy * blx + thx] = + (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); + } + __syncthreads(); + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { + sum += (d < input_dim) + ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d]) + : static_cast(0); + } + if (d < input_dim) { + din[(k + start) * input_dim + d] = sum; + } + } + } +} + +// Compute input gradient (Naive version) +template +__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence, + int input_dim, int future_context, + const size_t *batch_indices, T *din) { + int d = blockIdx.x * blockDim.x + threadIdx.x; // index along input_dim + int bly = blockDim.y; + int thy = threadIdx.y; + + if (d >= input_dim) return; + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { + sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]); + } + din[(k + start) * input_dim + d] = sum; + } + } +} + +// Compute W gradient (small future_context version) +template +__global__ void RowConvGradFilterImproved(const T *in, const T *dout, + int num_sequence, int input_dim, + int future_context, int block_x, + int block_y, + const size_t *batch_indices, + T *dfilter) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int gx = blockIdx.x * blx; + int d = gx + thx; // index along input dim + + extern __shared__ T mem[]; + + int xdim_sh_in = block_y; + int xdim_sh_dout = block_y; + // int xdim_sh_dfilter = future_context; + int ydim_sh_in = block_x; + int ydim_sh_dout = block_x + future_context - 1; + int ydim_sh_dfilter = block_y; + + T *sh_in = mem; + T *sh_dout = &mem[xdim_sh_in * ydim_sh_in]; + T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout]; + + if (thy < future_context) { + sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast(0); + } + __syncthreads(); + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + int scaled_cur_steps = + ((current_timesteps + block_x - 1) / block_x) * block_x; + + for (int k = thy; k < scaled_cur_steps; k += block_x) { + int pos = start + k; + sh_in[thx * ydim_sh_in + thy] = + (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0); + sh_dout[thx * ydim_sh_dout + thy + future_context - 1] = + (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0); + __syncthreads(); + + if (thy < future_context - 1) { + int pos_offset = pos - future_context + 1; + sh_dout[thx * ydim_sh_dout + thy] = + (d < input_dim && pos_offset >= start) + ? dout[pos_offset * input_dim + d] + : T(0); + } + __syncthreads(); + + for (int w = 0; w < future_context; w++) { + T val = sh_in[thy * ydim_sh_in + thx] * + sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w]; + __syncthreads(); + + for (int offset = 16; offset > 0; + offset = offset / 2) { // blockDim.x is 32. + val += __shfl_down(val, offset); + } + __syncthreads(); + + if (thx == 0) { + sh_dfilter[w * ydim_sh_dfilter + thy] += val; + } + __syncthreads(); + } + } + } + for (int w = thy; (w < future_context) && (d < input_dim); w += bly) { + dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx]; + } +} + +// Compute weight(filter) gradient +template +__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, + int input_dim, int future_context, + int block_x, int block_y, + const size_t *batch_indices, T *dfilter) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int gx = blockIdx.x * blx; + int d = gx + thx; // index along input dim + extern __shared__ T mem[]; + T *sh_in = mem; + T *sh_dout = &mem[block_x * block_y]; + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + int scaled_cur_steps = + ((current_timesteps + block_x - 1) / block_x) * block_x; + + for (int k = thy; k < scaled_cur_steps; k += block_x) { + int pos = start + k; + sh_in[thx * block_y + thy] = + (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0; + __syncthreads(); + + for (int w = 0; w < future_context; w++) { + sh_dout[thx * block_y + thy] = + (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps) + ? dout[(pos - w) * input_dim + d] + : 0.0; + __syncthreads(); + + T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx]; + __syncthreads(); + + for (int offset = 16; offset > 0; + offset = offset / 2) { // blockDim.x is 32. + val += __shfl_down(val, offset); + } + __syncthreads(); + + if (thx == 0 && (gx + thy) < input_dim) { + dfilter[w * input_dim + gx + thy] += val; + } + } + } + } +} + +} // namespace + +template +class RowConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Filter = context.Input("Filter"); + auto *Out = context.Output("Out"); + + const T *in = X->data(); + const T *weight = Filter->data(); + T *out = Out->mutable_data(context.GetPlace()); + + auto batch_indices = X->lod()[0]; + int input_dim = X->dims()[1]; + int num_sequence = batch_indices.size() - 1; + int future_context = Filter->dims()[0]; + size_t *idx = batch_indices.data(); + auto stream = context.cuda_device_context().stream(); + + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int mem_per_block = (future_context * block_dim.x) * sizeof(T); + RowConvForwardSharedMemory< + T><<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + RowConvForward<<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); + } + } +}; + +template +class RowConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Filter = context.Input("Filter"); + auto *dOut = context.Input(framework::GradVarName("Out")); + const T *in = X->data(); + const T *weights = Filter->data(); + const T *dout = dOut->data(); + + Tensor *dX = context.Output(framework::GradVarName("X")); + Tensor *dFilter = context.Output(framework::GradVarName("Filter")); + + auto batch_indices = X->lod()[0]; + int input_dim = X->dims()[1]; + int num_sequence = batch_indices.size() - 1; + int future_context = Filter->dims()[0]; + size_t *idx = batch_indices.data(); + + auto &device_ctx = context.cuda_device_context(); + math::SetConstant zero; + + if (dFilter) { + T *dfilter = dFilter->mutable_data(context.GetPlace()); + zero(device_ctx, dFilter, static_cast(0.0)); + + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int block_x = block_dim.x; + int block_y = block_dim.y; + int mem_per_block = + (block_y * block_x + block_y * (block_x + future_context - 1) + + future_context * block_y) * + sizeof(T); + RowConvGradFilterImproved< + T><<>>( + in, dout, num_sequence, input_dim, future_context, block_x, block_y, + idx, dfilter); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int block_x = block_dim.x; + int block_y = block_dim.y; + int mem_per_block = + (block_x * block_y * 2) * sizeof(T); // For 2 arrays of size 32x32 + RowConvGradFilter< + T><<>>( + in, dout, num_sequence, input_dim, future_context, block_x, block_y, + idx, dfilter); + } + } + + if (dX) { + T *din = dX->mutable_data(context.GetPlace()); + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int mem_per_block = (future_context * block_dim.x) * sizeof(T); + RowConvGradInputSharedMemory< + T><<>>( + dout, weights, num_sequence, input_dim, future_context, idx, din); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + RowConvGradInput<<>>( + dout, weights, num_sequence, input_dim, future_context, idx, din); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(row_conv, + ops::RowConvKernel); +REGISTER_OP_GPU_KERNEL( + row_conv_grad, ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h new file mode 100644 index 0000000000..525e83908d --- /dev/null +++ b/paddle/operators/row_conv_op.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class RowConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; + +template +class RowConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; +} // namespace operators +} // namespace paddle diff --git a/python/paddle/v2/fluid/tests/test_row_conv_op.py b/python/paddle/v2/fluid/tests/test_row_conv_op.py new file mode 100644 index 0000000000..1ed86e23ac --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py @@ -0,0 +1,95 @@ +import unittest +import numpy as np +from op_test import OpTest + + +def row_conv_forward(x, lod, wt): + out = np.zeros_like(x) + seq_info = lod[0] + num_sequences = len(seq_info) - 1 + context_length = wt.shape[0] + + for i in range(num_sequences): # loop over number of sequences + start = seq_info[i] + end = seq_info[i + 1] + curinput = x[start:end, :] + curoutput = out[start:end, :] + + cur_timesteps = end - start + for j in range(cur_timesteps): # loop over different timesteps + for k in range(context_length): + + if j + k >= cur_timesteps: + continue + curoutput[j, :] += curinput[j + k, :] * wt[k, :] + + return out + + +class TestRowConvOp1(OpTest): + def setUp(self): + + self.op_type = "row_conv" + lod = [[0, 2, 5, 7]] + T = lod[0][-1] + D = 16 + context_length = 2 + + x = np.random.random((T, D)).astype("float32") + wt = np.random.random((context_length, D)).astype("float32") + self.inputs = {'X': (x, lod), 'Filter': wt} + + out = row_conv_forward(x, lod, wt) + self.outputs = {'Out': (out, lod)} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05) + + def test_check_grad_ignore_x(self): + self.check_grad( + ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X')) + + def test_check_grad_ignore_wt(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter')) + + +class TestRowConvOp2(OpTest): + def setUp(self): + + self.op_type = "row_conv" + lod = [[0, 20, 50, 100]] + T = lod[0][-1] + D = 35 + context_length = 35 + + x = np.random.random((T, D)).astype("float32") + wt = np.random.random((context_length, D)).astype("float32") + self.inputs = {'X': (x, lod), 'Filter': wt} + + out = row_conv_forward(x, lod, wt) + self.outputs = {'Out': (out, lod)} + + def test_check_output(self): + self.check_output() + + #max_relative_error is increased from 0.05 to 0.06 as for higher + #dimensional input, the dX on CPU for some values has max_rel_error + #slightly more than 0.05 + def test_check_grad_normal(self): + self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.06) + + def test_check_grad_ignore_x(self): + self.check_grad( + ['Filter'], 'Out', max_relative_error=0.06, no_grad_set=set('X')) + + def test_check_grad_ignore_wt(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Filter')) + + +if __name__ == '__main__': + unittest.main() From 35420cdf63dd1369972c26f70cac2d4d75b1492a Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Mon, 11 Dec 2017 15:33:28 -0800 Subject: [PATCH 259/275] Updating the Latex equation for Adagrad (#6009) * Updating the Latex equation for Adagrad * Fixing Latex euqations for adadelta, adam and adamax --- paddle/operators/adadelta_op.cc | 12 ++++++------ paddle/operators/adagrad_op.cc | 4 ++-- paddle/operators/adam_op.cc | 12 +++++++----- paddle/operators/adamax_op.cc | 8 ++++---- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc index 16a7794d5b..29434a0ee2 100644 --- a/paddle/operators/adadelta_op.cc +++ b/paddle/operators/adadelta_op.cc @@ -92,12 +92,12 @@ for gradient descent. Adadelta updates are as follows: -$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break -paramUpdate = - $\sqrt{((avgSquaredUpdate + \epsilon) / - (avgSquaredGrad_out + \epsilon))}$ * grad \break -avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) * - {(paramUpdate)}^2 \break -paramOut = param + paramUpdate$$ +$$ +avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\ +param\_update = - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\ +avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\ +param\_out = param + param\_update +$$ )DOC"); } diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index d6686e3ef3..d19602244b 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -80,8 +80,8 @@ Adaptive Gradient Algorithm (Adagrad). The update is done as follows: -$$momentOut = moment + grad * grad \break -paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break +$$moment\_out = moment + grad * grad \\ +param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} $$ The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc index 03faa2a7c5..a268d05484 100644 --- a/paddle/operators/adam_op.cc +++ b/paddle/operators/adam_op.cc @@ -112,11 +112,13 @@ adaptive estimates of lower-order moments. Adam updates: -$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break -moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break -learningRate = learningRate * - $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break -paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$ +$$ +moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\ +moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\ +learning\_rate = learning\_rate * + \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\ +param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon} +$$ )DOC"); } diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index 867ddd9790..9e7576c961 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -108,10 +108,10 @@ Adam algorithm based on the infinity norm. Adamax updates: $$ - momentOut = \beta_{1} * moment + (1 - \beta_{1}) * grad \\ - infNormOut = max(\beta_{2} * infNorm + \epsilon, |grad|) \\ - learningRate = \frac{learningRate}{1 - \beta_{1}^{Beta1Pow}} \\ - paramOut = param - learningRate * \frac{momentOut}{infNormOut} +moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\ +inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\ +learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\ +param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out} $$ The original paper does not have an epsilon attribute. From f4f17e539b70a6cdb4245267ae4027cd5202b0fa Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 12 Dec 2017 16:35:13 +0800 Subject: [PATCH 260/275] skip mkl setting in v1 with Mac --- paddle/scripts/submit_local.sh.in | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index d71cb84df3..43d2d1b410 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -140,7 +140,11 @@ else: sys.exit(0) EOF -cpu_config +if [ "`uname -s`" == "Linux" ]; then + # only support on linux yet, with mac can use v2 + cpu_config +fi + # echo $KMP_AFFINITY $OMP_DYNAMIC case "$1" in From c175eeb387ee90d8ba8a549a2e29a1680c9f1f35 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Tue, 12 Dec 2017 11:08:23 +0800 Subject: [PATCH 261/275] Fix wrong index in dataset downloading exception --- python/paddle/v2/dataset/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index e31e501ce9..191d9ecfb1 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -71,7 +71,7 @@ def download(url, module_name, md5sum): if retry < retry_limit: retry += 1 else: - raise RuntimeError("Cannot download {0} within retry limit {2}". + raise RuntimeError("Cannot download {0} within retry limit {1}". format(url, retry_limit)) print "Cache file %s not found, downloading %s" % (filename, url) r = requests.get(url, stream=True) From f3acdd3af94cb3018134b13893cffebdf09c827c Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 12 Dec 2017 11:17:33 +0800 Subject: [PATCH 262/275] fix warning in row_conv_op.cu --- paddle/operators/row_conv_op.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu index e0d7ebda7e..79b7086b24 100644 --- a/paddle/operators/row_conv_op.cu +++ b/paddle/operators/row_conv_op.cu @@ -243,7 +243,6 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, int block_x, int block_y, const size_t *batch_indices, T *dfilter) { int blx = blockDim.x; - int bly = blockDim.y; int thx = threadIdx.x; int thy = threadIdx.y; int gx = blockIdx.x * blx; From 59c74fb14adcd23fc244275ef9e74b8430cc43fc Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 12 Dec 2017 11:35:38 +0800 Subject: [PATCH 263/275] change paddle:dev to paddle:latest-dev --- doc/howto/dev/contribute_to_paddle_cn.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md index 3eb477eb65..3e0bf7b397 100644 --- a/doc/howto/dev/contribute_to_paddle_cn.md +++ b/doc/howto/dev/contribute_to_paddle_cn.md @@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a") ## 构建和测试 -编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:dev`来代替。 +编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:latest-dev`来代替。 如要build这个开发镜像,在源码目录树的根目录中运行: ```bash -➜ docker build -t paddle:dev . +➜ docker build -t paddle:latest-dev . ``` 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: ```bash -➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:dev +➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev ``` 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`): @@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a") 如果要运行所有的单元测试,可以用如下命令: ```bash -➜ docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" +➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest" ``` 关于构建和测试的更多信息,请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。 From 936f0546e3aaa772514bba721167e897769ec2a9 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 12 Dec 2017 10:24:50 +0800 Subject: [PATCH 264/275] fix img_pool maxout doc --- .../paddle/trainer_config_helpers/layers.py | 48 ++++++++++++------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index d0b14cf63c..3a82e858f7 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2729,15 +2729,17 @@ def img_pool_layer(input, .. math:: - w = 1 + \frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} \\\\ - h = 1 + \frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} + + h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} - ceil_mode=False: .. math:: - w = 1 + \frac{floor(input\_width + 2 * padding - pool\_size)}{stride} \\\\ - h = 1 + \frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride} + + h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} The example usage is: @@ -2870,17 +2872,21 @@ def img_pool3d_layer(input, .. math:: - w = 1 + \frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} \\\\ - h = 1 + \frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} \\\\ - d = 1 + \frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} + w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} + + h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + + d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} - ceil_mode=False: .. math:: - w = 1 + \frac{floor(input\_width + 2 * padding - pool\_size)}{stride} \\\\ - h = 1 + \frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} \\\\ - d = 1 + \frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} \\\\ + w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride} + + h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + + d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} The example usage is: @@ -5437,13 +5443,21 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): .. math:: - out = \max_k (in[n, k, o_c , s]) \\\\ - out_{i * s + j} = \max_k in_{ k * o_{c} * s + i * s + j} \\\\ - s = \frac{input.size}{ num\_channels} \\\\ - o_{c} =\frac{num\_channels}{groups} \\\\ - 0 \le i < o_{c} \\\\ - 0 \le j < s \\\\ - 0 \le k < groups \\\\ + + out & = \max_k (in[n, k, o_c , s]) + + out_{i * s + j} & = \max_k in_{ k * o_{c} * s + i * s + j} + + s & = \\frac{input.size}{ num\_channels} + + o_{c} & = \\frac{num\_channels}{groups} + + 0 \le i & < o_{c} + + 0 \le j & < s + + 0 \le k & < groups + The simple usage is: From 7d8802938149df6b0dcf8934c0eb7fe0e4e5145c Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 12 Dec 2017 12:55:58 +0800 Subject: [PATCH 265/275] equation align --- python/paddle/trainer_config_helpers/layers.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 3a82e858f7..7e118b24a4 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -5444,19 +5444,19 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): .. math:: - out & = \max_k (in[n, k, o_c , s]) + & out = \max_k (in[n, k, o_c , s]) - out_{i * s + j} & = \max_k in_{ k * o_{c} * s + i * s + j} + & out_{i * s + j} = \max_k in_{ k * o_{c} * s + i * s + j} - s & = \\frac{input.size}{ num\_channels} + & s = \\frac{input.size}{ num\_channels} - o_{c} & = \\frac{num\_channels}{groups} + & o_{c} = \\frac{num\_channels}{groups} - 0 \le i & < o_{c} + & 0 \le i < o_{c} - 0 \le j & < s + & 0 \le j < s - 0 \le k & < groups + & 0 \le k < groups The simple usage is: From 61ec0b951656fb402e61c4dc519e80ba3fbc61d0 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Tue, 12 Dec 2017 14:00:28 +0800 Subject: [PATCH 266/275] Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` --- paddle/framework/op_registry.h | 12 +- paddle/framework/operator.cc | 16 +- paddle/framework/operator.h | 26 +-- paddle/framework/operator_test.cc | 2 +- paddle/operators/CMakeLists.txt | 2 +- paddle/operators/accuracy_op.cc | 2 +- paddle/operators/accuracy_op.cu | 5 +- paddle/operators/accuracy_op.h | 2 +- paddle/operators/activation_op.cc | 21 +- paddle/operators/activation_op.cu | 23 +- paddle/operators/activation_op.h | 14 +- paddle/operators/adadelta_op.cc | 4 +- paddle/operators/adadelta_op.cu | 6 +- paddle/operators/adadelta_op.h | 4 +- paddle/operators/adagrad_op.cc | 18 +- paddle/operators/adagrad_op.cu | 20 +- paddle/operators/adagrad_op.h | 17 +- paddle/operators/adam_op.cc | 6 +- paddle/operators/adam_op.cu | 6 +- paddle/operators/adam_op.h | 10 +- paddle/operators/adamax_op.cc | 6 +- paddle/operators/adamax_op.cu | 6 +- paddle/operators/adamax_op.h | 10 +- paddle/operators/auc_op.h | 2 +- paddle/operators/batch_norm_op.cc | 14 +- paddle/operators/batch_norm_op.cu.cc | 32 +-- paddle/operators/batch_norm_op.h | 4 +- .../operators/bilinear_tensor_product_op.cc | 11 +- .../operators/bilinear_tensor_product_op.cu | 16 +- paddle/operators/bilinear_tensor_product_op.h | 41 ++-- paddle/operators/cast_op.cc | 2 +- paddle/operators/cast_op.cu | 6 +- paddle/operators/cast_op.h | 13 +- paddle/operators/chunk_eval_op.h | 2 +- paddle/operators/clip_by_norm_op.cc | 3 +- paddle/operators/clip_by_norm_op.cu | 5 +- paddle/operators/clip_by_norm_op.h | 5 +- paddle/operators/clip_op.cc | 8 +- paddle/operators/clip_op.cu | 8 +- paddle/operators/clip_op.h | 16 +- paddle/operators/compare_op.cu | 10 +- paddle/operators/compare_op.h | 31 ++- paddle/operators/concat_op.cu.cc | 9 +- paddle/operators/concat_op.h | 4 +- paddle/operators/conv_cudnn_op.cc | 22 +- paddle/operators/conv_cudnn_op.cu.cc | 32 +-- paddle/operators/conv_op.cc | 22 +- paddle/operators/conv_op.cu.cc | 26 ++- paddle/operators/conv_op.h | 55 +++-- paddle/operators/conv_shift_op.cu | 22 +- paddle/operators/conv_shift_op.h | 4 +- paddle/operators/conv_transpose_cudnn_op.cc | 18 +- .../operators/conv_transpose_cudnn_op.cu.cc | 32 +-- paddle/operators/conv_transpose_op.cc | 18 +- paddle/operators/conv_transpose_op.cu.cc | 28 ++- paddle/operators/conv_transpose_op.h | 58 ++--- paddle/operators/cos_sim_op.cc | 7 +- paddle/operators/cos_sim_op.cu | 9 +- paddle/operators/cos_sim_op.h | 10 +- paddle/operators/crf_decoding_op.cc | 5 +- paddle/operators/crf_decoding_op.h | 6 +- paddle/operators/crop_op.cc | 4 +- paddle/operators/crop_op.cu | 6 +- paddle/operators/crop_op.h | 19 +- paddle/operators/cross_entropy_op.cu | 23 +- paddle/operators/cross_entropy_op.h | 12 +- paddle/operators/decayed_adagrad_op.cc | 2 +- paddle/operators/decayed_adagrad_op.cu | 4 +- paddle/operators/decayed_adagrad_op.h | 4 +- paddle/operators/dropout_op.cc | 6 +- paddle/operators/dropout_op.cu | 12 +- paddle/operators/dropout_op.h | 10 +- paddle/operators/elementwise_add_op.cc | 16 +- paddle/operators/elementwise_add_op.cu | 21 +- paddle/operators/elementwise_add_op.h | 10 +- paddle/operators/elementwise_div_op.cc | 16 +- paddle/operators/elementwise_div_op.cu | 21 +- paddle/operators/elementwise_div_op.h | 8 +- paddle/operators/elementwise_mul_op.cc | 16 +- paddle/operators/elementwise_mul_op.cu | 21 +- paddle/operators/elementwise_mul_op.h | 8 +- paddle/operators/elementwise_op_function.h | 94 ++++---- paddle/operators/elementwise_sub_op.cc | 16 +- paddle/operators/elementwise_sub_op.cu | 21 +- paddle/operators/elementwise_sub_op.h | 8 +- paddle/operators/expand_op.cc | 7 +- paddle/operators/expand_op.cu | 9 +- paddle/operators/expand_op.h | 10 +- .../fill_constant_batch_size_like_op.cc | 11 +- .../fill_constant_batch_size_like_op.cu.cc | 13 +- .../fill_constant_batch_size_like_op.h | 7 +- paddle/operators/fill_zeros_like_op.cc | 11 +- paddle/operators/fill_zeros_like_op.cu.cc | 13 +- paddle/operators/fill_zeros_like_op.h | 7 +- paddle/operators/ftrl_op.cc | 4 +- paddle/operators/ftrl_op.cu | 4 +- paddle/operators/ftrl_op.h | 4 +- paddle/operators/gather.cu.h | 2 +- paddle/operators/gather_op.cu | 7 +- paddle/operators/gather_op.h | 3 +- paddle/operators/gaussian_random_op.cu | 4 +- paddle/operators/gru_op.cc | 11 +- paddle/operators/gru_op.cu.cc | 11 +- paddle/operators/gru_op.h | 48 ++-- paddle/operators/gru_unit_op.cc | 11 +- paddle/operators/gru_unit_op.cu | 13 +- paddle/operators/gru_unit_op.h | 64 +++--- paddle/operators/hinge_loss_op.cc | 7 +- paddle/operators/hinge_loss_op.cu | 9 +- paddle/operators/hinge_loss_op.h | 10 +- paddle/operators/huber_loss_op.cc | 7 +- paddle/operators/huber_loss_op.cu | 9 +- paddle/operators/huber_loss_op.h | 10 +- paddle/operators/l1_norm_op.cc | 7 +- paddle/operators/l1_norm_op.cu | 9 +- paddle/operators/l1_norm_op.h | 10 +- paddle/operators/linear_chain_crf_op.cc | 9 +- paddle/operators/linear_chain_crf_op.cu | 13 +- paddle/operators/linear_chain_crf_op.h | 24 +- paddle/operators/lod_reset_op.cu | 13 +- paddle/operators/lod_reset_op.h | 4 +- paddle/operators/log_loss_op.cc | 7 +- paddle/operators/log_loss_op.cu | 9 +- paddle/operators/log_loss_op.h | 8 +- paddle/operators/logical_op.cu | 8 +- paddle/operators/logical_op.h | 21 +- paddle/operators/lookup_table_op.cu | 20 +- paddle/operators/lrn_op.cc | 21 +- paddle/operators/lrn_op.cu | 30 +-- paddle/operators/lrn_op.h | 10 +- paddle/operators/lstm_op.cc | 11 +- paddle/operators/lstm_op.cu.cc | 11 +- paddle/operators/lstm_op.h | 94 ++++---- paddle/operators/lstm_unit_op.cu | 8 +- paddle/operators/lstm_unit_op.h | 4 +- paddle/operators/margin_rank_loss_op.cc | 4 +- paddle/operators/margin_rank_loss_op.cu | 8 +- paddle/operators/margin_rank_loss_op.h | 8 +- paddle/operators/math/context_project.cc | 4 +- paddle/operators/math/context_project.cu | 4 +- paddle/operators/math/context_project.h | 20 +- paddle/operators/math/cross_entropy.cc | 10 +- paddle/operators/math/cross_entropy.cu | 14 +- paddle/operators/math/cross_entropy.h | 6 +- paddle/operators/math/gru_compute.cc | 28 +-- paddle/operators/math/gru_compute.cu | 34 ++- paddle/operators/math/gru_compute.h | 13 +- paddle/operators/math/im2col.cc | 32 +-- paddle/operators/math/im2col.cu | 48 ++-- paddle/operators/math/im2col.h | 11 +- paddle/operators/math/im2col_test.cc | 27 +-- paddle/operators/math/lstm_compute.cc | 16 +- paddle/operators/math/lstm_compute.cu | 16 +- paddle/operators/math/lstm_compute.h | 13 +- paddle/operators/math/math_function.cc | 144 ++++++------ paddle/operators/math/math_function.cu | 207 ++++++++---------- paddle/operators/math/math_function.h | 81 ++++--- paddle/operators/math/math_function_impl.h | 39 ++-- paddle/operators/math/math_function_test.cc | 9 +- paddle/operators/math/math_function_test.cu | 10 +- paddle/operators/math/matmul.h | 19 +- paddle/operators/math/maxouting.cc | 16 +- paddle/operators/math/maxouting.cu | 33 ++- paddle/operators/math/maxouting.h | 14 +- paddle/operators/math/pooling.cc | 128 ++++++----- paddle/operators/math/pooling.cu | 182 +++++++-------- paddle/operators/math/pooling.h | 68 +++--- .../operators/math/selected_rows_functor.cc | 45 ++-- .../operators/math/selected_rows_functor.cu | 77 +++---- paddle/operators/math/selected_rows_functor.h | 16 +- .../math/selected_rows_functor_test.cc | 12 +- .../math/selected_rows_functor_test.cu | 12 +- paddle/operators/math/sequence2batch.cc | 16 +- paddle/operators/math/sequence2batch.cu | 19 +- paddle/operators/math/sequence2batch.h | 22 +- paddle/operators/math/sequence_pooling.cc | 18 +- paddle/operators/math/sequence_pooling.cu | 24 +- paddle/operators/math/sequence_pooling.h | 8 +- paddle/operators/math/softmax.cc | 8 +- paddle/operators/math/softmax.cu | 8 +- paddle/operators/math/softmax.h | 13 +- paddle/operators/math/softmax_impl.h | 32 ++- paddle/operators/math/unpooling.cc | 16 +- paddle/operators/math/unpooling.cu | 36 ++- paddle/operators/math/unpooling.h | 10 +- paddle/operators/math/vol2col.cc | 16 +- paddle/operators/math/vol2col.cu | 24 +- paddle/operators/math/vol2col.h | 10 +- paddle/operators/math/vol2col_test.cc | 24 +- paddle/operators/matmul_op.cc | 7 +- paddle/operators/matmul_op.cu.cc | 9 +- paddle/operators/matmul_op.h | 45 ++-- paddle/operators/maxout_op.cc | 7 +- paddle/operators/maxout_op.cu.cc | 13 +- paddle/operators/maxout_op.h | 18 +- paddle/operators/mean_op.cc | 11 +- paddle/operators/mean_op.cu | 11 +- paddle/operators/mean_op.h | 10 +- paddle/operators/minus_op.cc | 4 +- paddle/operators/minus_op.cu | 5 +- paddle/operators/minus_op.h | 5 +- paddle/operators/modified_huber_loss_op.cc | 2 +- paddle/operators/modified_huber_loss_op.cu | 8 +- paddle/operators/modified_huber_loss_op.h | 5 +- paddle/operators/momentum_op.cu | 4 +- paddle/operators/mul_op.cc | 7 +- paddle/operators/mul_op.cu.cc | 7 +- paddle/operators/mul_op.h | 18 +- paddle/operators/multiplex_op.cc | 5 +- paddle/operators/multiplex_op.cu | 16 +- paddle/operators/multiplex_op.h | 11 +- paddle/operators/nccl_op.cu.cc | 6 +- paddle/operators/nccl_op_test.cu.cc | 6 +- paddle/operators/nce_op.cc | 4 +- paddle/operators/nce_op.h | 8 +- paddle/operators/pad_op.cc | 7 +- paddle/operators/pad_op.cu | 7 +- paddle/operators/pad_op.h | 38 ++-- paddle/operators/pool_cudnn_op.cc | 26 ++- paddle/operators/pool_cudnn_op.cu.cc | 18 +- paddle/operators/pool_op.cc | 24 +- paddle/operators/pool_op.cu.cc | 26 ++- paddle/operators/pool_op.h | 59 ++--- paddle/operators/pool_with_index_op.cc | 22 +- paddle/operators/pool_with_index_op.cu.cc | 32 ++- paddle/operators/pool_with_index_op.h | 26 ++- paddle/operators/positive_negative_pair_op.h | 2 +- paddle/operators/precision_recall_op.h | 2 +- paddle/operators/prelu_op.cc | 9 +- paddle/operators/prelu_op.cu | 11 +- paddle/operators/prelu_op.h | 16 +- paddle/operators/proximal_adagrad_op.cc | 2 +- paddle/operators/proximal_adagrad_op.cu | 4 +- paddle/operators/proximal_adagrad_op.h | 10 +- paddle/operators/proximal_gd_op.cc | 3 +- paddle/operators/proximal_gd_op.cu | 5 +- paddle/operators/proximal_gd_op.h | 4 +- paddle/operators/rank_loss_op.cc | 7 +- paddle/operators/rank_loss_op.cu | 12 +- paddle/operators/rank_loss_op.h | 8 +- paddle/operators/reduce_op.cc | 15 +- paddle/operators/reduce_op.cu | 15 +- paddle/operators/reduce_op.h | 44 ++-- paddle/operators/reshape_op.cu | 4 +- paddle/operators/reshape_op.h | 4 +- paddle/operators/rmsprop_op.cc | 4 +- paddle/operators/rmsprop_op.cu | 4 +- paddle/operators/rmsprop_op.h | 4 +- paddle/operators/roi_pool_op.cc | 9 +- paddle/operators/roi_pool_op.cu | 15 +- paddle/operators/roi_pool_op.h | 9 +- paddle/operators/row_conv_op.cc | 13 +- paddle/operators/row_conv_op.cu | 17 +- paddle/operators/row_conv_op.h | 4 +- paddle/operators/scale_op.cc | 10 +- paddle/operators/scale_op.cu | 12 +- paddle/operators/scale_op.h | 5 +- paddle/operators/scatter_op.cu | 4 +- paddle/operators/seq_expand_op.cc | 7 +- paddle/operators/seq_expand_op.cu | 9 +- paddle/operators/seq_expand_op.h | 14 +- paddle/operators/sequence_concat_op.cc | 4 +- paddle/operators/sequence_concat_op.cu.cc | 10 +- paddle/operators/sequence_concat_op.h | 4 +- paddle/operators/sequence_conv_op.cc | 9 +- paddle/operators/sequence_conv_op.cu.cc | 13 +- paddle/operators/sequence_conv_op.h | 65 +++--- paddle/operators/sequence_pool_op.cc | 5 +- paddle/operators/sequence_pool_op.cu | 9 +- paddle/operators/sequence_pool_op.h | 26 ++- paddle/operators/sequence_slice_op.cc | 4 +- paddle/operators/sequence_slice_op.cu | 8 +- paddle/operators/sequence_slice_op.h | 9 +- paddle/operators/sequence_softmax_op.cc | 4 +- paddle/operators/sequence_softmax_op.cu.cc | 8 +- paddle/operators/sequence_softmax_op.h | 12 +- paddle/operators/sgd_op.cc | 13 +- paddle/operators/sgd_op.cu | 22 +- paddle/operators/sgd_op.h | 14 +- .../sigmoid_cross_entropy_with_logits_op.cc | 4 +- .../sigmoid_cross_entropy_with_logits_op.cu | 12 +- .../sigmoid_cross_entropy_with_logits_op.h | 9 +- paddle/operators/sign_op.cc | 4 +- paddle/operators/sign_op.cu | 5 +- paddle/operators/sign_op.h | 5 +- paddle/operators/smooth_l1_loss_op.cc | 5 +- paddle/operators/smooth_l1_loss_op.cu | 9 +- paddle/operators/smooth_l1_loss_op.h | 30 +-- paddle/operators/softmax_op.cc | 7 +- paddle/operators/softmax_op.cu.cc | 9 +- paddle/operators/softmax_op.h | 10 +- .../softmax_with_cross_entropy_op.cu | 40 ++-- .../operators/softmax_with_cross_entropy_op.h | 18 +- paddle/operators/split_op.cu.cc | 4 +- paddle/operators/split_op.h | 2 +- paddle/operators/squared_l2_distance_op.cc | 8 +- paddle/operators/squared_l2_distance_op.cu | 10 +- paddle/operators/squared_l2_distance_op.h | 10 +- paddle/operators/squared_l2_norm_op.cc | 4 +- paddle/operators/squared_l2_norm_op.cu | 8 +- paddle/operators/squared_l2_norm_op.h | 14 +- paddle/operators/sum_op.cc | 9 +- paddle/operators/sum_op.cu | 9 +- paddle/operators/sum_op.h | 23 +- paddle/operators/top_k_op.cu | 2 +- paddle/operators/top_k_op.h | 2 +- paddle/operators/transpose_op.cc | 6 +- paddle/operators/transpose_op.cu.cc | 9 +- paddle/operators/transpose_op.h | 29 +-- paddle/operators/uniform_random_op.cc | 2 +- paddle/operators/uniform_random_op.cu | 6 +- paddle/operators/unpool_op.cc | 11 +- paddle/operators/unpool_op.cu.cc | 13 +- paddle/operators/unpool_op.h | 22 +- paddle/platform/device_context.cc | 12 - paddle/platform/device_context.h | 17 -- paddle/platform/device_context_test.cc | 5 +- paddle/platform/transform.h | 24 +- paddle/platform/transform_test.cu | 8 +- 319 files changed, 2624 insertions(+), 2546 deletions(-) diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index daade439e5..b29238432b 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar { return 0; \ } -#define REGISTER_OP_GPU_KERNEL(op_type, ...) \ - REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__) +#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__) #define REGISTER_OP_CPU_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) @@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar { #else #define USE_OP_KERNEL(op_type) \ USE_OP_DEVICE_KERNEL(op_type, CPU); \ - USE_OP_DEVICE_KERNEL(op_type, GPU) + USE_OP_DEVICE_KERNEL(op_type, CUDA) #endif #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type); @@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar { USE_OP_ITSELF(op_type); \ USE_OP_DEVICE_KERNEL(op_type, CPU); -#define USE_GPU_ONLY_OP(op_type) \ - USE_OP_ITSELF(op_type); \ - USE_OP_DEVICE_KERNEL(op_type, GPU) +#define USE_CUDA_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) #define USE_OP(op_type) \ USE_OP_ITSELF(op_type); \ diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index f1444eeee9..e83d754783 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -22,20 +22,6 @@ limitations under the License. */ namespace paddle { namespace framework { -template <> -Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< - platform::CPUPlace, Eigen::DefaultDevice>() const { - return *device_context_.GetEigenDevice(); -} - -#ifdef PADDLE_WITH_CUDA -template <> -Eigen::GpuDevice& -ExecutionContext::GetEigenDevice() const { - return *device_context_.GetEigenDevice(); -} -#endif - std::string OperatorBase::Input(const std::string& name) const { auto& ins = Inputs(name); PADDLE_ENFORCE_LE(ins.size(), 1UL, @@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope, } OpKernelType OperatorWithKernel::GetKernelType( const ExecutionContext& ctx) const { - return OpKernelType(IndicateDataType(ctx), ctx.device_context()); + return OpKernelType(IndicateDataType(ctx), ctx.GetPlace()); } DataType OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 60861d9293..e60dbfc313 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -276,17 +276,25 @@ class ExecutionContext { out_tensor->set_lod(in_tensor.lod()); } - template ::EigenDeviceType> - DeviceType& GetEigenDevice() const; - platform::Place GetPlace() const { return device_context_.GetPlace(); } + template + const DeviceContextType& device_context() const { + return *reinterpret_cast(&device_context_); + } + const platform::DeviceContext& device_context() const { return device_context_; } +#ifdef PADDLE_WITH_CUDA + const inline platform::CUDADeviceContext& cuda_device_context() const { + PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + return *reinterpret_cast( + &device_context_); + } +#endif + //! Get actual name vector for this input. const std::vector& Inputs(const std::string& name) const { return op_.Inputs(name); @@ -297,14 +305,6 @@ class ExecutionContext { return op_.Outputs(name); } -#ifdef PADDLE_WITH_CUDA - const inline platform::CUDADeviceContext& cuda_device_context() const { - PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); - return *reinterpret_cast( - &device_context_); - } -#endif - private: const OperatorBase& op_; const Scope& scope_; diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index 59ddbc7791..b678178454 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override {} OpKernelType GetKernelType(const ExecutionContext& ctx) const override { - return OpKernelType(DataType::FP32, ctx.device_context()); + return OpKernelType(DataType::FP32, ctx.GetPlace()); } }; diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 38b89b9eb1..5aaaf99332 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -138,7 +138,7 @@ function(op_library TARGET) if ("${TARGET}" STREQUAL "nccl_op") set(pybind_flag 1) # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n") + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") endif() # reduce_op contains several operators diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc index 2785a8c6fb..76da21c472 100644 --- a/paddle/operators/accuracy_op.cc +++ b/paddle/operators/accuracy_op.cc @@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Out")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index d2dcab4e54..539a935302 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { // FIXME(typhoonzero): types of T is for inference data. // label data is always int64 -REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(accuracy, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h index d060e6eddd..04104a695f 100644 --- a/paddle/operators/accuracy_op.h +++ b/paddle/operators/accuracy_op.h @@ -21,7 +21,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class AccuracyKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 7f3118f176..63490f0ec9 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -611,16 +611,17 @@ REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker, REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, ops::ActivationOpGrad); -#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_CPU_KERNEL( \ - act_type##_grad, ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CPU_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL); diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu index 97737857ab..856d3fc35d 100644 --- a/paddle/operators/activation_op.cu +++ b/paddle/operators/activation_op.cu @@ -17,16 +17,17 @@ namespace ops = paddle::operators; -#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \ - REGISTER_OP_GPU_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_GPU_KERNEL( \ - act_type##_grad, ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); -FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL); +FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index ac0e0a3b01..75eefca8b8 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class ActivationKernel : public framework::OpKernel { public: @@ -32,18 +32,19 @@ class ActivationKernel auto x = framework::EigenVector::Flatten(*X); auto y = framework::EigenVector::Flatten(*Y); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); Functor functor; auto attrs = functor.GetAttrs(); for (auto& attr : attrs) { *attr.second = context.Attr(attr.first); } - functor(place, x, y); + functor(*place, x, y); } }; -template +template class ActivationGradKernel : public framework::OpKernel { public: @@ -59,13 +60,14 @@ class ActivationGradKernel auto x = framework::EigenVector::Flatten(*X); auto y = framework::EigenVector::Flatten(*Y); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); Functor functor; auto attrs = functor.GetAttrs(); for (auto& attr : attrs) { *attr.second = context.Attr(attr.first); } - functor(place, x, y, dy, dx); + functor(*place, x, y, dy, dx); } }; diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc index 29434a0ee2..507811e7b5 100644 --- a/paddle/operators/adadelta_op.cc +++ b/paddle/operators/adadelta_op.cc @@ -109,5 +109,5 @@ $$ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); REGISTER_OP_CPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu index 9fb6185207..eee2d0a2f5 100644 --- a/paddle/operators/adadelta_op.cu +++ b/paddle/operators/adadelta_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adadelta_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); +REGISTER_OP_CUDA_KERNEL( + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h index a8c5f0c8aa..819d0845db 100644 --- a/paddle/operators/adadelta_op.h +++ b/paddle/operators/adadelta_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdadeltaOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel { framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); auto avg_squared_update_out = framework::EigenVector::Flatten(*avg_squared_update_out_tensor); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); avg_squared_grad_out.device(place) = rho * avg_squared_grad + (1 - rho) * grad.square(); diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index d19602244b..5d00716316 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -100,8 +100,8 @@ size_t FindPos(const std::vector& rows, int64_t value) { } // namespace template -struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseAdagradFunctor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { @@ -120,7 +120,7 @@ struct SparseAdagradFunctor { {static_cast(merge_rows.size()), grad_width}), context.GetPlace()); - math::SetConstant constant_functor; + math::SetConstant constant_functor; constant_functor(context, grad_merge->mutable_value(), 0.0); auto* grad_merge_data = grad_merge->mutable_value()->data(); @@ -144,9 +144,9 @@ struct SparseAdagradFunctor { auto gs = framework::EigenVector::Flatten(*(grad_square->mutable_value())); auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.GetEigenDevice()) = gm * gm; + gs.device(*context.eigen_device()) = gm * gm; - math::SelectedRowsAddToTensor functor; + math::SelectedRowsAddToTensor functor; functor(context, *grad_square, moment); // 3. update parameter @@ -164,13 +164,13 @@ struct SparseAdagradFunctor { } }; -template struct SparseAdagradFunctor; -template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); REGISTER_OP_CPU_KERNEL( - adagrad, ops::AdagradOpKernel, - ops::AdagradOpKernel); + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 1c870214b2..585b2d9289 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, } // namespace template -struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseAdagradFunctor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { @@ -92,7 +92,7 @@ struct SparseAdagradFunctor { {static_cast(merge_rows.size()), grad_width}), context.GetPlace()); - math::SetConstant constant_functor; + math::SetConstant constant_functor; constant_functor(context, grad_merge->mutable_value(), 0.0); auto* grad_merge_data = grad_merge->mutable_value()->data(); @@ -119,9 +119,9 @@ struct SparseAdagradFunctor { auto gs = framework::EigenVector::Flatten(*(grad_square->mutable_value())); auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.GetEigenDevice()) = gm * gm; + gs.device(*context.eigen_device()) = gm * gm; - math::SelectedRowsAddToTensor functor; + math::SelectedRowsAddToTensor functor; functor(context, *grad_square, moment); // 3. update parameter @@ -139,13 +139,13 @@ struct SparseAdagradFunctor { } }; -template struct SparseAdagradFunctor; -template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - adagrad, ops::AdagradOpKernel, - ops::AdagradOpKernel); +REGISTER_OP_CUDA_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h index 4d4a6434c7..0d77dbcbac 100644 --- a/paddle/operators/adagrad_op.h +++ b/paddle/operators/adagrad_op.h @@ -19,15 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param); }; -template +template class AdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,11 +52,11 @@ class AdagradOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment_out.device(place) = moment + grad * grad; + moment_out.device(*place) = moment + grad * grad; Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); } else if (grad_var->IsType()) { auto* param_tensor = ctx.Input("Param"); @@ -65,8 +65,9 @@ class AdagradOpKernel : public framework::OpKernel { auto* moment_tensor = ctx.Input("Moment"); PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); - SparseAdagradFunctor functor; - functor(ctx.device_context(), *ctx.Input("Grad"), + SparseAdagradFunctor functor; + functor(ctx.template device_context(), + *ctx.Input("Grad"), *ctx.Input("LearningRate"), epsilon, moment_out_tensor, param_out_tensor); } else { diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc index a268d05484..cf6ef6dd53 100644 --- a/paddle/operators/adam_op.cc +++ b/paddle/operators/adam_op.cc @@ -128,6 +128,6 @@ $$ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); -REGISTER_OP_CPU_KERNEL(adam, - ops::AdamOpKernel, - ops::AdamOpKernel); +REGISTER_OP_CPU_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu index 6e34f7818c..c135b37378 100644 --- a/paddle/operators/adam_op.cu +++ b/paddle/operators/adam_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adam_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(adam, - ops::AdamOpKernel, - ops::AdamOpKernel); +REGISTER_OP_CUDA_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 7f7fa1da1c..45157842a6 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdamOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,17 +52,17 @@ class AdamOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment1_out = framework::EigenVector::Flatten(*moment1_out_tensor); auto moment2_out = framework::EigenVector::Flatten(*moment2_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad; - moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square(); + moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad; + moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square(); // All of these are tensors of 1 element auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow); // Eigen does not support automatic broadcast // Get dimensions of moment vector to broadcast lr_t Eigen::DSizes m_dsize(moment1_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr_t.broadcast(m_dsize) * (moment1_out / (moment2_out.sqrt() + epsilon)); diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index 9e7576c961..49ce497bb7 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -127,6 +127,6 @@ division by 0 error. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); -REGISTER_OP_CPU_KERNEL(adamax, - ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +REGISTER_OP_CPU_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu index 057ef39025..2d143905c4 100644 --- a/paddle/operators/adamax_op.cu +++ b/paddle/operators/adamax_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adamax_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(adamax, - ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +REGISTER_OP_CUDA_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h index bf36ed7860..172c179c5f 100644 --- a/paddle/operators/adamax_op.h +++ b/paddle/operators/adamax_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdamaxOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel { auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); auto inf_norm_out = framework::EigenVector::Flatten(*inf_norm_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment_out.device(place) = beta1 * moment + (1 - beta1) * grad; - inf_norm_out.device(place) = + moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; + inf_norm_out.device(*place) = grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); auto lr_t = lr / (1 - beta1_pow); Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); } }; diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h index e5ac57b038..b80509e2a9 100644 --- a/paddle/operators/auc_op.h +++ b/paddle/operators/auc_op.h @@ -25,7 +25,7 @@ template using EigenVector = framework::EigenVector; -template +template class AucKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc index ac97bd83ab..94a972b7ab 100644 --- a/paddle/operators/batch_norm_op.cc +++ b/paddle/operators/batch_norm_op.cc @@ -135,7 +135,8 @@ The required data format for this layer is one of the following: }; template -class BatchNormKernel : public framework::OpKernel { +class BatchNormKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const float epsilon = ctx.Attr("epsilon"); @@ -318,12 +319,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel { PADDLE_THROW("can't find Y@GRAD"); } return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.device_context()); + ctx.GetPlace()); } }; template -class BatchNormGradKernel +class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -436,8 +437,9 @@ class BatchNormGradKernel namespace ops = paddle::operators; REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, batch_norm_grad, ops::BatchNormGradOp); -REGISTER_OP_CPU_KERNEL(batch_norm, - ops::BatchNormKernel); +REGISTER_OP_CPU_KERNEL( + batch_norm, + ops::BatchNormKernel); REGISTER_OP_CPU_KERNEL( batch_norm_grad, - ops::BatchNormGradKernel); + ops::BatchNormGradKernel); diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc index 7b2f318700..c7adc3d80e 100644 --- a/paddle/operators/batch_norm_op.cu.cc +++ b/paddle/operators/batch_norm_op.cu.cc @@ -47,7 +47,8 @@ void ExtractNCWHD(const framework::DDim &dims, } template -class BatchNormKernel : public framework::OpKernel { +class BatchNormKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -121,11 +122,12 @@ class BatchNormKernel : public framework::OpKernel { saved_mean->mutable_data(ctx.GetPlace()); saved_variance->mutable_data(ctx.GetPlace()); - math::SetConstant functor; - functor(ctx.device_context(), saved_mean, 0); - functor(ctx.device_context(), saved_variance, 0); + auto &dev_ctx = ctx.template device_context(); + math::SetConstant functor; + functor(dev_ctx, saved_mean, 0); + functor(dev_ctx, saved_variance, 0); - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto handle = dev_ctx.cudnn_handle(); // Now, depending on whether we are running test or not, we have two paths. if (is_test) { @@ -171,7 +173,7 @@ class BatchNormKernel : public framework::OpKernel { }; template -class BatchNormGradKernel +class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -244,11 +246,12 @@ class BatchNormGradKernel const void *saved_mean_data = saved_mean->template data(); const void *saved_var_data = saved_var->template data(); + auto &dev_ctx = ctx.template device_context(); CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( - ctx.cuda_device_context().cudnn_handle(), mode_, - CudnnDataType::kOne(), CudnnDataType::kZero(), - CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, - x->template data(), data_desc_, d_y->template data(), data_desc_, + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, d_y->template data(), data_desc_, d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, scale->template data(), d_scale->template mutable_data(ctx.GetPlace()), @@ -266,8 +269,9 @@ class BatchNormGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(batch_norm, - ops::BatchNormKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + batch_norm, + ops::BatchNormKernel); +REGISTER_OP_CUDA_KERNEL( batch_norm_grad, - ops::BatchNormGradKernel); + ops::BatchNormGradKernel); diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h index 4e80134a1a..8d99b68647 100644 --- a/paddle/operators/batch_norm_op.h +++ b/paddle/operators/batch_norm_op.h @@ -34,13 +34,13 @@ inline TensorFormat StringToTensorFormat(const std::string& str) { } } -template +template class BatchNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override; }; -template +template class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override; diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc index c88b2c9beb..217fd52366 100644 --- a/paddle/operators/bilinear_tensor_product_op.cc +++ b/paddle/operators/bilinear_tensor_product_op.cc @@ -159,9 +159,12 @@ REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp, ops::BilinearTensorProductOpGrad); REGISTER_OP_CPU_KERNEL( bilinear_tensor_product, - ops::BilinearTensorProductKernel, - ops::BilinearTensorProductKernel); + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); REGISTER_OP_CPU_KERNEL( bilinear_tensor_product_grad, - ops::BilinearTensorProductGradKernel, - ops::BilinearTensorProductGradKernel); + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu index 858d2668d0..0f48010716 100644 --- a/paddle/operators/bilinear_tensor_product_op.cu +++ b/paddle/operators/bilinear_tensor_product_op.cu @@ -16,11 +16,15 @@ limitations under the License. */ #include "paddle/operators/bilinear_tensor_product_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( bilinear_tensor_product, - ops::BilinearTensorProductKernel, - ops::BilinearTensorProductKernel); -REGISTER_OP_GPU_KERNEL( + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CUDA_KERNEL( bilinear_tensor_product_grad, - ops::BilinearTensorProductGradKernel, - ops::BilinearTensorProductGradKernel); + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h index 1113a4c6f3..ba9a2c5ce3 100644 --- a/paddle/operators/bilinear_tensor_product_op.h +++ b/paddle/operators/bilinear_tensor_product_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class BilinearTensorProductKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -46,7 +46,8 @@ class BilinearTensorProductKernel : public framework::OpKernel { int out_dim = weight_dims[0]; auto x_dim = weight_dims[1]; auto y_dim = weight_dims[2]; - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); // Create the intermediate variable to caculate the result of // Input(X) multiplied by Input(Weight_i), the formula is: @@ -60,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel { auto output_col_vec = output_mat.chip(i, 1); Tensor weight_mat = weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim})); - math::gemm(ctx.device_context(), CblasNoTrans, CblasNoTrans, - batch_size, y_dim, x_dim, 1, x->data(), - weight_mat.data(), 0, left_mul.data()); + math::gemm(dev_ctx, CblasNoTrans, CblasNoTrans, + batch_size, y_dim, x_dim, 1, x->data(), + weight_mat.data(), 0, left_mul.data()); output_col_vec.device(place) = (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); } @@ -74,7 +75,7 @@ class BilinearTensorProductKernel : public framework::OpKernel { } }; -template +template class BilinearTensorProductGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -96,8 +97,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { auto x_mat = EigenMatrix::From(*x); auto y_mat = EigenMatrix::From(*y); auto d_out_mat = EigenMatrix::From(*d_out); - auto place = ctx.GetEigenDevice(); - + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); // Create the intermediate variable to caculate the Output(Y@Grad). Tensor x_scale; x_scale.mutable_data(framework::make_ddim({batch_size, x_dim}), @@ -110,18 +111,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { ctx.GetPlace()); auto y_scale_mat = EigenMatrix::From(y_scale); - math::SetConstant set_zero; + math::SetConstant set_zero; // Set Output(X@Grad) be zero. if (d_x) { d_x->mutable_data(ctx.GetPlace()); - set_zero(ctx.device_context(), d_x, static_cast(0)); + set_zero(dev_ctx, d_x, static_cast(0)); } // Set Output(Y@Grad) be zero. if (d_y) { d_y->mutable_data(ctx.GetPlace()); - set_zero(ctx.device_context(), d_y, static_cast(0)); + set_zero(dev_ctx, d_y, static_cast(0)); } // Caculate the Output(X@Grad) and Output(Y@Grad). @@ -137,18 +138,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_x) * y_mat; - math::gemm(ctx.device_context(), CblasNoTrans, CblasTrans, - batch_size, x_dim, y_dim, 1, y_scale.data(), - weight_i.data(), 1, d_x->data()); + math::gemm( + dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, + y_scale.data(), weight_i.data(), 1, d_x->data()); } if (d_y) { x_scale_mat.device(place) = output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_y) * x_mat; - math::gemm(ctx.device_context(), CblasNoTrans, CblasNoTrans, - batch_size, y_dim, x_dim, 1, x_scale.data(), - weight_i.data(), 1, d_y->data()); + math::gemm( + dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, + x_scale.data(), weight_i.data(), 1, d_y->data()); } } } @@ -165,9 +166,9 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_weight) * x_mat; - math::gemm(ctx.device_context(), CblasTrans, CblasNoTrans, - x_dim, y_dim, batch_size, 1, x_scale.data(), - y->data(), 0, d_weight_i.data()); + math::gemm(dev_ctx, CblasTrans, CblasNoTrans, x_dim, + y_dim, batch_size, 1, x_scale.data(), + y->data(), 0, d_weight_i.data()); } } diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc index 3082a53ccf..42bff69a1e 100644 --- a/paddle/operators/cast_op.cc +++ b/paddle/operators/cast_op.cc @@ -68,7 +68,7 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUPlace; +using CPU = paddle::platform::CPUDeviceContext; REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, ops::CastOpProtoMaker); REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu index fb75ddbabf..4681deaa62 100644 --- a/paddle/operators/cast_op.cu +++ b/paddle/operators/cast_op.cu @@ -16,7 +16,7 @@ template using CastOpKernel = - paddle::operators::CastOpKernel; + paddle::operators::CastOpKernel; -REGISTER_OP_GPU_KERNEL(cast, CastOpKernel, CastOpKernel, - CastOpKernel, CastOpKernel); +REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, + CastOpKernel, CastOpKernel); diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h index 850dc8e349..a6773f13a8 100644 --- a/paddle/operators/cast_op.h +++ b/paddle/operators/cast_op.h @@ -27,13 +27,13 @@ struct CastOpTransformFunctor { HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } }; -template +template struct CastOpFunctor { const framework::Tensor* in_; framework::Tensor* out_; - const platform::DeviceContext& ctx_; + const DeviceContext& ctx_; CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, - const platform::DeviceContext& ctx) + const DeviceContext& ctx) : in_(in), out_(out), ctx_(ctx) {} template @@ -42,13 +42,13 @@ struct CastOpFunctor { auto numel = in_->numel(); auto* in_end = in_begin + numel; auto* out_begin = out_->mutable_data(ctx_.GetPlace()); - platform::Transform trans; + platform::Transform trans; trans(ctx_, in_begin, in_end, out_begin, CastOpTransformFunctor()); } }; -template +template class CastOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,7 +56,8 @@ class CastOpKernel : public framework::OpKernel { auto* out = context.Output("Out"); framework::VisitDataType( static_cast(context.Attr("out_dtype")), - CastOpFunctor(in, out, context.device_context())); + CastOpFunctor( + in, out, context.template device_context())); } }; diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h index dd88f2553b..9cd758a825 100644 --- a/paddle/operators/chunk_eval_op.h +++ b/paddle/operators/chunk_eval_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class ChunkEvalKernel : public framework::OpKernel { public: struct Segment { diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc index f73d55bbe3..0b7975a63f 100644 --- a/paddle/operators/clip_by_norm_op.cc +++ b/paddle/operators/clip_by_norm_op.cc @@ -71,4 +71,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, ops::ClipByNormOpMaker); REGISTER_OP_CPU_KERNEL( - clip_by_norm, ops::ClipByNormKernel); + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu index 2593a24ebb..acd7543823 100644 --- a/paddle/operators/clip_by_norm_op.cu +++ b/paddle/operators/clip_by_norm_op.cu @@ -15,5 +15,6 @@ #include "paddle/operators/clip_by_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - clip_by_norm, ops::ClipByNormKernel); +REGISTER_OP_CUDA_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h index b26476cae9..d8db1566b0 100644 --- a/paddle/operators/clip_by_norm_op.h +++ b/paddle/operators/clip_by_norm_op.h @@ -26,7 +26,7 @@ template using EigenVector = framework::EigenVector; -template +template class ClipByNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,7 +38,8 @@ class ClipByNormKernel : public framework::OpKernel { auto x = EigenVector::Flatten(*input); auto out = EigenVector::Flatten(*output); auto x_norm = x.square().sum().sqrt(); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto temp = (x_norm <= max_norm).template cast().eval(); auto scaling = temp + (static_cast(1) - temp) * max_norm / x_norm; diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc index 4ddf24dea3..6092212de4 100644 --- a/paddle/operators/clip_op.cc +++ b/paddle/operators/clip_op.cc @@ -83,7 +83,7 @@ class ClipOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad, ops::ClipOpGrad); -REGISTER_OP_CPU_KERNEL(clip, - ops::ClipKernel); -REGISTER_OP_CPU_KERNEL(clip_grad, - ops::ClipGradKernel); +REGISTER_OP_CPU_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CPU_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu index ca9701298f..bb7dcc671a 100644 --- a/paddle/operators/clip_op.cu +++ b/paddle/operators/clip_op.cu @@ -15,7 +15,7 @@ #include "paddle/operators/clip_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(clip, - ops::ClipKernel); -REGISTER_OP_GPU_KERNEL(clip_grad, - ops::ClipGradKernel); +REGISTER_OP_CUDA_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CUDA_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h index ac702e9935..0c40797410 100644 --- a/paddle/operators/clip_op.h +++ b/paddle/operators/clip_op.h @@ -55,7 +55,7 @@ class ClipGradFunctor { T max_; }; -template +template class ClipKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -66,13 +66,13 @@ class ClipKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); const T* x_data = x->data(); int64_t numel = x->numel(); - Transform trans; - trans(context.device_context(), x_data, x_data + numel, out_data, - ClipFunctor(min, max)); + Transform trans; + trans(context.template device_context(), x_data, + x_data + numel, out_data, ClipFunctor(min, max)); } }; -template +template class ClipGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -86,9 +86,9 @@ class ClipGradKernel : public framework::OpKernel { auto* d_x_data = d_x->mutable_data(context.GetPlace()); const T* d_out_data = d_out->data(); const T* x_data = x->data(); - Transform trans; - trans(context.device_context(), d_out_data, d_out_data + numel, x_data, - d_x_data, ClipGradFunctor(min, max)); + Transform trans; + trans(context.template device_context(), d_out_data, + d_out_data + numel, x_data, d_x_data, ClipGradFunctor(min, max)); } } }; diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu index 6ac8c124b9..596a878bcf 100644 --- a/paddle/operators/compare_op.cu +++ b/paddle/operators/compare_op.cu @@ -14,10 +14,10 @@ #include "paddle/operators/compare_op.h" -REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor); -REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor); -REGISTER_LOGICAL_KERNEL(greater_than, GPU, +REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); +REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); +REGISTER_LOGICAL_KERNEL(greater_than, CUDA, paddle::operators::GreaterThanFunctor); -REGISTER_LOGICAL_KERNEL(greater_equal, GPU, +REGISTER_LOGICAL_KERNEL(greater_equal, CUDA, paddle::operators::GreaterEqualFunctor); -REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor); +REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h index afdf3ab3e0..a56536e155 100644 --- a/paddle/operators/compare_op.h +++ b/paddle/operators/compare_op.h @@ -59,7 +59,7 @@ struct EqualFunctor { } }; -template +template class CompareOpKernel : public framework::OpKernel { public: @@ -69,24 +69,23 @@ class CompareOpKernel auto* y = context.Input("Y"); auto* out = context.Output("Out"); Functor binary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), - y->data(), out->mutable_data(context.GetPlace()), - binary_func); + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); } }; } // namespace operators } // namespace paddle -#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>); +#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/operators/concat_op.cu.cc b/paddle/operators/concat_op.cu.cc index ede832ddcd..7b46452d3d 100644 --- a/paddle/operators/concat_op.cu.cc +++ b/paddle/operators/concat_op.cu.cc @@ -14,7 +14,8 @@ limitations under the License. */ #include "paddle/operators/concat_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(concat, - ops::ConcatKernel); -REGISTER_OP_GPU_KERNEL( - concat_grad, ops::ConcatGradKernel); +REGISTER_OP_CUDA_KERNEL( + concat, ops::ConcatKernel); +REGISTER_OP_CUDA_KERNEL( + concat_grad, + ops::ConcatGradKernel); diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h index c113f19fb5..de4011585a 100644 --- a/paddle/operators/concat_op.h +++ b/paddle/operators/concat_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ConcatKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -43,7 +43,7 @@ class ConcatKernel : public framework::OpKernel { } }; -template +template class ConcatGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc index 0dd8c13b2a..008bf01885 100644 --- a/paddle/operators/conv_cudnn_op.cc +++ b/paddle/operators/conv_cudnn_op.cc @@ -57,18 +57,20 @@ REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker, REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker, conv3d_cudnn_grad, ops::ConvOpGrad); -REGISTER_OP_CPU_KERNEL(conv2d_cudnn, - ops::GemmConvKernel, - ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_cudnn, + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( conv2d_cudnn_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL(conv3d_cudnn, - ops::GemmConvKernel, - ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_cudnn, + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( conv3d_cudnn_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index bc265dcc4f..3da0a9001a 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -118,7 +118,8 @@ class CudnnConvOpKernel : public framework::OpKernel { } // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, @@ -238,7 +239,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel { workspace_size_limit = user_workspace_size * 1024 * 1024; } - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); if (input_grad) { PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( @@ -313,16 +315,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(conv2d_cudnn, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); -REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad, - paddle::operators::CudnnConvGradOpKernel, - paddle::operators::CudnnConvGradOpKernel); - -REGISTER_OP_GPU_KERNEL(conv3d_cudnn, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); -REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad, - paddle::operators::CudnnConvGradOpKernel, - paddle::operators::CudnnConvGradOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_cudnn, + paddle::operators::CudnnConvOpKernel, + paddle::operators::CudnnConvOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel, + paddle::operators::CudnnConvGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(conv3d_cudnn, + paddle::operators::CudnnConvOpKernel, + paddle::operators::CudnnConvOpKernel); +REGISTER_OP_CUDA_KERNEL(conv3d_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel, + paddle::operators::CudnnConvGradOpKernel); diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index 462e6d9cbc..7ef805fd44 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -235,16 +235,18 @@ namespace ops = paddle::operators; REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); -REGISTER_OP_CPU_KERNEL(conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv2d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL(conv3d, - ops::GemmConvKernel, - ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv3d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc index 546451234a..38615a8bef 100644 --- a/paddle/operators/conv_op.cu.cc +++ b/paddle/operators/conv_op.cu.cc @@ -16,16 +16,18 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_GPU_KERNEL( - conv2d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_GPU_KERNEL(conv3d, - ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_GPU_KERNEL( - conv3d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h index 09bff0a68d..749258183b 100644 --- a/paddle/operators/conv_op.h +++ b/paddle/operators/conv_op.h @@ -72,7 +72,7 @@ class ConvOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override; }; -template +template class GemmConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -141,9 +141,10 @@ class GemmConvKernel : public framework::OpKernel { int in_step = static_cast(input->dims()[1]) / groups; int out_step = static_cast(output->dims()[1]) / groups; - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + auto& dev_ctx = context.template device_context(); for (int i = 0; i < batch_size; i++) { Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); @@ -157,27 +158,26 @@ class GemmConvKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); } else if (data_dim == 2U) { // im2col - im2col(context.device_context(), in_slice, dilations, strides, + im2col(dev_ctx, in_slice, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &col); } else if (data_dim == 3U) { // vol2col - vol2col(context.device_context(), in_slice, dilations, strides, - paddings, &col); + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); } // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmul(context.device_context(), filter_slice, false, - col_matrix, false, T(1.0), &out_slice, T(0.0)); + math::matmul(dev_ctx, filter_slice, false, col_matrix, + false, T(1.0), &out_slice, T(0.0)); } } } }; -template +template class GemmConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -256,14 +256,15 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); } - math::SetConstant set_zero; + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); if (input_grad) { input_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = @@ -282,18 +283,17 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.ShareDataWith(in_grad_slice); col_matrix.Resize(col_matrix_shape); } - math::matmul(context.device_context(), filter_slice, true, - out_grad_slice, false, T(1.0), &col_matrix, - T(0.0)); + math::matmul(dev_ctx, filter_slice, true, + out_grad_slice, false, T(1.0), + &col_matrix, T(0.0)); if (is_expand && data_dim == 2U) { - col2im(context.device_context(), col, dilations, strides, + col2im(dev_ctx, col, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &in_grad_slice); } else if (is_expand && data_dim == 3U) { - col2vol(context.device_context(), col, dilations, strides, paddings, - &in_grad_slice); + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); } } } @@ -303,9 +303,9 @@ class GemmConvGradKernel : public framework::OpKernel { filter_grad->mutable_data(context.GetPlace()); Tensor filter_grad_ = *filter_grad; filter_grad_.Resize(filter_matrix_shape); - set_zero(context.device_context(), filter_grad, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; + set_zero(dev_ctx, filter_grad, static_cast(0)); + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = output_grad->Slice(i, i + 1).Resize(output_matrix_shape); @@ -321,21 +321,20 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } else if (data_dim == 2U) { - im2col(context.device_context(), in_slice, dilations, strides, + im2col(dev_ctx, in_slice, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &col); } else if (data_dim == 3U) { - vol2col(context.device_context(), in_slice, dilations, strides, - paddings, &col); + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); } // gemm Tensor filter_grad_slice = filter_grad_.Slice(g * out_step, (g + 1) * out_step); - math::matmul(context.device_context(), out_grad_slice, - false, col_matrix, true, T(1.0), - &filter_grad_slice, T(1.0)); + math::matmul(dev_ctx, out_grad_slice, false, + col_matrix, true, T(1.0), + &filter_grad_slice, T(1.0)); } } } diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu index 95e13c38a8..f7ca82ce26 100644 --- a/paddle/operators/conv_shift_op.cu +++ b/paddle/operators/conv_shift_op.cu @@ -111,7 +111,8 @@ __global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width, } // namespace template -class ConvShiftKernel : public framework::OpKernel { +class ConvShiftKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const Tensor *X = context.Input("X"); @@ -132,7 +133,8 @@ class ConvShiftKernel : public framework::OpKernel { dim3 grid_dim(num_x_blocks, batch_size); - auto stream = context.cuda_device_context().stream(); + auto stream = + context.template device_context().stream(); ConvShiftForward<<>>( x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); @@ -140,7 +142,7 @@ class ConvShiftKernel : public framework::OpKernel { }; template -class ConvShiftGradKernel +class ConvShiftGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -159,8 +161,9 @@ class ConvShiftGradKernel int y_width = Y->dims()[1]; int y_half_width = (y_width - 1) / 2; - auto &device_ctx = context.cuda_device_context(); - math::SetConstant zero; + auto &device_ctx = + context.template device_context(); + math::SetConstant zero; const int x_per_block = 256; int num_x_blocks = DivUp(x_width, x_per_block); @@ -186,8 +189,9 @@ class ConvShiftGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv_shift, - ops::ConvShiftKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CUDA_KERNEL( conv_shift_grad, - ops::ConvShiftGradKernel); + ops::ConvShiftGradKernel); diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h index 5a160b0f16..1a70b38a0d 100644 --- a/paddle/operators/conv_shift_op.h +++ b/paddle/operators/conv_shift_op.h @@ -18,13 +18,13 @@ namespace paddle { namespace operators { -template +template class ConvShiftKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; }; -template +template class ConvShiftGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc index 0192178ce3..4cb6a2ccff 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cc @@ -61,12 +61,13 @@ REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp, REGISTER_OP_CPU_KERNEL( conv2d_transpose_cudnn, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv2d_transpose_cudnn_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp, ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad, @@ -74,9 +75,10 @@ REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp, REGISTER_OP_CPU_KERNEL( conv3d_transpose_cudnn, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv3d_transpose_cudnn_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc index 494904fe52..f0297f6c40 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc @@ -83,7 +83,8 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { } // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionBwdDataAlgo_t algo; - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); // Get the algorithm PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, @@ -165,7 +166,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { workspace_size_limit = user_workspace_size * 1024 * 1024; } - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); if (input_grad) { // choose backward algorithm for data PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( @@ -234,16 +236,16 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn, - ops::CudnnConvTransposeOpKernel, - ops::CudnnConvTransposeOpKernel); -REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad, - ops::CudnnConvTransposeGradOpKernel, - ops::CudnnConvTransposeGradOpKernel); - -REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn, - ops::CudnnConvTransposeOpKernel, - ops::CudnnConvTransposeOpKernel); -REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad, - ops::CudnnConvTransposeGradOpKernel, - ops::CudnnConvTransposeGradOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn, + ops::CudnnConvTransposeOpKernel, + ops::CudnnConvTransposeOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn_grad, + ops::CudnnConvTransposeGradOpKernel, + ops::CudnnConvTransposeGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn, + ops::CudnnConvTransposeOpKernel, + ops::CudnnConvTransposeOpKernel); +REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn_grad, + ops::CudnnConvTransposeGradOpKernel, + ops::CudnnConvTransposeGradOpKernel); diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index 678b192dea..ca063e94bb 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -197,21 +197,23 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, REGISTER_OP_CPU_KERNEL( conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, conv3d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc index 4165eb0c7b..b91ebd7922 100644 --- a/paddle/operators/conv_transpose_op.cu.cc +++ b/paddle/operators/conv_transpose_op.cu.cc @@ -16,20 +16,24 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_GPU_KERNEL( + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_GPU_KERNEL( + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h index 1cacb770e6..80600b5361 100644 --- a/paddle/operators/conv_transpose_op.h +++ b/paddle/operators/conv_transpose_op.h @@ -52,7 +52,7 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override; }; -template +template class GemmConvTransposeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -109,11 +109,12 @@ class GemmConvTransposeKernel : public framework::OpKernel { filter.Resize(filter_matrix_shape); output->mutable_data(context.GetPlace()); - math::SetConstant set_zero; - set_zero(context.device_context(), output, static_cast(0)); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, output, static_cast(0)); - math::Col2ImFunctor col2im; - math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; std::vector dilations({1, 1, 1}); // convolution transpose: gemm + col2im or col2vol (similar to conv-backward @@ -127,29 +128,27 @@ class GemmConvTransposeKernel : public framework::OpKernel { // col_matrix = filter * input_batch // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) - math::matmul(context.device_context(), filter, true, - input_batch, false, static_cast(1.0), - &col_matrix, static_cast(0.0)); + math::matmul(dev_ctx, filter, true, input_batch, false, + static_cast(1.0), &col_matrix, + static_cast(0.0)); if (data_dim == 2U) { // col2im: col_matrix -> dy // from (c * k_h * k_w, h * w) to (c, o_h, o_w) - col2im(context.device_context(), col, - std::vector{dilations[0], dilations[1]}, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, + col2im(dev_ctx, col, std::vector{dilations[0], dilations[1]}, + strides, std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, &output_batch); } else if (data_dim == 3U) { // col2vol: col_matrix -> dy // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) - col2vol(context.device_context(), col, dilations, strides, paddings, - &output_batch); + col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch); } } } }; -template +template class GemmConvTransposeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -206,6 +205,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // convolution transpose grad on input: // im2col + gemm (similar to conv-forward) // input need to compute gradient + auto& dev_ctx = context.template device_context(); if (input_grad || filter_grad) { Tensor col; col.mutable_data(col_shape, context.GetPlace()); @@ -217,19 +217,19 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); Tensor filter_grad_; - math::SetConstant set_zero; + math::SetConstant set_zero; - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; std::vector dilations({1, 1, 1}); if (input_grad) { input_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); } if (filter_grad) { // filter size (m, c, k_h, k_w) filter_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), filter_grad, static_cast(0)); + set_zero(dev_ctx, filter_grad, static_cast(0)); filter_grad_ = *filter_grad; filter_grad_.Resize(filter_matrix_shape); } @@ -242,7 +242,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { if (data_dim == 2U) { // im2col: dy -> col matrix // from (c, o_h, o_w) to (c * k_h * k_w, h * w) - im2col(context.device_context(), output_grad_batch, + im2col(dev_ctx, output_grad_batch, std::vector{dilations[0], dilations[1]}, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, @@ -250,8 +250,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { } else if (data_dim == 3U) { // vol2col: dy -> col_matrix // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w) - vol2col(context.device_context(), output_grad_batch, dilations, - strides, paddings, &col); + vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings, + &col); } if (input_grad) { @@ -263,9 +263,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // or // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, // d, h, w) - math::matmul(context.device_context(), filter, false, - col_matrix, false, static_cast(1.0), - &input_grad_batch, static_cast(0.0)); + math::matmul( + dev_ctx, filter, false, col_matrix, false, static_cast(1.0), + &input_grad_batch, static_cast(0.0)); } if (filter_grad) { // input batch @@ -275,9 +275,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // or // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * // k_h * k_w) - math::matmul(context.device_context(), in_batch, false, - col_matrix, true, static_cast(1.0), - &filter_grad_, static_cast(1.0)); + math::matmul(dev_ctx, in_batch, false, col_matrix, + true, static_cast(1.0), + &filter_grad_, static_cast(1.0)); } } } diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 312264ccd4..440c427cba 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -155,7 +155,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad, ops::CosSimOpGrad); -REGISTER_OP_CPU_KERNEL(cos_sim, - ops::CosSimKernel); REGISTER_OP_CPU_KERNEL( - cos_sim_grad, ops::CosSimGradKernel); + cos_sim, ops::CosSimKernel); +REGISTER_OP_CPU_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 0cb8fd26de..1cb01f5945 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/cos_sim_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(cos_sim, - ops::CosSimKernel); -REGISTER_OP_GPU_KERNEL( - cos_sim_grad, ops::CosSimGradKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 62a4e484ec..fecb5a79b2 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -27,7 +27,7 @@ template using EigenVector = framework::EigenVector; -template +template class CosSimKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -51,7 +51,8 @@ class CosSimKernel : public framework::OpKernel { auto y_norm = EigenVector::Flatten(*out_y_norm); // compute - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto row_along = Eigen::array({{1}}); x_norm.device(place) = x.square().sum(row_along).sqrt(); y_norm.device(place) = y.square().sum(row_along).sqrt(); @@ -66,7 +67,7 @@ class CosSimKernel : public framework::OpKernel { } }; -template +template class CosSimGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -96,7 +97,8 @@ class CosSimGradKernel : public framework::OpKernel { auto z_bcast = z.broadcast(bcast_cols); auto dz_bcast = dz.broadcast(bcast_cols); auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); if (rows_x == rows_y) { auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols); auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols); diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc index 291b23ed1b..1ce189fa6e 100644 --- a/paddle/operators/crf_decoding_op.cc +++ b/paddle/operators/crf_decoding_op.cc @@ -135,5 +135,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp, ops::CRFDecodingOpMaker); REGISTER_OP_CPU_KERNEL( - crf_decoding, ops::CRFDecodingOpKernel, - ops::CRFDecodingOpKernel); + crf_decoding, + ops::CRFDecodingOpKernel, + ops::CRFDecodingOpKernel); diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h index 57b5e21b3a..f6827b7b11 100644 --- a/paddle/operators/crf_decoding_op.h +++ b/paddle/operators/crf_decoding_op.h @@ -24,7 +24,7 @@ using framework::LoDTensor; using framework::LoD; using framework::Tensor; -template +template class CRFDecodingOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -44,8 +44,8 @@ class CRFDecodingOpKernel : public framework::OpKernel { const size_t seq_num = lod[level].size() - 1; int64_t* path = decoded_path->mutable_data(platform::CPUPlace()); - math::SetConstant()(ctx.device_context(), - decoded_path, 0); + math::SetConstant()( + ctx.template device_context(), decoded_path, 0); for (size_t i = 0; i < seq_num; ++i) { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc index 6752eb8c1c..7c2a0ac7a7 100644 --- a/paddle/operators/crop_op.cc +++ b/paddle/operators/crop_op.cc @@ -133,5 +133,5 @@ class CropOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad); REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); -REGISTER_OP_CPU_KERNEL(crop_grad, - ops::CropGradKernel); +REGISTER_OP_CPU_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu index f8ee18a1d6..90fd83ca10 100644 --- a/paddle/operators/crop_op.cu +++ b/paddle/operators/crop_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/crop_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(crop, ops::CropKernel); -REGISTER_OP_GPU_KERNEL(crop_grad, - ops::CropGradKernel); +REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h index 2e72583d68..d531a19c78 100644 --- a/paddle/operators/crop_op.h +++ b/paddle/operators/crop_op.h @@ -49,7 +49,7 @@ class CropKernel : public framework::OpKernel { } }; -template +template void CropGradFunction(const framework::ExecutionContext& context) { auto* d_x = context.Output(framework::GradVarName("X")); if (d_x != nullptr) { @@ -63,12 +63,13 @@ void CropGradFunction(const framework::ExecutionContext& context) { } auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - d_x_tensor.device(context.GetEigenDevice()) = + d_x_tensor.device( + *context.template device_context().eigen_device()) = d_out_tensor.pad(paddings, 0); } } -template +template class CropGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -76,22 +77,22 @@ class CropGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out"))->dims().size(); switch (rank) { case 1: - CropGradFunction(context); + CropGradFunction(context); break; case 2: - CropGradFunction(context); + CropGradFunction(context); break; case 3: - CropGradFunction(context); + CropGradFunction(context); break; case 4: - CropGradFunction(context); + CropGradFunction(context); break; case 5: - CropGradFunction(context); + CropGradFunction(context); break; case 6: - CropGradFunction(context); + CropGradFunction(context); break; default: PADDLE_THROW( diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 6212e39dfd..0546964588 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -53,8 +53,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { Tensor* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - math::CrossEntropyFunctor()( - ctx.device_context(), y, x, label, ctx.Attr("soft_label")); + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, label, + ctx.Attr("soft_label")); } }; @@ -80,15 +81,17 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (batch_size * class_num + block - 1) / block; - auto stream = ctx.cuda_device_context().stream(); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); if (ctx.Attr("soft_label")) { auto* label_data = label->data(); SoftCrossEntropyGradientKernel<<>>( dx_data, dy_data, x_data, label_data, batch_size, class_num); } else { - math::SetConstant functor; - functor(ctx.device_context(), dx, 0); + math::SetConstant functor; + functor(dev_ctx, dx, 0); auto* label_data = label->data(); grid = (batch_size + block - 1) / block; CrossEntropyGradientKernel<<>>( @@ -101,8 +104,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, - ops::CrossEntropyOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(cross_entropy_grad, - ops::CrossEntropyGradientOpCUDAKernel, - ops::CrossEntropyGradientOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, + ops::CrossEntropyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, + ops::CrossEntropyGradientOpCUDAKernel, + ops::CrossEntropyGradientOpCUDAKernel); diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index 37db0a930a..5623d2ded1 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -37,8 +37,9 @@ class CrossEntropyOpKernel : public framework::OpKernel { Tensor* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - math::CrossEntropyFunctor()( - ctx.device_context(), y, x, labels, ctx.Attr("soft_label")); + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, labels, + ctx.Attr("soft_label")); } }; @@ -61,7 +62,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { auto lbl_mat = EigenMatrix::From(*label); auto dx_mat = EigenMatrix::From(*dx); - dx_mat.device(ctx.GetEigenDevice()) = + dx_mat.device(*ctx.template device_context() + .eigen_device()) = -(lbl_mat * dy_mat.broadcast(Eigen::DSizes(1, class_num)) / x_mat); } else { @@ -70,8 +72,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { const T* x_data = x->data(); const int64_t* label_data = label->data(); - math::SetConstant functor; - functor(ctx.device_context(), dx, 0); + math::SetConstant functor; + functor(ctx.template device_context(), dx, 0); for (int64_t i = 0; i < batch_size; ++i) { PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc index 640b4e7744..fd29c7270b 100644 --- a/paddle/operators/decayed_adagrad_op.cc +++ b/paddle/operators/decayed_adagrad_op.cc @@ -99,4 +99,4 @@ REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp, ops::DecayedAdagradOpMaker); REGISTER_OP_CPU_KERNEL( decayed_adagrad, - ops::DecayedAdagradOpKernel); + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu index 6fce77fe4e..282b90f275 100644 --- a/paddle/operators/decayed_adagrad_op.cu +++ b/paddle/operators/decayed_adagrad_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/decayed_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( decayed_adagrad, - ops::DecayedAdagradOpKernel); + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h index 0fe0fc5acd..fec9705cfc 100644 --- a/paddle/operators/decayed_adagrad_op.h +++ b/paddle/operators/decayed_adagrad_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class DecayedAdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -43,7 +43,7 @@ class DecayedAdagradOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); moment_out.device(place) = decay * moment + (1 - decay) * grad * grad; Eigen::DSizes m_dsize(moment_out_tensor->numel()); diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc index 932c0bf8fb..acd526ae80 100644 --- a/paddle/operators/dropout_op.cc +++ b/paddle/operators/dropout_op.cc @@ -100,6 +100,8 @@ namespace ops = paddle::operators; REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad, ops::DropoutOpGrad); REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel); + dropout, + ops::CPUDropoutKernel); REGISTER_OP_CPU_KERNEL( - dropout_grad, ops::DropoutGradKernel); + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu index db3578b9bf..10c670751d 100644 --- a/paddle/operators/dropout_op.cu +++ b/paddle/operators/dropout_op.cu @@ -58,7 +58,7 @@ class GPUDropoutKernel : public framework::OpKernel { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - auto place = context.GetEigenDevice(); + auto& place = *context.template device_context().eigen_device(); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); @@ -80,7 +80,9 @@ class GPUDropoutKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - dropout, ops::GPUDropoutKernel); -REGISTER_OP_GPU_KERNEL( - dropout_grad, ops::DropoutGradKernel); +REGISTER_OP_CUDA_KERNEL( + dropout, + ops::GPUDropoutKernel); +REGISTER_OP_CUDA_KERNEL( + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h index d9a130fdc0..84ad39f0bb 100644 --- a/paddle/operators/dropout_op.h +++ b/paddle/operators/dropout_op.h @@ -25,7 +25,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class CPUDropoutKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -55,13 +55,14 @@ class CPUDropoutKernel : public framework::OpKernel { } else { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Y.device(place) = X * dropout_prob; } } }; -template +template class DropoutGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -77,7 +78,8 @@ class DropoutGradKernel : public framework::OpKernel { auto dX = EigenMatrix::Reshape(*grad_x, 1); auto dY = EigenMatrix::Reshape(*grad_y, 1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); dX.device(place) = dY * M; } }; diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc index 432b9ba6f7..a62eeeeb95 100644 --- a/paddle/operators/elementwise_add_op.cc +++ b/paddle/operators/elementwise_add_op.cc @@ -34,13 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, elementwise_add_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_add, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); REGISTER_OP_CPU_KERNEL( elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu index 7591428ac7..78642bb424 100644 --- a/paddle/operators/elementwise_add_op.cu +++ b/paddle/operators/elementwise_add_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_add, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index 921dc5f6a6..069bdaf0ab 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -24,7 +24,7 @@ struct AddFunctor { inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } }; -template +template class ElementwiseAddKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -34,8 +34,8 @@ class ElementwiseAddKernel : public framework::OpKernel { auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - TransformFunctor, T, Place> functor( - x, y, z, ctx.device_context(), AddFunctor()); + TransformFunctor, T, DeviceContext> functor( + x, y, z, ctx.template device_context(), AddFunctor()); auto x_dims = x->dims(); auto y_dims = y->dims(); @@ -137,11 +137,11 @@ struct ElementwiseAddBroadCast2GradFunctor { } }; -template +template class ElementwiseAddGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseAddOneGradFunctor, ElementwiseAddBroadCastGradFunctor, ElementwiseAddBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc index 7a325199bd..1c3e9e70ee 100644 --- a/paddle/operators/elementwise_div_op.cc +++ b/paddle/operators/elementwise_div_op.cc @@ -35,13 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker, elementwise_div_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); REGISTER_OP_CPU_KERNEL( elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel); + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu index de4d0c3344..502c528936 100644 --- a/paddle/operators/elementwise_div_op.cu +++ b/paddle/operators/elementwise_div_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel); + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h index 8946ff3d25..d91313db42 100644 --- a/paddle/operators/elementwise_div_op.h +++ b/paddle/operators/elementwise_div_op.h @@ -19,11 +19,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseDivKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -102,11 +102,11 @@ struct ElementwiseDivBroadCast2GradFunctor { } }; -template +template class ElementwiseDivGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseDivGradFunctor, ElementwiseDivBroadCastGradFunctor, ElementwiseDivBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc index 8851267a52..aadb95cbe3 100644 --- a/paddle/operators/elementwise_mul_op.cc +++ b/paddle/operators/elementwise_mul_op.cc @@ -36,13 +36,13 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker, elementwise_mul_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_mul, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu index b0dfdee1cc..089451b3e1 100644 --- a/paddle/operators/elementwise_mul_op.cu +++ b/paddle/operators/elementwise_mul_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_mul, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h index 4469b07eaa..16fa5ec4b3 100644 --- a/paddle/operators/elementwise_mul_op.h +++ b/paddle/operators/elementwise_mul_op.h @@ -18,11 +18,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -101,11 +101,11 @@ struct ElementwiseMulBroadCast2GradFunctor { } }; -template +template class ElementwiseMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseMulGradFunctor, ElementwiseMulBroadCastGradFunctor, ElementwiseMulBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index ca3542e783..7ebfc7df8c 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -59,17 +59,17 @@ inline void get_mid_dims(const framework::DDim& x_dims, } } -template +template class RowwiseTransformIterator; -template +template class MidWiseTransformIterator; template -class RowwiseTransformIterator { +class RowwiseTransformIterator { public: RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {} - RowwiseTransformIterator& operator++() { + RowwiseTransformIterator& operator++() { ++i_; if (UNLIKELY(i_ == n_)) { i_ = 0; @@ -77,13 +77,13 @@ class RowwiseTransformIterator { return *this; } - bool operator==( - const RowwiseTransformIterator& rhs) const { + bool operator==(const RowwiseTransformIterator& + rhs) const { return (ptr_ + i_) == &(*rhs); } - bool operator!=( - const RowwiseTransformIterator& rhs) const { + bool operator!=(const RowwiseTransformIterator& + rhs) const { return (ptr_ + i_) != &(*rhs); } @@ -96,12 +96,12 @@ class RowwiseTransformIterator { }; template -class MidWiseTransformIterator { +class MidWiseTransformIterator { public: MidWiseTransformIterator(const T* ptr, int n, int post) : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} - MidWiseTransformIterator& operator++() { + MidWiseTransformIterator& operator++() { ++j_; i_ = j_ / post_; if (UNLIKELY(i_ == n_)) { @@ -111,13 +111,13 @@ class MidWiseTransformIterator { return *this; } - bool operator==( - const MidWiseTransformIterator& rhs) const { + bool operator==(const MidWiseTransformIterator& + rhs) const { return (ptr_ + i_) == &(*rhs); } - bool operator!=( - const MidWiseTransformIterator& rhs) const { + bool operator!=(const MidWiseTransformIterator& + rhs) const { return (ptr_ + i_) != &(*rhs); } @@ -133,12 +133,12 @@ class MidWiseTransformIterator { #ifdef __NVCC__ template -class RowwiseTransformIterator +class RowwiseTransformIterator : public thrust::iterator_adaptor< - RowwiseTransformIterator, const T*> { + RowwiseTransformIterator, const T*> { public: typedef thrust::iterator_adaptor< - RowwiseTransformIterator, const T*> + RowwiseTransformIterator, const T*> super_t; HOSTDEVICE RowwiseTransformIterator(const T* x, int n) : super_t(x), begin_(x), n_(n){}; @@ -153,12 +153,12 @@ class RowwiseTransformIterator }; template -class MidWiseTransformIterator +class MidWiseTransformIterator : public thrust::iterator_adaptor< - MidWiseTransformIterator, const T*> { + MidWiseTransformIterator, const T*> { public: typedef thrust::iterator_adaptor< - MidWiseTransformIterator, const T*> + MidWiseTransformIterator, const T*> super_t; HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) : super_t(x), begin_(x), n_(n), post_(post){}; @@ -174,12 +174,11 @@ class MidWiseTransformIterator }; #endif -template +template class TransformFunctor { public: TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z, const platform::DeviceContext& ctx, - Functor func) + framework::Tensor* z, const DeviceContext& ctx, Functor func) : x_(x->data()), y_(y->data()), z_(z->mutable_data(ctx.GetPlace())), @@ -188,20 +187,20 @@ class TransformFunctor { func_(func) {} inline void Run() const { - platform::Transform trans; + platform::Transform trans; trans(ctx_, x_, x_ + nx_, y_, z_, func_); } inline void RunRowWise(int n, int pre) const { - platform::Transform trans; - trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), z_, - func_); + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), + z_, func_); } inline void RunMidWise(int n, int pre, int post) const { - platform::Transform trans; - trans(ctx_, x_, x_ + nx_, MidWiseTransformIterator(y_, n, post), - z_, func_); + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, + MidWiseTransformIterator(y_, n, post), z_, func_); } private: @@ -209,22 +208,24 @@ class TransformFunctor { const T* y_; T* z_; int64_t nx_; - const platform::DeviceContext& ctx_; + const DeviceContext& ctx_; Functor func_; }; #define EIGEN_FUNCTOR(name, eigen_op) \ struct Eigen##name##Functor { \ - template \ + template \ inline void Run(const framework::Tensor* x, const framework::Tensor* y, \ framework::Tensor* z, \ const framework::ExecutionContext& ctx) { \ auto x_e = framework::EigenVector::Flatten(*x); \ auto y_e = framework::EigenVector::Flatten(*y); \ auto z_e = framework::EigenVector::Flatten(*z); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_e); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_e); \ } \ - template \ + template \ inline void RunBroadCast(const framework::Tensor* x, \ const framework::Tensor* y, framework::Tensor* z, \ const framework::ExecutionContext& ctx, int pre, \ @@ -235,9 +236,11 @@ class TransformFunctor { auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) \ .broadcast(Eigen::DSizes(pre, 1)) \ .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_bcast); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ } \ - template \ + template \ inline void RunBroadCast2(const framework::Tensor* x, \ const framework::Tensor* y, \ framework::Tensor* z, \ @@ -249,11 +252,13 @@ class TransformFunctor { auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) \ .broadcast(Eigen::DSizes(pre, 1, post)) \ .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_bcast); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ } \ } -template +template void ElementwiseCompute(const framework::ExecutionContext& ctx) { using Tensor = framework::Tensor; @@ -269,7 +274,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { if (x_dims == y_dims) { functor f; - f.template Run(x, y, z, ctx); + f.template Run(x, y, z, ctx); return; } @@ -282,11 +287,11 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { get_mid_dims(x_dims, y_dims, axis, pre, n, post); if (post == 1) { functor f; - f.template RunBroadCast(x, y, z, ctx, pre, n); + f.template RunBroadCast(x, y, z, ctx, pre, n); return; } else { functor f; - f.template RunBroadCast2(x, y, z, ctx, pre, n, post); + f.template RunBroadCast2(x, y, z, ctx, pre, n, post); return; } } @@ -303,8 +308,9 @@ EIGEN_FUNCTOR(Mul, EIGEN_MUL); #define EIGEN_DIV(x, y) ((x) / (y)) EIGEN_FUNCTOR(Div, EIGEN_DIV); -template +template void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { using Tensor = framework::Tensor; @@ -313,7 +319,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); auto x_dims = x->dims(); auto y_dims = y->dims(); diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc index 95d7979e39..3e4d19361e 100644 --- a/paddle/operators/elementwise_sub_op.cc +++ b/paddle/operators/elementwise_sub_op.cc @@ -34,13 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker, elementwise_sub_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel); + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu index ec23bec35f..0b2f0f7d4d 100644 --- a/paddle/operators/elementwise_sub_op.cu +++ b/paddle/operators/elementwise_sub_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel); + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h index 3f40c1c5bc..731a30c5e3 100644 --- a/paddle/operators/elementwise_sub_op.h +++ b/paddle/operators/elementwise_sub_op.h @@ -18,11 +18,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseSubKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -101,11 +101,11 @@ struct ElementwiseSubBroadCast2GradFunctor { } }; -template +template class ElementwiseSubGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseSubOneGradFunctor, ElementwiseSubBroadCastGradFunctor, ElementwiseSubBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc index 282775fcda..8b3cddbb94 100644 --- a/paddle/operators/expand_op.cc +++ b/paddle/operators/expand_op.cc @@ -130,7 +130,8 @@ class ExpandGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad, ops::ExpandGradOp); -REGISTER_OP_CPU_KERNEL(expand, - ops::ExpandKernel); REGISTER_OP_CPU_KERNEL( - expand_grad, ops::ExpandGradKernel); + expand, ops::ExpandKernel); +REGISTER_OP_CPU_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu index 6744562b6c..99ee584d08 100644 --- a/paddle/operators/expand_op.cu +++ b/paddle/operators/expand_op.cu @@ -17,7 +17,8 @@ #include "paddle/operators/expand_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(expand, - ops::ExpandKernel); -REGISTER_OP_GPU_KERNEL( - expand_grad, ops::ExpandGradKernel); +REGISTER_OP_CUDA_KERNEL( + expand, ops::ExpandKernel); +REGISTER_OP_CUDA_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h index 4d7996ad1e..14ef8b0912 100644 --- a/paddle/operators/expand_op.h +++ b/paddle/operators/expand_op.h @@ -56,7 +56,7 @@ template using EigenTensor = framework::EigenTensor; -template +template class ExpandKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -83,12 +83,13 @@ class ExpandKernel : public framework::OpKernel { auto x = EigenTensor::From(*in0); out0->mutable_data(context.GetPlace()); auto y = EigenTensor::From(*out0); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); y.device(place) = x.broadcast(bcast_dims); } }; -template +template class ExpandGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -164,7 +165,8 @@ class ExpandGradKernel : public framework::OpKernel { reduce_dims[i] = reduce_dims_vec[i]; } auto out_grad = EigenVector::Flatten(*in0); - x_grad.device(context.GetEigenDevice()) = + x_grad.device( + *context.template device_context().eigen_device()) = out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions()); } }; diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc index 892922cd3a..7fb74e2b95 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cc @@ -100,8 +100,11 @@ REGISTER_OPERATOR(fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpMaker); REGISTER_OP_CPU_KERNEL( fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc index 9e7a1eeab8..2e0e15f36b 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc @@ -16,10 +16,13 @@ #include "paddle/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h index 339d97a30a..66da9d0307 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.h +++ b/paddle/operators/fill_constant_batch_size_like_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -27,8 +27,9 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); auto value = ctx.Attr("value"); - math::SetConstant setter; - setter(ctx.device_context(), out, static_cast(value)); + math::SetConstant setter; + setter(ctx.template device_context(), out, + static_cast(value)); } }; diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc index 95fb5932b8..720c11f5f1 100644 --- a/paddle/operators/fill_zeros_like_op.cc +++ b/paddle/operators/fill_zeros_like_op.cc @@ -54,8 +54,9 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker); REGISTER_OP_CPU_KERNEL( - fill_zeros_like, ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel); + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc index 1501a17441..9f412306bb 100644 --- a/paddle/operators/fill_zeros_like_op.cu.cc +++ b/paddle/operators/fill_zeros_like_op.cu.cc @@ -16,9 +16,10 @@ #include "paddle/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - fill_zeros_like, ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel); +REGISTER_OP_CUDA_KERNEL( + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h index 7e7d78eea2..a6e2941f52 100644 --- a/paddle/operators/fill_zeros_like_op.h +++ b/paddle/operators/fill_zeros_like_op.h @@ -19,15 +19,16 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class FillZerosLikeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* out = context.Output("Y"); out->mutable_data(context.GetPlace()); - math::SetConstant setter; - setter(context.device_context(), out, static_cast(0)); + math::SetConstant setter; + setter(context.template device_context(), out, + static_cast(0)); } }; diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc index cb7ae69196..b14913ff21 100644 --- a/paddle/operators/ftrl_op.cc +++ b/paddle/operators/ftrl_op.cc @@ -135,5 +135,5 @@ The paper that proposed Follow The Regularized Leader (FTRL): namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker); -REGISTER_OP_CPU_KERNEL(ftrl, - ops::FTRLOpKernel); +REGISTER_OP_CPU_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu index 97b36dade6..abbbe7adbe 100644 --- a/paddle/operators/ftrl_op.cu +++ b/paddle/operators/ftrl_op.cu @@ -15,5 +15,5 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/ftrl_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(ftrl, - ops::FTRLOpKernel); +REGISTER_OP_CUDA_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h index b040162f8d..4eea04cd8d 100644 --- a/paddle/operators/ftrl_op.h +++ b/paddle/operators/ftrl_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class FTRLOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -53,7 +53,7 @@ class FTRLOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto s_acc_out = EigenVector::Flatten(*sq_accum_out); auto l_acc_out = EigenVector::Flatten(*lin_accum_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h index 8d04ecd284..c806aa5f05 100644 --- a/paddle/operators/gather.cu.h +++ b/paddle/operators/gather.cu.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { using framework::Tensor; -using platform::Place; +using platform::DeviceContext; #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu index 92219d6a43..b37f0576e2 100644 --- a/paddle/operators/gather_op.cu +++ b/paddle/operators/gather_op.cu @@ -49,7 +49,8 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto place = ctx.GetEigenDevice(); + auto &place = *ctx.template device_context() + .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); GPUScatterAssign(ctx.device_context(), *dO, *Index, dX); @@ -60,5 +61,5 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h index 8276ed0d3d..1a1ba0c41a 100644 --- a/paddle/operators/gather_op.h +++ b/paddle/operators/gather_op.h @@ -53,7 +53,8 @@ class GatherGradientOpKernel : public framework::OpKernel { dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto place = ctx.GetEigenDevice(); + auto &place = *ctx.template device_context() + .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); ScatterAssign(ctx.device_context(), *dO, *Index, dX); diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu index 315560bf1b..ffce6f7138 100644 --- a/paddle/operators/gaussian_random_op.cu +++ b/paddle/operators/gaussian_random_op.cu @@ -60,5 +60,5 @@ class GPUGaussianRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(gaussian_random, - paddle::operators::GPUGaussianRandomKernel); +REGISTER_OP_CUDA_KERNEL(gaussian_random, + paddle::operators::GPUGaussianRandomKernel); diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc index 5aa03f8916..311e7edcf1 100644 --- a/paddle/operators/gru_op.cc +++ b/paddle/operators/gru_op.cc @@ -213,8 +213,9 @@ class GRUGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp); -REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel, - ops::GRUKernel); -REGISTER_OP_CPU_KERNEL(gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CPU_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CPU_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc index 0ceff94ec3..458630ca61 100644 --- a/paddle/operators/gru_op.cu.cc +++ b/paddle/operators/gru_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/gru_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel, - ops::GRUKernel); -REGISTER_OP_GPU_KERNEL(gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CUDA_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CUDA_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index 564489d3a9..6d02dff578 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -27,16 +27,16 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -template -inline void ReorderInitState(const platform::DeviceContext& ctx, +template +inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, const size_t* index, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index, *dst, indexed_src); } -template +template class GRUKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { @@ -60,12 +60,12 @@ class GRUKernel : public framework::OpKernel { auto hidden_dims = hidden->dims(); bool is_reverse = context.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; - auto& dev_ctx = context.device_context(); + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = context.template device_context(); to_batch(dev_ctx, *input, *batch_gate, true, is_reverse); if (bias) { - math::RowwiseAdd add_bias; + math::RowwiseAdd add_bias; add_bias(dev_ctx, *batch_gate, *bias, batch_gate); } @@ -80,8 +80,9 @@ class GRUKernel : public framework::OpKernel { // Since the batch computing for GRU reorders the input sequences // according to their length. The initialized cell state also needs // to reorder. - ReorderInitState(context.device_context(), *h0, order, - &ordered_h0, true); + ReorderInitState( + context.template device_context(), *h0, order, + &ordered_h0, true); gru_value.prev_out_value = ordered_h0.data(); } else { gru_value.prev_out_value = nullptr; @@ -99,14 +100,14 @@ class GRUKernel : public framework::OpKernel { gru_value.output_value = hidden_t.data(); gru_value.gate_value = gate_t.data(); gru_value.reset_output_value = reset_hidden_prev_t.data(); - math::GRUUnitFunctor::compute( + math::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); gru_value.prev_out_value = gru_value.output_value; } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); to_seq(dev_ctx, *batch_hidden, *hidden); } @@ -116,7 +117,7 @@ class GRUKernel : public framework::OpKernel { } }; -template +template class GRUGradKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { @@ -141,14 +142,14 @@ class GRUGradKernel : public framework::OpKernel { auto hidden_dims = hidden->dims(); int frame_size = hidden_dims[1]; - math::LoDTensor2BatchFunctor to_batch; + math::LoDTensor2BatchFunctor to_batch; LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad; batch_hidden_grad.mutable_data(hidden_dims, context.GetPlace()); batch_gate_grad.mutable_data(gate_dims, context.GetPlace()); batch_reset_hidden_prev_grad.mutable_data(hidden_dims, context.GetPlace()); - math::SetConstant zero; - auto& dev_ctx = context.device_context(); + math::SetConstant zero; + auto& dev_ctx = context.template device_context(); zero(dev_ctx, &batch_hidden_grad, static_cast(0.0)); zero(dev_ctx, &batch_gate_grad, static_cast(0.0)); zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast(0.0)); @@ -156,12 +157,13 @@ class GRUGradKernel : public framework::OpKernel { Tensor ordered_h0, ordered_h0_grad; const size_t* order = batch_gate->lod()[2].data(); if (h0) { - ReorderInitState(context.device_context(), *h0, order, - &ordered_h0, true); + ReorderInitState(dev_ctx, *h0, order, &ordered_h0, + true); } if (h0_grad) { ordered_h0_grad.mutable_data(h0_grad->dims(), context.GetPlace()); - zero(context.device_context(), &ordered_h0_grad, static_cast(0.0)); + zero(context.template device_context(), &ordered_h0_grad, + static_cast(0.0)); } bool is_reverse = context.Attr("is_reverse"); @@ -216,25 +218,25 @@ class GRUGradKernel : public framework::OpKernel { gru_grad.prev_out_grad = hidden_prev_grad_t.data(); } - math::GRUUnitGradFunctor::compute( + math::GRUUnitGradFunctor::compute( dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_gate_grad.set_lod(batch_gate->lod()); to_seq(dev_ctx, batch_gate_grad, *input_grad); } if (bias_grad) { bias_grad->mutable_data(context.GetPlace()); - math::ColwiseSum col_sum; + math::ColwiseSum col_sum; col_sum(dev_ctx, batch_gate_grad, bias_grad); } if (h0 && h0_grad) { - ReorderInitState(context.device_context(), ordered_h0_grad, - order, h0_grad, false); + ReorderInitState(dev_ctx, ordered_h0_grad, order, + h0_grad, false); } } diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc index 877c969103..705de87be5 100644 --- a/paddle/operators/gru_unit_op.cc +++ b/paddle/operators/gru_unit_op.cc @@ -201,9 +201,10 @@ class GRUUnitGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad, ops::GRUUnitGradOp); -REGISTER_OP_CPU_KERNEL(gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); REGISTER_OP_CPU_KERNEL( - gru_unit_grad, ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CPU_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu index 821c8c6421..7c752db494 100644 --- a/paddle/operators/gru_unit_op.cu +++ b/paddle/operators/gru_unit_op.cu @@ -16,9 +16,10 @@ #include "paddle/operators/gru_unit_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); -REGISTER_OP_GPU_KERNEL( - gru_unit_grad, ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); +REGISTER_OP_CUDA_KERNEL( + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CUDA_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h index 3398c0934e..8fe60c750d 100644 --- a/paddle/operators/gru_unit_op.h +++ b/paddle/operators/gru_unit_op.h @@ -34,7 +34,7 @@ using EigenVector = framework::EigenVector; enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; -template +template class GRUUnitKernel : public framework::OpKernel { public: template @@ -71,7 +71,8 @@ class GRUUnitKernel : public framework::OpKernel { auto g = EigenMatrix::From(*gate); auto r_h_p = EigenMatrix::From(*reset_hidden_prev); auto h = EigenMatrix::From(*hidden); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); // calculate unactivated gate outputs if (bias) { @@ -86,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel { const T* weight_data = weight->data(); T* gate_data = gate->data(); T* reset_hidden_prev_data = reset_hidden_prev->data(); - math::gemm(context.device_context(), false, false, batch_size, - 2 * frame_size, frame_size, 1, hidden_prev_data, - frame_size, weight_data, frame_size * 2, 1, gate_data, - frame_size * 3); + math::gemm( + context.template device_context(), false, false, + batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size, + weight_data, frame_size * 2, 1, gate_data, frame_size * 3); // calculate activited gate Eigen::array extents({{batch_size, frame_size}}); @@ -102,11 +103,11 @@ class GRUUnitKernel : public framework::OpKernel { g.slice(r_offsets, extents), g.slice(r_offsets, extents)); auto r = g.slice(r_offsets, extents); // reset gate r_h_p.device(place) = r * h_p; // reset previous hidden state - math::gemm(context.device_context(), false, false, batch_size, - frame_size, frame_size, 1, reset_hidden_prev_data, - frame_size, weight_data + frame_size * frame_size * 2, - frame_size, 1, gate_data + frame_size * 2, - frame_size * 3); + math::gemm( + context.template device_context(), false, false, + batch_size, frame_size, frame_size, 1, reset_hidden_prev_data, + frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1, + gate_data + frame_size * 2, frame_size * 3); Eigen::array c_offsets({{0, frame_size * 2}}); ActCompute(context.Attr("activation"), place, @@ -118,7 +119,7 @@ class GRUUnitKernel : public framework::OpKernel { } }; -template +template class GRUUnitGradKernel : public framework::OpKernel { public: template @@ -166,7 +167,8 @@ class GRUUnitGradKernel : public framework::OpKernel { auto d_h = EigenMatrix::From(*hidden_grad); auto d_g = EigenMatrix::From(gate_grad); auto d_r_h_p = EigenMatrix::From(reset_hidden_prev_grad); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); int batch_size = input->dims()[0]; int frame_size = hidden_prev->dims()[1]; @@ -186,11 +188,11 @@ class GRUUnitGradKernel : public framework::OpKernel { ActGradCompute(context.Attr("activation"), place, c, c, d_g.slice(c_offsets, extents), d_h * u); // backward for reset_hidden_prev - math::gemm(context.device_context(), false, true, batch_size, - frame_size, frame_size, 1, - gate_grad_data + frame_size * 2, frame_size * 3, - weight_data + frame_size * frame_size * 2, frame_size, - 0, reset_hidden_prev_grad_data, frame_size); + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2, + frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size, + 0, reset_hidden_prev_grad_data, frame_size); // backward for unactivated reset gate ActGradCompute(context.Attr("gate_activation"), place, r, r, d_g.slice(r_offsets, extents), d_r_h_p * h_p); @@ -198,17 +200,18 @@ class GRUUnitGradKernel : public framework::OpKernel { if (weight_grad) { T* weight_grad_data = weight_grad->mutable_data(context.GetPlace()); // backward for state_weight - math::gemm( - context.device_context(), true, false, frame_size, frame_size, - batch_size, 1, reset_hidden_prev_data, frame_size, - gate_grad_data + frame_size * 2, frame_size * 3, 0, + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size, batch_size, 1, reset_hidden_prev_data, + frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0, weight_grad_data + frame_size * frame_size * 2, frame_size); // backward for update_gate_weight and reset_gate_weight - math::gemm(context.device_context(), true, false, frame_size, - frame_size * 2, batch_size, 1, hidden_prev_data, - frame_size, gate_grad_data, frame_size * 3, 0, - weight_grad_data, frame_size * 2); + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size * 2, batch_size, 1, hidden_prev_data, + frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data, + frame_size * 2); } // backward for hidden_prev if (hidden_prev_grad) { @@ -216,10 +219,11 @@ class GRUUnitGradKernel : public framework::OpKernel { hidden_prev_grad->mutable_data(context.GetPlace()); auto d_h_p = EigenMatrix::From(*hidden_prev_grad); d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u); - math::gemm(context.device_context(), false, true, batch_size, - frame_size, frame_size * 2, 1, gate_grad_data, - frame_size * 3, weight_data, frame_size * 2, 1, - hidden_prev_grad_data, frame_size); + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size * 2, 1, gate_grad_data, + frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data, + frame_size); } // backward for input if (input_grad) { diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc index 1e13897bb6..373b4d99b4 100644 --- a/paddle/operators/hinge_loss_op.cc +++ b/paddle/operators/hinge_loss_op.cc @@ -106,8 +106,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker, hinge_loss_grad, ops::HingeLossGradOp); -REGISTER_OP_CPU_KERNEL(hinge_loss, - ops::HingeLossKernel); +REGISTER_OP_CPU_KERNEL( + hinge_loss, + ops::HingeLossKernel); REGISTER_OP_CPU_KERNEL( hinge_loss_grad, - ops::HingeLossGradKernel); + ops::HingeLossGradKernel); diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu index ec20b08e30..31a5bde292 100644 --- a/paddle/operators/hinge_loss_op.cu +++ b/paddle/operators/hinge_loss_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/hinge_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(hinge_loss, - ops::HingeLossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CUDA_KERNEL( hinge_loss_grad, - ops::HingeLossGradKernel); + ops::HingeLossGradKernel); diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h index c0be496f9c..91369cfb8a 100644 --- a/paddle/operators/hinge_loss_op.h +++ b/paddle/operators/hinge_loss_op.h @@ -19,14 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class HingeLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* pred = context.Input("Logits"); auto* label = context.Input("Labels"); auto* loss = context.Output("Loss"); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = framework::EigenVector::Flatten(*pred); auto y = framework::EigenVector::Flatten(*label); @@ -38,7 +39,7 @@ class HingeLossKernel : public framework::OpKernel { } }; -template +template class HingeLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -48,7 +49,8 @@ class HingeLossGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Loss")); auto* dpred = context.Output(framework::GradVarName("Logits")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = framework::EigenVector::Flatten(*pred); auto y = framework::EigenVector::Flatten(*label); diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc index 938803d5b3..11828d083a 100644 --- a/paddle/operators/huber_loss_op.cc +++ b/paddle/operators/huber_loss_op.cc @@ -124,8 +124,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, huber_loss_grad, ops::HuberLossGradOp); -REGISTER_OP_CPU_KERNEL(huber_loss, - ops::HuberLossKernel); +REGISTER_OP_CPU_KERNEL( + huber_loss, + ops::HuberLossKernel); REGISTER_OP_CPU_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel); diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu index 317321dc6c..d49a4d9d42 100644 --- a/paddle/operators/huber_loss_op.cu +++ b/paddle/operators/huber_loss_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/huber_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(huber_loss, - ops::HuberLossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + huber_loss, + ops::HuberLossKernel); +REGISTER_OP_CUDA_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel); diff --git a/paddle/operators/huber_loss_op.h b/paddle/operators/huber_loss_op.h index 4e7bc55432..4dd20e8b08 100644 --- a/paddle/operators/huber_loss_op.h +++ b/paddle/operators/huber_loss_op.h @@ -41,7 +41,7 @@ struct HuberLossForward { T delta; }; -template +template class HuberLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -50,7 +50,8 @@ class HuberLossKernel : public framework::OpKernel { auto* out0 = context.Output("Residual"); auto* out1 = context.Output("Out"); auto delta = static_cast(context.Attr("delta")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = EigenVector::Flatten(*in0); auto y = EigenVector::Flatten(*in1); @@ -85,7 +86,7 @@ struct HuberLossBackward { T delta; }; -template +template class HuberLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -94,7 +95,8 @@ class HuberLossGradKernel : public framework::OpKernel { auto* out0 = context.Output(framework::GradVarName("X")); auto* out1 = context.Output(framework::GradVarName("Y")); auto delta = static_cast(context.op().Attr("delta")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto residual = EigenVector::Flatten(*in0); auto out_grad = EigenVector::Flatten(*in1); diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc index 02ebf02296..c0b51202c6 100644 --- a/paddle/operators/l1_norm_op.cc +++ b/paddle/operators/l1_norm_op.cc @@ -69,7 +69,8 @@ $$Out = \sum{|X|}$$ namespace ops = paddle::operators; REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad, ops::L1NormGradOp); -REGISTER_OP_CPU_KERNEL(l1_norm, - ops::L1NormKernel); REGISTER_OP_CPU_KERNEL( - l1_norm_grad, ops::L1NormGradKernel); + l1_norm, ops::L1NormKernel); +REGISTER_OP_CPU_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu index 1c206e04cc..fd725f86f6 100644 --- a/paddle/operators/l1_norm_op.cu +++ b/paddle/operators/l1_norm_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/l1_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(l1_norm, - ops::L1NormKernel); -REGISTER_OP_GPU_KERNEL( - l1_norm_grad, ops::L1NormGradKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm, ops::L1NormKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h index 3c60dc3dc7..ae3878f2b7 100644 --- a/paddle/operators/l1_norm_op.h +++ b/paddle/operators/l1_norm_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = sum(abs(X)) -template +template class L1NormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -30,14 +30,15 @@ class L1NormKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto out = framework::EigenScalar::From(*Out); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); out.device(place) = x.abs().sum(); } }; // dX = dout * sign(X) -template +template class L1NormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -52,7 +53,8 @@ class L1NormGradKernel : public framework::OpKernel { auto x_eigen = framework::EigenVector::Flatten(*x); auto d_out_eigen = framework::EigenVector::Flatten(*d_out); auto dx_eigen = framework::EigenVector::Flatten(*dx); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); Eigen::DSizes x_dsize(x->numel()); dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign(); diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc index 8e079a14e0..896e3657d4 100644 --- a/paddle/operators/linear_chain_crf_op.cc +++ b/paddle/operators/linear_chain_crf_op.cc @@ -261,9 +261,10 @@ REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker, linear_chain_crf_grad, ops::LinearChainCRFGradOp); REGISTER_OP_CPU_KERNEL( linear_chain_crf, - ops::LinearChainCRFOpKernel, - ops::LinearChainCRFOpKernel); + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); REGISTER_OP_CPU_KERNEL( linear_chain_crf_grad, - ops::LinearChainCRFGradOpKernel, - ops::LinearChainCRFGradOpKernel); + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu index 6fc8995f4c..3b105ec341 100644 --- a/paddle/operators/linear_chain_crf_op.cu +++ b/paddle/operators/linear_chain_crf_op.cu @@ -16,11 +16,12 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( linear_chain_crf, - ops::LinearChainCRFOpKernel, - ops::LinearChainCRFOpKernel); -REGISTER_OP_GPU_KERNEL( + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); +REGISTER_OP_CUDA_KERNEL( linear_chain_crf_grad, - ops::LinearChainCRFGradOpKernel, - ops::LinearChainCRFGradOpKernel); + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index 014bbfa758..694584e79c 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -50,7 +50,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class LinearChainCRFOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -137,7 +137,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel { framework::make_ddim({static_cast(batch_size), 1}), platform::CPUPlace()); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context() + .eigen_device(); auto x = EigenMatrix::From(*emission_weights); auto x_row_max = EigenMatrix::From(emission_row_max); x_row_max.device(place) = @@ -287,7 +288,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel { } }; -template +template class LinearChainCRFGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -359,8 +360,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { emission_grad->mutable_data(platform::CPUPlace()); if (transition_grad) { transition_grad->mutable_data(platform::CPUPlace()); - math::SetConstant()(ctx.device_context(), - transition_grad, 0.); + math::set_constant(ctx.device_context(), transition_grad, 0.); } // Now, all the inputs and outputs should be on the CPU memory. @@ -384,10 +384,10 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { Tensor one_seq_beta = beta.Slice(start_pos, end_pos); Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos); - BackwardOneSequence(ctx.device_context(), ll_grad[i], - one_seq_emission_exps, *transition_exps, - one_seq_alpha, one_seq_label, &one_seq_beta, - transition_grad, &one_seq_emission_grad); + BackwardOneSequence( + ctx.template device_context(), ll_grad[i], + one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label, + &one_seq_beta, transition_grad, &one_seq_emission_grad); } if (platform::is_gpu_place(ctx.GetPlace())) { @@ -441,8 +441,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { copyTensor(ctx, transition_grad_src, transition_grad_dst); } - void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad, - const Tensor& emission_exps, + void BackwardOneSequence(const platform::CPUDeviceContext& ctx, + const T ll_grad, const Tensor& emission_exps, const Tensor& transition_exps, const Tensor& alpha, const Tensor& label, Tensor* beta, Tensor* transition_grad, @@ -481,7 +481,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { auto alpha_mat = EigenMatrix::From(alpha); auto beta_mat = EigenMatrix::From(*beta); - auto* place = ctx.GetEigenDevice(); + auto* place = ctx.eigen_device(); auto prob = alpha_mat * beta_mat; auto row_sum = prob.sum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(seq_length, 1)) diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu index 5244a17c3a..f7c2358980 100644 --- a/paddle/operators/lod_reset_op.cu +++ b/paddle/operators/lod_reset_op.cu @@ -16,9 +16,10 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lod_reset, - ops::LoDResetKernel, - ops::LoDResetKernel); -REGISTER_OP_GPU_KERNEL( - lod_reset_grad, ops::LoDResetGradKernel, - ops::LoDResetGradKernel); +REGISTER_OP_CUDA_KERNEL( + lod_reset, ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CUDA_KERNEL( + lod_reset_grad, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h index cbcbf80adc..b86f8b1313 100644 --- a/paddle/operators/lod_reset_op.h +++ b/paddle/operators/lod_reset_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class LoDResetKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -65,7 +65,7 @@ class LoDResetKernel : public framework::OpKernel { } }; -template +template class LoDResetGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc index 257e5c8a49..4524229a33 100644 --- a/paddle/operators/log_loss_op.cc +++ b/paddle/operators/log_loss_op.cc @@ -109,7 +109,8 @@ class LogLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker, log_loss_grad, ops::LogLossGradOp); -REGISTER_OP_CPU_KERNEL(log_loss, - ops::LogLossKernel); REGISTER_OP_CPU_KERNEL( - log_loss_grad, ops::LogLossGradKernel); + log_loss, ops::LogLossKernel); +REGISTER_OP_CPU_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu index 6c189ef341..e87ac7d12a 100644 --- a/paddle/operators/log_loss_op.cu +++ b/paddle/operators/log_loss_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/log_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(log_loss, - ops::LogLossKernel); -REGISTER_OP_GPU_KERNEL( - log_loss_grad, ops::LogLossGradKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss, ops::LogLossKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h index 73404fce91..743eddb740 100644 --- a/paddle/operators/log_loss_op.h +++ b/paddle/operators/log_loss_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class LogLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -38,7 +38,7 @@ class LogLossKernel : public framework::OpKernel { auto label = EigenVector::Flatten(*ctx.Input("Labels")); auto loss = EigenVector::Flatten(*loss_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); loss.device(place) = (-(label * (prediction + epsilon).log()) - ((static_cast(1) - label) * @@ -46,7 +46,7 @@ class LogLossKernel : public framework::OpKernel { } }; -template +template class LogLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -59,7 +59,7 @@ class LogLossGradKernel : public framework::OpKernel { auto* dpred = ctx.Output(framework::GradVarName("Predicted")); auto dl = EigenVector::Flatten(*dloss); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); if (dpred) { dpred->mutable_data(ctx.GetPlace()); diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu index d41239b2ca..7fef60e0c9 100644 --- a/paddle/operators/logical_op.cu +++ b/paddle/operators/logical_op.cu @@ -14,11 +14,11 @@ #include "paddle/operators/logical_op.h" -REGISTER_BINARY_LOGICAL_KERNEL(logical_and, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA, paddle::operators::LogicalAndFunctor); -REGISTER_BINARY_LOGICAL_KERNEL(logical_or, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA, paddle::operators::LogicalOrFunctor); -REGISTER_UNARY_LOGICAL_KERNEL(logical_not, GPU, +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA, paddle::operators::LogicalNotFunctor); -REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA, paddle::operators::LogicalXorFunctor); diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h index 6e78a7d6ed..629388cac8 100644 --- a/paddle/operators/logical_op.h +++ b/paddle/operators/logical_op.h @@ -47,7 +47,7 @@ struct LogicalXorFunctor { } }; -template +template class BinaryLogicalOpKernel : public framework::OpKernel { public: @@ -57,14 +57,14 @@ class BinaryLogicalOpKernel auto* y = context.Input("Y"); auto* out = context.Output("Out"); Functor binary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), - y->data(), out->mutable_data(context.GetPlace()), - binary_func); + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); } }; -template +template class UnaryLogicalOpKernel : public framework::OpKernel { public: @@ -73,8 +73,9 @@ class UnaryLogicalOpKernel auto* x = context.Input("X"); auto* out = context.Output("Out"); Functor unary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), out->mutable_data(context.GetPlace()), unary_func); } }; @@ -85,9 +86,9 @@ class UnaryLogicalOpKernel #define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ REGISTER_OP_##dev##_KERNEL( \ op_type, ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##Place, functor>); + ::paddle::platform::dev##DeviceContext, functor>); #define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ REGISTER_OP_##dev##_KERNEL( \ op_type, ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##Place, functor>); + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index 84b044184a..9431030a53 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -85,6 +85,8 @@ template class LookupTableGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = + context.template device_context(); bool is_sparse = context.Attr("is_sparse"); if (is_sparse) { auto* ids = context.Input("Ids"); @@ -95,7 +97,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto* ids_data = ids->data(); auto ids_dim = ids->dims(); - auto stream = context.cuda_device_context().stream(); + auto stream = dev_ctx.stream(); // copy GPU memory to CPU pinned memory framework::Vector new_rows; new_rows.resize(ids_dim[0]); @@ -129,14 +131,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { T* d_table = d_table_t->mutable_data(context.GetPlace()); auto t = framework::EigenVector::Flatten(*d_table_t); - t.device(context.GetEigenDevice()) = - t.constant(static_cast(0)); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); dim3 threads(128, 8); dim3 grids(8, 1); - LookupTableGrad< - T, 128, 8, - 8><<>>( + LookupTableGrad<<>>( d_table, d_output, ids, N, K, D); } } @@ -146,7 +145,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel, - ops::LookupTableCUDAKernel); -REGISTER_OP_GPU_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel, - ops::LookupTableGradCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel, + ops::LookupTableCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table_grad, + ops::LookupTableGradCUDAKernel, + ops::LookupTableGradCUDAKernel); diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc index e20340e77b..b5b7bc940a 100644 --- a/paddle/operators/lrn_op.cc +++ b/paddle/operators/lrn_op.cc @@ -20,7 +20,7 @@ namespace operators { using framework::Tensor; template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, framework::Tensor* mid, int N, int C, int H, int W, int n, @@ -55,11 +55,11 @@ struct LRNFunctor { out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, const framework::Tensor& mid, framework::Tensor* x_g, @@ -113,8 +113,8 @@ struct LRNGradFunctor { } } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; class LRNOp : public framework::OperatorWithKernel { public: @@ -204,7 +204,7 @@ Input(i, x, y), Output(i, x, y) represents an element in an image. C is the number of feature maps of one image. n is a hyper-parameter configured when operator is initialized. The sum in the denominator is the sum of the same positions in the neighboring maps. - + )DOC"); } }; @@ -230,6 +230,7 @@ class LRNOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker, lrn_grad, ops::LRNOpGrad); -REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel); -REGISTER_OP_CPU_KERNEL(lrn_grad, - ops::LRNGradKernel); +REGISTER_OP_CPU_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CPU_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu index e9a8671233..c6857c2b6d 100644 --- a/paddle/operators/lrn_op.cu +++ b/paddle/operators/lrn_op.cu @@ -69,19 +69,18 @@ void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - KeCMRNormFillScale< - T><<>>( + auto& dev_ctx = ctx.template device_context(); + KeCMRNormFillScale<<>>( img_size, inputs, mid, C, H, W, n, k, alpha); int input_size = N * H * W * C; grid_size = (input_size + block_size - 1) / block_size; - KeCMRNormOutput< - T><<>>( + KeCMRNormOutput<<>>( input_size, inputs, mid, -beta, outputs); } template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, framework::Tensor* mid, int N, int C, int H, int W, int n, @@ -92,8 +91,8 @@ struct LRNFunctor { } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template __global__ void KeCMRNormDiff(int img_size, const T* x, const T* out, @@ -148,14 +147,14 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - KeCMRNormDiff< - T><<>>( + auto& dev_ctx = ctx.template device_context(); + KeCMRNormDiff<<>>( img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta, 2.0f * alpha * beta); } template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, const framework::Tensor& mid, framework::Tensor* x_g, @@ -167,12 +166,13 @@ struct LRNGradFunctor { } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel); -REGISTER_OP_GPU_KERNEL(lrn_grad, - ops::LRNGradKernel); +REGISTER_OP_CUDA_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CUDA_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h index aa7539db4a..44063d3e03 100644 --- a/paddle/operators/lrn_op.h +++ b/paddle/operators/lrn_op.h @@ -29,7 +29,7 @@ struct LRNFunctor { T k, T alpha, T beta); }; -template +template class LRNKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; @@ -65,12 +65,12 @@ class LRNKernel : public framework::OpKernel { PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); - LRNFunctor f; + LRNFunctor f; f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta); } }; -template +template struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, @@ -98,7 +98,7 @@ struct LRNGradFunctor { * The upper and lower is the same as forward. The logic of the sum * is also the same as forward. */ -template +template class LRNGradKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; @@ -121,7 +121,7 @@ class LRNGradKernel : public framework::OpKernel { T alpha = ctx.Attr("alpha"); T beta = ctx.Attr("beta"); - LRNGradFunctor f; + LRNGradFunctor f; f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); } }; diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc index fa8e5f2da8..2db7da30db 100644 --- a/paddle/operators/lstm_op.cc +++ b/paddle/operators/lstm_op.cc @@ -273,8 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp); -REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_CPU_KERNEL(lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CPU_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CPU_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc index 610cbb03e8..48519bed6f 100644 --- a/paddle/operators/lstm_op.cu.cc +++ b/paddle/operators/lstm_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/lstm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_GPU_KERNEL(lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CUDA_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CUDA_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h index a78f548aaf..14abd4bf0a 100644 --- a/paddle/operators/lstm_op.h +++ b/paddle/operators/lstm_op.h @@ -24,16 +24,16 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -template -inline void ReorderInitState(const platform::DeviceContext& ctx, +template +inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, const size_t* index, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index, *dst, indexed_src); } -template +template class LSTMKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,8 +52,8 @@ class LSTMKernel : public framework::OpKernel { cell_out->mutable_data(ctx.GetPlace()); bool is_reverse = ctx.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; - auto& device_ctx = ctx.device_context(); + math::LoDTensor2BatchFunctor to_batch; + auto& device_ctx = ctx.template device_context(); to_batch(device_ctx, *input, *batch_gate, true, is_reverse); auto in_dims = input->dims(); @@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel { Tensor b = *bias; b.Resize({bias->numel(), 1}); Tensor gate_bias = b.Slice(0, 4 * frame_size); - math::RowwiseAdd add_bias; + math::RowwiseAdd add_bias; add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); } @@ -88,8 +88,8 @@ class LSTMKernel : public framework::OpKernel { // Since the batch computing for LSTM reorders the input sequence // according to their length. The initialized cell state also needs // to reorder. - ReorderInitState(device_ctx, *cell_t0, order, &ordered_c0, - true); + ReorderInitState(device_ctx, *cell_t0, order, + &ordered_c0, true); lstm_value.prev_state_value = ordered_c0.data(); } @@ -121,9 +121,9 @@ class LSTMKernel : public framework::OpKernel { int pre_h_start = static_cast(batch_starts[n - 1]); int pre_h_end = pre_h_start + cur_batch_size; auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, pre_hidden_t, false, *weight, false, - static_cast(1.0), &gate_t, - static_cast(1.0)); + math::matmul(device_ctx, pre_hidden_t, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); } else if (hidden_t0) { // If n == 0 and there is no initialized hidden state, that is to say // the H0 is zeros, the calculation W_h * H0 will be skiped. @@ -133,24 +133,24 @@ class LSTMKernel : public framework::OpKernel { // according to their length. The initialized hidden state also needs // to reorder. Tensor ordered_h0; - ReorderInitState(device_ctx, *hidden_t0, order, &ordered_h0, - true); - math::matmul(device_ctx, ordered_h0, false, *weight, false, - static_cast(1.0), &gate_t, - static_cast(1.0)); + ReorderInitState(device_ctx, *hidden_t0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); } lstm_value.gate_value = gate_t.data(); lstm_value.output_value = out_t.data(); lstm_value.state_value = cell_t.data(); lstm_value.state_active_value = cell_pre_act_t.data(); - math::LstmUnitFunctor::compute(device_ctx, lstm_value, - frame_size, cur_batch_size, - gate_act, cell_act, cand_act); + math::LstmUnitFunctor::compute( + device_ctx, lstm_value, frame_size, cur_batch_size, gate_act, + cell_act, cand_act); lstm_value.prev_state_value = lstm_value.state_value; } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_hidden.set_lod(batch_gate->lod()); // restore the output hidden in LoDTensor from the batch hidden to_seq(device_ctx, batch_hidden, *hidden_out); @@ -161,7 +161,7 @@ class LSTMKernel : public framework::OpKernel { } }; -template +template class LSTMGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -187,8 +187,8 @@ class LSTMGradKernel : public framework::OpKernel { auto* h0_g = ctx.Output(framework::GradVarName("H0")); auto* c0_g = ctx.Output(framework::GradVarName("C0")); - auto& device_ctx = ctx.device_context(); - math::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; if (weight_g) { weight_g->mutable_data(ctx.GetPlace()); zero(device_ctx, weight_g, static_cast(0.0)); @@ -200,7 +200,8 @@ class LSTMGradKernel : public framework::OpKernel { Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; const size_t* order = batch_gate->lod()[2].data(); if (c0) { - ReorderInitState(device_ctx, *c0, order, &ordered_c0, true); + ReorderInitState(device_ctx, *c0, order, &ordered_c0, + true); } if (c0 && c0_g) { ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); @@ -240,10 +241,10 @@ class LSTMGradKernel : public framework::OpKernel { lstm_grad.check_og_grad = nullptr; } - math::LoDTensor2BatchFunctor to_batch; + math::LoDTensor2BatchFunctor to_batch; auto ToBatch = [&batch_gate, &to_batch]( - const platform::DeviceContext& ctx, const framework::LoDTensor& src, + const DeviceContext& ctx, const framework::LoDTensor& src, const framework::DDim& dims, framework::LoDTensor& dst) { dst.mutable_data(dims, ctx.GetPlace()); dst.set_lod(batch_gate->lod()); @@ -299,7 +300,7 @@ class LSTMGradKernel : public framework::OpKernel { } int cur_batch_size = bend - bstart; - math::LstmUnitGradFunctor::compute( + math::LstmUnitGradFunctor::compute( device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, gate_act, cell_act, cand_act); @@ -307,33 +308,34 @@ class LSTMGradKernel : public framework::OpKernel { int pre_h_start = static_cast(batch_starts[n - 1]); int pre_h_end = pre_h_start + cur_batch_size; auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, gate_g, false, *weight, true, - static_cast(1.0), &pre_hidden_g, - static_cast(1.0)); + math::matmul(device_ctx, gate_g, false, *weight, true, + static_cast(1.0), &pre_hidden_g, + static_cast(1.0)); if (weight_g) { /* backward weight */ auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, pre_hidden, true, gate_g, false, - static_cast(1.0), weight_g, - static_cast(1.0)); + math::matmul(device_ctx, pre_hidden, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); } } else { if (h0 && weight_g) { - ReorderInitState(device_ctx, *h0, order, &ordered_h0, true); - math::matmul(device_ctx, ordered_h0, true, gate_g, false, - static_cast(1.0), weight_g, - static_cast(1.0)); + ReorderInitState(device_ctx, *h0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); } if (h0 && h0_g) { ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); - math::matmul(device_ctx, gate_g, false, *weight, true, - static_cast(1.0), &ordered_h0_g, - static_cast(0.0)); + math::matmul(device_ctx, gate_g, false, *weight, + true, static_cast(1.0), + &ordered_h0_g, static_cast(0.0)); } } } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; if (in_g) { /* backward data */ in_g->mutable_data(ctx.GetPlace()); @@ -344,15 +346,17 @@ class LSTMGradKernel : public framework::OpKernel { Tensor b_g = *bias_g; b_g.Resize({bias_g->numel(), 1}); Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); - math::ColwiseSum col_sum; + math::ColwiseSum col_sum; col_sum(device_ctx, batch_gate_g, &gate_bias_g); } if (h0 && h0_g) { - ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, false); + ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, + false); } if (c0 && c0_g) { - ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, false); + ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, + false); } } }; diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu index e192283aa0..291f2c295e 100644 --- a/paddle/operators/lstm_unit_op.cu +++ b/paddle/operators/lstm_unit_op.cu @@ -173,7 +173,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, - ops::LstmUnitOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, - ops::LstmUnitGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, + ops::LstmUnitOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, + ops::LstmUnitGradOpCUDAKernel); diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h index 38cb298f92..61705675d9 100644 --- a/paddle/operators/lstm_unit_op.h +++ b/paddle/operators/lstm_unit_op.h @@ -35,7 +35,7 @@ inline T tanh(T x) { return 2. * sigmoid(2. * x) - 1.; } -template +template class LstmUnitKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -78,7 +78,7 @@ class LstmUnitKernel : public framework::OpKernel { } }; -template +template class LstmUnitGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc index d7e8a0ea76..42e8961c0e 100644 --- a/paddle/operators/margin_rank_loss_op.cc +++ b/paddle/operators/margin_rank_loss_op.cc @@ -117,7 +117,7 @@ REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp, ops::MarginRankLossGradOp); REGISTER_OP_CPU_KERNEL( margin_rank_loss, - ops::MarginRankLossKernel); + ops::MarginRankLossKernel); REGISTER_OP_CPU_KERNEL( margin_rank_loss_grad, - ops::MarginRankLossGradKernel); + ops::MarginRankLossGradKernel); diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu index 3a639f25d4..1c2afccc5b 100644 --- a/paddle/operators/margin_rank_loss_op.cu +++ b/paddle/operators/margin_rank_loss_op.cu @@ -16,9 +16,9 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( margin_rank_loss, - ops::MarginRankLossKernel); -REGISTER_OP_GPU_KERNEL( + ops::MarginRankLossKernel); +REGISTER_OP_CUDA_KERNEL( margin_rank_loss_grad, - ops::MarginRankLossGradKernel); + ops::MarginRankLossGradKernel); diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h index 8d0830147e..9c1f96cac1 100644 --- a/paddle/operators/margin_rank_loss_op.h +++ b/paddle/operators/margin_rank_loss_op.h @@ -34,7 +34,7 @@ struct Heaviside { } }; -template +template class MarginRankLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -56,13 +56,13 @@ class MarginRankLossKernel : public framework::OpKernel { auto x1 = framework::EigenVector::Flatten(*x1_t); auto x2 = framework::EigenVector::Flatten(*x2_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU()); act.device(dev) = out.unaryExpr(Heaviside()); } }; -template +template class MarginRankLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -78,7 +78,7 @@ class MarginRankLossGradKernel : public framework::OpKernel { auto d_out = framework::EigenVector::Flatten(*d_out_t); auto act = framework::EigenVector::Flatten(*act_t); auto label = framework::EigenVector::Flatten(*label_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); // compute d_x1 if (d_x1_t) { diff --git a/paddle/operators/math/context_project.cc b/paddle/operators/math/context_project.cc index f82ea5d7be..980dd90df8 100644 --- a/paddle/operators/math/context_project.cc +++ b/paddle/operators/math/context_project.cc @@ -18,8 +18,8 @@ namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/context_project.cu b/paddle/operators/math/context_project.cu index 04eeed543c..934e3df645 100644 --- a/paddle/operators/math/context_project.cu +++ b/paddle/operators/math/context_project.cu @@ -20,8 +20,8 @@ namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h index d853507188..4036614086 100644 --- a/paddle/operators/math/context_project.h +++ b/paddle/operators/math/context_project.h @@ -81,17 +81,17 @@ using LoDTensor = framework::LoDTensor; * */ -template +template class ContextProjectFunctor { public: - void operator()(const platform::DeviceContext& context, const LoDTensor& in, + void operator()(const DeviceContext& context, const LoDTensor& in, const Tensor& padding_data, bool padding_trainable, const int context_start, const int context_length, const int context_stride, const int up_pad, const int down_pad, Tensor* col) { auto lod_level_0 = in.lod()[0]; - math::Im2ColFunctor im2col_ocf; + math::Im2ColFunctor im2col_ocf; std::vector dilation({1, 1}); std::vector padding({up_pad, 0, down_pad, 0}); @@ -188,17 +188,17 @@ class ContextProjectFunctor { } }; -template +template class ContextProjectGradFunctor { public: - void operator()(const platform::DeviceContext& context, const LoDTensor& in, + void operator()(const DeviceContext& context, const LoDTensor& in, bool padding_trainable, const int context_start, const int context_length, const int context_stride, const int up_pad, const int down_pad, bool pad_grad, bool input_grad, Tensor* padding_data, Tensor* col) { auto lod_level_0 = in.lod()[0]; - math::Col2ImFunctor col2im_ocf; + math::Col2ImFunctor col2im_ocf; std::vector dilation({1, 1}); std::vector padding({up_pad, 0, down_pad, 0}); @@ -258,8 +258,8 @@ class ContextProjectGradFunctor { Tensor out_t_sub = out_t.Slice(k * context_length, k * context_length + padding_size); Tensor w_sub = padding_data->Slice(k, k + padding_size); - axpy(context, w_sub.numel(), static_cast(1), - out_t_sub.data(), w_sub.data()); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); } } if (down_pad > 0) { @@ -290,8 +290,8 @@ class ContextProjectGradFunctor { (down_pad_begin_row + t) * context_length); Tensor w_sub = padding_data->Slice( up_pad + padding_idx, up_pad + padding_idx + padding_size); - axpy(context, w_sub.numel(), static_cast(1), - out_t_sub.data(), w_sub.data()); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); } } out_t.Resize({sequence_height, context_length * sequence_width}); diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc index cf238a58e0..6011a196d4 100644 --- a/paddle/operators/math/cross_entropy.cc +++ b/paddle/operators/math/cross_entropy.cc @@ -24,9 +24,9 @@ template ; template -class CrossEntropyFunctor { +class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& ctx, framework::Tensor* out, + void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, const framework::Tensor* labels, const bool softLabel) { const int batch_size = prob->dims()[0]; @@ -35,7 +35,7 @@ class CrossEntropyFunctor { auto lbl = EigenMatrix::From(*labels); auto loss = EigenMatrix::From(*out); - loss.device(*ctx.GetEigenDevice()) = + loss.device(*ctx.eigen_device()) = -((lbl * in.log().unaryExpr(math::TolerableValue())) .sum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(batch_size, 1))); @@ -53,8 +53,8 @@ class CrossEntropyFunctor { } }; -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu index 651c08f740..2132d49c93 100644 --- a/paddle/operators/math/cross_entropy.cu +++ b/paddle/operators/math/cross_entropy.cu @@ -95,10 +95,10 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, using Tensor = framework::Tensor; template -class CrossEntropyFunctor { +class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& ctx, framework::Tensor* out, - const framework::Tensor* prob, + void operator()(const platform::CUDADeviceContext& ctx, + framework::Tensor* out, const framework::Tensor* prob, const framework::Tensor* labels, bool softLabel) { const T* prob_data = prob->data(); T* loss_data = out->mutable_data(ctx.GetPlace()); @@ -118,16 +118,14 @@ class CrossEntropyFunctor { const int64_t* label_data = labels->data(); int block = 512; int grid = (batch_size + block - 1) / block; - CrossEntropyKernel<<< - grid, block, 0, - reinterpret_cast(ctx).stream()>>>( + CrossEntropyKernel<<>>( loss_data, prob_data, label_data, batch_size, class_num); } } }; -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h index 70ed9ddd55..677adb5ada 100644 --- a/paddle/operators/math/cross_entropy.h +++ b/paddle/operators/math/cross_entropy.h @@ -33,11 +33,11 @@ struct TolerableValue { } }; -template +template class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& context, - framework::Tensor* out, const framework::Tensor* prob, + void operator()(const DeviceContext& context, framework::Tensor* out, + const framework::Tensor* prob, const framework::Tensor* labels, const bool softLabel); }; } // namespace math diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc index ae4e47b014..d570c68cd4 100644 --- a/paddle/operators/math/gru_compute.cc +++ b/paddle/operators/math/gru_compute.cc @@ -19,14 +19,14 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitFunctor { + static void compute(const platform::CPUDeviceContext &context, hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { #ifndef __NVCC__ if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, frame_size * 3); @@ -36,7 +36,7 @@ struct GRUUnitFunctor { frame_size, batch_size, active_gate); if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight, frame_size, 1, value.gate_value + frame_size * 2, frame_size * 3); @@ -49,8 +49,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const platform::CPUDeviceContext &context, hl_gru_value value, hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, @@ -60,13 +60,13 @@ struct GRUUnitGradFunctor { grad, frame_size, batch_size, active_node); if (value.prev_out_value && grad.prev_out_grad) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size, 1, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, frame_size, 0, grad.reset_output_grad, frame_size); if (grad.state_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size, batch_size, 1, value.reset_output_value, frame_size, grad.gate_grad + frame_size * 2, frame_size * 3, 1, @@ -78,13 +78,13 @@ struct GRUUnitGradFunctor { grad, frame_size, batch_size, active_gate); if (grad.prev_out_grad && value.prev_out_value) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, grad.prev_out_grad, frame_size); if (grad.gate_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size * 2, batch_size, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2); @@ -94,10 +94,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu index 0252bdbdb6..dd518cd1e4 100644 --- a/paddle/operators/math/gru_compute.cu +++ b/paddle/operators/math/gru_compute.cu @@ -19,13 +19,12 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitFunctor { + static void compute(const platform::CUDADeviceContext &context, hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); dim3 threads; dim3 grid; if (batch_size == 1) { @@ -39,7 +38,7 @@ struct GRUUnitFunctor { } if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, frame_size * 3); @@ -62,7 +61,7 @@ struct GRUUnitFunctor { } if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight, frame_size, 1, value.gate_value + frame_size * 2, frame_size * 3); @@ -87,14 +86,13 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const platform::CUDADeviceContext &context, hl_gru_value value, hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); dim3 threads; dim3 grid; if (batch_size == 1) { @@ -124,13 +122,13 @@ struct GRUUnitGradFunctor { } if (value.prev_out_value && grad.prev_out_grad) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size, 1, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, frame_size, 0, grad.reset_output_grad, frame_size); if (grad.state_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size, batch_size, 1, value.reset_output_value, frame_size, grad.gate_grad + frame_size * 2, frame_size * 3, 1, @@ -155,13 +153,13 @@ struct GRUUnitGradFunctor { } if (grad.prev_out_grad && value.prev_out_value) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, grad.prev_out_grad, frame_size); if (grad.gate_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size * 2, batch_size, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2); @@ -170,10 +168,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h index 58ea59f68e..ca1343cb2c 100644 --- a/paddle/operators/math/gru_compute.h +++ b/paddle/operators/math/gru_compute.h @@ -40,19 +40,18 @@ struct hl_gru_grad { T *prev_out_grad; }; -template +template struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, - hl_gru_value value, int frame_size, int batch_size, + static void compute(const DeviceContext &context, hl_gru_value value, + int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate); }; -template +template struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, - hl_gru_value value, hl_gru_grad grad, - int frame_size, int batch_size, + static void compute(const DeviceContext &context, hl_gru_value value, + hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate); }; diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc index c10c44c520..707ebf0596 100644 --- a/paddle/operators/math/im2col.cc +++ b/paddle/operators/math/im2col.cc @@ -25,9 +25,9 @@ namespace math { */ template class Im2ColFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -90,9 +90,9 @@ class Im2ColFunctor class Col2ImFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -149,13 +149,13 @@ class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Im2ColFunctor; + platform::CPUDeviceContext, double>; template class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Col2ImFunctor; + platform::CPUDeviceContext, double>; /* * im = [input_channels, input_height, input_width] @@ -164,9 +164,9 @@ template class Col2ImFunctor class Im2ColFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -235,9 +235,9 @@ class Im2ColFunctor class Col2ImFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -300,13 +300,13 @@ class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Im2ColFunctor; + platform::CPUDeviceContext, double>; template class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Col2ImFunctor; + platform::CPUDeviceContext, double>; } // namespace math } // namespace operators diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu index bf78942439..a88e837b03 100644 --- a/paddle/operators/math/im2col.cu +++ b/paddle/operators/math/im2col.cu @@ -58,9 +58,9 @@ __global__ void im2col(const T* data_im, int num_outs, int im_height, */ template class Im2ColFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -96,9 +96,7 @@ class Im2ColFunctor<<(context) - .stream()>>>( + im2col<<>>( im.data(), num_outputs, im_height, im_width, dilation[0], dilation[1], filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, col->data()); @@ -160,9 +158,9 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width, */ template class Col2ImFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -203,9 +201,7 @@ class Col2ImFunctor<<(context) - .stream()>>>( + col2im<<>>( num_kernels, col.data(), im_height, im_width, dilation[0], dilation[1], filter_height, filter_width, stride[0], stride[1], padding[0], padding[2], col_height, col_width, im->data()); @@ -213,13 +209,13 @@ class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Im2ColFunctor; + platform::CUDADeviceContext, double>; template class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Col2ImFunctor; + platform::CUDADeviceContext, double>; template __global__ void im2colOCF(const T* im_data, int im_channels, int im_height, @@ -260,9 +256,9 @@ __global__ void im2colOCF(const T* im_data, int im_channels, int im_height, */ template class Im2ColFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -310,9 +306,7 @@ class Im2ColFunctor<<(context) - .stream()>>>( + im2colOCF<<>>( im.data(), im_channels, im_height, im_width, filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, col->data()); @@ -358,9 +352,9 @@ __global__ void col2imOCF(const T* col_data, int im_channels, int im_height, */ template class Col2ImFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -409,9 +403,7 @@ class Col2ImFunctor<<(context) - .stream()>>>( + col2imOCF<<>>( col.data(), im_channels, im_height, im_width, filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, im->data()); @@ -419,13 +411,13 @@ class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Im2ColFunctor; + platform::CUDADeviceContext, double>; template class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Col2ImFunctor; + platform::CUDADeviceContext, double>; } // namespace math } // namespace operators diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h index 24fd9a06e9..38f2c9fe0a 100644 --- a/paddle/operators/math/im2col.h +++ b/paddle/operators/math/im2col.h @@ -79,20 +79,19 @@ enum class ColFormat { kCFO = 0, kOCF = 1 }; * \note The caller needs to ensure that imShape.inputChannels is equal to * colShape.inputChannels. */ -template +template class Im2ColFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& im, const std::vector& dilation, + void operator()(const DeviceContext& context, const framework::Tensor& im, + const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col); }; -template +template class Col2ImFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& col, + void operator()(const DeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* im); diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index ae197a97ed..256f3bc9bd 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -template +template void testIm2col() { paddle::framework::Tensor input_tmp; paddle::framework::Tensor input; @@ -59,18 +59,7 @@ void testIm2col() { memcpy(input_ptr, arr, 6 * sizeof(float)); auto* place = new Place(); - paddle::platform::DeviceContext* context; - if (paddle::platform::is_cpu_place(*place)) { - context = - new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); - } else { -#ifdef PADDLE_WITH_CUDA - context = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); -#else - PADDLE_THROW("no GPU support"); -#endif // PADDLE_WITH_CUDA - } + DeviceContext* context = new DeviceContext(*place); if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { @@ -83,10 +72,10 @@ void testIm2col() { // Im2Col paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, Place, float> + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> im2col; paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kOCF, Place, float> + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> im2col_ocf; im2col(*context, input, dilation, stride, padding, &output_cfo); @@ -119,10 +108,10 @@ void testIm2col() { // Col2Im: kCFO paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kCFO, Place, float> + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> col2im; paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kOCF, Place, float> + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> col2im_ocf; float col2im_data[] = {0, 2, 2, 3, 8, 5}; @@ -168,8 +157,8 @@ void testIm2col() { } TEST(math, im2col) { - testIm2col(); + testIm2col(); #ifdef PADDLE_WITH_CUDA - testIm2col(); + testIm2col(); #endif } diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc index ad3a59bcdb..2c2e8bb82e 100644 --- a/paddle/operators/math/lstm_compute.cc +++ b/paddle/operators/math/lstm_compute.cc @@ -21,8 +21,8 @@ namespace operators { namespace math { template -struct LstmUnitFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitFunctor { + static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, const std::string& cand_act) { @@ -42,8 +42,8 @@ struct LstmUnitFunctor { }; template -struct LstmUnitGradFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitGradFunctor { + static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, @@ -72,10 +72,10 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu index b2122f2a5c..92b1f4228b 100644 --- a/paddle/operators/math/lstm_compute.cu +++ b/paddle/operators/math/lstm_compute.cu @@ -21,8 +21,8 @@ namespace operators { namespace math { template -struct LstmUnitFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitFunctor { + static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, const std::string& cand_act) { @@ -33,8 +33,8 @@ struct LstmUnitFunctor { }; template -struct LstmUnitGradFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitGradFunctor { + static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, @@ -45,10 +45,10 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h index 9652399d4c..5f74e27358 100644 --- a/paddle/operators/math/lstm_compute.h +++ b/paddle/operators/math/lstm_compute.h @@ -67,21 +67,20 @@ inline activation_mode_t ActiveType(const std::string &type) { } } -template +template class LstmUnitFunctor { public: - static void compute(const platform::DeviceContext &context, - LstmMetaValue value, int frame_size, int batch_size, + static void compute(const DeviceContext &context, LstmMetaValue value, + int frame_size, int batch_size, const std::string &gate_act, const std::string &cell_act, const std::string &cand_act); }; -template +template class LstmUnitGradFunctor { public: - static void compute(const platform::DeviceContext &context, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, + static void compute(const DeviceContext &context, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, int batch_size, const std::string &gate_act, const std::string &cell_act, const std::string &cand_act); }; diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index e099a6a439..2b35e4532a 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -21,13 +21,11 @@ namespace operators { namespace math { template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const float alpha, const float* A, - const float* B, const float beta, - float* C) { +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -36,13 +34,11 @@ void gemm(const platform::DeviceContext& context, } template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const double alpha, const double* A, - const double* B, const double beta, - double* C) { +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -51,35 +47,32 @@ void gemm(const platform::DeviceContext& context, } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const float alpha, const float* A, - const int lda, const float* B, - const int ldb, const float beta, float* C, - const int ldc) { +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const double alpha, const double* A, - const int lda, const double* B, - const int ldb, const double beta, - double* C, const int ldc) { +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -99,15 +92,16 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -127,7 +121,7 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } @@ -135,8 +129,8 @@ void matmul( #ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -157,8 +151,8 @@ void batched_gemm( } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -183,8 +177,8 @@ void batched_gemm( // functions of Intel MKL are not available. In the future, this computation // should be parallelized. template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -192,14 +186,14 @@ void batched_gemm( const float* Ak = &A[k * strideA]; const float* Bk = &B[k * strideB]; float* Ck = &C[k * M * N]; - gemm(context, transA, transB, M, N, K, alpha, Ak, - Bk, beta, Ck); + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); } } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -207,55 +201,53 @@ void batched_gemm( const double* Ak = &A[k * strideA]; const double* Bk = &B[k * strideB]; double* Ck = &C[k * M * N]; - gemm(context, transA, transB, M, N, K, alpha, - Ak, Bk, beta, Ck); + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); } } #endif template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const float alpha, - const float* A, const float* B, - const float beta, float* C) { +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const double alpha, - const double* A, const double* B, - const double beta, double* C) { +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const float alpha, - const float* x, float* y) { +void axpy( + const platform::CPUDeviceContext& context, const int n, const float alpha, + const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const double alpha, - const double* x, double* y) { +void axpy( + const platform::CPUDeviceContext& context, const int n, const double alpha, + const double* x, double* y) { cblas_daxpy(n, alpha, x, 1, y, 1); } -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); @@ -310,10 +302,10 @@ void set_constant(const platform::DeviceContext& context, #endif } -template struct RowwiseAdd; -template struct RowwiseAdd; -template struct ColwiseSum; -template struct ColwiseSum; +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +template struct ColwiseSum; } // namespace math } // namespace operators diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 3018e50a4f..1b560a7e2d 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -22,13 +22,11 @@ namespace operators { namespace math { template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const float alpha, const float* A, - const float* B, const float beta, - float* C) { +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -39,19 +37,16 @@ void gemm(const platform::DeviceContext& context, (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasSgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); } template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const double alpha, const double* A, - const double* B, const double beta, - double* C) { +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -61,51 +56,45 @@ void gemm(const platform::DeviceContext& context, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasDgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const float alpha, const float* A, - const int lda, const float* B, - const int ldb, const float beta, float* C, - const int ldc) { +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasSgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const double alpha, const double* A, - const int lda, const double* B, - const int ldb, const double beta, - double* C, const int ldc) { +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasDgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -125,15 +114,16 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -153,14 +143,14 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -176,15 +166,13 @@ void batched_gemm( const int strideC = M * N; PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, - &beta, C, ldc, strideC, batchCount)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -200,68 +188,58 @@ void batched_gemm( const int strideC = M * N; PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, - &beta, C, ldc, strideC, batchCount)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const float alpha, - const float* A, const float* B, - const float beta, float* C) { +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; - PADDLE_ENFORCE(platform::dynload::cublasSgemv( - reinterpret_cast(context) - .cublas_handle(), - cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); + PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const double alpha, - const double* A, const double* B, - const double beta, double* C) { +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; - PADDLE_ENFORCE(platform::dynload::cublasDgemv( - reinterpret_cast(context) - .cublas_handle(), - cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); + PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const float alpha, - const float* x, float* y) { - PADDLE_ENFORCE(platform::dynload::cublasSaxpy( - reinterpret_cast(context) - .cublas_handle(), - n, &alpha, x, 1, y, 1)); +void axpy( + const platform::CUDADeviceContext& context, const int n, const float alpha, + const float* x, float* y) { + PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const double alpha, - const double* x, double* y) { - PADDLE_ENFORCE(platform::dynload::cublasDaxpy( - reinterpret_cast(context) - .cublas_handle(), - n, &alpha, x, 1, y, 1)); +void axpy( + const platform::CUDADeviceContext& context, const int n, const double alpha, + const double* x, double* y) { + PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); } -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(2); @@ -277,8 +255,9 @@ struct TensorSetConstantGPU { template void operator()() const { - SetConstant functor; - functor(context_, tensor_, static_cast(value_)); + SetConstant functor; + functor(reinterpret_cast(context_), + tensor_, static_cast(value_)); } const platform::DeviceContext& context_; @@ -294,27 +273,27 @@ void set_constant_with_place( TensorSetConstantGPU(context, tensor, value)); } -template struct RowwiseAdd; -template struct RowwiseAdd; -template struct ColwiseSum; -// template struct ColwiseSum; -// The ColwiseSum failed in debug mode, +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +// template struct ColwiseSum; +// The ColwiseSum failed in debug mode, // and only failed for this case. So reimplemented it. template <> -void ColwiseSum::operator()( - const platform::DeviceContext& context, const framework::Tensor& input, +void ColwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* vector) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector->numel(), size); framework::Tensor one; one.mutable_data({in_dims[0]}, context.GetPlace()); - SetConstant set; + SetConstant set; set(context, &one, static_cast(1.0)); - gemv(context, true, static_cast(in_dims[0]), - static_cast(in_dims[1]), 1.0, - input.data(), one.data(), - 0.0, vector->data()); + gemv( + context, true, static_cast(in_dims[0]), static_cast(in_dims[1]), + 1.0, input.data(), one.data(), 0.0, + vector->data()); } } // namespace math diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index f2b025b78b..8cc03c2ba0 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -62,53 +62,51 @@ namespace math { // Then matrixA: M * K, matrixB: K * N, matrixC : M * N // For more detailed info, please refer to // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html -template -void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +template +void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const T alpha, const T* A, const T* B, const T beta, T* C); // gemm wrapper with stride args for matrix uncontinuous in memory -template -void gemm(const platform::DeviceContext& context, const bool transA, - const bool transB, const int M, const int N, const int K, - const T alpha, const T* A, const int lda, const T* B, const int ldb, - const T beta, T* C, const int ldc); +template +void gemm(const DeviceContext& context, const bool transA, const bool transB, + const int M, const int N, const int K, const T alpha, const T* A, + const int lda, const T* B, const int ldb, const T beta, T* C, + const int ldc); // matrix multiply with continuous memory -template -void matmul(const platform::DeviceContext& context, - const framework::Tensor& matrix_a, bool trans_a, - const framework::Tensor& matrix_b, bool trans_b, T alpha, - framework::Tensor* matrix_out, T beta); +template +void matmul(const DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, + T alpha, framework::Tensor* matrix_out, T beta); // Batched gemm -template -void batched_gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, const T alpha, - const T* A, const T* B, const T beta, T* C, - const int batchCount, const int strideA, const int strideB); - -template -void gemv(const platform::DeviceContext& context, const bool trans_a, - const int M, const int N, const T alpha, const T* A, const T* B, - const T beta, T* C); - -template -void axpy(const platform::DeviceContext& context, const int n, const T alpha, - const T* x, T* y); - -template +template +void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, + const int K, const T alpha, const T* A, const T* B, + const T beta, T* C, const int batchCount, const int strideA, + const int strideB); + +template +void gemv(const DeviceContext& context, const bool trans_a, const int M, + const int N, const T alpha, const T* A, const T* B, const T beta, + T* C); + +template +void axpy(const DeviceContext& context, const int n, const T alpha, const T* x, + T* y); + +template struct Transpose { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& in, framework::Tensor* out, - const std::vector& axis); + void operator()(const DeviceContext& context, const framework::Tensor& in, + framework::Tensor* out, const std::vector& axis); }; -template +template struct SetConstant { - void operator()(const platform::DeviceContext& context, - framework::Tensor* tensor, T num); + void operator()(const DeviceContext& context, framework::Tensor* tensor, + T num); }; template @@ -118,17 +116,16 @@ void set_constant_with_place(const platform::DeviceContext& context, void set_constant(const platform::DeviceContext& context, framework::Tensor* tensor, float value); -template +template struct RowwiseAdd { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, const framework::Tensor& vec, - framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& vec, framework::Tensor* output); }; -template +template struct ColwiseSum { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); }; } // namespace math diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index 4dc17a4e52..3e6d833865 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -20,16 +20,17 @@ namespace paddle { namespace operators { namespace math { -template -void SetConstant::operator()(const platform::DeviceContext& context, - framework::Tensor* tensor, T num) { +template +void SetConstant::operator()(const DeviceContext& context, + framework::Tensor* tensor, + T num) { auto t = framework::EigenVector::Flatten(*tensor); - t.device(*context.GetEigenDevice()) = t.constant(static_cast(num)); + t.device(*context.eigen_device()) = t.constant(static_cast(num)); } -template -void Transpose::operator()( - const platform::DeviceContext& context, const framework::Tensor& in, +template +void Transpose::operator()( + const DeviceContext& context, const framework::Tensor& in, framework::Tensor* out, const std::vector& axis) { Eigen::array permute; for (int i = 0; i < Rank; i++) { @@ -40,15 +41,15 @@ void Transpose::operator()( auto eigen_in = framework::EigenTensor::From(in); auto eigen_out = framework::EigenTensor::From(*out); - auto* dev = context.GetEigenDevice(); + auto* dev = context.eigen_device(); eigen_out.device(*dev) = eigen_in.shuffle(permute); } -template -void RowwiseAdd::operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& vector, - framework::Tensor* output) { +template +void RowwiseAdd::operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, + framework::Tensor* output) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector.numel(), size); @@ -59,14 +60,14 @@ void RowwiseAdd::operator()(const platform::DeviceContext& context, auto out = framework::EigenMatrix::From(*output); Eigen::array shape({{1, static_cast(size)}}); Eigen::array bcast({{static_cast(in_dims[0]), 1}}); - out.device(*context.GetEigenDevice()) = + out.device(*context.eigen_device()) = in + vec.reshape(shape).broadcast(bcast); } -template -void ColwiseSum::operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - framework::Tensor* vector) { +template +void ColwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* vector) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector->numel(), size); @@ -74,7 +75,7 @@ void ColwiseSum::operator()(const platform::DeviceContext& context, auto vec = framework::EigenMatrix::From(*vector); auto in = framework::EigenMatrix::From(input); Eigen::array shape({{1, static_cast(size)}}); - vec.reshape(shape).device(*context.GetEigenDevice()) = + vec.reshape(shape).device(*context.eigen_device()) = in.sum(Eigen::array({{0}})).reshape(shape); } diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc index 983c9fdcff..7c6f098ca9 100644 --- a/paddle/operators/math/math_function_test.cc +++ b/paddle/operators/math/math_function_test.cc @@ -21,7 +21,7 @@ TEST(math_function, gemm_notrans_cblas) { memcpy(input3_ptr, arr3, 8 * sizeof(float)); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1, input3_ptr + 1, 4); @@ -55,7 +55,7 @@ TEST(math_function, gemm_trans_clbas) { memcpy(input3_ptr, arr3, 8 * sizeof(float)); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, input3_ptr + 1, 4); @@ -74,7 +74,8 @@ TEST(math_function, zero) { auto* cpu_place = new paddle::platform::CPUPlace(); float* t = tensor.mutable_data({2, 2}, *cpu_place); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::SetConstant + paddle::operators::math::SetConstant functor; functor(context, &tensor, 0); EXPECT_EQ(t[0], 0); @@ -110,7 +111,7 @@ void GemvTest(int m, int n, bool trans) { } paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., data_a, data_b, 0., data_c); diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu index d5d6f0c73b..32e96d9487 100644 --- a/paddle/operators/math/math_function_test.cu +++ b/paddle/operators/math/math_function_test.cu @@ -21,7 +21,7 @@ TEST(math_function, notrans_mul_trans) { out_gpu.mutable_data({2, 2}, *gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); @@ -55,7 +55,7 @@ TEST(math_function, trans_mul_notrans) { out_gpu.mutable_data({3, 3}, *gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); @@ -106,7 +106,7 @@ TEST(math_function, gemm_notrans_cublas) { float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); @@ -161,7 +161,7 @@ TEST(math_function, gemm_trans_cublas) { float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); @@ -208,7 +208,7 @@ void GemvTest(int m, int n, bool trans) { paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a); paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h index 6ba9a0ba9a..7048e11e6f 100644 --- a/paddle/operators/math/matmul.h +++ b/paddle/operators/math/matmul.h @@ -26,13 +26,12 @@ namespace math { // // Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported // yet. -template +template class MatMulFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& a, bool trans_a, - const framework::Tensor& b, bool trans_b, T alpha, - framework::Tensor* out, T beta) { + void operator()(const DeviceContext& context, const framework::Tensor& a, + bool trans_a, const framework::Tensor& b, bool trans_b, + T alpha, framework::Tensor* out, T beta) { auto dim_a = a.dims(); auto dim_b = b.dims(); @@ -108,13 +107,13 @@ class MatMulFunctor { if (!batchCount) { // regular matrix multiplication - gemm(context, transA, transB, M, N, kA, alpha, a.data(), - b.data(), beta, out->data()); + gemm(context, transA, transB, M, N, kA, alpha, + a.data(), b.data(), beta, out->data()); } else { // batched matrix multiplication - batched_gemm(context, transA, transB, M, N, kA, alpha, - a.data(), b.data(), beta, out->data(), - batchCount, strideA, strideB); + batched_gemm( + context, transA, transB, M, N, kA, alpha, a.data(), b.data(), + beta, out->data(), batchCount, strideA, strideB); } } }; diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc index c9003962d3..fea86675f7 100644 --- a/paddle/operators/math/maxouting.cc +++ b/paddle/operators/math/maxouting.cc @@ -20,9 +20,9 @@ namespace math { // All tensors are in NCHW format, and the groups must be greater than 1 template -class MaxOutFunctor { +class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, framework::Tensor* output, int groups) { const int batch_size = input.dims()[0]; @@ -54,9 +54,9 @@ class MaxOutFunctor { }; template -class MaxOutGradFunctor { +class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups) { @@ -91,10 +91,10 @@ class MaxOutGradFunctor { } }; -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; -template class MaxOutFunctor; -template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu index c3fabcae08..6056ad251c 100644 --- a/paddle/operators/math/maxouting.cu +++ b/paddle/operators/math/maxouting.cu @@ -78,9 +78,9 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, * All tensors are in NCHW format. */ template -class MaxOutFunctor { +class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* output, int groups) { const int batch_size = input.dims()[0]; @@ -98,20 +98,18 @@ class MaxOutFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxOut< - T><<(context) - .stream()>>>(nthreads, input_data, input_channels, - input_height, input_width, groups, output_data); + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + output_data); } }; /* * All tensors are in NCHW format. */ template -class MaxOutGradFunctor { +class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups) { @@ -132,20 +130,17 @@ class MaxOutGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxoutGrad< - T><<(context) - .stream()>>>(nthreads, input_data, output_data, - output_grad_data, input_grad_data, input_channels, - input_height, input_width, groups); + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups); } }; -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; -template class MaxOutFunctor; -template class MaxOutFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h index 2d9069b0b3..68f4743db0 100644 --- a/paddle/operators/math/maxouting.h +++ b/paddle/operators/math/maxouting.h @@ -23,20 +23,18 @@ namespace math { #define FLT_MAX __FLT_MAX__ -template - +template class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - int groups); + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, int groups); }; -template +template class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups); }; diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc index 135984586a..150de6fd59 100644 --- a/paddle/operators/math/pooling.cc +++ b/paddle/operators/math/pooling.cc @@ -24,9 +24,9 @@ namespace math { * height and width, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -84,9 +84,9 @@ class Pool2dFunctor { * and width, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -152,9 +152,9 @@ class Pool2dGradFunctor { * height and width, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -213,25 +213,29 @@ class MaxPool2dGradFunctor { } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool2dFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; /* * All tensors are in NCDHW format. @@ -239,9 +243,9 @@ template class Pool2dGradFunctor< * depth, height and width, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -314,9 +318,9 @@ class Pool3dFunctor { * depth, height and width, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -398,9 +402,9 @@ class Pool3dGradFunctor { * depth, height and width, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -473,25 +477,29 @@ class MaxPool3dGradFunctor { } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool3dFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; /* * All tensors are in NCHW format. @@ -499,9 +507,9 @@ template class Pool3dGradFunctor< * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -564,9 +572,9 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -602,10 +610,14 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; /* * All tensors are in NCDHW format. @@ -613,9 +625,9 @@ template class MaxPool2dWithIndexGradFunctor; * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -692,9 +704,9 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -735,10 +747,14 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu index ca3560f264..0243cf8316 100644 --- a/paddle/operators/math/pooling.cu +++ b/paddle/operators/math/pooling.cu @@ -155,9 +155,9 @@ __global__ void KernelMaxPool2DGrad( * height and width, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -183,11 +183,7 @@ class Pool2dFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool2D< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool2D<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, pool_process, output_data); @@ -200,9 +196,9 @@ class Pool2dFunctor { * height and width, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -231,11 +227,7 @@ class Pool2dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool2DGrad< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool2DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -249,9 +241,9 @@ class Pool2dGradFunctor { * height and width, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -281,10 +273,7 @@ class MaxPool2dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2DGrad< - T><<(context) - .stream()>>>( + KernelMaxPool2DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -292,25 +281,29 @@ class MaxPool2dGradFunctor { } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool2dFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; template __global__ void KernelPool3D(const int nthreads, const T* input_data, @@ -478,9 +471,9 @@ __global__ void KernelMaxPool3DGrad( * depth, height and width, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -512,11 +505,7 @@ class Pool3dFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool3D< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool3D<<>>( nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, @@ -531,9 +520,9 @@ class Pool3dFunctor { * depth, height and width, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -569,11 +558,7 @@ class Pool3dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool3DGrad< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool3DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, @@ -588,9 +573,9 @@ class Pool3dGradFunctor { * depth, height and width, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -626,10 +611,7 @@ class MaxPool3dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DGrad< - T><<(context) - .stream()>>>( + KernelMaxPool3DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, @@ -638,25 +620,29 @@ class MaxPool3dGradFunctor { } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool3dFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; template __global__ void KernelMaxPool2dWithIdx( @@ -747,9 +733,9 @@ __global__ void KernelMaxPool2DWithIdxGrad( * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -776,10 +762,7 @@ class MaxPool2dWithIndexFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2dWithIdx< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool2dWithIdx<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, output_data, mask_data); @@ -792,9 +775,9 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -821,10 +804,7 @@ class MaxPool2dWithIndexGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2DWithIdxGrad< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool2DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -832,10 +812,14 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; template __global__ void KernelMaxPool3DWithIdx( @@ -950,9 +934,9 @@ __global__ void KernelMaxPool3DWithIdxGrad( * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -985,10 +969,7 @@ class MaxPool3dWithIndexFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DWithIdx< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool3DWithIdx<<>>( nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, @@ -1002,9 +983,9 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -1037,10 +1018,7 @@ class MaxPool3dWithIndexGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DWithIdxGrad< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool3DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, @@ -1049,10 +1027,14 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h index 19fbd8b4bb..2759f06cb6 100644 --- a/paddle/operators/math/pooling.h +++ b/paddle/operators/math/pooling.h @@ -84,62 +84,58 @@ class AvgPoolGrad { * This is different from average pooling. So we rewrite the max_pool_grad: * MaxPool2dGradFunctor, MaxPool3dGradFunctor. */ -template +template class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); }; -template +template class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_compute, framework::Tensor* input_grad); }; -template +template class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* input_grad); }; -template +template class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); }; -template +template class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_compute, framework::Tensor* input_grad); }; -template +template class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -153,38 +149,38 @@ class MaxPool3dGradFunctor { * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in * NCDHW format. */ -template +template class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); }; -template +template class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* input_grad); }; -template +template class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); }; -template +template class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index 514f2adef2..ab758d1e7f 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -19,8 +19,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAdd { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output) { @@ -67,12 +67,12 @@ struct SelectedRowsAdd { } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; template -struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTensor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); @@ -88,7 +88,7 @@ struct SelectedRowsAddTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); - SetConstant functor; + SetConstant functor; functor(context, output, 0.0); auto* in1_data = in1_value.data(); @@ -103,17 +103,16 @@ struct SelectedRowsAddTensor { auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); - out_eigen.device(*context.GetEigenDevice()) = - out_eigen + in2_eigen; + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; } }; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2) { @@ -143,14 +142,14 @@ struct SelectedRowsAddTo { } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; template -struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddToTensor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); @@ -175,10 +174,10 @@ struct SelectedRowsAddToTensor { } }; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index c1dd323ba2..c44577e00a 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -20,8 +20,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAdd { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output) { @@ -64,16 +64,15 @@ struct SelectedRowsAdd { reinterpret_cast(context).stream()); auto* in2_data = in2_value.data(); - memory::Copy( - boost::get(out_place), out_data + in1_value.numel(), - boost::get(in2_place), in2_data, - in2_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(out_place), + out_data + in1_value.numel(), + boost::get(in2_place), in2_data, + in2_value.numel() * sizeof(T), context.stream()); } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; namespace { template @@ -96,8 +95,8 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows, } // namespace template -struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTensor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); @@ -117,30 +116,28 @@ struct SelectedRowsAddTensor { auto* in2_data = input2.data(); auto* out_data = output->data(); - SetConstant functor; + SetConstant functor; functor(context, output, 0.0); const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in1_rows.size()); - SelectedRowsAddTensorKernel<<< - grid, threads, 0, - reinterpret_cast(context) - .stream()>>>(in1_data, in1_rows.data(), out_data, in1_row_numel); + SelectedRowsAddTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.data(), out_data, in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); - out_eigen.device(*context.GetEigenDevice()) = - out_eigen + in2_eigen; + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; } }; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2) { @@ -163,18 +160,17 @@ struct SelectedRowsAddTo { auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); - memory::Copy( - boost::get(in2_place), in2_data + input2_offset, - boost::get(in1_place), in1_data, - in1_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(in2_place), + in2_data + input2_offset, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; namespace { template @@ -197,8 +193,8 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, } // namespace template -struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddToTensor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); @@ -216,17 +212,16 @@ struct SelectedRowsAddToTensor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in1_rows.size()); - SelectedRowsAddToTensorKernel<<< - grid, threads, 0, - reinterpret_cast(context) - .stream()>>>(in1_data, in1_rows.data(), in2_data, in1_row_numel); + SelectedRowsAddToTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.data(), in2_data, in1_row_numel); } }; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index d6dc6c03c9..1149075abf 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -21,33 +21,33 @@ namespace math { // SelectedRows + SelectedRows will simplely concat value and rows. // The real computation happens in dealing with LoDTensor. -template +template struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output); }; -template +template struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output); }; // input2 = input1 + input2 -template +template struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2); }; // input2 = input1 + input2 -template +template struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2); }; diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc index a3649b6875..8c74cab0a1 100644 --- a/paddle/operators/math/selected_rows_functor_test.cc +++ b/paddle/operators/math/selected_rows_functor_test.cc @@ -23,7 +23,7 @@ TEST(selected_rows_functor, cpu_add) { CPUPlace cpu_place; CPUDeviceContext ctx(cpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -47,7 +47,7 @@ TEST(selected_rows_functor, cpu_add) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), cpu_place); - SelectedRowsAdd add_functor; + SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -85,7 +85,7 @@ TEST(selected_rows_functor, cpu_add) { std::unique_ptr tensor2{new Tensor()}; tensor2->mutable_data(make_ddim({height, row_numel}), cpu_place); - SelectedRowsAddTensor add_tensor_functor; + SelectedRowsAddTensor add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); auto* tensor2_data = tensor2->data(); @@ -112,7 +112,7 @@ TEST(selected_rows_functor, cpu_add_to) { CPUPlace cpu_place; CPUDeviceContext ctx(cpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -137,7 +137,7 @@ TEST(selected_rows_functor, cpu_add_to) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), cpu_place); - SelectedRowsAddTo add_to_functor; + SelectedRowsAddTo add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -173,7 +173,7 @@ TEST(selected_rows_functor, cpu_add_to) { tensor1->mutable_data(make_ddim({height, row_numel}), cpu_place); functor(ctx, tensor1.get(), 3.0); - SelectedRowsAddToTensor add_to_tensor_functor; + SelectedRowsAddToTensor add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); auto* tensor1_data = tensor1->data(); diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu index 7de9291c17..777caf5635 100644 --- a/paddle/operators/math/selected_rows_functor_test.cu +++ b/paddle/operators/math/selected_rows_functor_test.cu @@ -24,7 +24,7 @@ TEST(selected_rows_functor, gpu_add) { GPUPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -48,7 +48,7 @@ TEST(selected_rows_functor, gpu_add) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), gpu_place); - SelectedRowsAdd add_functor; + SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -90,7 +90,7 @@ TEST(selected_rows_functor, gpu_add) { std::unique_ptr tensor2{new Tensor()}; tensor2->mutable_data(make_ddim({height, row_numel}), gpu_place); - SelectedRowsAddTensor add_tensor_functor; + SelectedRowsAddTensor add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); Tensor tensor2_cpu; @@ -122,7 +122,7 @@ TEST(selected_rows_functor, gpu_add_to) { GPUPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -147,7 +147,7 @@ TEST(selected_rows_functor, gpu_add_to) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), gpu_place); - SelectedRowsAddTo add_to_functor; + SelectedRowsAddTo add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -187,7 +187,7 @@ TEST(selected_rows_functor, gpu_add_to) { tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - SelectedRowsAddToTensor add_to_tensor_functor; + SelectedRowsAddToTensor add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); Tensor tensor1_cpu; diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc index 5b3bde02fb..88977be1f8 100644 --- a/paddle/operators/math/sequence2batch.cc +++ b/paddle/operators/math/sequence2batch.cc @@ -19,9 +19,9 @@ namespace operators { namespace math { template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& src, const size_t* index, framework::Tensor& dst, bool is_src_index) { auto src_dims = src.dims(); @@ -48,13 +48,13 @@ class CopyMatrixRowsFunctor { } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu index c5d968aeb2..452ae89510 100644 --- a/paddle/operators/math/sequence2batch.cu +++ b/paddle/operators/math/sequence2batch.cu @@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index, } template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& src, const size_t* index, framework::Tensor& dst, bool is_src_index) { auto src_dims = src.dims(); @@ -59,20 +59,19 @@ class CopyMatrixRowsFunctor { dim3 threads(128, 8); dim3 grid(8, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); CopyMatrixRowsKernel<<>>( src_data, dst_data, index, height, width, is_src_index); } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h index 73295ddbcb..a5c43a2c7d 100644 --- a/paddle/operators/math/sequence2batch.h +++ b/paddle/operators/math/sequence2batch.h @@ -26,7 +26,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class CopyMatrixRowsFunctor { public: // If is_src_index is true, @@ -34,12 +34,12 @@ class CopyMatrixRowsFunctor { // If is_src_index is false, // copy the input src to the indexed rows of output dst. // The indexed rows are based on the input index. - void operator()(const platform::DeviceContext& context, - const framework::Tensor& src, const size_t* index, - framework::Tensor& dst, bool is_src_index); + void operator()(const DeviceContext& context, const framework::Tensor& src, + const size_t* index, framework::Tensor& dst, + bool is_src_index); }; -template +template class LoDTensor2BatchFunctor { // Calculate the length of each sequence and // sort sequence index by the length. @@ -56,7 +56,7 @@ class LoDTensor2BatchFunctor { }; public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& lod_tensor, framework::LoDTensor& batch, bool is_cal_batch_lod, bool is_reverse = false) const { @@ -65,7 +65,7 @@ class LoDTensor2BatchFunctor { PADDLE_ENFORCE_GT(lods.size(), 2UL); PADDLE_ENFORCE_EQ(lods[1].size(), static_cast(lod_tensor.dims()[0])); - CopyMatrixRowsFunctor to_batch; + CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, lods[1].data(), batch, true); return; } @@ -143,22 +143,22 @@ class LoDTensor2BatchFunctor { } batch.set_lod(batch_lods); - CopyMatrixRowsFunctor to_batch; + CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, seq2batch_idx, batch, true); } }; -template +template class Batch2LoDTensorFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& batch, framework::LoDTensor& lod_tensor) const { auto in_lod = batch.lod(); PADDLE_ENFORCE_GT(in_lod.size(), 2UL); PADDLE_ENFORCE_EQ(in_lod[1].size(), static_cast(lod_tensor.dims()[0])); - CopyMatrixRowsFunctor to_seq; + CopyMatrixRowsFunctor to_seq; size_t* index = in_lod[1].data(); to_seq(context, batch, index, lod_tensor, false); } diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc index 5913c99fdb..8fb92b1a13 100644 --- a/paddle/operators/math/sequence_pooling.cc +++ b/paddle/operators/math/sequence_pooling.cc @@ -20,9 +20,9 @@ namespace operators { namespace math { template -class MaxSeqPoolFunctor { +class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index) { auto in_dims = input.dims(); @@ -60,9 +60,9 @@ class MaxSeqPoolFunctor { }; template -class MaxSeqPoolGradFunctor { +class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad) { @@ -80,7 +80,7 @@ class MaxSeqPoolGradFunctor { const int* max_index = index.data(); T* ig_data = in_grad->data(); - SetConstant set_zero; + SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); int64_t num_seq = og_dims[0]; int64_t dim = out_grad.numel() / num_seq; @@ -93,10 +93,10 @@ class MaxSeqPoolGradFunctor { } }; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolGradFunctor; -template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu index 5ed951402f..4c9e6b375c 100644 --- a/paddle/operators/math/sequence_pooling.cu +++ b/paddle/operators/math/sequence_pooling.cu @@ -46,9 +46,9 @@ __global__ void KeMaxSequencePool(const T* input, const size_t* starts, } template -class MaxSeqPoolFunctor { +class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index) { auto in_dims = input.dims(); @@ -71,8 +71,7 @@ class MaxSeqPoolFunctor { dim3 threads(256, 1); dim3 grid(num_seq, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); KeMaxSequencePool<<>>( in_data, starts.data(), out_data, max_index, num_seq, dim); } @@ -91,9 +90,9 @@ __global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index, } template -class MaxSeqPoolGradFunctor { +class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad) { @@ -111,7 +110,7 @@ class MaxSeqPoolGradFunctor { const int* max_index = index.data(); T* ig_data = in_grad->data(); - SetConstant set_zero; + SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); int64_t num_seq = og_dims[0]; int64_t dim = out_grad.numel() / num_seq; @@ -119,17 +118,16 @@ class MaxSeqPoolGradFunctor { unsigned int blocks = (num_seq * dim + 128 - 1) / 128; dim3 threads(128, 1); dim3 grid(blocks, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); KeMaxSequencePoolGrad<<>>( og_data, max_index, ig_data, num_seq, dim); } }; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolGradFunctor; -template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h index 35dfe26de1..13ffb2ebef 100644 --- a/paddle/operators/math/sequence_pooling.h +++ b/paddle/operators/math/sequence_pooling.h @@ -23,18 +23,18 @@ namespace math { #define FLT_MAX __FLT_MAX__ -template +template class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index); }; -template +template class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad); diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc index 3e2f15d6c2..72f10f35f4 100644 --- a/paddle/operators/math/softmax.cc +++ b/paddle/operators/math/softmax.cc @@ -19,10 +19,10 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu index 4dbab51d46..9e73f6a371 100644 --- a/paddle/operators/math/softmax.cu +++ b/paddle/operators/math/softmax.cu @@ -21,10 +21,10 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h index fe10746502..471f44d340 100644 --- a/paddle/operators/math/softmax.h +++ b/paddle/operators/math/softmax.h @@ -19,19 +19,18 @@ namespace paddle { namespace operators { namespace math { -template +template class SoftmaxFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor* X, framework::Tensor* Y); + void operator()(const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y); }; -template +template class SoftmaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor* y, const framework::Tensor* y_grad, - framework::Tensor* x_grad); + void operator()(const DeviceContext& context, const framework::Tensor* y, + const framework::Tensor* y_grad, framework::Tensor* x_grad); }; } // namespace math diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h index 05793eeb3e..82f597ff79 100644 --- a/paddle/operators/math/softmax_impl.h +++ b/paddle/operators/math/softmax_impl.h @@ -32,10 +32,10 @@ struct ValueClip { } }; -template -void SoftmaxFunctor::operator()( - const platform::DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y) { +template +void SoftmaxFunctor::operator()(const DeviceContext& context, + const framework::Tensor* X, + framework::Tensor* Y) { auto logits = EigenMatrix::From(*X); auto softmax = EigenMatrix::From(*Y); @@ -56,19 +56,18 @@ void SoftmaxFunctor::operator()( .broadcast(one_by_class)) .unaryExpr(ValueClip()); - softmax.device(*context.GetEigenDevice()) = shifted_logits.exp(); - softmax.device(*context.GetEigenDevice()) = - (softmax * - softmax.sum(along_class) - .inverse() - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); + softmax.device(*context.eigen_device()) = shifted_logits.exp(); + softmax.device(*context.eigen_device()) = (softmax * + softmax.sum(along_class) + .inverse() + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); } -template -void SoftmaxGradFunctor::operator()( - const platform::DeviceContext& context, const framework::Tensor* y, +template +void SoftmaxGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor* y, const framework::Tensor* y_grad, framework::Tensor* x_grad) { auto softmax = EigenMatrix::From(*y); auto softmax_grad = EigenMatrix::From(*y_grad); @@ -89,8 +88,7 @@ void SoftmaxGradFunctor::operator()( .eval() .reshape(batch_by_one) .broadcast(one_by_class); - logits_grad.device(*context.GetEigenDevice()) = - (softmax_grad - dot) * softmax; + logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax; } } // namespace math diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index b57d3dc141..ecd3a647e0 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -17,9 +17,9 @@ namespace paddle { namespace operators { namespace math { template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; @@ -48,9 +48,9 @@ class Unpool2dMaxFunctor { } }; template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -82,10 +82,10 @@ class Unpool2dMaxGradFunctor { } } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 37c3c8b689..ecbde0f6a7 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -67,9 +67,9 @@ __global__ void KernelUnpool2dMaxGrad( * All tensors are in NCHW format. */ template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; @@ -83,21 +83,18 @@ class Unpool2dMaxFunctor { T* output_data = output->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMax< - T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_height, output_width); + KernelUnpool2dMax<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_height, output_width); } }; /* * All tensors are in NCHW format. */ template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -116,19 +113,16 @@ class Unpool2dMaxGradFunctor { T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMaxGrad< - T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_grad_data, output_height, - output_width, input_grad_data); + KernelUnpool2dMaxGrad<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_grad_data, output_height, + output_width, input_grad_data); } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index 7077d7c227..0f0ff1371e 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -18,18 +18,16 @@ limitations under the License. */ namespace paddle { namespace operators { namespace math { -template +template class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output); }; -template +template class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, const framework::Tensor& output_grad, diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc index 99eb7fd46d..d574ed9234 100644 --- a/paddle/operators/math/vol2col.cc +++ b/paddle/operators/math/vol2col.cc @@ -25,9 +25,9 @@ namespace math { * output_depth, output_height, output_width] */ template -class Vol2ColFunctor { +class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, @@ -111,9 +111,9 @@ class Vol2ColFunctor { * output_depth, output_height, output_width] */ template -class Col2VolFunctor { +class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, @@ -190,10 +190,10 @@ class Col2VolFunctor { } }; -template class Vol2ColFunctor; -template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu index dae3be858e..b029442fe4 100644 --- a/paddle/operators/math/vol2col.cu +++ b/paddle/operators/math/vol2col.cu @@ -68,9 +68,9 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth, * output_depth, output_height, output_width] */ template -class Vol2ColFunctor { +class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, @@ -117,9 +117,7 @@ class Vol2ColFunctor { const int threads = 1024; const int blocks = (num_outputs + 1024 - 1) / 1024; - vol2col<<(context) - .stream()>>>( + vol2col<<>>( num_outputs, vol.data(), input_depth, input_height, input_width, dilations[0], dilations[1], dilations[2], filter_depth, filter_height, filter_width, strides[0], strides[1], strides[2], paddings[0], @@ -196,9 +194,9 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth, * output_depth, output_height, output_width] */ template -class Col2VolFunctor { +class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, @@ -245,9 +243,7 @@ class Col2VolFunctor { const int threads = 1024; const int blocks = (num_kernels + 1024 - 1) / 1024; - col2vol<<(context) - .stream()>>>( + col2vol<<>>( num_kernels, col.data(), input_depth, input_height, input_width, dilations[0], dilations[1], dilations[2], filter_depth, filter_height, filter_width, strides[0], strides[1], strides[2], paddings[0], @@ -256,10 +252,10 @@ class Col2VolFunctor { } }; -template class Vol2ColFunctor; -template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h index dc64d1d977..dcd80370e8 100644 --- a/paddle/operators/math/vol2col.h +++ b/paddle/operators/math/vol2col.h @@ -63,22 +63,20 @@ namespace math { * \note The caller needs to ensure that volShape.inputChannels is equal to * colShape.inputChannels. */ -template +template class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& vol, + void operator()(const DeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, const std::vector& paddings, framework::Tensor* col) const; }; -template +template class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& col, + void operator()(const DeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, const std::vector& paddings, diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index 62c3152304..f46db3c567 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -template +template void testVol2col() { paddle::framework::Tensor input; paddle::framework::Tensor input_tmp; @@ -24,18 +24,7 @@ void testVol2col() { paddle::framework::Tensor output_tmp; auto* place = new Place(); - paddle::platform::DeviceContext* context; - if (paddle::platform::is_cpu_place(*place)) { - context = - new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); - } else { -#ifdef PADDLE_WITH_CUDA - context = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); -#else - PADDLE_THROW("no GPU support"); -#endif // PADDLE_WITH_CUDA - } + DeviceContext* context = new DeviceContext(*place); /** * input = [[0, 1, 2, @@ -88,7 +77,7 @@ void testVol2col() { output_depth, output_height, output_width}, *place); - paddle::operators::math::Vol2ColFunctor vol2col; + paddle::operators::math::Vol2ColFunctor vol2col; vol2col(*context, input, dilations, strides, paddings, &output); float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; @@ -113,7 +102,7 @@ void testVol2col() { CopyFrom(input_tmp, *place, *context, &input); } - paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math::Col2VolFunctor col2vol; col2vol(*context, output, dilations, strides, paddings, &input); float* in_ptr; @@ -130,8 +119,9 @@ void testVol2col() { } TEST(math, vol2col) { - testVol2col(); + testVol2col(); #ifdef PADDLE_WITH_CUDA - testVol2col(); + testVol2col(); #endif // PADDLE_WITH_CUDA } diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc index 5a1a615420..ee0bc0c370 100644 --- a/paddle/operators/matmul_op.cc +++ b/paddle/operators/matmul_op.cc @@ -206,7 +206,8 @@ class MatMulOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad, ops::MatMulOpGrad); -REGISTER_OP_CPU_KERNEL(matmul, - ops::MatMulKernel); REGISTER_OP_CPU_KERNEL( - matmul_grad, ops::MatMulGradKernel); + matmul, ops::MatMulKernel); +REGISTER_OP_CPU_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc index b7e66382f0..6a3772c004 100644 --- a/paddle/operators/matmul_op.cu.cc +++ b/paddle/operators/matmul_op.cu.cc @@ -15,7 +15,8 @@ #include "paddle/operators/matmul_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(matmul, - ops::MatMulKernel); -REGISTER_OP_GPU_KERNEL( - matmul_grad, ops::MatMulGradKernel); +REGISTER_OP_CUDA_KERNEL( + matmul, ops::MatMulKernel); +REGISTER_OP_CUDA_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h index 1e4aa48b70..de9da487b3 100644 --- a/paddle/operators/matmul_op.h +++ b/paddle/operators/matmul_op.h @@ -27,7 +27,7 @@ using DDim = framework::DDim; using framework::make_ddim; using framework::vectorize; -template +template class MatMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,8 +38,9 @@ class MatMulKernel : public framework::OpKernel { bool transpose_x = context.Attr("transpose_X"); bool transpose_y = context.Attr("transpose_Y"); - math::MatMulFunctor()(context.device_context(), x, transpose_x, y, - transpose_y, T(1), out, T(0)); + math::MatMulFunctor()( + context.template device_context(), x, transpose_x, y, + transpose_y, T(1), out, T(0)); } }; @@ -68,17 +69,16 @@ Tensor CombineBatchAndM(const Tensor& input) { // Reshape a rank-3 tensor from P x M x N to M x (P * N). // (Warning: This requires transposing data and writes into new memory.) // Identity op if the tensor is not of rank 3. -template -Tensor CombineBatchAndN(const framework::ExecutionContext& context, - const Tensor& input) { +template +Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) { Tensor output; auto in_dims = input.dims(); if (in_dims.size() == 3) { output.Resize({in_dims[1], in_dims[0], in_dims[2]}); output.mutable_data(context.GetPlace()); std::vector axis = {1, 0, 2}; - math::Transpose trans; - trans(context.device_context(), input, &output, axis); + math::Transpose trans; + trans(context, input, &output, axis); std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); } else { @@ -112,7 +112,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context, // // To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N // to X: (P * M) x K, dOut: (P * M) x N. -template +template class MatMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -178,24 +178,23 @@ class MatMulGradKernel : public framework::OpKernel { Tensor Y = Reshape(y, make_ddim(y_dims)); Tensor dOut = Reshape(dout, make_ddim(dout_dims)); + auto& dev_ctx = context.template device_context(); if (dx) { dx->mutable_data(context.GetPlace()); const Tensor& dOut_for_dX = (x_dims.size() == 2 && y_dims.size() == 3) - ? CombineBatchAndN(context, dOut) + ? CombineBatchAndN(dev_ctx, dOut) : dOut; if (x_dims.size() == 2 && y_dims.size() == 3) { Y = transpose_y ? CombineBatchAndM(Y) - : CombineBatchAndN(context, Y); + : CombineBatchAndN(dev_ctx, Y); } if (transpose_x) { - math::MatMulFunctor()(context.device_context(), Y, - transpose_y, dOut_for_dX, transpose_x, - T(1), dx, T(0)); + math::MatMulFunctor()( + dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0)); } else { - math::MatMulFunctor()(context.device_context(), dOut_for_dX, - transpose_x, Y, !transpose_y, T(1), dx, - T(0)); + math::MatMulFunctor()( + dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0)); } } @@ -205,18 +204,16 @@ class MatMulGradKernel : public framework::OpKernel { ? CombineBatchAndM(dOut) : dOut; if (y_dims.size() == 2 && x_dims.size() == 3) { - X = transpose_x ? CombineBatchAndN(context, X) + X = transpose_x ? CombineBatchAndN(dev_ctx, X) : CombineBatchAndM(X); dOut = CombineBatchAndM(dOut); } if (transpose_y) { - math::MatMulFunctor()(context.device_context(), dOut_for_dY, - transpose_y, X, transpose_x, T(1), dy, - T(0)); + math::MatMulFunctor()( + dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0)); } else { - math::MatMulFunctor()(context.device_context(), X, - !transpose_x, dOut_for_dY, transpose_y, - T(1), dy, T(0)); + math::MatMulFunctor()( + dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0)); } } } diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc index 44bf402e95..011616e615 100644 --- a/paddle/operators/maxout_op.cc +++ b/paddle/operators/maxout_op.cc @@ -101,7 +101,8 @@ class MaxOutOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL(maxout, - ops::MaxOutKernel); REGISTER_OP_CPU_KERNEL( - maxout_grad, ops::MaxOutGradKernel); + maxout, ops::MaxOutKernel); +REGISTER_OP_CPU_KERNEL( + maxout_grad, + ops::MaxOutGradKernel); diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc index decd43913d..2904f0ff96 100644 --- a/paddle/operators/maxout_op.cu.cc +++ b/paddle/operators/maxout_op.cu.cc @@ -15,9 +15,10 @@ #include "paddle/operators/maxout_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(maxout, - ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_GPU_KERNEL( - maxout_grad, ops::MaxOutGradKernel, - ops::MaxOutGradKernel); +REGISTER_OP_CUDA_KERNEL( + maxout, ops::MaxOutKernel, + ops::MaxOutKernel); +REGISTER_OP_CUDA_KERNEL( + maxout_grad, + ops::MaxOutGradKernel, + ops::MaxOutGradKernel); diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h index 44a0d073dd..e8b12552b9 100644 --- a/paddle/operators/maxout_op.h +++ b/paddle/operators/maxout_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MaxOutKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -31,12 +31,13 @@ class MaxOutKernel : public framework::OpKernel { Tensor* out = context.Output("Out"); int groups = context.template Attr("groups"); - math::MaxOutFunctor maxout_forward; - maxout_forward(context.device_context(), *in_x, out, groups); + math::MaxOutFunctor maxout_forward; + maxout_forward(context.template device_context(), *in_x, out, + groups); } }; -template +template class MaxOutGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,14 +47,13 @@ class MaxOutGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out")); Tensor* in_x_grad = context.Output(framework::GradVarName("X")); int groups = context.template Attr("groups"); - auto& device_ctx = context.device_context(); - math::SetConstant zero; + auto& device_ctx = context.template device_context(); + math::SetConstant zero; if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(context.device_context(), *in_x, in_x_grad, *out, - *out_grad, groups); + math::MaxOutGradFunctor maxout_backward; + maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups); } } }; diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc index dcc5b4286f..8932d700c2 100644 --- a/paddle/operators/mean_op.cc +++ b/paddle/operators/mean_op.cc @@ -76,8 +76,9 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker); REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); -REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel, - ops::MeanKernel); -REGISTER_OP_CPU_KERNEL(mean_grad, - ops::MeanGradKernel, - ops::MeanGradKernel); +REGISTER_OP_CPU_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CPU_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu index ca089938c0..93062bf540 100644 --- a/paddle/operators/mean_op.cu +++ b/paddle/operators/mean_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/mean_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel, - ops::MeanKernel); -REGISTER_OP_GPU_KERNEL(mean_grad, - ops::MeanGradKernel, - ops::MeanGradKernel); +REGISTER_OP_CUDA_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CUDA_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h index c99286a5b9..351b345959 100644 --- a/paddle/operators/mean_op.h +++ b/paddle/operators/mean_op.h @@ -27,7 +27,7 @@ template using EigenVector = framework::EigenVector; -template +template class MeanKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,13 +38,14 @@ class MeanKernel : public framework::OpKernel { auto X = EigenVector::Flatten(*input); auto y = EigenScalar::From(*output); - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); y.device(place) = X.mean(); } }; -template +template class MeanGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,7 +57,8 @@ class MeanGradKernel : public framework::OpKernel { T ig_size = static_cast(IG->numel()); Eigen::DSizes bcast(ig_size); - EigenVector::Flatten(*IG).device(context.GetEigenDevice()) = + EigenVector::Flatten(*IG).device( + *context.template device_context().eigen_device()) = (EigenVector::From(*OG) / ig_size).broadcast(bcast); } }; diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc index 4684c20208..27f0c8de20 100644 --- a/paddle/operators/minus_op.cc +++ b/paddle/operators/minus_op.cc @@ -102,5 +102,5 @@ class MinusGradMaker : public framework::GradOpDescMakerBase { namespace ops = paddle::operators; REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker); -REGISTER_OP_CPU_KERNEL(minus, - ops::MinusKernel); +REGISTER_OP_CPU_KERNEL( + minus, ops::MinusKernel); diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu index a8375cc630..3b202ea92e 100644 --- a/paddle/operators/minus_op.cu +++ b/paddle/operators/minus_op.cu @@ -14,5 +14,6 @@ #include "paddle/operators/minus_op.h" -REGISTER_OP_GPU_KERNEL( - minus, paddle::operators::MinusKernel); +REGISTER_OP_CUDA_KERNEL( + minus, + paddle::operators::MinusKernel); diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h index bd9a2790aa..78e1e1be6d 100644 --- a/paddle/operators/minus_op.h +++ b/paddle/operators/minus_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class MinusKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -28,7 +28,8 @@ class MinusKernel : public framework::OpKernel { auto* out_tensor = context.Output("Out"); out_tensor->mutable_data(context.GetPlace()); - auto& dev = context.GetEigenDevice(); + auto& dev = + *context.template device_context().eigen_device(); framework::EigenVector::Flatten(*out_tensor).device(dev) = framework::EigenVector::Flatten(*left_tensor) - framework::EigenVector::Flatten(*right_tensor); diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc index 28528848af..f0a42491bf 100644 --- a/paddle/operators/modified_huber_loss_op.cc +++ b/paddle/operators/modified_huber_loss_op.cc @@ -115,6 +115,6 @@ REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp, REGISTER_OP_CPU_KERNEL( modified_huber_loss, - ops::ModifiedHuberLossKernel); + ops::ModifiedHuberLossKernel); REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad, ops::ModifiedHuberLossGradCPUKernel); diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu index 8854e166cd..40a8447da4 100644 --- a/paddle/operators/modified_huber_loss_op.cu +++ b/paddle/operators/modified_huber_loss_op.cu @@ -71,8 +71,8 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( modified_huber_loss, - ops::ModifiedHuberLossKernel); -REGISTER_OP_GPU_KERNEL(modified_huber_loss_grad, - ops::ModifiedHuberLossGradGPUKernel); + ops::ModifiedHuberLossKernel); +REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad, + ops::ModifiedHuberLossGradGPUKernel); diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h index aba75efad9..157ae0682e 100644 --- a/paddle/operators/modified_huber_loss_op.h +++ b/paddle/operators/modified_huber_loss_op.h @@ -46,7 +46,7 @@ struct ModifiedHuberLossForward { } }; -template +template class ModifiedHuberLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -57,7 +57,8 @@ class ModifiedHuberLossKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); out1->mutable_data(context.GetPlace()); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = EigenVector::Flatten(*in0); auto y = EigenVector::Flatten(*in1); diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu index be0c8ea071..00f1253465 100644 --- a/paddle/operators/momentum_op.cu +++ b/paddle/operators/momentum_op.cu @@ -74,5 +74,5 @@ class MomentumOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(momentum, ops::MomentumOpCUDAKernel, - ops::MomentumOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel, + ops::MomentumOpCUDAKernel); diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 3c39ae10dc..bc4a5fdf0b 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -149,6 +149,7 @@ REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, ops::MulOpShapeInference, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(mul_grad, ops::MulOpGrad); -REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); -REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); +REGISTER_OP_CPU_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CPU_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc index 66dc3d6d10..6095de58d0 100644 --- a/paddle/operators/mul_op.cu.cc +++ b/paddle/operators/mul_op.cu.cc @@ -15,6 +15,7 @@ #include "paddle/operators/mul_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); -REGISTER_OP_GPU_KERNEL(mul_grad, - ops::MulGradKernel); +REGISTER_OP_CUDA_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CUDA_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 0eb9df41e9..1b467dca83 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,15 +46,16 @@ class MulKernel : public framework::OpKernel { if (z_dim.size() != 2) { z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); } - math::matmul(context.device_context(), x_matrix, false, y_matrix, - false, 1, z, 0); + math::matmul( + context.template device_context(), x_matrix, false, + y_matrix, false, 1, z, 0); if (z_dim.size() != 2) { z->Resize(z_dim); } } }; -template +template class MulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -77,6 +78,7 @@ class MulGradKernel : public framework::OpKernel { Tensor* dx = ctx.Output(framework::GradVarName("X")); Tensor* dy = ctx.Output(framework::GradVarName("Y")); + auto& dev_ctx = ctx.template device_context(); if (dx) { dx->mutable_data(ctx.GetPlace()); Tensor dx_matrix = dx->dims().size() > 2 @@ -84,8 +86,8 @@ class MulGradKernel : public framework::OpKernel { : *dx; // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - math::matmul(ctx.device_context(), dout_mat, false, y_matrix, - true, 1, &dx_matrix, 0); + math::matmul(dev_ctx, dout_mat, false, y_matrix, true, + 1, &dx_matrix, 0); } if (dy) { dy->mutable_data(ctx.GetPlace()); @@ -93,8 +95,8 @@ class MulGradKernel : public framework::OpKernel { ? framework::ReshapeToMatrix(*dy, y_num_col_dims) : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K - math::matmul(ctx.device_context(), x_matrix, true, dout_mat, - false, 1, &dy_matrix, 0); + math::matmul(dev_ctx, x_matrix, true, dout_mat, false, + 1, &dy_matrix, 0); } } }; diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index 8e7f544e0d..b1ee8051c4 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -119,7 +119,8 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OP_CPU_KERNEL( - multiplex, ops::MultiplexCPUKernel); + multiplex, + ops::MultiplexCPUKernel); REGISTER_OP_CPU_KERNEL( multiplex_grad, - ops::MultiplexGradCPUKernel); + ops::MultiplexGradCPUKernel); diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 10dff8d021..47986e9ff8 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel { CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - Place place = boost::get(ctx.GetPlace()); + platform::GPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); @@ -60,7 +60,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(ctx.GetEigenDevice()) = t.constant(static_cast(0)); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); } } @@ -72,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - Place place = boost::get(ctx.GetPlace()); + platform::GPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { @@ -87,8 +88,9 @@ class MultiplexGradGPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - multiplex, ops::MultiplexGPUKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + multiplex, + ops::MultiplexGPUKernel); +REGISTER_OP_CUDA_KERNEL( multiplex_grad, - ops::MultiplexGradGPUKernel); + ops::MultiplexGradGPUKernel); diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h index ab3cafaa32..3443151161 100644 --- a/paddle/operators/multiplex_op.h +++ b/paddle/operators/multiplex_op.h @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -template +template class MultiplexCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -35,7 +35,7 @@ class MultiplexCPUKernel : public framework::OpKernel { auto rows = ins[0]->dims()[0]; auto cols = ins[0]->numel() / rows; auto index = ids->data(); - Place place = boost::get(ctx.GetPlace()); + platform::CPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); @@ -47,7 +47,7 @@ class MultiplexCPUKernel : public framework::OpKernel { } }; -template +template class MultiplexGradCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -60,14 +60,15 @@ class MultiplexGradCPUKernel : public framework::OpKernel { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(ctx.GetEigenDevice()) = t.constant(static_cast(0)); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); } } auto rows = ins[0]->dims()[0]; auto cols = ins[0]->numel() / rows; auto* index = ids->data(); - Place place = boost::get(ctx.GetPlace()); + platform::CPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc index 4f0a2a79ed..6ca6db7253 100644 --- a/paddle/operators/nccl_op.cu.cc +++ b/paddle/operators/nccl_op.cu.cc @@ -204,6 +204,6 @@ class NCCLBcastKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); -REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel); -REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel); +REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); +REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel); +REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel); diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index bb7ae20286..d747cc0cf5 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -33,9 +33,9 @@ #include "paddle/platform/place.h" USE_NO_KERNEL_OP(ncclInit); -USE_GPU_ONLY_OP(ncclAllReduce); -USE_GPU_ONLY_OP(ncclReduce); -USE_GPU_ONLY_OP(ncclBcast); +USE_CUDA_ONLY_OP(ncclAllReduce); +USE_CUDA_ONLY_OP(ncclReduce); +USE_CUDA_ONLY_OP(ncclBcast); namespace f = paddle::framework; namespace p = paddle::platform; diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc index 952da10434..5ad1610fde 100644 --- a/paddle/operators/nce_op.cc +++ b/paddle/operators/nce_op.cc @@ -67,7 +67,7 @@ class NCEOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; @@ -170,7 +170,7 @@ class NCEOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index 0a8a95de5f..6636dad060 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -28,7 +28,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template void PrepareSamples(const framework::ExecutionContext& context) { auto label = context.Input("Label"); const int64_t* label_data = label->data(); @@ -67,11 +67,11 @@ void PrepareSamples(const framework::ExecutionContext& context) { } } -template +template class NCEKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PrepareSamples(context); + PrepareSamples(context); auto sample_labels = context.Output("SampleLabels"); const int64_t* sample_labels_data = sample_labels->data(); auto sample_out = context.Output("SampleLogits"); @@ -135,7 +135,7 @@ class NCEKernel : public framework::OpKernel { } }; -template +template class NCEGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc index adb75df6ef..936dde22c3 100644 --- a/paddle/operators/pad_op.cc +++ b/paddle/operators/pad_op.cc @@ -134,6 +134,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker); REGISTER_OPERATOR(pad_grad, ops::PadOpGrad); -REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel); -REGISTER_OP_CPU_KERNEL(pad_grad, - ops::PadGradKernel); +REGISTER_OP_CPU_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CPU_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu index 555a7dba23..c309fb625c 100644 --- a/paddle/operators/pad_op.cu +++ b/paddle/operators/pad_op.cu @@ -16,6 +16,7 @@ #include "paddle/operators/pad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pad, ops::PadKernel); -REGISTER_OP_GPU_KERNEL(pad_grad, - ops::PadGradKernel); +REGISTER_OP_CUDA_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CUDA_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h index 9534dbf545..1b95942af3 100644 --- a/paddle/operators/pad_op.h +++ b/paddle/operators/pad_op.h @@ -26,7 +26,7 @@ template using EigenTensor = framework::EigenTensor; -template +template void PadFunction(const framework::ExecutionContext& context) { auto pads = context.Attr>("paddings"); Eigen::array, D> paddings; @@ -42,33 +42,34 @@ void PadFunction(const framework::ExecutionContext& context) { auto x_tensor = EigenTensor::From(*x); auto out_tensor = EigenTensor::From(*out); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); out_tensor.device(place) = x_tensor.pad(paddings, pad_value); } -template +template class PadKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { int rank = context.Input("X")->dims().size(); switch (rank) { case 1: - PadFunction(context); + PadFunction(context); break; case 2: - PadFunction(context); + PadFunction(context); break; case 3: - PadFunction(context); + PadFunction(context); break; case 4: - PadFunction(context); + PadFunction(context); break; case 5: - PadFunction(context); + PadFunction(context); break; case 6: - PadFunction(context); + PadFunction(context); break; default: PADDLE_THROW( @@ -77,7 +78,7 @@ class PadKernel : public framework::OpKernel { } }; -template +template void PadGradFunction(const framework::ExecutionContext& context) { auto pads = context.Attr>("paddings"); Eigen::array, D> paddings; @@ -91,12 +92,13 @@ void PadGradFunction(const framework::ExecutionContext& context) { d_x->mutable_data(context.GetPlace()); auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); } } -template +template class PadGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -104,22 +106,22 @@ class PadGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out"))->dims().size(); switch (rank) { case 1: - PadGradFunction(context); + PadGradFunction(context); break; case 2: - PadGradFunction(context); + PadGradFunction(context); break; case 3: - PadGradFunction(context); + PadGradFunction(context); break; case 4: - PadGradFunction(context); + PadGradFunction(context); break; case 5: - PadGradFunction(context); + PadGradFunction(context); break; case 6: - PadGradFunction(context); + PadGradFunction(context); break; default: PADDLE_THROW( diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc index be9fcc5661..77407f5cdf 100644 --- a/paddle/operators/pool_cudnn_op.cc +++ b/paddle/operators/pool_cudnn_op.cc @@ -19,19 +19,21 @@ namespace ops = paddle::operators; REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool2d_cudnn, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool2d_cudnn, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool2d_cudnn_grad, + ops::PoolGradKernel, + ops::PoolGradKernel) REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool3d_cudnn, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool3d_cudnn, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool3d_cudnn_grad, + ops::PoolGradKernel, + ops::PoolGradKernel) diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc index 66dd194ccd..fc2b37bd0f 100644 --- a/paddle/operators/pool_cudnn_op.cu.cc +++ b/paddle/operators/pool_cudnn_op.cu.cc @@ -162,12 +162,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel, - ops::PoolCudnnOpKernel); -REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel, - ops::PoolCudnnGradOpKernel); - -REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel, - ops::PoolCudnnOpKernel); -REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel, - ops::PoolCudnnGradOpKernel); +REGISTER_OP_CUDA_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel, + ops::PoolCudnnOpKernel); +REGISTER_OP_CUDA_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel, + ops::PoolCudnnGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel, + ops::PoolCudnnOpKernel); +REGISTER_OP_CUDA_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel, + ops::PoolCudnnGradOpKernel); diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc index e26ffd86e5..45fa20280c 100644 --- a/paddle/operators/pool_op.cc +++ b/paddle/operators/pool_op.cc @@ -216,19 +216,19 @@ namespace ops = paddle::operators; REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool2d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool2d_grad, ops::PoolGradKernel, + ops::PoolGradKernel) REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool3d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CPU_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool3d_grad, ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc index 1010cb7622..39a9dfbf79 100644 --- a/paddle/operators/pool_op.cu.cc +++ b/paddle/operators/pool_op.cu.cc @@ -16,16 +16,18 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pool2d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_GPU_KERNEL(pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool2d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); -REGISTER_OP_GPU_KERNEL(pool3d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_GPU_KERNEL(pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool3d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h index 63492a89e8..ab85d587a3 100644 --- a/paddle/operators/pool_op.h +++ b/paddle/operators/pool_op.h @@ -50,7 +50,7 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker* op_checker); }; -template +template class PoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -67,41 +67,41 @@ class PoolKernel : public framework::OpKernel { ksize[i] = static_cast(in_x->dims()[i + 2]); } } - + auto& dev_ctx = context.template device_context(); switch (ksize.size()) { case 2: { if (pooling_type == "max") { paddle::operators::math::Pool2dFunctor< - Place, paddle::operators::math::MaxPool, T> + DeviceContext, paddle::operators::math::MaxPool, T> pool2d_forward; paddle::operators::math::MaxPool pool_process; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dFunctor< - Place, paddle::operators::math::AvgPool, T> + DeviceContext, paddle::operators::math::AvgPool, T> pool2d_forward; paddle::operators::math::AvgPool pool_process; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } } break; case 3: { if (pooling_type == "max") { paddle::operators::math::Pool3dFunctor< - Place, paddle::operators::math::MaxPool, T> + DeviceContext, paddle::operators::math::MaxPool, T> pool3d_forward; paddle::operators::math::MaxPool pool_process; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dFunctor< - Place, paddle::operators::math::AvgPool, T> + DeviceContext, paddle::operators::math::AvgPool, T> pool3d_forward; paddle::operators::math::AvgPool pool_process; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } @@ -109,7 +109,7 @@ class PoolKernel : public framework::OpKernel { } }; -template +template class PoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -130,42 +130,43 @@ class PoolGradKernel : public framework::OpKernel { ksize[i] = static_cast(in_x->dims()[i + 2]); } } - + auto& dev_ctx = context.template device_context(); if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); auto temp = framework::EigenVector::Flatten(*in_x_grad); - temp.device(context.GetEigenDevice()) = + temp.device( + *context.template device_context().eigen_device()) = temp.constant(static_cast(0)); switch (ksize.size()) { case 2: { if (pooling_type == "max") { - paddle::operators::math::MaxPool2dGradFunctor + paddle::operators::math::MaxPool2dGradFunctor pool2d_backward; - pool2d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, in_x_grad); + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dGradFunctor< - Place, paddle::operators::math::AvgPoolGrad, T> + DeviceContext, paddle::operators::math::AvgPoolGrad, T> pool2d_backward; paddle::operators::math::AvgPoolGrad pool_process; - pool2d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, pool_process, in_x_grad); + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); } } break; case 3: { if (pooling_type == "max") { - paddle::operators::math::MaxPool3dGradFunctor + paddle::operators::math::MaxPool3dGradFunctor pool3d_backward; - pool3d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, in_x_grad); + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dGradFunctor< - Place, paddle::operators::math::AvgPoolGrad, T> + DeviceContext, paddle::operators::math::AvgPoolGrad, T> pool3d_backward; paddle::operators::math::AvgPoolGrad pool_process; - pool3d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, pool_process, in_x_grad); + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc index b9c42a6912..1a2383f8b8 100644 --- a/paddle/operators/pool_with_index_op.cc +++ b/paddle/operators/pool_with_index_op.cc @@ -266,12 +266,15 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp, REGISTER_OP_CPU_KERNEL( max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); REGISTER_OP_CPU_KERNEL( max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad, @@ -279,9 +282,12 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp, REGISTER_OP_CPU_KERNEL( max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); REGISTER_OP_CPU_KERNEL( max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc index 335064a7ee..4c9804da63 100644 --- a/paddle/operators/pool_with_index_op.cu.cc +++ b/paddle/operators/pool_with_index_op.cu.cc @@ -16,20 +16,28 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_GPU_KERNEL( + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_GPU_KERNEL( + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h index 40766c7e82..4f4087d1dd 100644 --- a/paddle/operators/pool_with_index_op.h +++ b/paddle/operators/pool_with_index_op.h @@ -24,7 +24,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MaxPoolWithIndexKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -35,6 +35,8 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + + auto& dev_ctx = context.template device_context(); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -44,23 +46,23 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { switch (ksize.size()) { case 2: { - paddle::operators::math::MaxPool2dWithIndexFunctor + paddle::operators::math::MaxPool2dWithIndexFunctor pool2d_forward; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, out, mask); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); } break; case 3: { - paddle::operators::math::MaxPool3dWithIndexFunctor + paddle::operators::math::MaxPool3dWithIndexFunctor pool3d_forward; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, out, mask); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } } }; -template +template class MaxPoolWithIndexGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -81,18 +83,20 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); - auto& device_ctx = context.device_context(); + auto& device_ctx = context.template device_context(); math::set_constant(device_ctx, in_x_grad, 0); switch (ksize.size()) { case 2: { - paddle::operators::math::MaxPool2dWithIndexGradFunctor + paddle::operators::math::MaxPool2dWithIndexGradFunctor pool2d_backward; pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, paddings, in_x_grad); } break; case 3: { - paddle::operators::math::MaxPool3dWithIndexGradFunctor + paddle::operators::math::MaxPool3dWithIndexGradFunctor pool3d_backward; pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, paddings, in_x_grad); diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h index 2efd3777e0..977e59b7d2 100644 --- a/paddle/operators/positive_negative_pair_op.h +++ b/paddle/operators/positive_negative_pair_op.h @@ -22,7 +22,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class PositiveNegativePairKernel : public framework::OpKernel { public: struct PredictionResult { diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h index 4a871ce674..c0d55405a3 100644 --- a/paddle/operators/precision_recall_op.h +++ b/paddle/operators/precision_recall_op.h @@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix; enum StateVariable { TP = 0, FP, TN, FN }; -template +template class PrecisionRecallKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc index 055c471b45..317a2a4015 100644 --- a/paddle/operators/prelu_op.cc +++ b/paddle/operators/prelu_op.cc @@ -85,7 +85,8 @@ namespace ops = paddle::operators; REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad, ops::PReluGradOp); -REGISTER_OP_CPU_KERNEL(prelu, - ops::PReluKernel); -REGISTER_OP_CPU_KERNEL(prelu_grad, - ops::PReluGradKernel); +REGISTER_OP_CPU_KERNEL( + prelu, ops::PReluKernel); +REGISTER_OP_CPU_KERNEL( + prelu_grad, + ops::PReluGradKernel); diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu index 9e391dabae..12033dee0e 100644 --- a/paddle/operators/prelu_op.cu +++ b/paddle/operators/prelu_op.cu @@ -14,8 +14,9 @@ #include "paddle/operators/prelu_op.h" -REGISTER_OP_GPU_KERNEL( - prelu, paddle::operators::PReluKernel); -REGISTER_OP_GPU_KERNEL( - prelu_grad, - paddle::operators::PReluGradKernel); +REGISTER_OP_CUDA_KERNEL( + prelu, + paddle::operators::PReluKernel); +REGISTER_OP_CUDA_KERNEL(prelu_grad, + paddle::operators::PReluGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h index 5ad31c2203..56f9a553ec 100644 --- a/paddle/operators/prelu_op.h +++ b/paddle/operators/prelu_op.h @@ -39,7 +39,7 @@ class PReluFunctor { const T* alpha_; }; -template +template class PReluKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -54,9 +54,9 @@ class PReluKernel : public framework::OpKernel { int numel = x->numel(); - Transform trans; - trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr, - PReluFunctor(alpha_ptr)); + Transform trans; + trans(context.template device_context(), x_ptr, + x_ptr + numel, o_ptr, PReluFunctor(alpha_ptr)); } }; @@ -76,7 +76,7 @@ class PReluGradFunctor { const T* alpha_; }; -template +template class PReluGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -92,9 +92,9 @@ class PReluGradKernel : public framework::OpKernel { const T* out_ptr = out->data(); int numel = dx->numel(); - Transform trans; - trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr, - PReluGradFunctor(alpha_ptr)); + Transform trans; + trans(context.template device_context(), out_ptr, + out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor(alpha_ptr)); // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready } diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc index 36e460103a..cc350f6d26 100644 --- a/paddle/operators/proximal_adagrad_op.cc +++ b/paddle/operators/proximal_adagrad_op.cc @@ -114,4 +114,4 @@ REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp, ops::ProximalAdagradOpMaker); REGISTER_OP_CPU_KERNEL( proximal_adagrad, - ops::ProximalAdagradOpKernel); + ops::ProximalAdagradOpKernel); diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/operators/proximal_adagrad_op.cu index d0ae039518..42a178f94b 100644 --- a/paddle/operators/proximal_adagrad_op.cu +++ b/paddle/operators/proximal_adagrad_op.cu @@ -15,6 +15,6 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/proximal_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( proximal_adagrad, - ops::ProximalAdagradOpKernel); + ops::ProximalAdagradOpKernel); diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/operators/proximal_adagrad_op.h index 7a1560e8cb..523924d80e 100644 --- a/paddle/operators/proximal_adagrad_op.h +++ b/paddle/operators/proximal_adagrad_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class ProximalAdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -45,20 +45,20 @@ class ProximalAdagradOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto m_out = EigenVector::Flatten(*moment_out); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); - m_out.device(place) = m + g * g; + m_out.device(*place) = m + g * g; auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt(); if (l1 > static_cast(0)) { - p_out.device(place) = + p_out.device(*place) = prox_param.sign() * (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) .cwiseMax(static_cast(0.0))) / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize))); } else { - p_out.device(place) = + p_out.device(*place) = prox_param / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize)); } } diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc index 5693d0ec9e..0b26beb3ac 100644 --- a/paddle/operators/proximal_gd_op.cc +++ b/paddle/operators/proximal_gd_op.cc @@ -94,4 +94,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp, ops::ProximalGDOpMaker); REGISTER_OP_CPU_KERNEL( - proximal_gd, ops::ProximalGDOpKernel); + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu index 26f4ebaa0f..b7dd840d19 100644 --- a/paddle/operators/proximal_gd_op.cu +++ b/paddle/operators/proximal_gd_op.cu @@ -15,5 +15,6 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/proximal_gd_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - proximal_gd, ops::ProximalGDOpKernel); +REGISTER_OP_CUDA_KERNEL( + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h index bebda02041..64648b3cca 100644 --- a/paddle/operators/proximal_gd_op.h +++ b/paddle/operators/proximal_gd_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class ProximalGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -42,7 +42,7 @@ class ProximalGDOpKernel : public framework::OpKernel { auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); auto p_out = EigenVector::Flatten(*param_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc index 912f88f455..b80b175792 100644 --- a/paddle/operators/rank_loss_op.cc +++ b/paddle/operators/rank_loss_op.cc @@ -123,7 +123,8 @@ namespace ops = paddle::operators; REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad, ops::RankLossGradOp); -REGISTER_OP_CPU_KERNEL(rank_loss, - ops::RankLossKernel); REGISTER_OP_CPU_KERNEL( - rank_loss_grad, ops::RankLossGradKernel); + rank_loss, ops::RankLossKernel); +REGISTER_OP_CPU_KERNEL( + rank_loss_grad, + ops::RankLossGradKernel); diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu index 5382e3a629..5aee66443d 100644 --- a/paddle/operators/rank_loss_op.cu +++ b/paddle/operators/rank_loss_op.cu @@ -14,9 +14,9 @@ #include "paddle/operators/rank_loss_op.h" -REGISTER_OP_GPU_KERNEL( - rank_loss, - paddle::operators::RankLossKernel); -REGISTER_OP_GPU_KERNEL( - rank_loss_grad, - paddle::operators::RankLossGradKernel); +REGISTER_OP_CUDA_KERNEL(rank_loss, + paddle::operators::RankLossKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(rank_loss_grad, + paddle::operators::RankLossGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h index 703c77a0b2..ea24b61fd9 100644 --- a/paddle/operators/rank_loss_op.h +++ b/paddle/operators/rank_loss_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class RankLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -35,13 +35,13 @@ class RankLossKernel : public framework::OpKernel { auto left = framework::EigenVector::Flatten(*left_t); auto right = framework::EigenVector::Flatten(*right_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = (1. + (left - right).exp()).log() - label * (left - right); } }; -template +template class RankLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -55,7 +55,7 @@ class RankLossGradKernel : public framework::OpKernel { auto* left_t = ctx.Input("Left"); auto* right_t = ctx.Input("Right"); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); auto d_out = framework::EigenVector::Flatten(*d_out_t); auto label = framework::EigenVector::Flatten(*label_t); auto left = framework::EigenVector::Flatten(*left_t); diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc index 2589a54cfc..b754637bf2 100644 --- a/paddle/operators/reduce_op.cc +++ b/paddle/operators/reduce_op.cc @@ -180,12 +180,13 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, ops::ReduceGradOp); -#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL( \ - reduce_type, \ - ops::ReduceKernel); \ - REGISTER_OP_CPU_KERNEL(reduce_type##_grad, \ - ops::ReduceGradKernel); +#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CPU_KERNEL(reduce_type, \ + ops::ReduceKernel); \ + REGISTER_OP_CPU_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel); FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu index d306e1a240..a10ace5253 100644 --- a/paddle/operators/reduce_op.cu +++ b/paddle/operators/reduce_op.cu @@ -17,12 +17,13 @@ namespace ops = paddle::operators; -#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_GPU_KERNEL( \ - reduce_type, \ - ops::ReduceKernel); \ - REGISTER_OP_GPU_KERNEL(reduce_type##_grad, \ - ops::ReduceGradKernel); +#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type, ops::ReduceKernel); \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel); FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h index dd6547542d..47ce910f28 100644 --- a/paddle/operators/reduce_op.h +++ b/paddle/operators/reduce_op.h @@ -32,55 +32,55 @@ template ; struct SumFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.sum(dim); } }; struct SumGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { dx.device(place) = dy.broadcast(dim); } }; struct MeanFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.mean(dim); } }; struct MeanGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { dx.device(place) = dy.broadcast(dim) / dx.constant(size); } }; struct MaxFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.maximum(dim); } }; struct MinFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.minimum(dim); } }; struct MaxOrMinGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { auto equals = x == y.broadcast(dim); auto ones = dx.constant(1); @@ -91,7 +91,7 @@ struct MaxOrMinGradFunctor { } }; -template +template class ReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -139,7 +139,8 @@ class ReduceKernel : public framework::OpKernel { dims = framework::make_ddim(dims_vector); } - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Functor functor; if (D == 1) { @@ -152,7 +153,7 @@ class ReduceKernel : public framework::OpKernel { } }; -template +template class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -201,7 +202,8 @@ class ReduceGradKernel : public framework::OpKernel { Eigen::array broadcast_dim; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; broadcast_dim[dim] = input0->dims()[dim]; - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Functor functor; functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim, broadcast_dim[dim]); diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu index dca6c15007..b7329238c0 100644 --- a/paddle/operators/reshape_op.cu +++ b/paddle/operators/reshape_op.cu @@ -14,9 +14,9 @@ #include "paddle/operators/reshape_op.h" -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( reshape, paddle::operators::ReshapeKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( reshape_grad, paddle::operators::ReshapeGradKernel); diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h index 73fd1da642..92d8cbbb56 100644 --- a/paddle/operators/reshape_op.h +++ b/paddle/operators/reshape_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class ReshapeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel { } }; -template +template class ReshapeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc index a9c45f639c..fc3f9b8988 100644 --- a/paddle/operators/rmsprop_op.cc +++ b/paddle/operators/rmsprop_op.cc @@ -116,5 +116,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker); -REGISTER_OP_CPU_KERNEL(rmsprop, - ops::RmspropOpKernel); +REGISTER_OP_CPU_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu index 52634a5481..2a9fd6e104 100644 --- a/paddle/operators/rmsprop_op.cu +++ b/paddle/operators/rmsprop_op.cu @@ -16,5 +16,5 @@ #include "paddle/operators/rmsprop_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(rmsprop, - ops::RmspropOpKernel); +REGISTER_OP_CUDA_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h index 7bf2129010..16a561835d 100644 --- a/paddle/operators/rmsprop_op.h +++ b/paddle/operators/rmsprop_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class RmspropOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,7 +51,7 @@ class RmspropOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto mom_out = EigenVector::Flatten(*moment_out); auto ms_out = EigenVector::Flatten(*mean_square_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc index 2b5e66c96b..75fcea8401 100644 --- a/paddle/operators/roi_pool_op.cc +++ b/paddle/operators/roi_pool_op.cc @@ -157,9 +157,10 @@ namespace ops = paddle::operators; REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OP_CPU_KERNEL( - roi_pool, ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel); + roi_pool, + ops::CPUROIPoolOpKernel, + ops::CPUROIPoolOpKernel); REGISTER_OP_CPU_KERNEL( roi_pool_grad, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolOpKernel); + ops::CPUROIPoolGradOpKernel, + ops::CPUROIPoolOpKernel); diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu index 9a4c8ca752..a874befe4d 100644 --- a/paddle/operators/roi_pool_op.cu +++ b/paddle/operators/roi_pool_op.cu @@ -177,7 +177,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { if (x_grad) { x_grad->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; - set_zero(ctx.device_context(), x_grad, static_cast(0)); + set_zero(ctx.cuda_device_context(), x_grad, static_cast(0)); int output_grad_size = out_grad->numel(); int blocks = NumBlocks(output_grad_size); @@ -199,10 +199,11 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - roi_pool, ops::GPUROIPoolOpKernel, - ops::GPUROIPoolOpKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + roi_pool, + ops::GPUROIPoolOpKernel, + ops::GPUROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( roi_pool_grad, - ops::GPUROIPoolGradOpKernel, - ops::GPUROIPoolOpKernel); + ops::GPUROIPoolGradOpKernel, + ops::GPUROIPoolOpKernel); diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h index 3812c66c65..09a9d3d870 100644 --- a/paddle/operators/roi_pool_op.h +++ b/paddle/operators/roi_pool_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CPUROIPoolOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -126,7 +126,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel { } }; -template +template class CPUROIPoolGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -145,8 +145,9 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel { const T* out_grad_data = out_grad->data(); const int64_t* argmax_data = argmax->data(); T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - math::SetConstant set_zero; - set_zero(ctx.device_context(), in_grad, static_cast(0)); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), in_grad, + static_cast(0)); auto in_stride = framework::stride(in->dims()); auto argmax_stride = framework::stride(argmax->dims()); diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc index ea0bb99f8d..5203a5079c 100644 --- a/paddle/operators/row_conv_op.cc +++ b/paddle/operators/row_conv_op.cc @@ -124,7 +124,8 @@ $$ }; template -class RowConvKernel : public framework::OpKernel { +class RowConvKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *x = context.Input("X"); @@ -169,7 +170,8 @@ class RowConvKernel : public framework::OpKernel { }; template -class RowConvGradKernel : public framework::OpKernel { +class RowConvGradKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *x = context.Input("X"); @@ -251,7 +253,8 @@ class RowConvGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad, ops::RowConvGradOp); -REGISTER_OP_CPU_KERNEL(row_conv, - ops::RowConvKernel); REGISTER_OP_CPU_KERNEL( - row_conv_grad, ops::RowConvGradKernel); + row_conv, ops::RowConvKernel); +REGISTER_OP_CPU_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu index e0d7ebda7e..3fc5eabcf5 100644 --- a/paddle/operators/row_conv_op.cu +++ b/paddle/operators/row_conv_op.cu @@ -292,7 +292,8 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, } // namespace template -class RowConvKernel : public framework::OpKernel { +class RowConvKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -327,7 +328,8 @@ class RowConvKernel : public framework::OpKernel { }; template -class RowConvGradKernel : public framework::OpKernel { +class RowConvGradKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -347,7 +349,7 @@ class RowConvGradKernel : public framework::OpKernel { size_t *idx = batch_indices.data(); auto &device_ctx = context.cuda_device_context(); - math::SetConstant zero; + math::SetConstant zero; if (dFilter) { T *dfilter = dFilter->mutable_data(context.GetPlace()); @@ -402,7 +404,8 @@ class RowConvGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(row_conv, - ops::RowConvKernel); -REGISTER_OP_GPU_KERNEL( - row_conv_grad, ops::RowConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + row_conv, ops::RowConvKernel); +REGISTER_OP_CUDA_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h index 525e83908d..80912ad8f7 100644 --- a/paddle/operators/row_conv_op.h +++ b/paddle/operators/row_conv_op.h @@ -18,13 +18,13 @@ namespace paddle { namespace operators { -template +template class RowConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; }; -template +template class RowConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc index e5c10fec4d..d848be823e 100644 --- a/paddle/operators/scale_op.cc +++ b/paddle/operators/scale_op.cc @@ -75,8 +75,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker); -REGISTER_OP_CPU_KERNEL(scale, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel); +REGISTER_OP_CPU_KERNEL( + scale, ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel); diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu index 0d70775159..0c7980430f 100644 --- a/paddle/operators/scale_op.cu +++ b/paddle/operators/scale_op.cu @@ -14,8 +14,10 @@ #include "paddle/operators/scale_op.h" -REGISTER_OP_GPU_KERNEL( - scale, paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel); +REGISTER_OP_CUDA_KERNEL( + scale, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel); diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h index 4931294c9d..02a8c97a83 100644 --- a/paddle/operators/scale_op.h +++ b/paddle/operators/scale_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class ScaleKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& context) const { @@ -31,7 +31,8 @@ class ScaleKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*tensor); auto eigen_in = framework::EigenVector::Flatten(*in); - auto& dev = context.GetEigenDevice(); + auto& dev = + *context.template device_context().eigen_device(); eigen_out.device(dev) = scale * eigen_in; } }; diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu index 3b32ae2fb7..6b43a1389f 100644 --- a/paddle/operators/scatter_op.cu +++ b/paddle/operators/scatter_op.cu @@ -59,5 +59,5 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel); diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc index b862056ad4..ede9754697 100644 --- a/paddle/operators/seq_expand_op.cc +++ b/paddle/operators/seq_expand_op.cc @@ -148,8 +148,9 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker, seq_expand_grad, ops::SeqExpandOpGrad); -REGISTER_OP_CPU_KERNEL(seq_expand, - ops::SeqExpandKernel); +REGISTER_OP_CPU_KERNEL( + seq_expand, + ops::SeqExpandKernel); REGISTER_OP_CPU_KERNEL( seq_expand_grad, - ops::SeqExpandGradKernel); + ops::SeqExpandGradKernel); diff --git a/paddle/operators/seq_expand_op.cu b/paddle/operators/seq_expand_op.cu index f1e4b82a76..8e67ce9ccb 100644 --- a/paddle/operators/seq_expand_op.cu +++ b/paddle/operators/seq_expand_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/seq_expand_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(seq_expand, - ops::SeqExpandKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + seq_expand, + ops::SeqExpandKernel); +REGISTER_OP_CUDA_KERNEL( seq_expand_grad, - ops::SeqExpandGradKernel); + ops::SeqExpandGradKernel); diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h index 4ef0d02cf8..fbee0db454 100644 --- a/paddle/operators/seq_expand_op.h +++ b/paddle/operators/seq_expand_op.h @@ -23,7 +23,7 @@ namespace operators { using LoDTensor = framework::LoDTensor; -template +template class SeqExpandKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -37,7 +37,8 @@ class SeqExpandKernel : public framework::OpKernel { "The size of last lod level in Input(Y)" "must be equal to dims[0] of Input(X)."); out->set_lod(y->lod()); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); size_t element_len = framework::product(x_dims) / x_dims[0]; T* out_data = out->mutable_data(context.GetPlace()); auto out_starts = out->lod().back(); @@ -50,7 +51,7 @@ class SeqExpandKernel : public framework::OpKernel { Eigen::TensorMap> out_t(out_data, scale, element_len); Eigen::array cast({{scale, 1}}); - out_t.device(place) = x_t.broadcast(cast); + out_t.device(*place) = x_t.broadcast(cast); x_data += element_len; out_data += element_len * scale; } @@ -69,7 +70,7 @@ class SeqExpandKernel : public framework::OpKernel { * Grad(X).lod = Input(X).lod * * */ -template +template class SeqExpandGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -89,8 +90,9 @@ class SeqExpandGradKernel : public framework::OpKernel { d_out_t(d_out_data, static_cast(repeat), element_len); Eigen::TensorMap> d_x_t(d_x_data, static_cast(element_len)); - auto place = context.GetEigenDevice(); - d_x_t.device(place) = d_out_t.sum(Eigen::array({{0}})); + auto place = + context.template device_context().eigen_device(); + d_x_t.device(*place) = d_out_t.sum(Eigen::array({{0}})); d_out_data += (repeat * element_len); d_x_data += element_len; } diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc index d1de0b4447..9c7e5456e8 100644 --- a/paddle/operators/sequence_concat_op.cc +++ b/paddle/operators/sequence_concat_op.cc @@ -129,7 +129,7 @@ REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker, sequence_concat_grad, ops::SequenceConcatGradOp); REGISTER_OP_CPU_KERNEL( sequence_concat, - ops::SequenceConcatOpKernel); + ops::SequenceConcatOpKernel); REGISTER_OP_CPU_KERNEL( sequence_concat_grad, - ops::SequenceConcatGradOpKernel); + ops::SequenceConcatGradOpKernel); diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/operators/sequence_concat_op.cu.cc index 9ca99c2258..144bdb5af6 100644 --- a/paddle/operators/sequence_concat_op.cu.cc +++ b/paddle/operators/sequence_concat_op.cu.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_concat_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_concat, - ops::SequenceConcatOpKernel); -REGISTER_OP_GPU_KERNEL( - sequence_concat_grad, - ops::SequenceConcatGradOpKernel); + ops::SequenceConcatOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, + ops::SequenceConcatGradOpKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h index 09212070aa..8445224f46 100644 --- a/paddle/operators/sequence_concat_op.h +++ b/paddle/operators/sequence_concat_op.h @@ -59,7 +59,7 @@ LoD ConcatLoD(const std::vector ins, const size_t level) { return out_lod; } -template +template class SequenceConcatOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -119,7 +119,7 @@ class SequenceConcatOpKernel : public framework::OpKernel { } }; -template +template class SequenceConcatGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc index c5533732d4..f5c4f1c133 100644 --- a/paddle/operators/sequence_conv_op.cc +++ b/paddle/operators/sequence_conv_op.cc @@ -179,9 +179,10 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker, sequence_conv_grad, ops::SequenceConvGradOp); REGISTER_OP_CPU_KERNEL( - sequence_conv, ops::SequenceConvKernel, - ops::SequenceConvKernel); + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); REGISTER_OP_CPU_KERNEL( sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc index c8136dbcb3..eacba79ace 100644 --- a/paddle/operators/sequence_conv_op.cu.cc +++ b/paddle/operators/sequence_conv_op.cu.cc @@ -15,10 +15,11 @@ #include "paddle/operators/sequence_conv_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - sequence_conv, ops::SequenceConvKernel, - ops::SequenceConvKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CUDA_KERNEL( sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h index b8fbe2647c..bb584b7bfa 100644 --- a/paddle/operators/sequence_conv_op.h +++ b/paddle/operators/sequence_conv_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class SequenceConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,21 +56,23 @@ class SequenceConvKernel : public framework::OpKernel { Tensor col; col.mutable_data(col_shape, context.GetPlace()); // Because if padding_trainable is false, padding data should be zeros. - math::SetConstant set_zero; - set_zero(context.device_context(), &col, static_cast(0)); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, &col, static_cast(0)); - math::ContextProjectFunctor seq_project_functor; + math::ContextProjectFunctor seq_project_functor; - seq_project_functor(context.device_context(), *in, *padding_data, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, &col); + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); - math::matmul(context.device_context(), col, false, filter, false, - static_cast(1.0), out, static_cast(0.0)); + math::matmul(dev_ctx, col, false, filter, false, + static_cast(1.0), out, + static_cast(0.0)); } }; -template +template class SequenceConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -95,7 +97,8 @@ class SequenceConvGradKernel : public framework::OpKernel { int down_pad = std::max(0, context_start + context_length - 1); int sequence_width = static_cast(in->dims()[1]); - math::SetConstant set_zero; + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); // use col_shape in the im2col calculation framework::DDim col_shape = {in->dims()[0], sequence_width * context_length}; @@ -104,38 +107,36 @@ class SequenceConvGradKernel : public framework::OpKernel { if (in_g || filter_g || (padding_trainable && padding_data_g)) { col.mutable_data(col_shape, context.GetPlace()); // Because if padding_trainable is false, padding data should be zeros. - set_zero(context.device_context(), &col, static_cast(0)); - math::matmul(context.device_context(), *out_g, false, *filter, - true, T(1.0), &col, T(1.0)); + set_zero(dev_ctx, &col, static_cast(0)); + math::matmul(dev_ctx, *out_g, false, *filter, true, + T(1.0), &col, T(1.0)); } - math::ContextProjectFunctor seq_project_functor; - math::ContextProjectGradFunctor seq_project_grad_functor; + math::ContextProjectFunctor seq_project_functor; + math::ContextProjectGradFunctor seq_project_grad_functor; if (in_g) { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - set_zero(context.device_context(), in_g, static_cast(0)); + set_zero(dev_ctx, in_g, static_cast(0)); - seq_project_grad_functor(context.device_context(), *in_g, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, false, true, - padding_data_g, &col); + seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start, + context_length, context_stride, up_pad, down_pad, + false, true, padding_data_g, &col); } if (padding_trainable && padding_data_g) { padding_data_g->mutable_data(context.GetPlace()); - set_zero(context.device_context(), padding_data_g, static_cast(0)); + set_zero(dev_ctx, padding_data_g, static_cast(0)); LoDTensor* input = const_cast(in); - seq_project_grad_functor(context.device_context(), *input, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, true, false, - padding_data_g, &col); + seq_project_grad_functor( + dev_ctx, *input, padding_trainable, context_start, context_length, + context_stride, up_pad, down_pad, true, false, padding_data_g, &col); } if (filter_g) { filter_g->mutable_data(context.GetPlace()); - set_zero(context.device_context(), filter_g, static_cast(0)); + set_zero(dev_ctx, filter_g, static_cast(0)); Tensor filter_grad = *filter_g; LoDTensor out_grad = *out_g; @@ -145,12 +146,12 @@ class SequenceConvGradKernel : public framework::OpKernel { padding_data = context.Input("PaddingData"); } - seq_project_functor(context.device_context(), *in, *padding_data, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, &col); + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); - math::matmul(context.device_context(), col, true, out_grad, - false, T(1.0), &filter_grad, T(1.0)); + math::matmul(dev_ctx, col, true, out_grad, false, + T(1.0), &filter_grad, T(1.0)); } } }; diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc index bfda8649cd..3526e45a1b 100644 --- a/paddle/operators/sequence_pool_op.cc +++ b/paddle/operators/sequence_pool_op.cc @@ -123,7 +123,8 @@ namespace ops = paddle::operators; REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker, sequence_pool_grad, ops::SequencePoolGradOp); REGISTER_OP_CPU_KERNEL( - sequence_pool, ops::SequencePoolKernel); + sequence_pool, + ops::SequencePoolKernel); REGISTER_OP_CPU_KERNEL( sequence_pool_grad, - ops::SequencePoolGradKernel); + ops::SequencePoolGradKernel); diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu index 66850772d5..fcd6508435 100644 --- a/paddle/operators/sequence_pool_op.cu +++ b/paddle/operators/sequence_pool_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/sequence_pool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - sequence_pool, ops::SequencePoolKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CUDA_KERNEL( sequence_pool_grad, - ops::SequencePoolGradKernel); + ops::SequencePoolGradKernel); diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h index 7f136d8cf0..7519aa1d72 100644 --- a/paddle/operators/sequence_pool_op.h +++ b/paddle/operators/sequence_pool_op.h @@ -30,7 +30,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class SequencePoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -54,17 +54,18 @@ class SequencePoolKernel : public framework::OpKernel { auto lod_level_0 = lod[0]; out->mutable_data(context.GetPlace()); - + auto& dev_ctx = context.template device_context(); if (pooltype == "MAX") { - math::MaxSeqPoolFunctor max_pool; + math::MaxSeqPoolFunctor max_pool; auto* index = context.Output("MaxIndex"); index->Resize({dims}); index->mutable_data(context.GetPlace()); - max_pool(context.device_context(), *in, out, index); + max_pool(dev_ctx, *in, out, index); return; } - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { Tensor in_t = in->Slice(static_cast(lod_level_0[i]), static_cast(lod_level_0[i + 1])); @@ -91,7 +92,7 @@ class SequencePoolKernel : public framework::OpKernel { } }; -template +template class SequencePoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -105,20 +106,23 @@ class SequencePoolGradKernel : public framework::OpKernel { int64_t w = in->numel() / dims[0]; in_g->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); if (pooltype == "MAX") { - math::MaxSeqPoolGradFunctor max_pool_grad; + math::MaxSeqPoolGradFunctor max_pool_grad; auto* index = context.Input("MaxIndex"); - max_pool_grad(context.device_context(), *out_g, *index, in_g); + max_pool_grad(dev_ctx, *out_g, *index, in_g); return; } if (pooltype == "LAST" || pooltype == "FIRST") { // set X@Grad be zero at first when pooltype is LAST/FIRST - math::SetConstant functor; - functor(context.device_context(), in_g, 0); + math::SetConstant functor; + functor(dev_ctx, in_g, 0); } - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { auto in_g_t = in_g->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc index 255683a572..481db8f9e5 100644 --- a/paddle/operators/sequence_slice_op.cc +++ b/paddle/operators/sequence_slice_op.cc @@ -125,7 +125,7 @@ REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker, sequence_slice_grad, ops::SequenceSliceGradOp); REGISTER_OP_CPU_KERNEL( sequence_slice, - ops::SequenceSliceOpKernel); + ops::SequenceSliceOpKernel); REGISTER_OP_CPU_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel); diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu index a9f59dadba..43a21d619f 100755 --- a/paddle/operators/sequence_slice_op.cu +++ b/paddle/operators/sequence_slice_op.cu @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_slice_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_slice, - ops::SequenceSliceOpKernel); -REGISTER_OP_GPU_KERNEL( + ops::SequenceSliceOpKernel); +REGISTER_OP_CUDA_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel); diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h index 428ef556da..14bcaebbb4 100644 --- a/paddle/operators/sequence_slice_op.h +++ b/paddle/operators/sequence_slice_op.h @@ -39,7 +39,7 @@ inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data, return out_lod; } -template +template class SequenceSliceOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -108,7 +108,7 @@ class SequenceSliceOpKernel : public framework::OpKernel { } }; -template +template class SequenceSliceGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -143,8 +143,9 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { if (x_grad) { x_grad->mutable_data(ctx.GetPlace()); x_grad->set_lod(in->lod()); - math::SetConstant set_zero; - set_zero(ctx.device_context(), x_grad, static_cast(0)); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), x_grad, + static_cast(0)); auto out_grad_stride = framework::stride(out_grad->dims()); diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc index 32c1502566..37d5452e6b 100644 --- a/paddle/operators/sequence_softmax_op.cc +++ b/paddle/operators/sequence_softmax_op.cc @@ -99,7 +99,7 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp, ops::SequenceSoftmaxGradOp); REGISTER_OP_CPU_KERNEL( sequence_softmax, - ops::SequenceSoftmaxKernel); + ops::SequenceSoftmaxKernel); REGISTER_OP_CPU_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/operators/sequence_softmax_op.cu.cc index 7023795a3b..5f65b4daf9 100644 --- a/paddle/operators/sequence_softmax_op.cu.cc +++ b/paddle/operators/sequence_softmax_op.cu.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_softmax_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_softmax, - ops::SequenceSoftmaxKernel) -REGISTER_OP_GPU_KERNEL( + ops::SequenceSoftmaxKernel) +REGISTER_OP_CUDA_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h index 1b68dd0662..e889e88cb3 100644 --- a/paddle/operators/sequence_softmax_op.h +++ b/paddle/operators/sequence_softmax_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class SequenceSoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,12 +52,13 @@ class SequenceSoftmaxKernel : public framework::OpKernel { framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); x_i.Resize(dims_i); out_i.Resize(dims_i); - math::SoftmaxFunctor()(ctx.device_context(), &x_i, &out_i); + math::SoftmaxFunctor()( + ctx.template device_context(), &x_i, &out_i); } } }; -template +template class SequenceSoftmaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -83,8 +84,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel { out_i.Resize(dims_i); out_grad_i.Resize(dims_i); x_grad_i.Resize(dims_i); - math::SoftmaxGradFunctor()(ctx.device_context(), &out_i, - &out_grad_i, &x_grad_i); + math::SoftmaxGradFunctor()( + ctx.template device_context(), &out_i, &out_grad_i, + &x_grad_i); } } }; diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc index 5576d7b8be..121bf60b27 100644 --- a/paddle/operators/sgd_op.cc +++ b/paddle/operators/sgd_op.cc @@ -62,8 +62,8 @@ $$param\_out = param - learning\_rate * grad$$ }; template -struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseSGDFunctor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output) { @@ -90,13 +90,14 @@ struct SparseSGDFunctor { } }; -template struct SparseSGDFunctor; -template struct SparseSGDFunctor; +template struct SparseSGDFunctor; +template struct SparseSGDFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); -REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel, - ops::SGDOpKernel); +REGISTER_OP_CPU_KERNEL( + sgd, ops::SGDOpKernel, + ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 7b6c5ec306..a3c0db7e50 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -41,8 +41,8 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, } // namespace template -struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseSGDFunctor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output) { @@ -62,21 +62,19 @@ struct SparseSGDFunctor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in_rows.size()); - SparseSGDFunctorKernel< - T, 256><<(context) - .stream()>>>(in_data, in_rows.data(), - learning_rate.data(), out_data, - in_row_numel); + SparseSGDFunctorKernel<<>>( + in_data, in_rows.data(), learning_rate.data(), out_data, + in_row_numel); } }; -template struct SparseSGDFunctor; -template struct SparseSGDFunctor; +template struct SparseSGDFunctor; +template struct SparseSGDFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel, - ops::SGDOpKernel); +REGISTER_OP_CUDA_KERNEL( + sgd, ops::SGDOpKernel, + ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h index 78b595fc6c..c920025a91 100644 --- a/paddle/operators/sgd_op.h +++ b/paddle/operators/sgd_op.h @@ -20,15 +20,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output); }; -template +template class SGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -46,7 +46,8 @@ class SGDOpKernel : public framework::OpKernel { auto g = framework::EigenVector::Flatten(*grad); auto o = framework::EigenVector::Flatten(*param_out); auto lr = framework::EigenVector::Flatten(*learning_rate); - auto place = ctx.GetEigenDevice(); + auto& place = + *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); o.device(place) = p - lr.broadcast(grad_dsize) * g; @@ -56,8 +57,9 @@ class SGDOpKernel : public framework::OpKernel { // It's better to find a more elegant solution. PADDLE_ENFORCE_EQ(param, param_out); auto* grad = ctx.Input("Grad"); - SparseSGDFunctor functor; - functor(ctx.device_context(), *grad, *learning_rate, param_out); + SparseSGDFunctor functor; + functor(ctx.template device_context(), *grad, + *learning_rate, param_out); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); } diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc index 782f4c7936..b8a1bf122a 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -142,7 +142,7 @@ REGISTER_OP(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsGradOp); REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CPUPlace, float>); + paddle::platform::CPUDeviceContext, float>); REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUPlace, float>); + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu index 32a39956a1..1b569c93ed 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -16,9 +16,9 @@ #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel< - paddle::platform::GPUPlace, float>); -REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::GPUPlace, float>); +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h index 2a9d9bbc77..8fe7c5ba82 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template +template class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -32,7 +32,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto labels = framework::EigenVector::Flatten(*Labels); auto out = framework::EigenVector::Flatten(*Out); - auto place = context.GetEigenDevice(); + auto &place = *context.device_context().eigen_device(); // term1 = max(x, 0) auto term1 = x.cwiseMax(static_cast(0)); @@ -46,7 +46,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { }; // dX = sigmoid(X) - labels -template +template class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -62,7 +62,8 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { auto labels = framework::EigenVector::Flatten(*Labels); auto dout = framework::EigenVector::Flatten(*dOut); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); dx.device(place) = dout * (sigmoid_x - labels); diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc index 08bf2e4e7c..d5a7ccb77e 100644 --- a/paddle/operators/sign_op.cc +++ b/paddle/operators/sign_op.cc @@ -67,5 +67,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, ops::SignGradMaker); -REGISTER_OP_CPU_KERNEL(sign, - ops::SignKernel); +REGISTER_OP_CPU_KERNEL( + sign, ops::SignKernel); diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu index 4d0638cb97..9bc1c65d21 100644 --- a/paddle/operators/sign_op.cu +++ b/paddle/operators/sign_op.cu @@ -14,5 +14,6 @@ #include "paddle/operators/sign_op.h" -REGISTER_OP_GPU_KERNEL( - sign, paddle::operators::SignKernel); +REGISTER_OP_CUDA_KERNEL( + sign, + paddle::operators::SignKernel); diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h index ab5cd4bac0..2e476ed665 100644 --- a/paddle/operators/sign_op.h +++ b/paddle/operators/sign_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class SignKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& context) const { @@ -29,7 +29,8 @@ class SignKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*out); auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); eigen_out.device(place) = eigen_in.sign(); } }; diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc index 50543fcc14..56e8d9058f 100644 --- a/paddle/operators/smooth_l1_loss_op.cc +++ b/paddle/operators/smooth_l1_loss_op.cc @@ -138,7 +138,8 @@ REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker, smooth_l1_loss_grad, ops::SmoothL1LossGradOp); REGISTER_OP_CPU_KERNEL( - smooth_l1_loss, ops::SmoothL1LossKernel); + smooth_l1_loss, + ops::SmoothL1LossKernel); REGISTER_OP_CPU_KERNEL( smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); + ops::SmoothL1LossGradKernel); diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu index 1c3172f438..8e94ebac64 100644 --- a/paddle/operators/smooth_l1_loss_op.cu +++ b/paddle/operators/smooth_l1_loss_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/smooth_l1_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - smooth_l1_loss, ops::SmoothL1LossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CUDA_KERNEL( smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); + ops::SmoothL1LossGradKernel); diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h index 39d0070b6c..1a70c9c63c 100644 --- a/paddle/operators/smooth_l1_loss_op.h +++ b/paddle/operators/smooth_l1_loss_op.h @@ -44,7 +44,7 @@ struct SmoothL1LossForward { T sigma2; }; -template +template class SmoothL1LossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -57,7 +57,8 @@ class SmoothL1LossKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); out1->mutable_data(context.GetPlace()); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); auto sigma = static_cast(context.Attr("sigma")); T sigma2 = sigma * sigma; @@ -67,12 +68,12 @@ class SmoothL1LossKernel : public framework::OpKernel { auto y = EigenVector::Flatten(*in1); auto diff = EigenVector::Flatten(*out0); - diff.device(place) = x - y; + diff.device(*place) = x - y; // multiply inside weight if (has_weight) { auto inside_weight = EigenVector::Flatten(*in2); // cache diff, reused in bp - diff.device(place) = diff * inside_weight; + diff.device(*place) = diff * inside_weight; } auto in_counts = in0->numel(); @@ -81,12 +82,12 @@ class SmoothL1LossKernel : public framework::OpKernel { context.GetPlace()); auto errors = EigenVector::Flatten(ptensor_errors); // apply smooth l1 forward - errors.device(place) = diff.unaryExpr(SmoothL1LossForward(sigma2)); + errors.device(*place) = diff.unaryExpr(SmoothL1LossForward(sigma2)); // multiply outside weight if (has_weight) { auto outside_weight = EigenVector::Flatten(*in3); - errors.device(place) = errors * outside_weight; + errors.device(*place) = errors * outside_weight; } auto loss = EigenVector::Flatten(*out1); // first dimension of 'X' is the number of samples @@ -94,7 +95,7 @@ class SmoothL1LossKernel : public framework::OpKernel { framework::make_ddim({static_cast(in0->dims()[0]), static_cast(in_counts / in0->dims()[0])}); auto errors_mat_view = EigenMatrix::From(ptensor_errors, mat_dims); - loss.device(place) = errors_mat_view.sum(Eigen::array({{1}})); + loss.device(*place) = errors_mat_view.sum(Eigen::array({{1}})); } }; @@ -114,7 +115,7 @@ struct SmoothL1LossBackward { T sigma2; }; -template +template class SmoothL1LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -126,7 +127,8 @@ class SmoothL1LossGradKernel : public framework::OpKernel { T sigma2 = sigma * sigma; bool has_weight = (in0 != nullptr) && (in1 != nullptr); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); auto in_dims = in2->dims(); auto counts = in2->numel(); @@ -139,7 +141,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel { context.GetPlace()); auto diff = EigenVector::Flatten(ptensor_diff); // apply smooth l1 backwoard - diff.device(place) = EigenVector::Flatten(*in2).unaryExpr( + diff.device(*place) = EigenVector::Flatten(*in2).unaryExpr( SmoothL1LossBackward(sigma2)); // compute weights @@ -147,11 +149,11 @@ class SmoothL1LossGradKernel : public framework::OpKernel { ptensor_weights.mutable_data(mat_dims, context.GetPlace()); auto weights = EigenMatrix::From(ptensor_weights); // initialize to 1.0 - weights.device(place) = weights.constant(static_cast(1.0)); + weights.device(*place) = weights.constant(static_cast(1.0)); if (has_weight) { auto inside_weight = EigenMatrix::From(*in0, mat_dims); auto outside_weight = EigenMatrix::From(*in1, mat_dims); - weights.device(place) = inside_weight * outside_weight; + weights.device(*place) = inside_weight * outside_weight; } // compute gradients @@ -167,13 +169,13 @@ class SmoothL1LossGradKernel : public framework::OpKernel { if (out0) { out0->mutable_data(context.GetPlace()); auto x_grad = EigenMatrix::From(*out0, mat_dims); - x_grad.device(place) = gradients; + x_grad.device(*place) = gradients; } if (out1) { out1->mutable_data(context.GetPlace()); auto y_grad = EigenMatrix::From(*out1, mat_dims); - y_grad.device(place) = -1 * gradients; + y_grad.device(*place) = -1 * gradients; } } }; diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc index 93e0525bad..0988c83d43 100644 --- a/paddle/operators/softmax_op.cc +++ b/paddle/operators/softmax_op.cc @@ -89,7 +89,8 @@ namespace ops = paddle::operators; REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad, ops::SoftmaxOpGrad); -REGISTER_OP_CPU_KERNEL(softmax, - ops::SoftmaxKernel); REGISTER_OP_CPU_KERNEL( - softmax_grad, ops::SoftmaxGradKernel); + softmax, ops::SoftmaxKernel); +REGISTER_OP_CPU_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc index 013ace19ae..7b9882cbcf 100644 --- a/paddle/operators/softmax_op.cu.cc +++ b/paddle/operators/softmax_op.cu.cc @@ -16,7 +16,8 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(softmax, - ops::SoftmaxKernel); -REGISTER_OP_GPU_KERNEL( - softmax_grad, ops::SoftmaxGradKernel); +REGISTER_OP_CUDA_KERNEL( + softmax, ops::SoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h index 44d1e63f1b..0f8998b99e 100644 --- a/paddle/operators/softmax_op.h +++ b/paddle/operators/softmax_op.h @@ -21,7 +21,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class SoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -31,11 +31,12 @@ class SoftmaxKernel : public framework::OpKernel { // allocate memory on device. Y->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), X, Y); + math::SoftmaxFunctor()( + context.template device_context(), X, Y); } }; -template +template class SoftmaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,7 +47,8 @@ class SoftmaxGradKernel : public framework::OpKernel { // allocate memory on device. dX->mutable_data(context.GetPlace()); - math::SoftmaxGradFunctor()(context.device_context(), Y, dY, dX); + math::SoftmaxGradFunctor()( + context.template device_context(), Y, dY, dX); } }; diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu index b1faddac3f..6100c63f9a 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/operators/softmax_with_cross_entropy_op.cu @@ -69,10 +69,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), - logits, softmax); - math::CrossEntropyFunctor()( - context.device_context(), loss, softmax, labels, + math::SoftmaxFunctor()( + context.cuda_device_context(), logits, softmax); + math::CrossEntropyFunctor()( + context.cuda_device_context(), loss, softmax, labels, context.Attr("soft_label")); } }; @@ -98,18 +98,18 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { if (context.Attr("soft_label")) { const T* label_data = labels->data(); - SoftCrossEntropyGradientKernel<<< - grid, block, 0, reinterpret_cast( - context.device_context()) - .stream()>>>(logit_grad_data, loss_grad_data, - label_data, batch_size, class_num); + SoftCrossEntropyGradientKernel< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); } else { const int64_t* label_data = labels->data(); - CrossEntropyGrad<<< - grid, block, 0, reinterpret_cast( - context.device_context()) - .stream()>>>(logit_grad_data, loss_grad_data, - label_data, batch_size, class_num); + CrossEntropyGrad< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); } } }; @@ -118,9 +118,9 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyCUDAKernel, - ops::SoftmaxWithCrossEntropyCUDAKernel); -REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradCUDAKernel, - ops::SoftmaxWithCrossEntropyGradCUDAKernel); +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, + ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel); +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel); diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h index c4ab3f74b4..9c3431605b 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.h +++ b/paddle/operators/softmax_with_cross_entropy_op.h @@ -40,11 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), - logits, softmax); - math::CrossEntropyFunctor()( - context.device_context(), loss, softmax, labels, - context.Attr("soft_label")); + auto& dev_ctx = + context.template device_context(); + math::SoftmaxFunctor()(dev_ctx, logits, + softmax); + math::CrossEntropyFunctor()( + dev_ctx, loss, softmax, labels, context.Attr("soft_label")); } }; @@ -62,14 +63,15 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { const int class_num = logit_grad->dims()[1]; auto out_grad_mat = EigenMatrix::From(*out_grad); auto logit_grad_mat = EigenMatrix::From(*logit_grad); - + auto& place = *context.template device_context() + .eigen_device(); if (context.Attr("soft_label")) { auto lbl_mat = EigenMatrix::From(*labels); - logit_grad_mat.device(context.GetEigenDevice()) = + logit_grad_mat.device(place) = out_grad_mat.broadcast(Eigen::DSizes(1, class_num)) * (logit_grad_mat - lbl_mat); } else { - logit_grad_mat.device(context.GetEigenDevice()) = + logit_grad_mat.device(place) = logit_grad_mat * out_grad_mat.broadcast(Eigen::DSizes(1, class_num)); diff --git a/paddle/operators/split_op.cu.cc b/paddle/operators/split_op.cu.cc index 93d1fc3c44..dbad0bbf68 100644 --- a/paddle/operators/split_op.cu.cc +++ b/paddle/operators/split_op.cu.cc @@ -14,5 +14,5 @@ limitations under the License. */ #include "paddle/operators/split_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(split, - ops::SplitOpKernel); +REGISTER_OP_CUDA_KERNEL( + split, ops::SplitOpKernel); diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h index fa26e5f677..a38c435d53 100644 --- a/paddle/operators/split_op.h +++ b/paddle/operators/split_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class SplitOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc index bec2a2c18a..50bc6da196 100644 --- a/paddle/operators/squared_l2_distance_op.cc +++ b/paddle/operators/squared_l2_distance_op.cc @@ -115,7 +115,7 @@ REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp, ops::SquaredL2DistanceGradOp); REGISTER_OP_CPU_KERNEL( squared_l2_distance, - ops::SquaredL2DistanceKernel); -REGISTER_OP_CPU_KERNEL( - squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceKernel); +REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu index 3fe62f1a9c..ecc82ed1e4 100644 --- a/paddle/operators/squared_l2_distance_op.cu +++ b/paddle/operators/squared_l2_distance_op.cu @@ -17,9 +17,9 @@ #include "paddle/operators/squared_l2_distance_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( squared_l2_distance, - ops::SquaredL2DistanceKernel); -REGISTER_OP_GPU_KERNEL( - squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceKernel); +REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h index 259ef40296..5bd5f4819a 100644 --- a/paddle/operators/squared_l2_distance_op.h +++ b/paddle/operators/squared_l2_distance_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class SquaredL2DistanceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -51,7 +51,8 @@ class SquaredL2DistanceKernel : public framework::OpKernel { auto sub_result = EigenMatrix::From(*out0); auto z = EigenVector::Flatten(*out1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x_dims = x.dimensions(); auto y_dims = y.dimensions(); // buffer the substraction result @@ -67,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel { } }; -template +template class SquaredL2DistanceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -89,7 +90,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel { sub_result; // propagate back to input - auto eigen_place = context.GetEigenDevice(); + auto& eigen_place = + *context.template device_context().eigen_device(); if (x_g) { x_g->mutable_data(context.GetPlace()); // eigen matrix diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc index 3c10e6159f..3cff61a02f 100644 --- a/paddle/operators/squared_l2_norm_op.cc +++ b/paddle/operators/squared_l2_norm_op.cc @@ -72,7 +72,7 @@ REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker, squared_l2_norm_grad, ops::SquaredL2NormGradOp); REGISTER_OP_CPU_KERNEL( squared_l2_norm, - ops::SquaredL2NormKernel); + ops::SquaredL2NormKernel); REGISTER_OP_CPU_KERNEL( squared_l2_norm_grad, - ops::SquaredL2NormGradKernel); + ops::SquaredL2NormGradKernel); diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu index d384e9c28c..2d6567d090 100644 --- a/paddle/operators/squared_l2_norm_op.cu +++ b/paddle/operators/squared_l2_norm_op.cu @@ -16,9 +16,9 @@ #include "paddle/operators/squared_l2_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( squared_l2_norm, - ops::SquaredL2NormKernel); -REGISTER_OP_GPU_KERNEL( + ops::SquaredL2NormKernel); +REGISTER_OP_CUDA_KERNEL( squared_l2_norm_grad, - ops::SquaredL2NormGradKernel); + ops::SquaredL2NormGradKernel); diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h index 48d7b1c2d5..0ced7e7d70 100644 --- a/paddle/operators/squared_l2_norm_op.h +++ b/paddle/operators/squared_l2_norm_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = sum(square(X)) -template +template class SquaredL2NormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -30,14 +30,15 @@ class SquaredL2NormKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto out = framework::EigenScalar::From(*Out); - auto place = context.GetEigenDevice(); + auto *place = + context.template device_context().eigen_device(); - out.device(place) = x.square().sum(); + out.device(*place) = x.square().sum(); } }; // dX = X -template +template class SquaredL2NormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -53,10 +54,11 @@ class SquaredL2NormGradKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto dout = framework::EigenVector::Flatten(*dOut); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto *place = + context.template device_context().eigen_device(); Eigen::DSizes x_dsize(X->numel()); - dx.device(place) = (dout.broadcast(x_dsize) * x) * static_cast(2.0); + dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast(2.0); } }; diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index 744b2fe3f2..cd52672f78 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -195,7 +195,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker, ops::SumOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); +REGISTER_OP_CPU_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu index 5c30dd4d47..873155076c 100644 --- a/paddle/operators/sum_op.cu +++ b/paddle/operators/sum_op.cu @@ -13,7 +13,8 @@ limitations under the License. */ #include "paddle/operators/sum_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); +REGISTER_OP_CUDA_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index ed6c80ce60..eaa36aa1ae 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -26,7 +26,7 @@ template using EigenVector = framework::EigenVector; -template +template class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -43,12 +43,14 @@ class SumKernel : public framework::OpKernel { auto result = EigenVector::Flatten(*out); if (!in_place) { - math::SetConstant constant_functor; - constant_functor(context.device_context(), out, 0.0); + math::SetConstant constant_functor; + constant_functor(context.template device_context(), out, + 0.0); } - math::SelectedRowsAddToTensor functor; - auto place = context.GetEigenDevice(); + math::SelectedRowsAddToTensor functor; + auto &place = + *context.template device_context().eigen_device(); // If in_place, just skip the first tensor for (int i = in_place ? 1 : 0; i < N; i++) { if (in_vars[i]->IsType()) { @@ -60,7 +62,7 @@ class SumKernel : public framework::OpKernel { result.device(place) = result + in; } else if (in_vars[i]->IsType()) { auto &in_t = in_vars[i]->Get(); - functor(context.device_context(), in_t, out); + functor(context.template device_context(), in_t, out); } else { PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); } @@ -82,14 +84,14 @@ class SumKernel : public framework::OpKernel { out_value->Resize(framework::make_ddim(in_dim_vec)); out_value->mutable_data(context.GetPlace()); - math::SelectedRowsAddTo functor; + math::SelectedRowsAddTo functor; int64_t offset = 0; for (int i = 0; i < N; i++) { PADDLE_ENFORCE_EQ(out->height(), in_vars[i]->Get().height()); - functor(context.device_context(), in_vars[i]->Get(), - offset, out); + functor(context.template device_context(), + in_vars[i]->Get(), offset, out); offset += in_vars[i]->Get().value().numel(); } } else if (out_var->IsType()) { @@ -112,7 +114,8 @@ class SumKernel : public framework::OpKernel { PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); auto in = EigenVector::Flatten(in_array[i]); auto result = EigenVector::Flatten(out_array[i]); - result.device(context.GetEigenDevice()) = result + in; + result.device(*context.template device_context() + .eigen_device()) = result + in; } } } diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu index 7851c71bbe..453bd07267 100644 --- a/paddle/operators/top_k_op.cu +++ b/paddle/operators/top_k_op.cu @@ -317,4 +317,4 @@ class TopkOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h index bc8563717a..e9cd9bbd4d 100644 --- a/paddle/operators/top_k_op.h +++ b/paddle/operators/top_k_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class TopkKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc index 94de3d5069..de5ff561ad 100644 --- a/paddle/operators/transpose_op.cc +++ b/paddle/operators/transpose_op.cc @@ -112,8 +112,8 @@ class TransposeOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad, ops::TransposeOpGrad); -REGISTER_OP_CPU_KERNEL(transpose, - ops::TransposeKernel); +REGISTER_OP_CPU_KERNEL( + transpose, ops::TransposeKernel); REGISTER_OP_CPU_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel); diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc index af3f581462..7d23f1493e 100644 --- a/paddle/operators/transpose_op.cu.cc +++ b/paddle/operators/transpose_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/transpose_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(transpose, - ops::TransposeKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + transpose, + ops::TransposeKernel); +REGISTER_OP_CUDA_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel); diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h index e296032f41..d995271a6b 100644 --- a/paddle/operators/transpose_op.h +++ b/paddle/operators/transpose_op.h @@ -20,33 +20,33 @@ namespace paddle { namespace operators { -template -inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx, +template +inline void TransCompute(const int dim, const DeviceContext& dev_ctx, const framework::Tensor& in, framework::Tensor* out, const std::vector& axis) { switch (dim) { case 1: - math::Transpose trans1; + math::Transpose trans1; trans1(dev_ctx, in, out, axis); break; case 2: - math::Transpose trans2; + math::Transpose trans2; trans2(dev_ctx, in, out, axis); break; case 3: - math::Transpose trans3; + math::Transpose trans3; trans3(dev_ctx, in, out, axis); break; case 4: - math::Transpose trans4; + math::Transpose trans4; trans4(dev_ctx, in, out, axis); break; case 5: - math::Transpose trans5; + math::Transpose trans5; trans5(dev_ctx, in, out, axis); break; case 6: - math::Transpose trans6; + math::Transpose trans6; trans6(dev_ctx, in, out, axis); break; default: @@ -54,7 +54,7 @@ inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx, } } -template +template class TransposeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -64,12 +64,12 @@ class TransposeKernel : public framework::OpKernel { std::vector axis = context.Attr>("axis"); int ndims = axis.size(); - auto& dev_ctx = context.device_context(); - TransCompute(ndims, dev_ctx, *x, out, axis); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *x, out, axis); } }; -template +template class TransposeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -88,8 +88,9 @@ class TransposeGradKernel : public framework::OpKernel { } int ndims = axis.size(); - auto& dev_ctx = context.device_context(); - TransCompute(ndims, dev_ctx, *out_grad, x_grad, reversed_axis); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *out_grad, x_grad, + reversed_axis); } }; diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc index fff1dc7ccd..2a49ee471f 100644 --- a/paddle/operators/uniform_random_op.cc +++ b/paddle/operators/uniform_random_op.cc @@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( static_cast(ctx.Attr("dtype")), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu index 8b20bb8287..cfe9d293cf 100644 --- a/paddle/operators/uniform_random_op.cu +++ b/paddle/operators/uniform_random_op.cu @@ -63,6 +63,6 @@ class GPUUniformRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(uniform_random, - paddle::operators::GPUUniformRandomKernel, - paddle::operators::GPUUniformRandomKernel); +REGISTER_OP_CUDA_KERNEL(uniform_random, + paddle::operators::GPUUniformRandomKernel, + paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 89c48e071c..49df2a530c 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -135,9 +135,10 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL( - unpool_grad, ops::UnpoolGradKernel, - ops::UnpoolGradKernel); + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 18aafb7dc7..9b002e35c4 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -15,9 +15,10 @@ limitations under the License. */ #include "paddle/operators/unpool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_GPU_KERNEL( - unpool_grad, ops::UnpoolGradKernel, - ops::UnpoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CUDA_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index 243eb7e532..ee18b118c9 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class UnpoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -32,15 +32,16 @@ class UnpoolKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); T* output_data = out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); if (output_data) { - math::SetConstant set_zero; - set_zero(context.device_context(), out, static_cast(0)); + math::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); } - math::Unpool2dMaxFunctor unpool2d_max_forward; - unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); + math::Unpool2dMaxFunctor unpool2d_max_forward; + unpool2d_max_forward(dev_ctx, *in_x, *in_y, out); } }; -template +template class UnpoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,15 +57,14 @@ class UnpoolGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); - auto& device_ctx = context.device_context(); - math::SetConstant zero; + auto& device_ctx = context.template device_context(); + math::SetConstant zero; if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); } - math::Unpool2dMaxGradFunctor unpool2d_max_backward; - unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out, - *out_grad, in_x_grad); + math::Unpool2dMaxGradFunctor unpool2d_max_backward; + unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad); } }; } // namespace operators diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index ae4f0bf896..2c7f964216 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -15,12 +15,6 @@ limitations under the License. */ namespace paddle { namespace platform { -template <> -Eigen::DefaultDevice* DeviceContext::GetEigenDevice< - platform::CPUPlace, Eigen::DefaultDevice>() const { - return reinterpret_cast(this)->eigen_device(); -} - CPUDeviceContext::CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } @@ -37,12 +31,6 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } #ifdef PADDLE_WITH_CUDA -template <> -Eigen::GpuDevice* -DeviceContext::GetEigenDevice() const { - return reinterpret_cast(this)->eigen_device(); -} - class EigenCudaStreamDevice : public Eigen::StreamInterface { public: EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index ef5f19214d..596d9d0bba 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -27,24 +27,11 @@ limitations under the License. */ namespace paddle { namespace platform { -template -struct EigenDeviceConverter; - -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::DefaultDevice; -}; - class DeviceContext { public: virtual ~DeviceContext() {} virtual Place GetPlace() const = 0; - template ::EigenDeviceType> - DeviceType* GetEigenDevice() const; - virtual void Wait() const {} }; @@ -62,10 +49,6 @@ class CPUDeviceContext : public DeviceContext { }; #ifdef PADDLE_WITH_CUDA -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::GpuDevice; -}; class EigenCudaStreamDevice; diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc index 8bf5174c4a..4893cd92f6 100644 --- a/paddle/platform/device_context_test.cc +++ b/paddle/platform/device_context_test.cc @@ -22,9 +22,8 @@ TEST(Device, Init) { int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; i++) { - DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); - Eigen::GpuDevice* gpu_device = - device_context->template GetEigenDevice(); + CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); + Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); delete device_context; } diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h index bb9d59ec0a..148ebaed3d 100644 --- a/paddle/platform/transform.h +++ b/paddle/platform/transform.h @@ -31,7 +31,7 @@ namespace paddle { namespace platform { // Transform on host or device. It provides the same API in std library. -template +template struct Transform { template void operator()(const DeviceContext& context, InputIter first, InputIter last, @@ -45,16 +45,16 @@ struct Transform { }; template <> -struct Transform { +struct Transform { template - void operator()(const DeviceContext& context, InputIter first, InputIter last, - OutputIter result, UnaryOperation op) { + void operator()(const platform::CPUDeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { std::transform(first, last, result, op); } template - void operator()(const DeviceContext& context, InputIter1 first1, + void operator()(const platform::CPUDeviceContext& context, InputIter1 first1, InputIter1 last1, InputIter2 first2, OutputIter result, BinaryOperation op) { std::transform(first1, last1, first2, result, op); @@ -63,27 +63,25 @@ struct Transform { #ifdef __NVCC__ template <> -struct Transform { +struct Transform { template - void operator()(const DeviceContext& context, InputIter first, InputIter last, - OutputIter result, UnaryOperation op) { + void operator()(const platform::CUDADeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { auto place = context.GetPlace(); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); - auto& ctx = reinterpret_cast(context); - thrust::transform(thrust::cuda::par.on(ctx.stream()), + thrust::transform(thrust::cuda::par.on(context.stream()), details::DevPtrCast(first), details::DevPtrCast(last), details::DevPtrCast(result), op); } template - void operator()(const DeviceContext& context, InputIter1 first1, + void operator()(const platform::CUDADeviceContext& context, InputIter1 first1, InputIter1 last1, InputIter2 first2, OutputIter result, BinaryOperation op) { auto place = context.GetPlace(); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); - auto& ctx = reinterpret_cast(context); - thrust::transform(thrust::cuda::par.on(ctx.stream()), + thrust::transform(thrust::cuda::par.on(context.stream()), details::DevPtrCast(first1), details::DevPtrCast(last1), details::DevPtrCast(first2), details::DevPtrCast(result), op); diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu index c76cab80e4..d36eac8379 100644 --- a/paddle/platform/transform_test.cu +++ b/paddle/platform/transform_test.cu @@ -39,7 +39,7 @@ TEST(Transform, CPUUnary) { using namespace paddle::platform; CPUDeviceContext ctx; float buf[4] = {0.1, 0.2, 0.3, 0.4}; - Transform trans; + Transform trans; trans(ctx, buf, buf + 4, buf, Scale(10)); for (int i = 0; i < 4; ++i) { ASSERT_NEAR(buf[i], static_cast(i + 1), 1e-5); @@ -54,7 +54,7 @@ TEST(Transform, GPUUnary) { float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); ctx.Wait(); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); @@ -68,7 +68,7 @@ TEST(Transform, CPUBinary) { using namespace paddle::platform; using namespace paddle::memory; int buf[4] = {1, 2, 3, 4}; - Transform trans; + Transform trans; CPUDeviceContext ctx; trans(ctx, buf, buf + 4, buf, buf, Multiply()); for (int i = 0; i < 4; ++i) { @@ -84,7 +84,7 @@ TEST(Transform, GPUBinary) { CUDADeviceContext ctx(gpu0); int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); ctx.Wait(); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); From de8c4627776b2b9a60dbcc64c48e858c80a2715f Mon Sep 17 00:00:00 2001 From: QI JUN Date: Tue, 12 Dec 2017 14:42:33 +0800 Subject: [PATCH 267/275] Update new op docs (#6505) * update docs about how to add a new operator --- doc/howto/dev/new_op_cn.md | 98 ++++++++++++++++---------------------- doc/howto/dev/new_op_en.md | 93 ++++++++++++++++-------------------- 2 files changed, 80 insertions(+), 111 deletions(-) diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md index 6cfc9536f2..44dbeecbbd 100644 --- a/doc/howto/dev/new_op_cn.md +++ b/doc/howto/dev/new_op_cn.md @@ -30,8 +30,8 @@ -------------- | :---------------------- OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake Op定义 | `.cc`文件 -Kernel实现 | CPU、GPU共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,GPU 实现在`.cu`文件中。 -注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,GPU实现在`.cu`文件中 +Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 +注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。** @@ -153,7 +153,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, `MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数: -- `typename Place`: 表示设备类型,不同设备(CPU、GPU)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 +- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 - `typename T` : 表示数据类型,如`float`, `double`等。 @@ -165,7 +165,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, 下面是 `MulKernel` `Compute`的实现: ```cpp - template + template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -173,18 +173,16 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, auto* Y = context.Input("Y"); auto* Z = context.Output("Out"); Z->mutable_data(context.GetPlace()); - auto* device_context = - const_cast(context.device_context_); - math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + auto& device_context = context.template device_context(); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); } }; - ``` -需要注意:**不同设备(CPU、GPU)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** +需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** -`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 +`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 -为了使`OpKernel`的计算过程书写更加简单,并且CPU、GPU的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。 +为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。 到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。 @@ -197,9 +195,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, ```cpp namespace ops = paddle::operators; REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); - REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); + ops::MulGradKernel); ``` 在上面的代码中: @@ -209,17 +207,17 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。 -- 在 `.cu`文件中注册GPU Kernel。 - - 请注意,如果GPU Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下: +- 在 `.cu`文件中注册CUDA Kernel。 + - 请注意,如果CUDA Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下: ```cpp // if use Eigen unsupported module before include head files - // #define EIGEN_USE_GPU + #define EIGEN_USE_GPU namespace ops = paddle::operators; - REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); - REGISTER_OP_GPU_KERNEL(mul_grad, - ops::MulGradKernel); + REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CUDA_KERNEL(mul_grad, + ops::MulGradKernel); ``` ### 5. 编译 @@ -236,71 +234,55 @@ make mul_op ## 实现单元测试 -单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。 +单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。 -### 前向Operator单元测试 -前向Op单元测试继承自`unittest.TestCase`,并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator,需要: +Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要: 1. 在`setUp`函数定义输入、输出,以及相关的属性参数。 2. 生成随机的输入数据。 3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。 +4. 反向计算已经自动集成进测试框架,直接调用相应接口即可。 ```python import unittest import numpy as np - from gradient_checker import GradientChecker, create_op - from op_test_util import OpTestMeta + from op_test import OpTest - class TestMulOp(unittest.TestCase): - __metaclass__ = OpTestMeta + class TestMulOp(OpTest): def setUp(self): - self.type = "mul" + self.op_type = "mul" self.inputs = { 'X': np.random.random((32, 84)).astype("float32"), 'Y': np.random.random((84, 100)).astype("float32") } self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} - ``` -上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: - -- `self.type = "mul" ` : 定义类型,与operator注册时注册的类型一致。 -- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 -- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 - - -### 反向Operator单元测试 + def test_check_output(self): + self.check_output() -反向Op单元测试继承自`GradientChecker`,而`GradientChecker`继承自`unittest.TestCase`,因此,**反向单元测试函数需要以`test_`开头**。 + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) -```python -class TestMulGradOp(GradientChecker): - def setUp(self): - self.op = create_op("mul") - self.inputs = { - 'X': np.random.random((32, 84)).astype("float32"), - 'Y': np.random.random((84, 100)).astype("float32") - } + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) - def test_check_grad_normal(self): - # mul op will enlarge the relative error - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + ``` - def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) -``` +上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: -下面解释代码中一些关键的地方: +- `self.op_type = "mul" ` : 定义类型,与operator注册时注册的类型一致。 +- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 +- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 -- 调用`create_op("mul")`创建反向Op对应的前向Op。 +而反向测试中: - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。 - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。 - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。 @@ -328,5 +310,5 @@ ctest -R test_mul_op - 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,这将会导致编译出错。 - 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`等,这将会导致单元测试出错。 -- 如果Op没有实现GPU Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。 +- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。 - 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。 diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md index 1e88e1f5b4..510233306c 100644 --- a/doc/howto/dev/new_op_en.md +++ b/doc/howto/dev/new_op_en.md @@ -28,8 +28,8 @@ An operator can be differentiated by whether in has kernel methods. An operator -------------- | :---------------------- OpProtoMake definition | `.cc`files, Backward Op does not need an OpProtoMake interface. Op definition | `.cc` files -Kernel implementation | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files. -Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation. +Kernel implementation | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files. +Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation. New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. ** @@ -151,7 +151,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w `MulKernel` inherits `framework::OpKernel`, which includes the following templates: -- `typename Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). +- `typename DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). - `typename T` denotes data type, such as `float` or `double`. @@ -163,7 +163,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w `MulKernel`'s implementation of `Compute` is as follows: ```cpp - template + template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -171,16 +171,15 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w auto* Y = context.Input("Y"); auto* Z = context.Output("Out"); Z->mutable_data(context.GetPlace()); - auto* device_context = - const_cast(context.device_context_); - math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + auto& device_context = context.template device_context(); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); } }; ``` -Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.** +Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.** -`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). +`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md). @@ -196,9 +195,9 @@ The definition of its corresponding backward operator, if applicable, is similar ```cpp namespace ops = paddle::operators; REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); - REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); + ops::MulGradKernel); ``` In that code block, @@ -208,17 +207,17 @@ The definition of its corresponding backward operator, if applicable, is similar - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`. -- Registering GPU Kernel in `.cu` files - - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as +- Registering CUDA Kernel in `.cu` files + - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as ```cpp // if use Eigen unsupported module before include head files #define EIGEN_USE_GPU namespace ops = paddle::operators; - REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); - REGISTER_OP_GPU_KERNEL(mul_grad, - ops::MulGradKernel); + REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CUDA_KERNEL(mul_grad, + ops::MulGradKernel); ``` ### 5. Compilation @@ -253,62 +252,50 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass 2. Generating random input data. -3. Implementing the same computation logic in a Python script: +3. Implementing the same computation logic in a Python script. + +4. Call check gradient function to check the backward operator. ```python import unittest import numpy as np - from gradient_checker import GradientChecker, create_op - from op_test_util import OpTestMeta + from op_test import OpTest - class TestMulOp(unittest.TestCase): - __metaclass__ = OpTestMeta + class TestMulOp(OpTest): def setUp(self): - self.type = "mul" + self.op_type = "mul" self.inputs = { 'X': np.random.random((32, 84)).astype("float32"), 'Y': np.random.random((84, 100)).astype("float32") } self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} - ``` -Get its output, and compare it with the forward operator's own output. - -The code above first loads required packages. In addition, we have - -- `self.type = "mul" ` defines the type that is identical to what the operator's registered type. -- `self.inputs` defines input, with type `numpy.array` and initializes it. -- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. -### Testing Backward Operators + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) -A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**. + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) -```python -class TestMulGradOp(GradientChecker): - def setUp(self): - self.op = create_op("mul") - self.inputs = { - 'X': np.random.random((32, 84)).astype("float32"), - 'Y': np.random.random((84, 100)).astype("float32") - } + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - def test_check_grad_normal(self): - # mul op will enlarge the relative error - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + ``` +Get its output, and compare it with the forward operator's own output. - def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) +The code above first loads required packages. In addition, we have - def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) -``` +- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type. +- `self.inputs` defines input, with type `numpy.array` and initializes it. +- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. -Some key points in the code above include: +Some key points in checking gradient above include: -- `create_op("mul")` creates the backward operator's corresponding forward operator. - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods. - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested. - The second variable `"Out"` points to the network's final output target `Out`. @@ -338,5 +325,5 @@ ctest -R test_mul_op - Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file. - The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures. -- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail. +- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail. - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`. From d918ccded35d972eba42b07767972bc4c2dd0921 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 12 Dec 2017 15:22:52 +0800 Subject: [PATCH 268/275] Add fill_op (#6477) * Add fill_op * Fix bug --- paddle/operators/fill_op.cc | 111 +++++++++++++++++++ python/paddle/v2/fluid/tests/test_fill_op.py | 24 ++++ 2 files changed, 135 insertions(+) create mode 100644 paddle/operators/fill_op.cc create mode 100644 python/paddle/v2/fluid/tests/test_fill_op.py diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc new file mode 100644 index 0000000000..382e161c5d --- /dev/null +++ b/paddle/operators/fill_op.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/data_type.h" +#include "paddle/framework/op_registry.h" +#include "paddle/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +struct FillOpVisitor { + FillOpVisitor(framework::LoDTensor *tensor, const std::vector &value) + : tensor_(tensor), value_(value) {} + + template + void operator()() const { + platform::CPUPlace cpu; + auto *data = tensor_->mutable_data(cpu); + std::transform(value_.data(), value_.data() + tensor_->numel(), data, + [](float dat) { return static_cast(dat); }); + } + + framework::LoDTensor *tensor_; + const std::vector &value_; +}; + +class FillOp : public framework::OperatorBase { + public: + FillOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::DeviceContext &dev_ctx) const override { + auto &out = + detail::Ref(detail::Ref(scope.FindVar(Output("Out")), + "Cannot find variable %s", Output("Out")) + .GetMutable()); + out.Resize(framework::make_ddim(Attr>("shape"))); + auto dtype = static_cast(Attr("dtype")); + platform::CPUPlace cpu; + auto force_cpu = Attr("force_cpu"); + out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(), + framework::ToTypeIndex(dtype)); + + framework::LoDTensor tensor; + + if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) { + tensor.ShareDataWith(out); + } else { + // Always make tensor in CPU memory. + tensor.Resize(out.dims()); + tensor.mutable_data(cpu, framework::ToTypeIndex(dtype)); + } + + framework::VisitDataType( + dtype, FillOpVisitor(&tensor, Attr>("value"))); + + if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) { + // Copy tensor to out + framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out); + } + } +}; + +class FillOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment(R"DOC(Fill operator + +Fill an tensor with `value` and `shape`. The type of the tensor is specify by +`dtype`. +)DOC"); + AddOutput("Out", "(LoDTensor) The output tensor."); + AddAttr>( + "value", "The float values of tensor, which are flatten in row major"); + AddAttr>("shape", "The shape of output tensor"); + AddAttr("dtype", "The data type of output tensor, Default is float") + .SetDefault(framework::DataType::FP32); + AddAttr("force_cpu", + "Whether the output tensor must be at CPU memory or not. " + "Default is false.") + .SetDefault(false); + } +}; + +class FillOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim( + "Out", + framework::make_ddim(context->Attrs().Get>("shape"))); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; +REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker); diff --git a/python/paddle/v2/fluid/tests/test_fill_op.py b/python/paddle/v2/fluid/tests/test_fill_op.py new file mode 100644 index 0000000000..88337598c8 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_fill_op.py @@ -0,0 +1,24 @@ +import unittest +import numpy as np +from op_test import OpTest +import paddle.v2.fluid.core as core + + +class TestFillOp(OpTest): + def setUp(self): + self.op_type = "fill" + val = np.random.random(size=[100, 200]) + self.inputs = {} + self.attrs = { + 'value': val.flatten().tolist(), + 'shape': [100, 200], + 'dtype': int(core.DataType.FP64) + } + self.outputs = {'Out': val.astype('float64')} + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() From 8f7d0b18145c8310fb36ad017aa0f1d4bce98c65 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 12 Dec 2017 15:54:38 +0800 Subject: [PATCH 269/275] add param_attr for nets (#6509) --- python/paddle/v2/fluid/layers.py | 6 ++++-- python/paddle/v2/fluid/nets.py | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index fd8a2ed18c..1f45487902 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -1732,8 +1732,10 @@ def conv2d_transpose(input, h_in = input.shape[2] w_in = input.shape[3] - filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0] - filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1] + filter_size_h = output_size[0] - \ + (h_in - 1) * stride[0] + 2 * padding[0] + filter_size_w = output_size[1] - \ + (w_in - 1) * stride[1] + 2 * padding[1] filter_size = [filter_size_h, filter_size_w] elif isinstance(filter_size, int): filter_size = [filter_size, filter_size] diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py index 05728ad75a..7ef524318e 100644 --- a/python/paddle/v2/fluid/nets.py +++ b/python/paddle/v2/fluid/nets.py @@ -9,6 +9,7 @@ def simple_img_conv_pool(input, pool_size, pool_stride, act, + param_attr=None, pool_type='max', main_program=None, startup_program=None): @@ -16,6 +17,7 @@ def simple_img_conv_pool(input, input=input, num_filters=num_filters, filter_size=filter_size, + param_attr=param_attr, act=act, main_program=main_program, startup_program=startup_program) @@ -36,6 +38,7 @@ def img_conv_group(input, conv_padding=1, conv_filter_size=3, conv_act=None, + param_attr=None, conv_with_batchnorm=False, conv_batchnorm_drop_rate=None, pool_stride=1, @@ -57,6 +60,7 @@ def img_conv_group(input, conv_padding = __extend_list__(conv_padding) conv_filter_size = __extend_list__(conv_filter_size) + param_attr = __extend_list__(param_attr) conv_with_batchnorm = __extend_list__(conv_with_batchnorm) conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate) @@ -70,6 +74,7 @@ def img_conv_group(input, num_filters=conv_num_filter[i], filter_size=conv_filter_size[i], padding=conv_padding[i], + param_attr=param_attr[i], act=local_conv_act, main_program=main_program, startup_program=startup_program) @@ -101,6 +106,7 @@ def img_conv_group(input, def sequence_conv_pool(input, num_filters, filter_size, + param_attr=None, act="sigmoid", pool_type="max", main_program=None, @@ -109,6 +115,7 @@ def sequence_conv_pool(input, input=input, num_filters=num_filters, filter_size=filter_size, + param_attr=param_attr, act=act, main_program=main_program, startup_program=startup_program) From 3ef8ec37bb80427ab8abe23384c42a9e9056c87d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 13 Dec 2017 10:35:57 +0800 Subject: [PATCH 270/275] fix the new_op doc (#6540) * fix the ending symbol * fix invalid content link --- doc/howto/dev/new_op_cn.md | 36 ++++++++++++++++++++---------------- doc/howto/dev/new_op_en.md | 19 ++++++++++--------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md index 44dbeecbbd..757a5840bc 100644 --- a/doc/howto/dev/new_op_cn.md +++ b/doc/howto/dev/new_op_cn.md @@ -1,17 +1,18 @@ # 如何写新的Operator - [概念简介](#概念简介) - - [实现C++类](#实现C++类) - - [定义ProtoMaker类](#定义ProtoMaker类) - - [定义Operator类](#定义Operator类) - - [定义OpKernel类](#定义OpKernel类) - - [注册Operator](#注册Operator) + - [实现C++类](#实现c类) + - [定义ProtoMaker类](#定义protomaker类) + - [定义Operator类](#定义operator类) + - [定义OpKernel类](#定义opkernel类) + - [注册Operator](#注册operator) - [编译](#编译) - - [绑定Python](#绑定Python) + - [绑定Python](#绑定python) - [实现单元测试](#实现单元测试) - - [前向Operator单测](#前向Operator单测) - - [反向Operator单测](#反向Operator单测) + - [前向Operator单测](#前向operator单测) + - [反向Operator单测](#反向operator单测) - [编译和执行](#编译和执行) + - [注意事项](#注意事项) ## 概念简介 @@ -43,7 +44,7 @@ Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU ## 实现C++类 -### 1. 定义ProtoMaker类 +### 定义ProtoMaker类 矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。 @@ -100,7 +101,7 @@ The equation is: Out = scale*X - `AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。 -### 2. 定义Operator类 +### 定义Operator类 下面的点实现了MulOp的定义: @@ -149,7 +150,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中 -### 3. 定义OpKernel类 +### 定义OpKernel类 `MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数: @@ -177,6 +178,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); } }; + ``` 需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** @@ -188,7 +190,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, 到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。 反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。 -### 4. 注册Operator +### 注册Operator - 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。 @@ -220,7 +222,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, ops::MulGradKernel); ``` -### 5. 编译 +### 编译 运行下面命令可以进行编译: @@ -236,6 +238,7 @@ make mul_op 单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。 +### 前向Operator单测 Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要: @@ -273,8 +276,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp def test_check_grad_ingore_y(self): self.check_grad( ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - - ``` + ``` 上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: @@ -282,6 +284,8 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp - `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 - `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 +### 反向operator单测 + 而反向测试中: - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。 - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。 @@ -290,7 +294,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp - `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。 -### 编译和执行单元测试 +### 编译和执行 `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。 diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md index 510233306c..fe86936bc1 100644 --- a/doc/howto/dev/new_op_en.md +++ b/doc/howto/dev/new_op_en.md @@ -1,8 +1,8 @@ # How to write a new operator - [Background](#background) - - [Implementing C++ Types](#implementing-c++-types) - - [Defining ProtoMaker](#defining-protoMaker) + - [Implementing C++ Types](#implementing-c-types) + - [Defining ProtoMaker](#defining-protomaker) - [Defining Operator](#defining-operator) - [Registering Operator](#registering-operator) - [Compilation](#compilation) @@ -41,7 +41,7 @@ Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePadd ## Implementing C++ Types -### 1. Defining Class ProtoMaker +### Defining ProtoMaker Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output. @@ -98,7 +98,7 @@ There are two changes in this example: - `AddAttr("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0. -### 2. Defining Operator +### Defining Operator The following code defines the interface for MulOp: @@ -147,7 +147,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later. -### 3. Defining OpKernel +### Defining OpKernel `MulKernel` inherits `framework::OpKernel`, which includes the following templates: @@ -188,7 +188,7 @@ This concludes the forward implementation of an operator. Next its operation and The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**. -### 4. Registering Operator +### Registering Operator - In `.cc` files, register forward and backward operator classes and the CPU kernel. @@ -220,7 +220,7 @@ The definition of its corresponding backward operator, if applicable, is similar ops::MulGradKernel); ``` -### 5. Compilation +### Compilation Run the following commands to compile. @@ -284,8 +284,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass def test_check_grad_ingore_y(self): self.check_grad( ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - - ``` + ``` Get its output, and compare it with the forward operator's own output. The code above first loads required packages. In addition, we have @@ -294,6 +293,8 @@ The code above first loads required packages. In addition, we have - `self.inputs` defines input, with type `numpy.array` and initializes it. - `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. +### Testing Backward Operators + Some key points in checking gradient above include: - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods. From 8ad36cdb5d2c3a79c82c36cd7c2e79bc2d4cc4bf Mon Sep 17 00:00:00 2001 From: "Yang Yang(Tony)" Date: Wed, 13 Dec 2017 10:41:31 +0800 Subject: [PATCH 271/275] PaddlePaddle Fluid Source Overview (#6485) * first commit * Update read_source.md --- doc/howto/read_source.md | 67 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 doc/howto/read_source.md diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md new file mode 100644 index 0000000000..383acb0c82 --- /dev/null +++ b/doc/howto/read_source.md @@ -0,0 +1,67 @@ +# PaddlePaddle Fluid Source Code Overview + +Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book + +Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework + +Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators + +Optimizer: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer + +Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory + +# Compile Time + +The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). + +```python +x = fluid.layers.data(name='x', shape=[13], dtype='float32') +y = fluid.layers.data(name='y', shape=[1], dtype='float32') + +y_predict = fluid.layers.fc(input=x, size=1, act=None) +cost = fluid.layers.square_error_cost(input=y_predict, label=y) +avg_cost = fluid.layers.mean(x=cost) + +sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) +sgd_optimizer.minimize(avg_cost) +``` + +- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93) +- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py) + - Every Layer has one or more operators and variables/parameters + - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files: + - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h) + - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) + - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h) +- Optimizer: `fluid.optimizer.SGD`. It does the following + - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)] + - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)] + +# Run Time + +The following **evaluates** the NN. Instantiates all the variables, operators. + +```python +place = fluid.CPUPlace() +feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) +exe = fluid.Executor(place) + +# Allocate memory. Initialize Parameter. +exe.run(fluid.default_startup_program()) + +# Allocate memory. Do computation. +exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost]) +``` + +- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h) + - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h) +- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)] + - Feeds the data: `feed=feeder.feed(data)` + - Evaluates all the operators + - Fetches the result: `fetch_list=[avg_cost]` +- Other worth looking files: + - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live + - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live + - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory) From 697facc92f58e6b65c76db2f1b6efc282b3c57a0 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 13 Dec 2017 10:43:20 +0800 Subject: [PATCH 272/275] "add registry interface" (#6449) * "add registry interface" * "move function to registry" * "rename with meaningful name" * "add exposed layers" * "fixed based on comments" * "remove unsed comments" --- python/paddle/v2/fluid/layers.py | 185 ++--------------- python/paddle/v2/fluid/registry.py | 186 ++++++++++++++++++ python/paddle/v2/fluid/tests/test_registry.py | 22 +++ 3 files changed, 221 insertions(+), 172 deletions(-) create mode 100644 python/paddle/v2/fluid/registry.py create mode 100644 python/paddle/v2/fluid/tests/test_registry.py diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 1f45487902..9f5a219b20 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -1,12 +1,12 @@ -import core +import contextlib + import proto.framework_pb2 as framework_pb2 +import core from framework import OpProtoHolder, Variable, Program, Operator from initializer import Constant, Normal, Xavier, Initializer from paddle.v2.fluid.layer_helper import LayerHelper, unique_name -import re -import cStringIO +from registry import register_layer from param_attr import ParamAttr -import contextlib __all__ = [ 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat', @@ -14,6 +14,15 @@ __all__ = [ 'batch_norm', 'accuracy', 'split_lod_tensor', 'While' ] +_REGISTER_LAYER_FROM_OPS = [ + 'mean', 'mul', 'elementwise_add', 'elementwise_div', 'dropout', 'reshape', + 'sigmoid', 'scale', 'transpose', 'sigmoid_cross_entropy_with_logits' +] + +for _OP in set(_REGISTER_LAYER_FROM_OPS): + globals()[_OP] = register_layer(_OP) + __all__.append(_OP) + def fc(input, size, @@ -309,174 +318,6 @@ def create_tensor(dtype, name=None, main_program=None, startup_program=None): return helper.create_variable(name=helper.name, dtype=dtype) -def _convert_(name): - """ - Formatting. - - Args: - name: The name/alias - - This function takes in a name and converts it to a standard format of - group1_group2. Where as per the regular expression, group1 can have - alphabets and numbers and group2 has capital alphabets. - - """ - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() - - -def _generate_doc_string_(op_proto): - """ - Generate docstring by OpProto - - Args: - op_proto (framework_pb2.OpProto): a protobuf message typed OpProto - - Returns: - str: the document string - """ - - def _type_to_str_(tp): - return framework_pb2.AttrType.Name(tp) - - if not isinstance(op_proto, framework_pb2.OpProto): - raise TypeError("OpProto should be `framework_pb2.OpProto`") - - buf = cStringIO.StringIO() - buf.write(op_proto.comment) - buf.write('\nArgs:\n') - for each_input in op_proto.inputs: - line_begin = ' {0}: '.format(_convert_(each_input.name)) - buf.write(line_begin) - buf.write(each_input.comment) - buf.write('\n') - buf.write(' ' * len(line_begin)) - buf.write('Duplicable: ') - buf.write(str(each_input.duplicable)) - buf.write(' Optional: ') - buf.write(str(each_input.dispensable)) - buf.write('\n') - - for each_attr in op_proto.attrs: - buf.write(' ') - buf.write(each_attr.name) - buf.write(' (') - buf.write(_type_to_str_(each_attr.type)) - buf.write('): ') - buf.write(each_attr.comment) - buf.write('\n') - - if len(op_proto.outputs) != 0: - buf.write('\nReturns:\n') - buf.write(' ') - for each_opt in op_proto.outputs: - if not each_opt.intermediate: - break - buf.write(each_opt.comment) - - return buf.getvalue() - - -def _create_op_func_(op_type): - """ - Create an Operator for a Function. - - Args: - op_type: The name of the operator to be created - - This function takes in the operator type (sigmoid, mean , average etc) and - creates the operator functionality. - - """ - op_proto = OpProtoHolder.instance().get_op_proto(op_type) - not_intermediate_outputs = \ - filter(lambda output: not output.intermediate, op_proto.outputs) - intermediate_outputs = \ - filter(lambda output: output.intermediate, op_proto.outputs) - - if len(not_intermediate_outputs) != 1: - raise ValueError("Only one non intermediate output operator can be", - "automatically generated") - - if not_intermediate_outputs[0].duplicable: - raise ValueError( - "Only non duplicable op can be automatically generated") - - for output in intermediate_outputs: - if output.duplicable: - raise ValueError("The op can be automatically generated only when ", - "all intermediate ops are not duplicable") - - o_name = not_intermediate_outputs[0].name - intermediate_output_names = [output.name for output in intermediate_outputs] - - def infer_and_check_dtype(op_proto, **kwargs): - """ - This function performs the sanity check for dtype and - instance type. - """ - dtype = None - for ipt in op_proto.inputs: - name = _convert_(ipt.name) - val = kwargs.pop(name, []) - if not isinstance(val, list) and not isinstance(val, tuple): - val = [val] - for each in val: - if not isinstance(each, Variable): - raise ValueError("input of {0} must be variable".format( - op_type)) - - if dtype is None: - dtype = each.dtype - elif dtype != each.dtype: - raise ValueError( - "operator {0} must input same dtype. {1} vs {2}".format( - op_type, dtype, each.dtype)) - - return dtype - - def func(**kwargs): - helper = LayerHelper(op_type, **kwargs) - - dtype = infer_and_check_dtype(op_proto, **kwargs) - - inputs = dict() - for ipt in op_proto.inputs: - name = _convert_(ipt.name) - val = kwargs.pop(name, []) - if not isinstance(val, list) and not isinstance(val, tuple): - val = [val] - inputs[ipt.name] = val - - outputs = dict() - out = helper.create_tmp_variable(dtype=dtype) - outputs[o_name] = [out] - for name in intermediate_output_names: - outputs[name] = [helper.create_tmp_variable(dtype=dtype)] - helper.append_op( - type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs) - return helper.append_activation(out) - - func.__name__ = op_type - globals()[op_type] = func - func.__doc__ = _generate_doc_string_(op_proto) - global __all__ - __all__.append(op_type) - - -_create_op_func_('mean') -_create_op_func_('mul') -_create_op_func_('elementwise_add') -_create_op_func_('elementwise_div') -_create_op_func_('dropout') -_create_op_func_('reshape') -_create_op_func_('sigmoid') -_create_op_func_('scale') -_create_op_func_('reshape') -_create_op_func_('transpose') -_create_op_func_('sigmoid_cross_entropy_with_logits') - - def cast(x, dtype, main_program=None): """ This function takes in the input with input_dtype diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py new file mode 100644 index 0000000000..6f5dd365de --- /dev/null +++ b/python/paddle/v2/fluid/registry.py @@ -0,0 +1,186 @@ +import re +import cStringIO +import warnings +import functools +import inspect + +import proto.framework_pb2 as framework_pb2 +from framework import OpProtoHolder, Variable, Program, Operator +from paddle.v2.fluid.layer_helper import LayerHelper, unique_name + +__all__ = ['deprecated', 'register_layer'] + + +def _convert_(name): + """ + Formatting. + + Args: + name: The name/alias + + This function takes in a name and converts it to a standard format of + group1_group2. Where as per the regular expression, group1 can have + alphabets and numbers and group2 has capital alphabets. + + """ + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + + +def _generate_doc_string_(op_proto): + """ + Generate docstring by OpProto + + Args: + op_proto (framework_pb2.OpProto): a protobuf message typed OpProto + + Returns: + str: the document string + """ + + def _type_to_str_(tp): + return framework_pb2.AttrType.Name(tp) + + if not isinstance(op_proto, framework_pb2.OpProto): + raise TypeError("OpProto should be `framework_pb2.OpProto`") + + buf = cStringIO.StringIO() + buf.write(op_proto.comment) + buf.write('\nArgs:\n') + for each_input in op_proto.inputs: + line_begin = ' {0}: '.format(_convert_(each_input.name)) + buf.write(line_begin) + buf.write(each_input.comment) + buf.write('\n') + buf.write(' ' * len(line_begin)) + buf.write('Duplicable: ') + buf.write(str(each_input.duplicable)) + buf.write(' Optional: ') + buf.write(str(each_input.dispensable)) + buf.write('\n') + + for each_attr in op_proto.attrs: + buf.write(' ') + buf.write(each_attr.name) + buf.write(' (') + buf.write(_type_to_str_(each_attr.type)) + buf.write('): ') + buf.write(each_attr.comment) + buf.write('\n') + + if len(op_proto.outputs) != 0: + buf.write('\nReturns:\n') + buf.write(' ') + for each_opt in op_proto.outputs: + if not each_opt.intermediate: + break + buf.write(each_opt.comment) + + return buf.getvalue() + + +def register_layer(op_type): + """ + Register an Python layer for an Operator + + Args: + op_type: The name of the operator to be created + + This function takes in the operator type (sigmoid, mean , average etc) and + creates the operator functionality. + + """ + op_proto = OpProtoHolder.instance().get_op_proto(op_type) + not_intermediate_outputs = \ + filter(lambda output: not output.intermediate, op_proto.outputs) + intermediate_outputs = \ + filter(lambda output: output.intermediate, op_proto.outputs) + + if len(not_intermediate_outputs) != 1: + raise ValueError("Only one non intermediate output operator can be", + "automatically generated") + + if not_intermediate_outputs[0].duplicable: + raise ValueError( + "Only non duplicable op can be automatically generated") + + for output in intermediate_outputs: + if output.duplicable: + raise ValueError("The op can be automatically generated only when ", + "all intermediate ops are not duplicable") + + o_name = not_intermediate_outputs[0].name + intermediate_output_names = [output.name for output in intermediate_outputs] + + def infer_and_check_dtype(op_proto, **kwargs): + """ + This function performs the sanity check for dtype and + instance type. + """ + dtype = None + for ipt in op_proto.inputs: + name = _convert_(ipt.name) + val = kwargs.pop(name, []) + if not isinstance(val, list) and not isinstance(val, tuple): + val = [val] + for each in val: + if not isinstance(each, Variable): + raise ValueError("input of {0} must be variable".format( + op_type)) + + if dtype is None: + dtype = each.dtype + elif dtype != each.dtype: + raise ValueError( + "operator {0} must input same dtype. {1} vs {2}".format( + op_type, dtype, each.dtype)) + + return dtype + + def func(**kwargs): + helper = LayerHelper(op_type, **kwargs) + + dtype = infer_and_check_dtype(op_proto, **kwargs) + + inputs = dict() + for ipt in op_proto.inputs: + name = _convert_(ipt.name) + val = kwargs.pop(name, []) + if not isinstance(val, list) and not isinstance(val, tuple): + val = [val] + inputs[ipt.name] = val + + outputs = dict() + out = helper.create_tmp_variable(dtype=dtype) + outputs[o_name] = [out] + for name in intermediate_output_names: + outputs[name] = [helper.create_tmp_variable(dtype=dtype)] + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs) + return helper.append_activation(out) + + func.__name__ = op_type + func.__doc__ = _generate_doc_string_(op_proto) + return func + + +def deprecated(func_or_class): + """ + Deprecated warning decorator. It will result a warning message. + Should be used before class or function, member function + """ + + @functools.wraps(func) + def func_wrapper(*args, **kwargs): + """ + Wrap func with deprecated warning + """ + warnings.simplefilter('always', DeprecationWarning) #turn off filter + warnings.warn( + "Call to deprecated function {}.".format(func.__name__), + category=DeprecationWarning, + stacklevel=2) + warnings.simplefilter('default', DeprecationWarning) #reset filter + return func(*args, **kwargs) + + return func_wrapper diff --git a/python/paddle/v2/fluid/tests/test_registry.py b/python/paddle/v2/fluid/tests/test_registry.py new file mode 100644 index 0000000000..f8328f31cf --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_registry.py @@ -0,0 +1,22 @@ +import unittest +import warnings + +import paddle.v2.fluid as fluid +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.registry as registry + + +class TestRegistry(unittest.TestCase): + def test_registry_layer(self): + self.layer_type = "mean" + program = framework.Program() + + x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32') + output = layers.mean(x) + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + X = np.random.random((10, 10)).astype("float32") + mean_out = exe.run(program, feed={"X": X}, fetch_list=[output]) + self.assertAlmostEqual(np.mean(X), mean_out) From 1ba8f7fe71fc8df40669a24fc5af88d0904240f1 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 13 Dec 2017 14:18:19 +0800 Subject: [PATCH 273/275] The comments in reshape_op is wrong (#6565) --- paddle/operators/reshape_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc index 39bf2118d6..7fd33bf662 100644 --- a/paddle/operators/reshape_op.cc +++ b/paddle/operators/reshape_op.cc @@ -84,9 +84,9 @@ Given a 2-D tensor X with 2 rows and 2 columns [[1, 2], [3, 4]] and target shape = [1, 4], the reshape operator will transform -the tensor X into a 1-D tensor: +the tensor X into a 2-D tensor: - [1, 2, 3, 4] + [[1, 2, 3, 4]] )DOC"); } From d069f2ca0a92dd07eb06c9a02528c41b0702f131 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 13 Dec 2017 14:37:05 +0800 Subject: [PATCH 274/275] Make fluid.layers.fc support multiple param_attr (#6532) Fix #6531 --- python/paddle/v2/fluid/param_attr.py | 2 ++ python/paddle/v2/fluid/tests/test_layers.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py index 86088fdd7c..7952a5ea51 100644 --- a/python/paddle/v2/fluid/param_attr.py +++ b/python/paddle/v2/fluid/param_attr.py @@ -36,6 +36,8 @@ class ParamAttr(object): def to_attr(arg): if arg is None: return ParamAttr() + elif isinstance(arg, list) or isinstance(arg, tuple): + return [ParamAttr.to_attr(a) for a in arg] elif isinstance(arg, ParamAttr): return arg elif isinstance(arg, str) or isinstance(arg, unicode): diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 57f6a362de..9b88080158 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -29,7 +29,10 @@ class TestBook(unittest.TestCase): label = layers.data(name='label', shape=[1], dtype='int32') hidden1 = layers.fc(input=images, size=128, act='relu') hidden2 = layers.fc(input=hidden1, size=64, act='relu') - predict = layers.fc(input=hidden2, size=10, act='softmax') + predict = layers.fc(input=[hidden2, hidden1], + size=10, + act='softmax', + param_attr=["sftmax.w1", "sftmax.w2"]) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) self.assertIsNotNone(avg_cost) From 0a8addf802e2198084b9cc66e49ca4ae2fdd8125 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 13 Dec 2017 15:24:53 +0800 Subject: [PATCH 275/275] Make cast op support bool (#6562) Also add `elemwise_sub/mul/abs/clip` layers --- paddle/operators/cast_op.cc | 3 ++- paddle/operators/cast_op.cu | 3 ++- python/paddle/v2/fluid/layers.py | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc index 42bff69a1e..d641b8fc9f 100644 --- a/paddle/operators/cast_op.cc +++ b/paddle/operators/cast_op.cc @@ -74,4 +74,5 @@ REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, - ops::CastOpKernel); + ops::CastOpKernel, + ops::CastOpKernel); diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu index 4681deaa62..91e6fb391c 100644 --- a/paddle/operators/cast_op.cu +++ b/paddle/operators/cast_op.cu @@ -19,4 +19,5 @@ using CastOpKernel = paddle::operators::CastOpKernel; REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, - CastOpKernel, CastOpKernel); + CastOpKernel, CastOpKernel, + CastOpKernel); diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 9f5a219b20..f67d6d08c7 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -15,8 +15,9 @@ __all__ = [ ] _REGISTER_LAYER_FROM_OPS = [ - 'mean', 'mul', 'elementwise_add', 'elementwise_div', 'dropout', 'reshape', - 'sigmoid', 'scale', 'transpose', 'sigmoid_cross_entropy_with_logits' + 'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose', + 'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div', + 'elementwise_sub', 'elementwise_mul', 'clip', 'abs' ] for _OP in set(_REGISTER_LAYER_FROM_OPS):

|JvuKVG&L2LEIn3UOwIcl1T5!U5}J?Y&k@LyV}ntGxN$ z<2!ic$a?2%3Sv*CZ|j_}sb=c`O{|AS9&ZvEdU!AHOvekTmFs5{%kQ`FY#2$6Daqii zrd{tS5NDVDon|QjcQ5>H6B0LbbXiNje1mJ~Ap~<$a5dEW71OHBu>Wd%`yfY`;ZHC5jGACKRflnkum z^RJu0D9f$B+-T!ZwX)7-3iJ=o_qzy>J6f#Y(s}EC-P^aAo_tDC&4L>`8yZ?m)|eg zcqKectWcgbMk-$Q&w&B;`lQ3jhT#@E{Qc2GM7dPn4WBIvXeN zw0F4Yg6k7l9=7)pHU=aEXSDPFI`~?zz z)-(TW60~jko6*JfDqPY|Bn2)$JZrVthgz(TmFt{?Q5-IQR%?J7Nv*j3+y!Yseg+7T zJ{n#BWPk8^@wgfy`I7QE_Fn7OPVD4w*X8lF`yn+XRQVf97s*jPFk+1xJtvoaR^*(v zl@$9OHfs;>L@x7i&wS@&%}|vHUTs4ra@A-1>=>nhS{A`v5(@L?Z`q%(RGj+x$dAd@ zuon|i)Vf%Sl?{D9LYeHWQH1+McAuFNOpSqgGI_srP};b8d(PZ`hys}xWmnd{K5&Fk zRQ6iMk@sd_oJ=&Kq-r%$zzZ`}0AH*%Q)2H`{8G~~b5@7)2!(^ulr`xrgy=O8n~Fa> z!ro&D)%2?n`kI!0_Lj1=6Qm=Clc1uGTo!a0a=oa=l8}eC&h@Of1_62pV~%G^U*WMQ zYi-eE@H=7f0Fwj1%DZ)a@D$z<|EGuR3-2#_=K~{bz#%bS_tc>`PMXq9GMrnXD!;BY zJV_odb=M zPh;`qlaWmyogDm(EN+}eg*dG~Do`t5ro%SWH0%%1Kf5`qYN~jVF^@zG)kUm#Iu{j0 z6eMA^n*nVX7T&r>QA{l5=WXkd7|FR8-R_nCS_^hQXpj{N#ePOC}bFguFQTHEvSm>~Rlv$LV$4l8W_i zrtt_8zaZ1P{B%ItgY&M+ZYJ{v3~O_}Kyf~8N^L4`Wef9Y8(ARKr3N;s5K-^FVGPMV zE6wXhNNus6essgFkKJSqH27f5DgBJZpLEzrLI(}YY#Us!PwuHE`gxO3Tc z`T9g8d-~>cW9+&rx#g=Ig(-Hka*|H^M$3c12QxEOG(T7%g=4dg4V*&Z7<$sHoHH7Z zxER)|a)(ah)!KQtB)|FBYasiaf5v&Pm-r*)EOB3q*25W(>HG(njxevD^2tZO&u{Kh$myKq&zNItfNhjtOw|`e3X7z?-RPsKx@HR2`{7AV z%v-6b&en301>uSFk)~5AW^SA$n|0r5-_Sk%k(OJ=Y8DTg)iIY5>BZs&A{z=Ni`b@@ zln0iv9w!)ElbF!;=JH48^z58;$;(=Y6hF3me!+a`f=LkfsS?EkMrPmTq!pPkbZPvA zk6$}OwY(m{w&R<%a#2)hK05jLzHc)Qi1{( zustxbGRMnKz1tAKl!}g*#T12EIYA-P18d;p26;Fg((F~bW4^WX!6FOg^;1R%Y=nXm z2bO>2kPLIvisWS>($en{dRi)SvwV(bnM`mZw|99jOhRXfqK2ED_uRHl51|&j94v%U z*DQ2Yg_E?ZeEeap?LQ8`vAzaDlJ9~UvSm+%XmD2E8$nj64O>eNm-x9<#Ju7B3O3pY zV?=zu*RZwhF|XNby{Bb1j*tSQGnrn^ar`b&&whRdN?Dek&@{O*$;(dGz%9b8HPpF+ z2dsCFEv)!Wz2KA080>x5k*~yw;yE`&JiX5b2Wt&s=pIXw7B}j2E|981XMVhAWMz6g z45ZV0D>hdCUd%sRrk@@xB8?;Dx9akVvmw_UW=mv~I%h)WCVDC&73VuD3w zFIHKBU)j>z%0lO#n9rJRU=ub4O;5jQNBt`KdvR$%b1^whb}dTn7c9EW(Qt=Do*9oj z%$CHF4IKraS(xGmxb?&dOZDvz-d-mNP{*bGYZZD~5JzvI8NO4M%=0<2RK@QwTM5FF ziw%VI;1NSRBX<{`=K16*!tHt=-qYGgAq_M@|0pgzklMZQD<3_kR9~uAh)_mbGJ%_t zhh@k}^BFyN(FcMKn^$I+c>%S<7kdnV3xw10Lx!lUxO`7ik&IpctM7*4oUd|@qJ+lE zg=`V=YFWj*M&!D1pyVm$T!uYyG1r7PKCXmKtSFzh?qr;fpEl=|#q$#%cDOmB6kKPQ zvpa+E7le>)|3v|)By!gSTOj$>Ll%hGoTFgvNj7<#a#vQbo54jB2jHZuGUt4f8h3hs z(D7c2kMwFvtAZTK^LtwaRXc$wn`Qfi&k+UM#e3{uxqoK?D>x(aSy0478;QBys9y;fav=n5_H96%kKYI^3 zkeoaTH78tyBos5CNV>|YP+eozGDn~3* zsnO}LVZpSrV0+FdC`Bdka#6!KV*7UaVJsHdNiQj!c+Ue)LU!leW^%g`#s)|lx4HEV z$j<+Zjz49Msfw~-Y_x3P{6t6b?vnW`6u;!;hrAu!G&;Z;GoE_P0@uHOga8Ef4`=x%5}0|tW1 zvv>L~e<{IbsvN2mp}5T*92#tr$op$ThD9CX{eI1vuUHHpeA{sptnRgv#q-l4x-%Xc z7EK1(19H#O(1bK@;Q^~)sBD=P4k?uIR_74;a0Br!y}sR__$i=7T_gPiS?xr=a*#c{nw){_iW`9X~qhVdRj+D=#YQlnH4t;oTmTK|g-Ih48YRTfbb;D?*= zO0CKgQ<%XeE+Y?!$3EQKG@dcp?+aCvb$UKJ|fn%Bv)6>0@3~lTL9_KJn9g{oFJ;a4)UzEjD!yo>O1!*A(J*Rh2-B)a3uY z(SXG=Kc)!$GD9f-7Sh5zs1s>jG5@KUuqZ>V5}gM~Ck&D8-Ymz0iAfV(Uu=>P@U_+Jh_-$EjJaj zRudpv$OU=~q9gnqmj@*!5k(o3i?aB8)BI&a9&~ZLTbrznUu2m+8a{K3>)3Q$#S9b@ z`4jJ3mK$p09(ge6l^jX0=>D$*LD^nA)0nY*XjAaEJ@%3l zImUT^&pU&WifyoeE|!}_@uuN$PItiOeIWFHKF`N38u88TT(d69@XN&K6)ZJhkv*95 zb&yxm$?jYAM-<2>vLXz*lw6dalkTwubdo>oq)*TJm`&evzs1Mqgt8YQcRE|wm4(aw zm?<{ryt$=}!-j13?V4}t_`IZs4Pw%`Gc49pol8{c7Qu$Ul${f+qsZl<)^BOvdd)wJ zbDG|v;6l_mbgfEjd5puD>5bx43bG))tgjx)@P>Ee(`&zrB*cs$Q$GZfl z_BZxfcS;Sf6(=Hfd@!2gRZhktBZ|GJgu{I#rY5c}BhAT;mVQ7o@&86y-2d5R zGI<8-zq?HnU?iy?j(d>9R6IKY=OrX79aEdhFf7K;seZFL3!e{({Bi+E#sXTvWxAkdp}O6ilb`%AAaqh6)btksBr)_@ zkNFYbYk-FK9}a=rN+!V|I{@ygVkM-a)*jHt!bH6GO5xuw6*8*kI<&-*t<0L$4&|Ce zSG=!`Q~NYD=s!?LvDHx5>Y2<1jPgPsdou(08v>>NTE1+XPisWN(#XzQ#X@?N+AJtE z4lRIc^Bkk>zwkp#WT>tGWLmeveksu}*ngLbU2B&c+wFQ5swgQ7Z?uGG0E9W>S%}|s zq;)7SIx#!(Rh}(PxeeJ_X_W-1>qM0L!f@d3_QH|Cbq(5hxTFg@OT=$FZ2@o?hMC6A zT&iq%Ui|~(GGK_s>bZjUZ0KsJ6@!KV@RjLd0Ad1g^uc?mj+ez5b|dZ=KofyGzO>k~r&|E-BX%`K8b|M^zG8gM#X zr7yvKZgXXdg~8E@@xh;$WB?AhGnMG-0;=fNkW&G_ikqs#L;`%eZylu{IHGbAKqAhO zI@;xDtff_Bx#1<}M3SsC_jh~8pDu3qX8@rXfMmKv%cu5P2QX`NXSiqTKySB&&BnF0 z$?I~xW>6lS8I*^|I@h5_qRjxu#1n%I;hD>g$6f-|*Vh9es8UL8jjUfHxK@Ld#Gu^@ z(Mls5Vqo6+S57TXOeJFRZ@I@<8Xn)(1W+V*2rmcW9w&iay=fY%(d8e98FRpQJ+I%n zHgRi~!8c&$0B#*U^-5(=s3!gn*cLX8OW1i@%j=#f|Fku zw6RD%c&EcZa^{+GF^m_k!#5(;CgL}NAT_5%oOpY4qOem6c}neuw>*IS5ECf0a8JaA{!xHE)5pe*NY@uzasoAj-R2SKc#L!eW5ELE8Db@0Z7S zr#M&JkCL~JS3iKqRY9CyTsYcx+%^Ra2?`t(vEuG#L}6VyeWNpQ_}>(l)!6#2N!ukV z<8){{XS)MMhQ(j}9IO2LzW!PE((A@;y~ahS&tWc+xd#qC&dsju-#sA(==iis4(!r69!lvpdiQS30OP0N=y9;*hlNlPm zRLR)-057jZiX(#0fde3!9*6OPGm(XbURFJ3BHMLUNgn4xh;<7A63+~yhakr?$-_mI zF@rejQ^3-9^3bH@E|nU;e4jfBPao-&ixpYE2=s^ z;Rh^$W&V}lVz*>M>$HDv4@?tw03dJ=eTOXzoS$oR${h>J$0g~FIH1Qi$1l9PItUB6 z1tKe@R1h=2A$@{g7Bb~1s)GFCG?WzuU;m}h4pq5_6j=q4*xJjhB45L*%lYO>SN*bGciX#zkD%97G zBYjg>w;anm@izq4V^O2Djh&#;=AB|4#BP7bS6=nFt)NC!*cfSg>7BJD$cIw>+Y~gb z6wrdZ#gvnsRn8aHF}`G@tQX(c{MFaeaOQUc0(7cGx|4hsmISk-xSbyg2{z?)ivQ$Q z-w=cgB^~k|>2pB{`vNfMef>;u0#fyLhcv^eifxA6!qZm~kq5)^o{^_aiTnCYJW7m6 zto z;ZD=umYFBIpR-c_P;6M3=1Ju{jM0!i!Q#yt%t#!}ovfV#-TOX78Xd5}b`S1K;Z~e| zZ@eB?eCJvSOrJBwRC^wSMBKw4znyiDv)A*B`#1TbvNiZGsjtKzhHE&r6Gn({_DI#< zu;_a2uS#_iD)KV+&I<9JA($jX7u`*{4u^o!LON1R6~)h@ZmT^az9 zgF}RA1TvqF-AdB9C;a}WLibE*p)cP;r&Qsb?S`p$Yg)sTA&DOOi{U_9CH%}60%B9y zoC-bL0DKq+UqC{flzGVx;;}vDhDO)z>#Pug>lUi68}Sq~HNDHd-Y$`KFLG!t^N-ZW zr6^XJ_GK7rte-M_!(TpUogB#NL^NPY{klVqu0g0Pr}S@>g9$N%*T$ z=WY=nDxbJ}eMr-4I{WiWIxYqQJQg5-v-q~JEM^616%*e5z(xSFBLg6gfoni18^>o? zy)Qp;Ddiw?BZTuv?%7e+1!?m=bX>V%lqy{rz;;7nX%rn*JwQw{#rm6ZH2l&EXu74O z&oGki%$AvuQC@H|&zY551EyD7rn!*RgPr_`r#y&^4FcPEOTCj=s$cVKHx6iJmP|&G z3wvP9Mp`$`9GeCFEIWKT?Wn$rzg3#kdpw_NNIsc8@2zvQdDczj@Z@od2bLsfKyFtY ztZ+(&Ez?ORpJlNY zr~}+Uw)QTzq7u7?U}RB;?A z$I%7l4yr;s>rmQ>`n*oF;G(_bvya$O?{)W{5NkqpfzpbMH|$yg{s-;mt~ZfEdD+=wHhFT+JaI1Oqs;*YV#$#*1(%tsl~SH zt7bIZ3aWH-aNwifC3aC>I?R=_!T+B8Mr4;{{oYDY7uDg)BbKG+>ZPie-BT^$RBAcX z$G9mH!R^7FJNQ7)qBj~|Sx877$f*FZ6a!(H(Kz2WX79o=-z`B`&jc&W$GB6P@{RFU z#;IoH6KQXMbduztRK>xpj;eMVWx3PJ`M`J++3&RSMWmgCFWUl)BWiM;wK(XlRH zP$%PbO4P%8b^MsovoP7VeI|<@BZa@h>o0^U(#!suAv3o#(x(m4Lk~^k?merE;-uxI z*KcS9`~I%z_==@j_g9M7KYiV6BEB5gi>z539Hc6w8E@eOUSqyBX(+ieT+O*V=Z9Nw z$1CoSIdi$mUL4GK`p!cB)}TX|P1A z$U+IH($$lj=GlXBxeZCm!&pYmo6XHX$i+D^U@@CSsCiVs!6zFlB!{@N39zO(_RS~^ zOAp=jCyPCc%%Nzc)F|H{(kI0SyNs}v`l+Yz_`dV~7T(C8-A^L^7z4N9@1()WYHr5$ zG#1dYbE~NV#6^7)$uPt*+1~Hfu+R3h=TFeiLNKAh{K7M)bfvB|l!Lg)-ulP_vp+Xb zh<8rsM^nB+BS-;19f!^lts5uQJu_ZUe3pU;7#2w0wRc5SNy&bhTy?TfR$MB@Y>hkJ zHuEVU@_PJQHGP&J#Wn}2?v&__1fViI#cwF}Y-(%Q0}vl_I`!&dht+%gsvVa;n2Mx@ z=|Ls$Ov8zJ=!jaK`v*>a-Cng9X(e(OE0Xsh!s9SUvI4VUxZrDdj?m}Ti+7k`fGxgR zwx(<{r}kiOL-X}SWzSNh)I*H3T7Y#^5S2LY%Ue;ZS8kG;dS585UG~a=&QrSMbif0I zhypXU_7%lsF0LfiH~D^DXWxoAF!Ber*qpSg?yh1`%{Ww!$Rf>V|CyPehq={$p}`uF z+;326e|+o%`Zvi>gW*_xq@>d~^fY|N3oK|Q^a>%J+yJr z-acgFhkg?c(E=2OCklnwP(!Ag89Ig0%SrL`GVJNp@kh6nKU(i8}%TppOXWmZ{1 zFsqo`ZwA6Rv|2w)9%`s^932$%=YBQw8=-Y)S-D(_iXDH|k>E*ra@FqkAv2Am>ROSZ zv4hu27f909)8BmsX9G%|2abp?hJ?Xt0tojv?i(Ar^AQTXk{@W7l|0~1sf44iJ!vlf`oeUfd5|xtd9tmqz2R#GFIL%sfC^08M+ zFWsIx!!C0_Ht0bE6p*qAOMnl#tP9!cPTc`t(pRP4qk(MPt76E@%$vX=@6ii5NVXD( zBAx$*GpsORNAG!DuZ#4#1I6sk($H3?gALG9UOjR4-PP1xz{uR|T3R9;sR4w)nx~WR zO{(oud^73ot^wUcfQ~o>I*4ld%0W-a)xm#3l%Tg}qd|Vi!^j)7o;Wc-P7TBiyL8LydSv1O{o<6i#4Tw;RV|~ zm%sAL6S^7wFW$5vZ`b|Hpm(Q)_3M8?l`RQ?nbix3iyIA+wM*A&jRLM7pkrv*_ESv$ zZHQQn#8Gq(%NL*t_1#)J=%JD4c^9Af`5vog*v1)=?z3^tItTcw{{y-K5F9gH1TZBR z0A+GW=go^FohZ=VG&I>7DSlkrf;Xe%s%K&*)CT2AMuph49${@I z$#Gb0A=X@!FXC))^iAu6CpvH97^^)7%t4U4|NI{|<`qo1_lF!sc0``J^10biP*tMXczQkHb1ph6A$g6F# z(ca%H4S2k&Ol7hn=g#xaM~t`$v=&{az3j9tk|4EI_cN1UA)NDcqdkVdfLou3`xW1* zDSmCSNZ06-d5z6W#H%V7*h}x;_1YK6^I+6@T^Q+|o8EK=~i!;&oQr&^bAURnZh5KP?1T*bnU`aRtS(DZ&--4bh%l zAyw<$+(jICrxa&D?E5libIQ0fdPBq2zn#DT?H~I~?6;_jwh~X0Ct-ejI4oG_3+M>7 z@oPMrj8#ON74w*6$W~L}@FnfMMl)Y*xQ8%~#Vz?PJ5Q1?IqKRWHE0*C;tW&S&ygMI_2IcbybB2eERbB-^reOMU0M1SyvI@~jw;u&0@Zs&v-umIK`Ds#W z*V_zP_@`$TOK}TVA!5_znwLS}&%XA1S2RP-=3+8rPwlnHfm5o}wk+ zm!D8`$?xVrwSU&i`i!7xo_ohJ@Okh0&7Q()tF3Wf(5{vtJ0`B8dD%x^Ty0S51o{Db z6tq|P9Kz(-V?FP5CA|KhS_21|x|>!$X@H&%TzR2wC|u&Z(|L*SZ9KO3)0JbJVPt2-~C|;`i2!Gac#tPP_5WmiLD=G#}6|| z>>FR&L@+^zn7Przp>uiK3`XlCFu@Cs5_sk<6>vu-o|+SItt|${sk!RZ?O%2bc7G3m zi(x!!MxWnFLTvd8a7^~kuLM=&T)+o^HA;eO$VoFSUcK!eD>DZ~1@e@)pinGq z41#?DZ9_Zk<4C)v6IJ-#f6jSmELRxfblIz`i?0)D3h zaJMUAsYl&^pGz!M=Yz|?hIw4g1kh-x&?D04J}7 zgU5te!4GjjdDPYd{0r??Kx63Q26;-CGy`=F0YAzBiS&%nssI16|5V5c>d_f?k$n%Zrme$Wb@wjkW2*!1gm+pRn6w+3N) zlvKEB_~y<>d3-=feQt%?=2N7ic8wIfqgpMl?O z0&$jTxH#x-nh%5fc-Ow|G2yk^`rw_>6xn0H(BjBoTW`HQnz~*-hJj6I|23%ZVGs^OAGF|e!|`dX0k+M z>GO%bJpnZ#@Cy($?!4r{-WYF3EVCfT^F_bf*&OScb5ti3#V}^5wZB7$Lx$S{jd+d9 zy_viINl^52736};koQP!!I}&^f|Vh4czKx!dTgHCfAvt8<#y|&t4rR9#Ag*gy2-oP1s$Bbt;Cmm{^nI;L zT%;+dXOo$8Gq1Lwcqjc8%0bmwBg#2-^R#$9*M!`FNt6ItSlJf!XJGEUoj`4NpKWr} zH)#INWKD=|GIc-no_QNx;;N@AmhG{Kgl1KxJuc=zYnPPn1(FO-sA{~Agi$pSIy+XW zGxOM?J1M+!trJ>X=Pxu`l|m5sa&787nfXS?R5V81EX>@*ZEc;QXQsfzFH@>cq=cn%If}eyv|d__E?#+258emy~!Ux8QCcU0+C`z7y`QVg2>l{@7cJ?qx%I~pSb<} z(@^p?))`q}JxJuKv%Cu9EhmL0(A`dTI8WiR;e&lIbP5JOKVg3)@QX*iJEhW9E#{c$ z0bh|y$@#XqNV3No1YKcd^{}ErgTn_|XiYVrNHFNs+UfLKDM69@C##hv@ks#2t(G9o zBZuJibDJQ!bW5MzyofIpOPvxdX&3PgO zs>JUP9TKh|*x59VP-UhB0Z-07x0)30#WoiEq)~eWa9B?pUh-B;Lx0CtOh}X7hRfk% z@_1KC&6=d@bqO~+sB#I-0@2`1)&u4-MoV7QO+##hS`WD>H-cn;RpP!$*M%zp4B zmzurFNcfmVL7vg#Qy9bIkfNOnO+|Q=x)(>XfPJYV4cqjFzh+@YC0FtUTDXQJmz_p2 z5F9brn9uFpP1`}Q>GnWm`xL+==sZ zOC-a4OlVkcj8cg-wZ}s!lwZjf*QoT$KqU68lDJR|+z0Q6;Y_A^hIrprND}y&IRH#X z!ONCKS`aeLc`FP-c__}W-8J6NYVA_*yFcDV%sCg}dEwOJgrgx+6yI1x4u#^&IGHeK z*?YMo)42ciJN2vP*-y;l+fDI3dLMD@j=CI!*zgS^ebz~uy;GhdIC<02GKF8+F4Hn_ zYQO|ikmfvnbCozZE(g#H)*8Xp9NLg4Q!Tm&PZ^eoLIPAK`@l78BUMrZ@N=O%kOE%! zT-Swr7It&vhP@z!dZ1>4;A7AOvs{A5|I{O!waz)ouy-0|{h`Bp#q$j47M7V+Y@`<9 z2F>Y*39@C&k3Qf88Ea!u0smUt$c663MleKa6x5s+|411jBFJ!-EIu@Ap81oR*QTyQ zIES9^p~aS?n-Wvh_a2w7^llI1HlO=4Fe8>aR8xdCpf9*dfoVHw7m@=xzH5wena8@j zS=fuq_e%8kRMD`#HTY5a$?CoaH%hgR@~rYBR1Zu)j^ExabJ+W36K(-9w^7kLVGe|v zR$kajFj6KrHR*`MVcz{lLDLp!1)ltHNPdN7AM!^( zvW?^;r6PwOoAFVj%sx%5{)m4j==a046Uayg7m*STZWMR%c>2g11tBn28TE*MwXPDL9RzaDs zPzo*j8E}MAyycWY|3V_ZG+y;D=R;>DZhCDIXP;6sm{UG>q*@e6k_RjB1_eR(v!2{o z-qU44o@+v>K*)wcM1UL>Pj#D&D~}Pln;MLzeE(~;D0{Xjt)b3rS*AyXq7%5s=YKM_ z$(~JWuY;E0&CP$(T|^ZqbX(7Ui?yeWss(x62N{7(CI#4gqsJU(Z&8lkx4II>FILW~ zD_&!6Qc>kvw>I@-NwN1Pk#_*=zc65&{ z3U*peQd>{o^o>?Iu;y{C(E```uZ#4b`~u?rj1p)~wvbQ{OfSq1l9Gw1%bXa7%_>#e z>O64Kg9?;@vm%jx)PI(3fKA=T0L_5a%d4isfHP4QuTZ7>Cfy2v`YsC3 zZJtJ#<6=}L84-ZQIpkLqQdg0$q6$i?SRTI~s~miRcX?pAc&Rp$A?`0px=UG2PRcIdz(=%Wr#H}YOZ@UI!MBNwG*iD7=1>BA5Ho)_ci zl=-{9sSUXByt}l?pnPv%OLJbo(l_PFhURT|x8g$HdVIKvv2ugOS{_A zRF-8=$Gp>@;W5x^dT=Dl8`1A9atY`}MoWkz%NaY_0g6*6aIG!oJto~6spGakGrbtc zXhbzuDdEee87*&idsC_6Rk*10?u?^?tqIfg$Vq*6Lf)I{2HhGWY|=TG^G9YV zeQH)&eparjW5$1;zuJ;H88A7I?CAMXk%rJk6?b+P<11sz3cPE~-y-Pl92|O&HwHA^F54r_1J`3GOk-Q` z_r|;J!fH>xq&DHORMTa;Id&oz$86qbTxYu~(^Q~#q!F|Im-N1Y{k+0qTS*ZVLlpNF zPXkIE;{x)N?!@xrXV15q_JsHJeHG8+eU?3tDRsHt*=@OvH!ByM05SZPSwwz`XJA{X zJx8TJxNHv0y7Dfd_3y9y^V1 z4}He@!7Iu7>p|3cmUM;`Rr4>cb4o#bSU!H=8J4n`#Ps90=mkfTkVe5g6tUI{(%Z;T zys^)~*@tPkBgn$=UuGi@H6&u-{rq>$=GyFA`-)wt2R)UIHrdK$ar%EM7#q>kx;Y7D zsBdgx=gZY8ZfdsRfWz?ElV=5-NB6gAn@jpQg|iuPzVW3SX-R%knq^@oJBceFpU*^< zEUxnpbabq2wLjEb?I#{lPCtjKV8UN!zuCJ7i(paImh`gs)$m!zCm z+V*$8YeLjNzB(2LhIs(0ju|$fL~&_0D~RHsYpTTb*D(1#qU%%RdA6`tZM&BM9gl%S zNVChRa6=}zR;VZY|L3yg37xPpM)y9@5C=nT%>Q>UPsq5G61cOG+^_@~ybeHtm;N{3 zqyXb-fTq`wL*Np#A!DFcP+avFF!TTL-3o|?YzcUr;V=sU%l6v|dZ;yre&a<>)==)?|Bi4e)&zEWx`G+OLWqD9;i`Gu{>D z30^s7NMpPatq{C`VTy1nN^Nu6wu6~9xsm$?-zf%qk7Zr_pv#A~4;}O2#RP>hFn3+R z0&fF2m)q(C9AGXw^P+seO7|!=oolChHJq;IMy(D zWksSPe2Ps*DDud~dfM>Q#5u8E{etwzy*C@6$9WwZp8Q|!eR(+4-}`rqqKqxojD1U# zVur|AD_KgBXtOpX>llnRhDbwbwAl)UvNlO!QW`P#Wki-TGWM}#9otx+Gd`cs_xs!a z{r&M=&*SQHX_j;DbD#4*@AtW1_x*aMHP$$D${hS}naq}gSw`>x`0$SAmRZFfHr9ZR zEmeB?=-=TjwH^1~1H!q0r9U5AJhF_cA{n1Nyw?wX^wv(@7m-_!Qtu@@_}t^x+>^qs z{ZNvhiP8{W1i%kQ_?&%cUl`}1RB7tRBbG}YQ|cww(=f=*2H4J>!3p91%}M8LPDBQW zQ8B+kX0TW4LwM7-oj(*{H`qjBwPj*;MqIQx%OJHO=7qMgsEK1gq{V~&6gI!*_#kKA z3Sf*c&Vc0M_Fp6q0^EviWs|iaxw-5>-TmhU0vR$ll~vdb4;tDjPGQV<4J6W3!SAZV z;m?Mbh9@-?sT?SSqp)8Ofmh$^!w9YYv2=ad?Hf~+Z#yR>Ed--(oTS}6QRO!Ii@zo8 z&eWB{l-Dfv$N>#wkQ)3%yZ#l(eC@k?Ix>KBiVc1gOL~1osGVmMTyU#0RY$kMj*Nsi z%e~t9Frny0u*oi}s%LLlu5r}M!NpFT743VlXk!56({AtMw_JBZoynIUo@(5-qwC@s z#jZ<=on;RbXEi+>gc5*1>Xdi#MYtK9`nfyKOY0p`ZOmtgDOZKO+EQ^xQ92?1gk6dq z1mb!8FDgHQ2xiVq1}do`lVV+jmAu_90~$*;lsk1fJKbmR31RDRvS>M~jHj(vh*w$| zmhg&0kvHt+V6c1YQ)jgCk798_g68R@m=j$F?;Z!wut^hC-ldjN*l2|wKVd>aM*MAz z+p#!>IB>++7O&$wjGKkANb;WA3`k3Hqh>+RJ?Uk*)qI%}`0FBJNoZzcX8Rr|Jk@SZqGy~lfp^;I_C zzeh?)>iZLX#3jgU!&A}A#LJ_)Za2`yyI@4_4V3!PEaS7VEHi6}(M*}^h<(Mb2vcol2Q+vt*>qdD z>CW@d4^u^`QdC*$5uG9mvZl0VS9pw`PgP*=k#ZS%tgW|~)GaUDxnjGt#8rXXlj0QQ z9L{P2B!ko{QBPAdfmP)LyIfN4`Fvj!VMwjZh#~uvSBIc)7ODxaL-`JQrp-)27#GF2 zMZ?P)wfe2Wxlg14l~nVnEztKuIAh0Yr$+sVY0A)Q-?FVbS%NqCTFUD&b%sYMEZ=!!m)~T3zt~>ro4S^&xfI}Qsh$HQciL>P-W(B zr(h7uWncnJcqxdB;nYL}#F^@R8w29#+s#4U>3E;>%f0%F{MMo{G`Q00raP+w>-Do1 zuHg26j#1fRazf)ka3hDC8k9;ZIZ~1)=g7h%rLZksxGIlt8S`p z$5oqxNij?@h0K3v%vLgA$xtFt`;hf1Pg9GiMhE0H3ev2geCHbbq${e!t~JJ9m_46N zNMs6F9~vmTH6Km#FU=|Hly%WrL9S@7Xj4Cqrc8WMiQUVr(9F8(N%h5~5*m;&)kdPnvFO*XIBu%iW>76G(1CVk4E` zxZu#;23g-WkxI!1r{$WAlrgN5e|NP~u60GaT+(sBLb)W`ckW3|2k@X>ircN$dw3K+ zTj=3{^fxR0CWoP0FNTCZeem0I@NYhjl>1G@fu9}{2TjZZ zn@mXN6Y2rCzUG+7*N#0PtKbFlO4(#Xnmdo`A`<^4~9z4N5RiJQC!FcouG zkD-#d^>`>`qxb}(MD*O6=v<({uoiZ5as& z(i-DUi&R3R;Z18t6d|w72~6e6_ez)F41Z@^=kGMf-Nr>>?Q+R~nt0+JdE1*fnR|d; z>9vzIL`CBODv9vfclJ?HEZ_n~tArk{^bAgtj6`WLxF}=?nd1t*sp<60%|?_uwL;_m zBkcQ*dSl^hc6SY6hhv2CZw`RdjKj*4>7Pc8$@}nQA~KL#?zXFPN-oyg3;;qj015^Q z_y`mEb9_klA)CSk(29X*-}P=KKm;cUK7s-tS!eU5{a^A%>I8i4_p~2Kp|D$8xi)@Y zmyVkbnskf=j5%Ej%byi(-IbuU@h@V!%N6B`K?4t-l3Gi0(A~(9uS;axz)a^KRv27| zAHiy3+AJlDG_%zfx1#GYirYJey?P{o`o5{3q^3FE5?NFaObi$FD zm4Ui07&}Ri`=}|JrJB1S9=QRss-K|(%&(Qe3&5=PvnST_BJ6g%vTK9YiX>N!k>Y8D z>kD}B@Xru)oJ>w6gc61$K(|@Y$s;8r!;ziZau9|uuUL#cWhUz#xaQamfb|er{2bo2 zDHFTUe1qt{?eHOkW*!RST7t0f*xO0;@sBY%>EDFCJ<<1F}KHGxrSW!}N2$Lu;5OA>zRgOExJMgE8E_@rKK z31$dq0vlmyY4kk58vftx$%8jd+jmp02e|$|>FNrjN+gLq4;8VbSziW?Kq`YQQk{Og zx|IdIvSfZ>;QK}EZur}Xo8kK}+We|yXa0idSOTw`FG}6?hV!lx{?qWdXoHJ;C>(UI zNIi4T4_AW{_p7Qx7@n4fx2@34zfff+=6APdl7+V4mpzpfu9kTnuf zL|#;s9S3zD6}ov-L8AUgH+29`Gsh|9rkOD!%GmkOwc2@<;<@qu+5~Y4_TVjM0el~| z!-_YAh%TBh_CLZ7O}8#pkkAzPz^>hK7t?z8Il_9| z(PberblpXOoR;M(SS4Kpw#=~i8=@M7QA$L#B_+mfOVq!}L#f+7-7u?N7_52_?tdVj zb~0kophHr(vr|i%%Dp{=SQK2p*W0gO zqQ`0FZ}HSStL-JqM=um^vNY4I8cp_4kgu8QRop!P^%@$7TkE-6q`HY@jNnm2CGVRx zCSNJ4?1z&Ms2;>KgIxjZUUDwFSxJ4a(Brr!a12SQ)8)U;lNkQ-y#pQ44gTVbFazG8 z1*>Cl)tep)c^x_9|g9+TJHN^?=Lz#IQ|>Ew_6P}caB zKZc2$tvE)xxSQff13YSyn<2)KsAm1Ny#0{s3!|Ps8H%$!u)SE-@ylrt7P*Ux;ldw0Foe{=QM2+nJ^noq6Kc4N-2 znPzo8p*u_E&H&N$=fAG=X~Q4;EG{PspB*yi7`&Ei)aM^qxN z9_xaVvkr;{Pi*+Kft}ncV|2)h#MiH5KJoTG$`DTjw5cPrDRXBYdIt38E}8a_b{CPs zhE)}aSV(E04T*Yy9$Zt4;eYBp0WUDP+s^D5<YMr<*^&N!}Zu@*ZufV(!(_gZ~cPeKBjms$6Q-syTrQ&ME z@yX}+tEvs(y=2{znGP5IWM|=a_6Pi}4T!#hXMpLuH-(GattQR%QY}MPNq(JR35+LE zJbB}3|RAU9(Vhw{Zcm45d2EmiTOQ+06W{2xW`D_3*R@A^o$C7 z3;D!q*0X}66J13tCmp^<6u(J3X{<2<7iE6SJu=-^G;~vAez^+!a{LQ= zJn%889{Z-a=}``PN~EhWVD_8Wpud)3ZooT2GYc?nVP9WaNU92Ky)*m4g7)22Vw2%d z)465>m?YRHC2*;mSvOfVy34rcm=BiZIPg+Ri9WaWQsw)Lih0E&k4UK>rJw9J8Kyft zC<$9NsQrB{Vy+X%H+-GqsUt@U)WH|u@9(*q)`)Al7D9(=WNWo#jw2qpisj38;{&{C z*Lp%nwR%CAlkN56lih&}4}>k9ViPX;o|TFGw6-c`Mx;7ln0jkT;;TnLG$GIhTt8`# zU=4~5Ft7vB4NN^IZqJ88>alK<4qAe<6)zXZqE>Vhx^&XqAZZC0F>{$Q)D9O_dbCXH zRa+`k=SHcg<*$iUJYoa|+sJfuUrTu4P=Omv>R2r?2odPexca4p2g%iCc8mHY0MY!p zrGcyPZ%24v7aoT*qQ@uNKRq%m)`;tQcgN~wPOg~$!PE}5rDpUSOBt0?nS2R+#$dFo zSb-?qa)MA|zEEOA!(Z1;^@$igsxd$&tnm}T&EaNIV@%9i*_bAYSz_7pwUnXXd)p1? zkWo^K(c$BYAPRJ@?I`O_Yq{4s+rYT2;BHV%=y-=7EQ^u+4Xtr8KSud%+4$%0s~}-+ z7rB5E%k7?DV{pzjyJpu5dbP%?2L506C1~cSyZVNg5{Yo3jB&(^eU=&Hvu8bmzM{KI zhF|sWZIj}B&^Z{$RV$2VI&>}tC)tk%Ry#Zc(9{HA`zW zh<9YMgDEAkw6+TO(fZ42$6wkJ-7ZDdKAUizSY$JCtb)5{+cSc{Jj9&13ce{BTS&;2 zb#<*x9j3bG)Kq%X5ogPGS&|gZ>u5wzA5`bZBZMybW&Pq6Zp!CX1 zC~i*8s-**A4sJpozd^yoyAq|Xz86K~z&f&_9hCC_lbTqF`}xl@8jfM45d8lP^j_n* zIn`KaXvZ0gRvh12Nq<_@mQwFPR!bghR&Yp^G6b9=|xbrdx{Z52E-i~gAVE&>4eActG9ayd} zCC8`emP>3xrNu^g*}dTOv~Cx|-@`Su&kD}a;f%7;XhUm=lt7Cw_Qh^H-RD@&ER>Ca z+j)-eI&DktEpemG4Z;^ny{?M7!FI=Kb}V^(+do^DQG)x!HE3quT{yv%bxp)3x&9W< z7*`vxmYSK{6pbD9*Q=GxOVj&VjAnV&mFrIYHF*C=n$78t)7F)S#a7TLrcS2_H$i^gF- zOsWAFp7vA?)5`M2NwEy{1ZxVu*Vtk!m= z5q1TN=!qdxccvLc(bFT$Y7!(C?L zRtYVqAJFvy3bbP~3L6s|8GfCYN=sWZqp>%|I`;54%-@BAC4_Yb^Ij3wxzYj{QrJ7? zuAx&GYHPJRyC2M9Z3(>Ubn%Rm$^EA}`Na^`epsib$+w}dPTGew^3fW?-TbQiHC9R< z>%ygJzTiV-kEk!LG^=NHPuJf2x=u$q4Kq&_zEv#%jOzv4mQ(M9~T<3 z!Kg)#7D_yQ`4TYLS@6cVeO3gCW(YE$dad^!o!!KuP-*ilKZFFXeiyZhUfX@n( z8e@DP^?vF|*RC{O+HO=+uErfrwoybkbMP&J=m}h=CFA=(v|dOn!otb%{8LmwmR#EM zia^RG`l@Y^Smcw5RXRo~)7%MW%bzNgDp8T;R~3O!tn1NfAJeZbZP!P3Kh?aoAt94R zTx~)uqIpZl7w77JU6RZuTJXg7$Svmkg^N0bUzp->GZaNTiLsWz3Ki)uz^Jx{emUj7EFvbbT{=5wPaZ)05LeOwU6FfcYM<&hI)*aIp9d zEvYVjw)EAOGh=dV<{iDXbGmMK*Z8d%yyL5YYq1a9>&_-Q9}nUSw8$^KZ!N!qP&6!C zE9Nj0i~M2nuD{mlMr2 zwwUlsy#?#E(95j661z3dNW~CXX*5ZH3{dNh9-_R@vbQ|b z{{6(gTt+_G9+p-Qe=8=0_bnPgUyE+-*D8O?e}R$0xSaL|8RnFXx^6!FsF77?lD}rfI5q*EcvL3$0TQ3(adM(k=zc%A6aJhc8ODWc@g*V6Fq1ehszFY zSc>7>tX};1rW=2C_4;%?BT=G91u_veTjI`JrFTWQC^H0)YJO}O85xcBeZY6kGd;LcdZMJrL{kz! zp-V6gRLn5@>Iv(10NqhfREHMs>wN0 zT{1Tf3gSpER=lM|9H04Of670HVUVVl#q~@TVuDchTtqv#&#Q7acTN`C=51}%>)D;( zx#o~yyWfi390Q)fp**8)b~=$C^5qS~lXd%Q%D)8P^{_+zbOG)>2YUVvdS19+aU_9@ zGHX+T)90V!PTp$o7K~cnMFJGYsqE@KkQDIa(j~EvZhb>K`@<6XN~^uw;~_t^H`iu@ zf0qTbH_X*9xIn?%WH&tV9y3mcLtW#B6GGMA?IIDur zlc`YvrThPQTtEn1!2;KFJ0~^d|Gg~Y&TdR#eJwsUWOfm0zL$XO<-n1Hq70FEu-4%i z@w-#`=1N7p*2g1RvPB84r`akje|bIltUVcCJ?fc8Gv>P8i%H9FY+-F^fVkxo6Oc3? z=?K%24nC#GIgp+w(fcDAWD@*i$79r#=el{5*&5-g^HKGAJvfMf3w^A1Zhni_$qpMY8pJio8K2Nn@b%^P%RG zpau}g4}coL!G|Z+P|IxYEH;l0#&_lkP_xQZ%115F4L3T;L4LgFq7Y7<%Z?$aNA`o_ zyj3d@E^gF1+F_aanl&zeC88hDk=>Gs><8*(JX7gkuKhpxT*Q*z| z_{&LzV<5n+0omZ)G>c(FEZPh0<}WJ5Y=dhw5h-q=o2zs_DDVqF1QLBr&dU*>cAuLj zvT+bkLTe#;zRegkzK2|9tj7sDSWbBGuLZm@sDU_*bSCtylk|~_j?b&Ak2u6OMTilc>G3N z(8Pe0i%{&m@=?IczlZFFIz|6$C~glgd-V!L2+0rFg%JE*M!(%im}y#18aSJj^^}v5 z{W`lG`8Ns%TSlFiRMHx>fl&*RqzfBQp1Wif*}rVCN5fVFby-7BGgXDh$zW9dNMWvn&b1Q z=Or`AA9HA3gt3dk$=Ji-X7KFPEiSjRw@7*}hCiZKSL2X(Hs#N2-`)?7oJrY6k)*ohOej zTpzKP-H(;%EtV4fQE?ASeA2rylG3-f4fJ?180XAe8TfAIG`mMed^2iVZ1K-5@(!xWq+3p=vobggkEBcp=r zVE^?tKlMoUMD^@}cyA6A^($9ptAL>-LBQVjVr^-%;p>5T^?u7XQZ;g4=9LuP6vL(; z*_l~S3DC?Ge;=wq#2|fCZmbF^Cj&DUu4PpQQdmzpadoeZflx<2{6gXES_8@NGmH4D zhjspWBvy`^IU0qm9jEDAi%LVV9&^}Oqe9VCwWai(jEyi8o6F_FWX}yx@!2y)1dD1U zEKGgH$Wx^8F0XcF;cJu^xeA%k@=-+q@4~^85hj2yJq53_p}pF?2HWMf_@4^BeyX-sh zjC%)^%U1QrjUQerdGU0Q0krKo<`6at8;y;@#+n_~7|~F!-X*OuI2v#RPZdeY-fi!q znWBm0HU9f?xlMUog&Aw&d|-Y%M5J{3+NIUNYy=Mc+Xi9%3+Wnz@5nZ!X(AC zpf+-J?G>RYRXvpLadnbm8W>*wRx~`L{QQ?JK4~-SyODiX(NI?Xo+HUyoV-NaANa zd+pfj6t*=^^3c=gU?+p$ZPo9)tML#|Kyya>sp1vFYyj zyyphh?G8;|o4*$PMhXYRi7TBNqm3@vOUiQ&e8r>G8fwM@T=*Ym7J`(VBFj+nJI zhix48iU&Dt6d|(-`!?ssldt{gsR^vtR84uYR(zif{kDe^ zd$tv(aZAHSxRH*<`1U8Bu`s_!w)}8 zJTzAKJ+d_su`7`rcdE%cWOtao&$1*cDNy*3w}u^_=ps6{4wxv>exN}wP}S~ki(r~S zGRw=(5cou@+M|*_R8SEnYXJ`EBuHr9Y1U^n~beb}yT2~kJU%xsdmEg2PdL)qm* zyktIgPZb;cyB@ePBZ^8gNivfk@%vr}sxI??g&AAcr26-kvOL95iI~K`aQ>IBP8Lbm znwmZi?0g$5Ef|3*9ZvYWhj|po0J{)UWu{@@ikp0W(2O~pH~I2sC+;I>B&p+A;(@t?Clu;CVTuI8!I0fRS^q^H_dbH zKx9_M9%QQRW5_6(V92ZyHg{n)Nm231A+dX#te2Z(9)OSZV$Cghl)r({a{BaO{#lma zN3`x)e*jM9DM3^)h)VchR{w>guS-?Ju@N#5!C0Z=x7(ArcQe2<6cAnkf=~3%pNyXP zK-h$WY-y+$8*vqqyMtluR`q-U@3`YiC7g@0rbzyEpZS=Z3;LvUm%)?Q9|=0KVR+GiUlQ1aP!lDzu+-Iuiv z-}UpCos~cq6J-WMU!QjRBGRm41-D-K;ZqC6X2Ar;zKK9lgb4ZpaH2-Xm zONZ{%3LQ{4cc_hGK&o+;q?ZT5t>a_BBSeK5@>}tZa-JQXSHwD(A@7BtDsVmw0`(4g|=k}#%aV#b1-6tjF>chqN ze$oO2fUT{Tid7C(1p>hnWM=B(2R(oeDcl+eqOaZwQ>fmYn55NJ zl=ub#%$TPPk0Rbh1?(^o$mBvIo%h{_^~)SGQ=K6noIKbMEv67FG@>N|-N}`<9u3Fv9$GDi4J?e5TiLV)2JP>}r7}$Gg(gh)TD2 zo!$c9mr1^wwC4V@15KGD5$Xv<)sq};fcVOH=+&>ijkLN+%7}70s~q+lAKlP0*wa~;-0G<#aU+H37V6!&>d^EZicAQ z;POXzy2X?9T6tY^p{S~C6v;#PI3BeSEzF&c^EPTq|5W_vIxGaFYItO)1!R1^(4NAd zNV)Y#4vjuelUqDK?U2voBn#mKRbsm!l$~5!0tq#};?sh~1@Krc&VaYS5w-EYBc~eW zv49&0(P2*;EGo(=984=;&seaR<7``7o+tW`A{v%jPM1y3o`6g=J&@k7|K07P>8RlL z#9Y4FP*))40V6!V5PiuS!)Jy)Bid*%JVTv-&^dR7H?B052r2>IsDLn=BJ@f@Zp}no{GLTa3JSI)d}*uf^&o#nK*BagSU8mj zK@1ac4Cy@#5zLWE`H)t~YWT|+v)D93g-+M-$KVUFD>@1A6uX=4P&Fd%CW~u*f7~w8 zg0?uKQe0x(L4-A1@Wc9g+F)At;H9;)DnG#N_|ob)YnSMR$MH-o+k*yjr1eum8Cek@ z3h$njgUIAzSe=}`bIx80e~Qn3pVTx#w$EkkDIu)OE_Mppj8hJMI0x zAd`Znc!tcH8Ae*sdO0cY@968xN1rM4 zvTy#9521!RyK8&V)x~S~;h8vBO>dDsPg&KciR8yCWg!G7Eog?fe6!S-0>9tXcBuLv zO#4r6DKS;b%-x*a{*SV*9`bIx%Fc?f2$y+^!XMbq`U;>;#>1jzOr%AECAMdl{Wms~F7uRF|bS)Lhi zVW65z3K7{DQ1+p0L)}12ZeT+z&i=*}Uz#avm+5bli2#$|CLz6@W@E9nF9v2Yzu#d& zv7n@@>$3ZU=ldFJcJyBQ?kZ%d6Mj`;W*;GQ+}SZck(l|qPUa<%wt99!j8y1wKy~<& zOrD}d?St`ET<^wr#IzfB1J@fuFEIr-hOZgk{K@-Z_Cl2xK(UM<_%?}&5l?@?^z*e) zP+zDII~oGTl$#?zIR+-f^tIwp>bm$FAzIideOp~s2I4I~4TCKaoam=_PX5ue;BZZ~ z!uM}CGp9HiTkzaCjzb-R>>?(RTa9CJeaEM)vXv|l(LG&R=yFHtf4Lav9O7#y>BvFy z4UQDgOjoO2pBwf3$2u;t*)O7}EW;GzDu+RZct6+i{h2r-sdn}3@FJmpRj}OQ1)|O9 z%iNrlK^$`uX;jkv2!j#(f-dUTy*-L(LjR%p57m}Vx!LN zlh`z@#(B2AxYL+uKqu>dtd9qnnNf zzGmI?M|9kB4vp+ik$4&q1}aXI1>-5anXk-Ot%!_}hyT-U#doWhbXX5`U#BZ|r$(C;LgL}-ib z8`-t}=`x!-JCx!LTfHy6!`x`uLt#0&j?+5Liofc4ziKF#*lQ~A%&kv?`w45&M$dlK zZSzs^%Wgs;u1@;YlrTP!cQ&{`@W6!om9O-k$tq`^%nfjnD0%pJ2;n)0T3gA}<6?rC zH%qvHV~tFVl}rgX+|eV*=30@9>jKXBY>|`mQGJfGk6IU{L+=uM)eaglxp!=UOsT`$ z483{)q=C(Fm6wK}9eyQ@Q6*)a1Q$+V-j0qtZeODFCdHoYyz=#U_zhmuA_bs7a?EIi zmK{Mlg72ow(X&e&+$BD&B;8M6xTf`v&J41H72xK&f6CJZT z+PHpjY@{RG9nc0kBdkrLqbW$1_?lsvF24g3_V^_>0DTdiBm#1eglIzCN6m@N3Txc`a+3lnmtWmTcB~dqVB;%%t`Y~+J3#s>8I1? zG(-vC4GW&fdJ`>{N@{uO>n}d1#@0TR-o{)W;oGr@!$wiXzz9T5gP@ywq*3vnhePJ_dn@MX8OlpJ< z<0)8urYGN6g+RTW)45E%DO7!DZu$maW3Atj&aRW^(`c=qwTEhx#)D!rEYZAyFU*L; z>@=3I8C+ruB`c>tlg%pmQHjEzzxr+l)2~_q7)$v;qd~EU6Fp<@193)mYgiOoQsE*A zGHK}HpfD3DD!r$eRZ=*(KdT7moSaLm_x`e{Z+3ZZ4Sl=-TMyUTo_;*;?Bap#IR|dB zYTBX5^ntitL-c+wP4(K3j`bnJeYQF>%S0Idi z*SWw$VI^Jq5`kLOj@xVgZ-a?6tkJ`9k$##ighBd|OYRfw=;}=n&9NjgJYgNs6kxF0 z5ZLTU2!}~b{Zj5^&MNJsucU68mJX{VeZk40v4d5$u$W8BwNH&$Uk;rRIn2H)ouWug zX5FFm`(`!io`To$tBhBa&&xJq)#!Sxk(+?1OYQeN(vLp6Pb?m^<*M4Kk#{S~Bpl?E z>Jxa2{WiSuR>2_JY6e38^IgNgM1| zgT(;(x}z_k0c_3{*k6%REb6z1QYZCrvYqugYTOND%xgiU31s$~`5rnkq{iNrUkdi( zFSvxpgGKe22CH`1XKtrH6;YRSHA>Vnv_>OyIVvPfwM<`4@0&uxHZ)a;OytW>yF)K51LrjgR0@ueNz#zo(`@X|aWA_;T z3swV3CE_njY4vl9_%u3u&w4Ja6Hqcunkv=>=iB_JhAG+PneU~bjPn>mKVxCLb}D$@ zIF~OdGfx~-(^&c&)Uk|Tt1!(rdJ7=3t#fhy6Gp10zJ2^ZZ$DhyUXhMbY%Tp8=WBBv zIQ2%11JzIxFHrDk2&`Xs7DyqEC{dbrH_FamtLNoSaqXmA9 z`12Ah1kjC{Mxdy*A|9XE65|?{5vbjM(hyw$Mr_HvjDd-0+q-bTpDl7Nhz~4ox<3GP zXx4z5&A+23ud;1?tBp@6DO!;kF9(GCw#tI@|r!F*ll9NAHE ze;_?le0-{mNuA1i;W56!2;E>tQdQj7{`_~Lx#@vM~RSDBoqg8#C zQ_)2z#UVIAl6)?FOdusPIAc*Ml+}2czqY3zY8!z1c&wkr76U5FyB)m?OOQRMkw2P0 z`f-$*v}kB&z@LT5jC0_ZH+`&b1lw4pZ<1N_;)01|v4L)jK*YTVVz(9A1tHx>jS3DU zTq@E{fptZ)iV%)LAk)18m-yfiI3mCdk1wsUwg5x;tL-nNl+nh~ANH=2J5NCxl+{Jd z4=oA<6_6t-C#Zm)GjrgcCWpPBq(KI(^YCv?VcBMAK`atP@IRe8%YXRKTtkv1=*3Fk z8pc&^eTb`mmPfGcy5sp>4EcgQVE@GE#3+f~gT3WVj(!`%`?Pe@6xpwW(M+RfnPzRlxPn+2F6?PZE~X-ngygnO0Bu0?kBQLL1?(s_`uZT6cz7lE zuNFst)TP`q4Tpc+UzbTE>@^24nU=-{hDeyYLB?oKawAN}?9l4ffy5I8fMy)A6~Yp# z+oJ*;o4Cyt_oh7E0{gH}-;0{d?^t4o&KqzCao5Xs3gc{Jxu!uWFOlNOVK4l6XQ7N) zoA*GO%U9(Q>%o-49+@>|yS#>qq~T9@K`_*vJvM2;BG@5W)&;)+if4hi&#}0(2uuC& zmRxfDiY!kfeGe2A{O-Rm)aUG~K`1D` zUsV)jb$m>Y+A+;u%==a$h}yk_P`XxuL?1ZbwX1XMH6#*) z8o9eV{q5e1$&P~uVZ^g~y{oT->HSX99(~7-6Ngz|MzLV8gT4_3EoKP_*mzm5-Y#W% zO_|GKF@zGz#R|A>6#UOePU7r;pBPz{ZE^lyjLf7jZ}yU?_z zcy}i*Luh8ui;DgFtvKkM%$Mb6LPqdc=Dx-IZXM1+O`LH`T5Jtuk#+yQG<)EYa*S08 zYv>*pgMy>&$$n#F1xGbC*BM@tP9z7UE98m%A0SxZ!=bI$ z#l>>|ke9>ZL1TdTJtfDfFcaILs>gdr1D`BgdqW^h;eF$5c>$Snuf5H#2{{65Ea`3?V^dzILxX3conr3V3NMvCkUUd!T`cX{TyJw z>&kZ{cAxbXOB}##mhGJaJtIh3+f5*^j=tM_1DbnIT!Kl0IHFF|Q(c!9Z(t9sozJeK z3Kid{Y*s*yczaM3&*CzEz|xqw(+NdJbw#^y$D2J1A`KcBXg>Z}R%BlDZypk+llFkl z8wE;tl&^ZdaPio*&*tV-p*~w+rH(ul_m0nBT{h#Tu?I`BEw%sN^>2SScQ8(}DZ#xz z5|H~+@}%P<)=}6-MsiZ(p%E4Xgrgjxgz z&A)KKD%os@olZ)(XWT3A_NAq#wFP*G+4kH+P`hRodzx3ubYng*E64HU_&Atly5=~H zTvW@orvn5^YE--y?~1uR-H{*N(9%lDf8i2F88x?lwO10Np}RWq{K3+4%m(^hS!*q* zQPN^(*Z(J%20+0l& zz9yH_xxg)JYYs~8)tQhkFjq;n+*15mxMm<=rP^f0JDvY#csv8A~`q9Sr5>2ECH+sHZ!-3{*6IrIkD zPk**%(;oBUwHVy;AO^U8Bq$Sznp}RjYX6hQ?i2t(qVi_b;4RuOG}6wlg1I%$@P#=VY=BIZr5w$&uHkjy!LD4O9fZ zrek>alg_DOG$QAyn&dv|E_Ms*|83Fa6F|`QNZ^Ep?W|yN{`cM_Pq&AuRck>bG%65` zMk9rG^+Q5yy!m$g#Nljiz|2(9KPR%Nrsp!jut3s|7&4VSma5`${=OaMXkTW(hv=(- zlH%Ve2MH~sPl271x>QOUtnF8NF&_nK9lLJE!v2hqRsfk|dO;tKOe_Hmz0u9)=HSst zw^F|J+M6c308@f_F+Q>qz*O=U?BJ<>Kvtfn?U6g9bA9f3OOM+?)ZEx=0B`qNAQVpq z+E_Jl{haaQdo|G4S@y?s(Hg`3F;sxZue=nLP{#h;wB&W@j1GZMT@f^(xo)+8Vzpg- ztq~qS9OF0&Q)e@1ZJWK*)4@W~9Vi`>?mRLxtuoEEc%%6%f$WFK#YNav9Ace?p15LH zOf!gh{?OF=tDus?zqWm)XAb9q>G$#aHS8|`Ft8nF~8c=Z;ddfd>tvJN*r{T=QYV!(* zHZhAx_B+0Nmp$rMRKTywJm>nta(|pxwPE!EHISBZBkW606eO#eWA!eYGhldW{ zlH>*^QGJ5dukXvDyK~j~9pPAh32WPatEYgc;@+Q@`bt=BU+2F;`HJG^aXux<#PovU zImGVjDzDz+8MLE~fJLA1CDUiKUe9Q?n=(TOt!6vkr@NvIVLN2xJ_7)X2QyN1-L>h% zzCPv%ikPZ-l|W+NXe(d%sjkDx{x#2)NbWZUOI}`$>(j*CMlzApP1oUCNjf@eLJ_Cz zl2$9_+-_|e?9(bvH37%o*tb!vZg&5G2tYe%;#ZhSYMSk(CIzFO>)^3#QFth!QNAx_ zluffdCYra{pbSKvSf;7*$5!qTPF&b(2WqEOg;mpJL~WL_kjyffhSqsa3|2!;Y^(cK z%Lc(1ntjT{pUZ>y*rMvpGlV@9*qeD#Pkv+mkfZar3ee_i5xbeV8+9|=)!lr*hqXJXr{u%hZRFU zC?hM$$mrY~d-1OIm(4co@^cRX1lDUy#?FARRx2x`@}FPH+wO`^zTqe-Ox<0%f4fN% z_0sa&$tSslcgc!9`iCtK4B%6vLYJtF6~jfMd;!6dW)*2&fiVsOu3qSIEQ0 zvb65oJNhL9O2S%$(&fH4V#PS0DL7Lr8RMTX{9OHp_*FZKi#J@-0;062noRwliTOFn zGd=AqaG0#;eTRF~Vn*TMr_H#L(;DUr`sic5q!Z%~ODz%!3fywm?r29F0NbSK%nhVn zCO2F*a?ArBahb5W!OR_(KHy15yVWu&StB9nN9Yu=-a=Vc%DHEi7hQ=*zg+MKgSd9U zuRm8+QJ+xVbclj7$9#*aFobQZzYd7#o)BkeMvTH_mKOF%Z}M{`A{k_C3Lu1w!CIZ3;ZjEwRkqYlZefZkUgaB2{C-YEv|;`>%=m>wv@ zn<0UfGvGVPbfg&0?1}diQ3JwfO4}_E`*W-d74UC}&tn+P35px$;xYXL2}fb7#Ux#p5GCymbbelH3$fkr@qb}mqza}GM^vvR zDg8=i(`c~+V>|}0QO3^Vk1R@QV0Lzr){zrM#)9Inu=_t(2Bx98mL-Q|y9-txwcQ2y zTAxWAnLFvDUF8$3*++xkAGQQq7>gvQDl~SGgeIM4CUok%{Cwd|DfsyyHPPddT}#1Q ztPo!3u(-rgnkg=j2`Nk0%bK z(pQY4%E2;K#A>mY1@}Ic;12(vLEl<>vnb3d(@UI7_*y4vg+#1-*tRP!Cnw)lgnRVT z3g~*NZJJWZRnvsp+ghU-;Q79U4$7L@K||ZiIuK9>BQ7QZ9hFQEBP`Y88V@Qz!M(>1 zo|q;O^9fU(K?4|qYUWrW{mmdDQ(8Mg^uw_+Em|*Q06S$M!r@(ipYrRbd&W#4ax4S+q5_7*Ny$94y4RQyLf^vSMg>&z#uW z7>I2s@JSk_0ohH%Vbjz8v23x{#sTKji-SbEb1}7vUN)`RZs}?|>1>Aey#+ zC5I>ax>~6C#1m%lFry-t1$F{5wG`pazWOFNBz#0SFeWRnFrZkI$IkDu9HYIz{G-W? z5a`=hxa>YfeE%u|;La5DkIz_6l9D_fLX)Z~?=nHcsbLLh2jjR+S3l0C1P6GHEIx$> z-)MJ6qupCW)+e4Kw#Gn^>ouUn-`lK?&wG%qjS;K zowb7}G#ZNa2!bzsx@M+Uy~m0n>O!?s#9RVWT{yKVj=JxDRAy(`-5y^i2E1han+`}xTzyCZ z|7;r#xRWbsuP>i8khtqEh{S!WyLH@w3STUJ3G`E|)N0W#O3SqW?+zglI)%|gU=A!z zDsFCNj?-c3%@c25Cy#{o>f$bwl|{3-N~zT;E{M)Q!plktLZE@mD`rhR=8s-^{ct^^&=1(Kh$2J&h@ z11SjsFaJS@6m$hv6?R_7pN&0i;@do(kl$W=yFN}CQgsch2Zq2MIr8^U{*ij-w-k~L z0ohJs6OL>Lt=%5u+h;6dS5enowXHvSYGJXEKPxfg@RMw-!0Sn?tKLVK+3)*(p10{a zVgc}*PTEWx0wYtvNABSo?^VGb7ZMlAL{~7d zcmZGF&XiHk$7C{-vwK+`&2f!0A@1uJPM7Ju>23pkrh)>{RglAX<95k;er=L&56rU0 zz+W$+YAC!<@NPLEmTdnImYvJqqtWq&zISzD-qDbt9*+Y+=#1RBZ!10>*pFr9+jHh( zy(C;* z8jZbm{P(W0_W_Tdy&c9}DVmte*let5wJUe!yEaQe=R7+Jg2tt-pR-G)BGj<;DEtCQ z>b}BqRLjJf!>-LO$W&aS=g1>_$BNA_DLO12WSxBs#+Kz^OK*2{1H-v`=z8v07CWHw zRfxIxfMGYd0v|Ax?Qm-DthrE{-97kIHhg~CdrG&-*CYD9)-!I(MNE%8+fGC@V=LlJ zZrB^1noR2A^^trLjXI{CU4hV~Q+!Q3Wvim9xu5rO1GpzzlfHv)o=##; z835(X*zq%u{H0)4=>fRd=e`VJE&^6%9q)~Dsk;SpL3dlR$))53da?EF{V?cPosA6F zdzS(wb#|{Y2}d=Y+OKw#bjzI%YG$36Pr3k2jPvQEUPTK|A_Cl9?nbTN;z^AoQ8EgQ zPZB{3#&ZW@Jn1@e=w#{EHwfMlm0G5>Jl(Y^3*42{fbB{Te^$|*FI5KlhtU1uUCX;G zvS!Z#d>fqtZ>VICaK%U+XZp^9DqHL#=3NlNa9`nf-v{50BiMxk=YZy!t`&eUf{8(T zENU%v!sj>nuUamM51(!!$9yv~$3^T1$Ef&GIE-6X6ZE?a`fwV!V#j-2;l254)Q}$y zr!M7aFXso$!gRqd@VY%jH#fsk&8Ez7_0?&L9{%G&4yI#&q4i1W!^1Qi>?F|jCbm}k zZ^S%=FcuNN6L}F?^MY;r;Q<~2(|QCZAmH*!N3B}<)d|fY>&}jwX{LajLo#b1?gr*@ z{!Z_CG4K2WTW5Vg9XWzuujyBKS~IvSY9ZwL(Urs_slChNsW2zEGB&Td$%?bYf@9!i z9NJamnkQ&I6gH(#dDT6J0Ut|wZlWjM(!3;nU5RGW7LP91QCVkd6%YBU^q0-CqPl6y zFcfAl-&F+07LTWX@LZTSJh=ZRWc49$ZgSQ|CF&HJ%ZOp*iqlgi`qY$!_VMyizNNvf2RS zt2oi}#L}S!15s_~g3;DBusX@NI(95S-~MB(6C%aou;6iCwbj@7Gx^GV< z-`~7&J4PW4g-IYj>m5hw$q-lM`g{FRXq7tkt1RfaR0u=BdS6PIEbnI&R%){S3IUl# zj(-hp^6&zBtXy#Pb3-iuQPaLCXusjn=o#7JK3&t7_XFMik+0d>t(@9qby!(L(TSeK zkJul)_B-7THTt@)?$uew9Ctjay*A|lA6{-h%&#~XWot2kPFp5)8dv}m8hp#w-X<#7 z`N@`QwJ`8_?j_9$5W7~B5j$k(C$CrLl#^TDZZloUP|b&#yw(-DSQt0`ab=MraO*Sw zLvy}tYpuMVBBzc8QWEx;9K@1Dc08S)J}Pb;Y5g+&L(ErLK&Ge4B-Q3tf6EliK8;=f z)?F)mTwBTZ*h4PyD>c`*rUW;Eb1@ZBOA}0bGNMH0!Nl)0%{I(S^juMKZHQ>O|u)!uKD}tMa=lvn3Mh7p@D$HtLpSE)8RjDygGE6;{78 z0^d1X&E|EqO_c=#YsiCQM|wy{-y0slPCq)dn;`%+@+%e6ngrB-kJE-_PQw>%88`*$dk?>Y<4ieTQA^MsmPWnz+SW=6_ zUPtbOPirC!{rG-Uu;49(KLAVfTECej(=i0*10J=3kvMzc)O&xrY(~VLlMP!q>&K_b zU2)M-dZ+-NyA`|<83KIid^|@h`pfJ-ChLE zYcvYs)^SZ@YcjCn6o9^tRAuH^c~_0#{*N|6!NyZMp|IY*RV%k#+qu|rHhlK}>C8Ch z$_|!>(SnRWc#3U5@^mBQdIWqpzip@zaQ4NmSkCB2><8NeA=SSzF7o+GO{`Ifr?X^j zR+OQ?r%Q2rZP}n9TkmbiB?;#%zRFNBWy|Qne`HvlIP#WhBTTO+c4it|Y!8Pep#wgH zE%IUEbM(YFsm*)(e9zllb*OR%MIzg0y&&ar~cip%$ILc|#PoJ5ftwS7PJ zlN>U5M<)jaGO$S7_nrp%6+$Ln3cpu>{ts6te^&F_hv6-+Ls*{<_x6TOX(6}2h0F4D z$JsV@(>)#W|64HJeBljb)cE7`ymx!rMxul%Gl!OfC5H3lOG2Dy_iJ+MEEjR)Nb--x z;}IUx=J%!w^m}#D#BnIv%bdXkyl@G}Jjt?{uNrDdwUl!2osnPjLpJp5y4<*`TKz0; zEvyS@y(~2B0jks>0xI7(ds$T>x>|SPuQ2+IKN^+(Z$SQ3KV&%9oSrh|z9X(TVC1X; zw+`_tdP-`G00cT3k_7h03Bd9lN`XmPN!eet6#+y4;qn(Nzx9OY#XkG?r@fJOKEL9O zhGp;gg1-qu$$f2doukHUWTO}rn0x2RN7Wk-b$1pk4{>3~<;IUDCbs2!Hn*ATm)|po zY%4q%LXq=lS8opu&6>cO3wxY0`}60NxwXJ4!-XM-R@f*Lx7WH)>#Xp#o(+<5dDX6Q zGbyqPaIf-f%gK4j+GSUo<~l7>Mx}3}f8)vo)bh{OjQyj4xtUmU0>sA;&~~)BX_577 z2_BxYWa82+!KvS0P6&4(CIF~<@};&qYTrx29dTqobKn$VdHX7=wt1tbutV0GfI@MS zCCv|j3zj<##`lq3f0N^p9C({pBD`+7wU+VExUw#2QWQ!FL;lSmB#8wC1O8I0+$KG~ zY|~{OyGkhH3hWuSA%+y(s2}!_nGTAS58xY4|6^nZ#z*nbRZM<(N?eaBb+ykLM>eyyk)mOdXK zg2NE!PR;?UK2@URe;;*IJ^Xog4ngBW*(!39KrDBajmtm>h1C;Wi4&5@?fGvB(nOa3 zFCuU4SXkyJ`=S4Rlb-&bVLW*0bGGK;tlpknB*k9=$HCfq;l6w3Pwqz6lw1naPsih{ zVtUVu3&Z<*yACvR4XwIVNMT&NlQ9B(sE~8n4D~W^!YKlcyA1a!UCiqJG(ujR|{i)4#%*@9H{s_ z_sA&#wsGX?Rb4v0+R-slm0xhwYaqBB-*C5c_0d~TS2ehVwbz${g774u4Ec22XtEJz z_&)C5As|GFs>QzV@w4q!p{HP^{>Rc=Kvv(m^uYzGgPxHkVXg#JWM%Cy`#75xgMc1s zp5jDNLH^|u`NMnGc?{uVCW^`Su5x}`n=e|Od^^Z%d1)ML>Du3Iq5oEaL!>-}4n|m_ zLw9f7YC?BEihNiKYTc7;s$u@OYI21#kU#zT;4K&R<0YnNstgIsG7t})LdFIYFOlkee~5xjlbdeJ8LmD$~l z;hwLhBXE;eai(V2@cA z*NFN>H_@Syul&cG6oKz9Ck@5N0!G7n_tnh!m|M!7eqR@b>Flq_I$1O=JI;sp(+iSp zr^8J>l^0m&lvk$JzpI>HZ5*mOK`YeD?T^7od3%FJD`i{3?K=s$B8D1a3tjIQp>PcG ztog$D)~y%m9Y6xW&`@g%qsg!=mGq=!vxr zuL#j=4TjvH3DlMQra%3-o0{G)T+YcB2lj0Y9oh{uoaP!X7t}N>9@iV}PsUshr9qCb zQLxN$WuV%J%Q)X$`)?q};Q~t~&z7G3nri*|14;{hXVbZ$zh{aQaO2ki>gR@UiP(?X zYW?lSVkJ2Nb%1}aR%%v$nyrSWC14V1b9fd=^`ue!+%3ygnr#@japN`KMxXTCX>*pv z!M_2ul6@2_Q2ZegQvWMT9~=|YTD&(xh@x z7Hlid*d&*j_0L(CH65ql1h&PJ74TwoSW`XysjjhikkVo6iu|<5I>Fzk?w}|0 zykn;-f;FMqXnGo$7@7vf%Uazij9I%6&|0o>GP06e!XTP{!}dRnQFhFwVrhOy-7gGb z5U$hjdV)mTlgkvGaYvxlv;_8t-*Ez38jU{(>YAGDt}gNwkrV1PU5|fNAbG2e1^m5u z-cQBGn_fIBu!Ost4w>Sv>>ceni@JfFsfWKf3fKoY>=J; zfQ={_3qr3+(iKk6AT7WvWv=r^mB_uMnIiytwBw|^6?zyJyhRa@OeX|-}MXRbqhjPCK6t#6sL%-7y;y;GbDvxtJI$@lKo+!8RKWO;lzei+kG zL(18!{l!xHHm;`=Y;6X5oOKl{l!8^S<>A*EH}3^%cQ{RYk|cj=+U)V(vKg_CR36VULOes96ty1jbegYUz3H39Y;*lG_;9|s#l&-+ z_wmR59&;T=8%4wM)mhg(;+X!h!>zuzIBQlKhSAZ= z>~1~uB75-#`VfB7tOge&&!;M0bjqTyf+CA#WN}=AHL=gFSj62@4=RgYQX6h~9V-~p zi5jKkoY=po=A@g|oxgAMOpFPqR@E5(9s25tpQcIl31&BziP*}f@$w#8aAKy)fhPuu zrxCVPB>F~F8i|9Yhf?Ryqrn#7%Vp>qsKbkGgMgWwX!mulwLGvuYRfq}$tlGW(#-OX z^KLh5{f?VOHY1FX1l5G>`;+eWw7b3)T>M$Do2Iiudcp8*896V#2%kT5)ixL>Sl2X5 z$k=(FAGzMs8eA4bzJPLk5@9cXFU3YWDJ_a;FUuu*&0=zaT>{9?%Gs0w-j8AcErC$V zsOVo;`178(An1q5fFJ56g8kGjAVyA=a?}bnw<9xSN!gI9!z(F2W4Yp?0|}6fTR~nl zeOutZ38Bx}X`a~cvGZH1kEM)$MT(z=9Fb5h;j)KDEg457g5+3ET_g5drJwRs3S)0ey;Z;Q(F z>8@z3eKaLh-ZT$)<))g{_1s2>IoU;d$#>Z*4BRBj-8N(AlgZ^4oUFhQPcA}y%wrES z5#t@Ux!z*65i_Bx4qB~iS&vmHHuF1~?e!WBhPe*QMMmNAW`&N;G-aqC+q79K_8^F2 z80bb_uYV;$7+R$#$1UUhlr~Y?E_nB3}bDu2p$mGaehq8jcTp3o;jwS+VM? z*%SoEhjM)@W^yRWu#6tx(gT4&gG%i0^%w$#3 z#(-+vwTke{FLvVbXWK@$hsSkZ-U3A^56g;4K|OnVDAl4KXkAO{Tt{5qb@<^kkj=KQ znKiRp_s^`WG2;mV?tJr6GLocN*#*v(p1i+o*2cdv%ZWbQkk=`oP~mb9WB{M($v6J| z;14Z&VfZea_Tkh0y2+?i&JPBfc7zQ}kz8r(JOG9QaSv%diAy&#j`yxOjK`>`hPVeD z6!w&rcHUZL@sg9j9++Bd_C*2?l6(&;qQb@iMf#N_zIoI}% zC`U!o!df+$+{z-Xsq=%tG)UGHvK}a7y1^TJ8Z0UwJWZKH>n$}X?db7Us<%DhyjIfG zpSJI8PAPbFNk8jhG{5mrO$Q&w+s@Rerb2Q8;oA;;h}7{9TT#~dI3M+NmceK}HLj-E zfb9%7naDjz+Uho~IE3Y>LbQ%Mx{lE zuuJ{YqNS2q(Vp@3>$ z@1H~0M{!TDLR`^0)b6)k+>ZZF$*S|GVRk05ZnBA|lF4DNqP8*A=ue_z(Chpy+gMrs zYNrE_c~n_2PO>+6fkt{B z=R{6DfkK2C7npNz?xHu~R)=hprPrvjr0_ z>7=A+;SdUpiT8<~nA>NiIicRop$;Og4@Y;mHF=jdj;&Lp#h~bUTn1KYq_Vn7((4)5 z;4mI_^bf^$9@w*Ci(FnTFK7mcyTlfoGsDSy&d?JG)HuDi!VtsD_mIDooO;9dQ#` z(yYlu*RChzEkFq}3TL=3wy13z^nhR=8X1?+tMu`KKX1l~a)Z>! zKleY4eEV`h{o?a}Q$TRkq+^qhCU@f>*iK}KzJB&`WeOzO3=|jI=v|Sk_aVwSHgs!! zVc`K|{_3td>wz^E>uD%^R?-Wt3ypM7G2{wO@9}&dsdE)F8Or1B#5l*45Ho0V+Uiu} z0=xdySP-eLz;?9K6xuNke)`5PotB8MC!axZk;4ZD(7;U^2yss zBh?~qv?RJa3s6h&*?oVhk!TV?TJPhRuS_XVnDc(c$@bSwK6^s<>bEghjFQa?v_$le>3gLDU}osNfpbG*Tf!7 zfFQ$mA!glQ0X!O7E@dm=E>_WM;aR8tsmWZXu5r-WBwJ$J+x9Kkkq^EXC=5UB6owCK zt*CtWauMo19;_2=`$gaO(6YGpP`SAF*uMB)tGUcKVsR&8~dJm`Z4(;y_3Mari4l1@$jx+B)!7T^ND8s3+^+l z*58cL+2CXC37=Ua`7B%Yt_ihAjX~0m-@ujgt=&Y4c!kRcR|SY|6AV4Xn32eALN-}5 z&$WSqsP4>wr<>ZAKI3GH&5P^B!uYs=;_+90u9}(3ZH&zkd#o7LSD&5u6E;Bmyj+qD}UY80#S%~3?K@u%ZA$D71vC#w^Jr0j(mrd~sH zQ8jqiuMQ6d$}68CNCCO@9E9aGyl=4xE36;}+6*BXriv8Q&wTVL>?sIFk^FNtjr!0p zxhaw8L)(mp3$@W))*7z~jItuwXGU4%LOBuQ+t;#jyVs*u75I1>Wl=`#nyFmGn8lV% zsX3C8su0U@sU~KU{Fdmu58fhgH|9-l=J3HBEqtV>jO24Bdv*t0q1E#;*U%)W>5sB) zH1@#YL=L!>VJkiSKn61GWX?Jv$a$YnO|;q2#2Ch z0^_RhlfH(AohbdB$IJdMO%YNFlzx)Ub!Kxd-EeaEdT{s>e8gsiHb7MH>F^`GGv{cT+#p_bD)5f2PRoDuVlQd> zW2ZgXM$17$=*XK!dwGzBDXPb?|q7epu~kufw<4~BF|Ccdt#el;7h zyG~Y$u~YR8hinTqy`5&qNg6d0k_KojNGc|HhLQhbbAo8@C#~ISlt?j)yD|Snf3341 zbJ3K@1@Fe&E}hgYpC8*sL5(xk%zlkx$3SqL&mQgAuq=JW_oo_}Xn}04z49M%gB7rQ@(k*7<+CfGu$xa z>r1Zu$$rHVP9-5J2|+6AK+FQE_OdP;M2Z;QGi=?+fzTV#H5t*tsP82eCe}eBlK4H z^kf*O2ar_bF-}##tTM0?wotSU5NsN$c{$bBFHKhKFZV8K@#HaJfGWDxf4X$y9;KqV zaRuN7_1vJu2S>e0g%o>toPW}ls44V@KEI<;kpyx?JR|LgP8@yDW( zfZCGX8TL02reqLp^Gb%P=tdYWQ*ry_5@m5!2p%(DWqy{OS`1On>FN)Pqbk~HGh?ot zo=P1UByH3-ZbuMa^}LWsSK5>!rdz8hX7jLg-1Ug!Gj4I znRmxAv(uOWpVB<$Az#9PhVrgX75@WnkN-rSp-lgSLZ+)l_#Sl+ERKpRJXNH7G!A}u zcPQ^{19r~b$WBH+NpSAAZzm6bKw0`7c2jI%((Hd+nZ@cQ$7?zHyupRj5JwSXb(P?q@nIN>{ zThln5FLV8Uzl`%$7eP2WSJiDNyfsk`apXeKn}p%&*tE9S0^Hb>Cakw-@A&>m&{NsY`JEll_=$`sB*Ghn0PqwnU*{2a&GLF7$O||&*DE7}?@t5r9J<9yC znOk*}c~~^xr2tYyaeAxlh(2+wn!c+JS>tiNJH2r6T+$>u`%F^>lL^{Gg>GRnj4$^T zPxy#ckIg8*DzdYL>R}LJP9C~TCRGwN^1!2o(mN>d4#-F(R7@wvlluEkUu{wwckeMf zoe2avJ?z!>SGLtG9~&i|V9rxi)Kl$+@W;SxCOTm@?%jgCfRN;nF84_3JGW|Z-;?Fy zuSEUQI?AZCShcxWyDRy=eXKHnMf42pw*Tacfug@_ptg2ocbq^UEy_s&W?sU4OzgmV z{71h|F1_2<6-2@t<(V|%X*4IxKUQ!bwP=d?pv=4(e-h`XXJZ{c#PUX4kAnCIYo0?AlTY>1v*JW_F-j%6l7zHVHnO) z+ittS+QY;DCPw}B%!Z$}A;b$qj;Lrp&KcC?StH2EjJa&ex`YR@xTtZHykO$3x{oEz z3ZhqlOr7A(@TI=m_OH$xc-ZmoW@J1xz#MjS9{Xtd1VQXiz?i`4&X`X24tPaLGWpka z*+&pc&Eo>#R~hRXjT_GDhIKw1TKmekp{8&_LiOV*)aR6Z6g4fu0_OTJqE=QCFG<{# zrwDaFR^&i}oS{_i4HnqF+zGDFHl^K?lw5K^ec66J&*V=!2?0891*q-GAZ>`0xIkygyrpxWpD6&vz3!5$j!vEF{-Ln=Y?hP3!P$ zQbkrN+#3S-By6+_?ot;U=EFZyg;!O*wPWH+!Ww+MAAeg*7@Xp?{zQpXz7050<8*Vh zCJf&s$6OPh8~v5X8!F1CT(sy}G-?=J+OQ~L zF2s7ItpZm3L>{gU8oouFAFUo=ky%6PdIQ!wyx@Y>V33LSG6yYpt zBf3BFn<0;|aXDu={1{im#aE5p?f0aTy#Ah^q1zzSC%2zl1_|gju|gMyYdGVCa=Dz$ z=Y;Q7R9FEFw-V{q&o;#tS8Z9-$+u#Vz9xWwm)uR2cb?`wgKVVFK+@~qh(4E0s9*Ua ziB>58_~Y*cO)Wg*s8$IODT_wrC^Kqg?7P}gq7LshnrzpjEnT*h+#`C$?P4j0~Kk~%2aV!*vFY;Bd*!Yd(Dw@C@1p{!;N+dhnq8Y>*# zln#ejMh81_7#MOE%I9?@?c+bFS}QB-#<4YUyb9_cy67+&aU4zD+@}50eY63&J73xK z%r$GRnhS!`c>2P{8F;8HxLuf$iaY6EZrE-K9*>v3XYzK>pHw|rl{-}8&~J}}%CgsM zE?0#Fc|7=m`9?qFrtOsbRUAAwli!&j9o4XjD6eH;#i-_tr@n>5PxAaYr;vGHQk!H# znNfFsVs1{P|5Jd(3F%Si&Pp?nu`UH7&j_Nk^=_1@xJIi)9w3(~x?5q>QjH05j7o*= zYx$5xf&jwW7C?z*eUS>@)3VkUc;&Q)A->>pAmaWTVleQiCJ=6y_1A`XB)MY0BV;@` z!;>Ut;F>nyc%i57S>`0M07!7I0tY^vn!^#i$F;mNej}H5F^!}GfJDOgG)Vh2WIIH3 z9Fq>O_XF{v3h~>w5h+Q*ghjE1W3H*6OFW1-aV;7zm&)PUFuFh=$X~DDkt^dj8lo8q zvvv?0f>q7HPb5R4SKfTQ+K8 z6Q1}PO8??Jta=of4a@jBvw(D*RTeg|z38Ba%phST9Rc^Zu^bTw6_F%FXp`O zHYd5vd3Pl7SL_JA&0YJN0DN@qMT^fp^RJkZJ04(f6wS((A%P&!A%|xJ7Y({HF=gQb zbE#D_#gDpQDs{f>%uHsdRK~+u!%;0DobL_IiQ`f$(4aI?G-^pSjD7lao>pjESzX!l zJwR?ne-Jx`pYk`O8wWF*+p6Ib@MuuP9;#!586OcbPfGqQIh59Q&$=htNKH%QEmDk` zU_Ug<75U5p`vQD@B(rLbw4r)cv`oJ|Y^|&-TZBo5+NMtYg*B&xXThsmtr zBcB*pUlM0;=5|5J(CcOH+~eBy=K!7>Qkq8iHx(Z$vL)*;7FsT36&Zz7QZ~7SI99b+ z#)DzZhSF@)$I!Qy`gTzd0<=NI*<1Z@$g}6k3?o`~A2;9;AZ?t}%+0AFcjd+^-s8A9`S57@_#SZj!;_bYi-{G}hcSB@4vRyKdNtMdbiRsKi`WI5ESs)6FB3HA zxvT|k3iYNU4;}ct3vM|3gLI*b{#oD@GRb_!m6s_==%E%oV1cx#Z#j$WOdfO3`mJmD zZ9ApHlNkm zk83LyNJsucXwEYZq~_I9zfYHZ z?1R|_;mS&XeKuK!j^5*FxL{Uj-|`J0i`X?%UbOSKU))Z zZRlnCX2*N(JkZrr2dl#&j3CgRmARzN$)ctmJ~#EQ;KU>BmDw?x|KoHxVJvgm^A?9c zLsrdKhTdbf1Zxug(yrHz@Gm*icXMLGjm)FG9vzltY)gpaCb~8oaAjtW`MGhcOJ_hh zbX(!**5u>`)4a(1rLyr5mO6ZBxuQKr@A{8A1FP6+fmdjtOo)6#o);3yIqMPA(&?!i z5Mo0trRypK)cK5=Q`-)ADX*4~QBSLK)Q(miN|*my@P}^ob)geZ$io^vY9`qg*#&kd z|Es&VjB6_X|Gy0+LPU z`7bK{CM&5Lwh!DVZjou++>xywmqrY@1PPhUhOQDUh@=Zk&k-1Mp;Z&Kw7Y~2v(=cY zl^>`zw=QFY_V~F?4!Z{zQd-w-T%(7X+`+}yTF^Fvmg9vTcNgsfMWpMUsR$K-j+xcQnDtv7k zhj#j5>eVi1|LUpUjm%cxr!o#5>c1G&ezeel$**99t$zJJV;blc;yZXEd)c&KM5FT- zVQqTRCE(Jh?nyfIE0Wh`LR7aryJ@MLM(0b46(q6xbP84>hZXBMErJm{g;=JY&zv%I zc?MCw_AzaxE3vN=NfexRt3w$#pHGgDH~>)4oVDS^s8e1D{BD+RI=zYIKlh5{P^?ZZ zBG=N%yBs_~A5vqZQsHs9X9QWPyp5viN;Ryls`}B$KGdH4E^fr?3VfYg_EnC2M{-fY|mK8+|pF~ z(1g;lJ^JWVXIpQ%;Ln(NHX-#I+Y6g$K}-DMuD*(PHs9TRc8k>fD2IdVOl@wUjA3V4 z68oXl2Q@~CyB|(|?x_9M28c@{RTJyTLKo5L4XQIBzd9Slf-u!2K*CsYwSGP+ ztpxY+H>+|8%J{s4QvSgyESLo*tr+zMi}`1_8cJSV?AG4q3Ao;P9*ul>yi% zjwN(?VUHPUU8^n8vGjvJV{yT@0DWO+Z!c~%H59%P^Y&p8T(+b z_R1WLoa&Fu3_uQP5=@>yvL<=^-AQA8B`|nT5P`Tuwz)$nN#2tqFAcrQdy|=7aaB%* zkR-nBBn$#ifet}jzx0>6PLngZwM3}5MRcjB64U!~U*x@2z`RpmB zqi-|wyi-DDD9V17eUKU6&NB}G2fv_WBUBDl)(r%%O+ImR!HW6FRlLQ~=?vjQ#%aD* zI8j2yOL{ULj@jU`oZ&pv^nb`dtb{_fM^l?oSxxn`{?*g3U)TPr9LQ-k$@-dRqh@P0 z^KTj>xmGk&w#4JT`Ii~?n)LAS-KvE?!2M|&SR1FI-Q^Ouci$)&3W}8V?V~JbdkA~Z zdk|>%BcI>6M1fyq`!c>mv<%K}VBp*@zw=|`LLz85My};D&el3S!U{sA5tw#D&m`S& z5$m3OZc7Mv_c>og6Y4Eac8x}}eaRQNohGv~K^CU>RWsG8VfqX;BQ6b@A(x=e9BS42zHYJmH*Ik0QaJU?k6}SHN4dVUOt0 z2~U3+KT*I5Ie&-`BNowuJ|P8Fd`agYAnIlf8T;du=v&?(%o!IRaqDLKlJlUC$@}F9 zvwTDPJ!I#Nq}cht>csl0$>fXKz3P>-@d_?|W+wn-=PvgQX(Rd|+!fLu!%L4}zqRcu z=g>l7-FEvXtQW1}%k;)AWtXw*)!TXc8K zZfowX1bY-ylt(**)vLWOz1V_jxD{JSCzo-S{T`%0t>Dhd5~Q>Y2r(+MHp!pF&&k0B z`D3+WvL}1({sHEu&F3F%Yow|m-B_LKnU6SsZr;Ii2dXs$-S1*VvqKyj`+m#`oxSC% zwrC+2zbbgNI6NinP4xm!&nF_|m;blr@_>l%FPTALU2%g$!VpSw^dC=zp12>;evEjD zJjV!Q_s>?`8FqLl&R3Y4i6DuJh%cdO3LubtM|y?YY8Y7WFe#h_vkV?*V}s8EfB5)W z3P>(a%8v`|j`mAgpUmDfeF5^_MQT_DUuIAQw;~{?_z7{#(ls5%VO80LGRR3)8Pc1N z+s@Wzn?vP}2_4F;O~AWPhdBt(7^DL_7&cwYf|dR)O1jM^r(bcEVqNXO6q&HzN3Jx) z(Na+MGG^|NH+`W`7ed)A>P5UjOR~GrO)ux8-nbrGK-vXM|oeT|CkU-rM6{g9IpOYDSc{CvluvOi{aLh+GqNjp>qsw28K zf-hN^l=xNdW&WkO{n)eKzI@0+%&XpG0*MIG@Otq8q_=xZ*Ci@oV|nE>wC09M%PloJ zkg^G;8%D0orMKKrY}6v!1J5{HH(ZudN*lIx>tS3fw26!WzdXn@LOlR z$vkp*Om_?$W3-wAUeKNh)d>L&C>pKtjwdU}{PL_-cc!PMyp!12qGUDgJA&bS%^8KS zdZW%rXPqZB5&HU2RE@^!*X)ufD$bTBe1eR`$Z@h;%K`ldXK;`XZ$C?z!=W zQ9=qPeb_VVPren0H!tf8B(bcrN2?P8>HqA)jlaNBu)$epU}z7%0;lIXf}I_Psg;h! zOcu}m+=m~AT;xq)A(u^K0eplG3ySOQ5F4BWo`VFXuSp9%(g>}_(Toi0DQU)iH3i`U z+5zg(S_bv7GE%AYFx2p|?lGJ}24l(YH{J2+vIJ#d!3S9zUivfXzYxWBR(M%I2HCy2 zDZ?aclvUrO+N-|~<&dKm2KQFw)dvjznUx~^+zvoNd-=EZ=-dW8=sZjp*|3l6xEHru z8KloUx(kz5q8Tnbwz-dX=lEKu*KvsflQmrXXl={|G-w`l7$IGi--cKS($#`&GRM608>?ob1qCiN**F^ZTjA}8b!fG*6`K3v%Xirb}^9My8 zYK(LEvqyHac*2O8Whl!Lm&vTP<8yaNDyER@JqFYnqAYl{ieF-kB_kmA z$%0{QH$+~D9OUNS46$6nAGksolNgJV^7P)^Ju^m2pE!z?`huu1FEVndk41#oM9h6~aydy628YpeKA zUO5$ZLSNqSNMGm4?8a{!YkPNXdy`*3*t;R9s-aVE9c}xv08Y10(P_ncZE;TyMRp&4 z=|5^N*7RLawGmsqmw%ieusQ{(sxI!3x*G12$Lg^~p2A&n7btMdUXM^gi}dycn@?*5 zWjwbX1A0i<=A<;K2WbT6A9yxWDYZmF11yEQGXKI}5WOTs{EOtx>sJog5}|~(gA>9( zzwwXJ6jhqt0f!OX^f^=c!rBF+7uc*vs$x3#BpwpJ9vl-qa`}RMU@^)Rx0=fC*~0;P z0AsOMP*VFu;Zv{XmqI|#%L?uHpvd?c%@6(v4f#>bdVH*iDbai9JHgNrA@5)PBT`HXnKr6AG+dUJ?-}OB7 zYgFD%y^Zo(Zdcrp9%f5|f|M(q=|@Jn-C$1cD#Ld@7QdItcH!2KLL=F311ll?J|e3k z-i(}$TEr&q7ME2SBJEp3TY>83&bMx_$frGW%SY`_q_P7jJxQ_qMj;bZL>h!eMr#qIe`*IL+s__W`pe%&%&<#aanb4`Q$oPzgbyER%{NifHc zXZ%ZlHNb3(xHiuOl|M)Xs@x26&v$$e?bWH@H(kR_cUtG{BRfBJ$4zEM8J-`8nE{Qj zz5A}`<_pl7AvJ2s0jDT~CY3lJ6cKmIc;j+_tok!J8(apCk5~<$)+28ys`#|W3?pWN zvru;4m@zXY3vHlLy_hMa11W}U{fgPlNE5;063&ZZKPv9Q?;hQpZ2uL*&vp*$nNP0u}T7X>?7w@7$_u3gMdp2e_f1x7a z{&&wT2yb$3gZ>q%#DkdMwDn0B69k~+dM@YnXl5g&%Ofs;?q!9g`j)7}@i!+SPlR~+ z!B1H^nY?|e7j@WM?zTrgTKcupAPsxC$v&ygcs~`a8ICq>35m&5&XF`Sg2pR#xOxJ5 zj768zo05`K+k9s`UwoY28j`}1q_f_)gO^sdAa2GNoLu~Jhnt_j2Bb+lJ&=gG18P}b z(A`vEC)KMob>rD{Wim-15I&DmN!*f2&UlJR9M0aH!Ohl@0$IX_qF&p^+@G6RXIxj+ zXpoa^a#(q9;6Ev3D+CHS|VN z&F8m43nq-|_uti&B--Flop~!^H9z~?3u{bY6gie3^`no<)?zW5F zClK)8w?4U;S&@#CFpZ9Z>RDb1Z>Zex7AaU__#YGN;k(^gcy^z$C62mSMdT5scx(Xf zK`_!0>R$NK#Q=VXU^Hb4TfEm(>qnr<1~3*t}@N{+}d>;zui0$gZk6_ z&boCI?;yUHKbqgu3H9=)>Bd?tP~&zBX-11qT6v+RT3V$!^8I4tm<03Y#@0=TD`)X9 zQAhVrJcQ|0>=I>sKU~0X@5kr!S7hH67@__>{4Ty|$71DF$WMBNO=~9}pZg%%BqBmXEQdR!4aH5>pfL}!g=|snryu7CuXE5}fMMJ10{16Fkj^Ia7*Yjx#`Fg% z?H9xNc~ygYitI+sKy|@Onl|@OnteTh)X|BV-_X@y(|E_Fe>ES?S(a;1pcY1YQ*yJ_U-)mqM|e+zrQ%T zVv^{Oj3by}RB56qOFz|{zY{rSVbTjdk#`h7^w?#8`d^5d>M`LRiwkO^sCubnyl&XD z=Q=94z6rC|uy!4{pDZ_#Q}}PLM6?*)BW)6RT+ziwxJ}sfl;XW01M2e`~;!`#icKRkRS7%it<}6=i z0XMGKaCNFQ?a}dmP=yIQ^jLn1_z}I%;Lw2(`-2?dKbu?ZvbqLS|J6Ct7I+U7u8UuK z@~19>H#o5pE?Eu?UY}AP`1KCv>ZksLt*@_&y6vpJ5Nc0LEl^^rxfycgwC^|<|4zn~ z*<-K$urKZoi^tnBt+nIPk7*whFps}aH5;v3{K^M=xKV~aNgS0+U16FI|DGbdXAfmd zkTA!zvL1bUdFz(ELFB&&_T}$&3iKHZ_BI%%UZY>xlb(Q7OW@3N-%!Rf45tNMv5vW$4+FKyzHs7&2>-P6) zo#1#w$}p`dZ|yMNGx9ftIp!3q-~cC={StDs#g4Xy;xFfo5!1nhYWQDz2MH4%vqKe> zRuZ7~pVA0I`}jXM%4l$l8rN&MO>E4*<`LbFw~=3j=jc*SH+D@2N55(q<-2$ALVn+* zg0%;OD5Z9V_x78v+3%GrC)(rGAsjBVR#;Eo20I(*&g1s^P_0w+^)1jI%RR9U?dX!k%jjR^r_z%81 zR5vkA66b-QTxkFt%|Ip!`cV1mJyFZ;11N6ohFO)@QL-E$QK?`Nks84me09<7AGr4) z^t(tDXmP>1$TE>Zwr!Rb4d-=eL+K8mk_`xCkKr@J5T^}3ZceQGF<-y z$7&M-qt!-(x zoq%DxV|gR{#kUR~pi;hVGZ*W?b=!{= z6~xB87FjL|FCrN^kw~nYF1r01fZ!pQ0gCpG*DR@~W|aYaPQ<0_8)2>)^(?2_C6V-Xj-HhPq!MXGowk2w7JAvtdaUTrX*-^fKP(YER>rs6BR3GgG_G7ybg3vB>;JDYakfXQaV{n?@;5(F$x+| z@!@;GG}6_m*C6gcSo=|_!@ap|V06~%NRGcVPOAu^yL?hn$S`MOi}EcQ#lDNgbG>n^ zaRN<&Gg!gW-JE)C{+7h7wc^+F)u0=jqcX*lUUDL%kDq+(dhTz`?GN5ugP@AZX^&DM zPq?{+=7_d#h2+k;DJEI}fFdq_-4F0KO+kC|aa(v^rE0>{cNlMIHio<((EEm;mKo>` z&WFWy0D5d008ut{ptg=(_w_a{YmaDEJ3JBZcK={ z65SXfM!Ef1!OB^B+>yXRyu%@zU5sfEK#jLtUl*gjVGG@KJjIixUYohph(_(Caj35$ z0IWLeB|88K=e;!|;yI*-r~7@2PO3L(fymc z7(BTq3gW80WaWo((fxoL`AQhD>DtQ2qgn^ZO$^+0w0vXsi}_#lm3doGhy)pEHBn^I zncPzf`;v>PMa+$D^Zfq$8o8jU#O{uCTCp_r5ge~M^iOJ9vD++(A&ZEOMw{8&%)LE& z@%Z%Q3q?CvR|mdPTGb{tYl>Vr60`FO@I&aYQ8@?%MMAPX`o-J#lF$jD_*7qrrF9Ec zRy#4#+G?Ys7-;i?9qR>l(y+ezrOT=Nu2O!V0WA7_2Ei4DgBzluD4-BTe}nDgdg;(ln%GRP%CQE+Q{?2I>0hi&WL0k)eR` zoG0N1tUXU4Tg8SSwoG%M9Zg!EK7pU_c01z+SD)d!ff6y2uhqAsPL{fph{GaGdeJCk zf8`bKZ`>nn7~Ek?*(G}}jfzw-=aH?SG;5b;6`n}U$hC3AqC+{t-8ii8~Df#7htcNX58Lze*ht zRo5t=#J%~b0vK`8KLaw_7HPL&2)HIdh*9ZjCS|A+k!KYuUK^~#34BER9^c7OmG-lN zjexXyEyxW3zRcz8fRxUcB!Es)_6}P5q}W@&74V05*COS(N$(Ri)FARJnUAOE`cpad z+{7Je^>5vD#M8TlD}eyP^Ly2L`g}9iacR+rFEs<`WZ80l2m>_8y+(PuU;afH_*uef zm^1yN2WZ6lKpWVq{usrM+vV@dvc7I0FRCGJMCT4vE60{?&*N4^$e~=YK`+F8Y=ra* z#I5CvLIhh)OEVlYv_ICv7x-VF5MYlE@IB8~cafENgIrL&`KHV!i-Pyu`NEMy&So{y zapCT(6VIcZ1O0XI6Ng^uywzH@jsPos-D^A zbQHYf+qp#+7u^Is7Tt6^?_uA|X|ts<8hZaa|2y~W(V4<y~9H)Z}Kb z(kAyotEoeLkd&YTT&KYCEEev z{|yJWA5pHKwYv$`*e7KdPs(kQmqV?etL#wUpIwfQVdv1{^a{H}0yE^6?YFq&>ED-} zGynMSeP#5};ufoEQ4IHCMQUfFiO6U~P}wjX@@z(fZJvK5xLTBt(HwRdWU2hrFIsOv zUMNS}k3&jqP^U3SO*3{sMmJhQr+DZ{CszX_ORFvPt*6B8uc6|n^diu+G2c&ymw9Ch%9_}uKPb~Ht2Vzz7j#OceFYliSv~1eG#00#*4}nP5mAvi$n0eJnQT=C>PCu zAEghnb!*A*um@EEohnt7k$*~^Tg5wd39KJ7KjQbjUn@hK~m zZ^Pdftkz1&MolGU$bIJa)0p>PEm;MKA51R~K7hRy_y`ULgT|kr-C*Oi zpzz#BFP?89bsyoVThs0flNw~&O4|S1+W+(*ZfS92qZ4GxtbXISK?=q26$|;#K-u)k z3wPwzXaKx}2Ka;(l;wF>Oy3#clU&dpAM?J~; zo)+|;PTC$XDJ3J5zbKsEsDkibte53Mr+eLrtH<%!?(N@<*1m}nyNZF9F%+L114PSz zy&X$&y;0fH^PJvqxe@Zr)jTiYwaV<@mmvgz{2XeaM5@0*Q9EnP4HNVGpc$swXyMoK z@Ke`VHkoyLf6K*92JA~CTPs9+YATr_(SC2Dc)$vfuj3Hm?yJbIt_TVNipJOP5g+fd zTmqB=99C~iv{HD%R=5=Z2yf`MM!iLa{5t_}QI`E|NY#mYFKCMD zXb;f@z0@MlBXy5vj=jbi0SS$$X5zT_)Iz2=#LIf*i+<-4Nkz2efuE=*+fx^LH|!TS z{!T->+sDndr9Fn7uCTcZW)#QWn`IWLH!UM?k=)^W6de*jx z0^=*p-H-i5|9cm^MA$KGOSs(u9{%#pF^YLBROm}9O}?G7;&09BjEz&E{I2wKOAGG~ zP*826`?*mq!Zj1L>oe0mk^A+r%TX!UL80tT)a`Hvgq}`vu#TZ+sO5r0c~4E+d<$P{ z-7}w_$Fzq0C%(o$D#ST?p14V{?BNF_7s$A-RB>-s4z#y@p{}MX)j#vsAbg&6F8f{a(*#fms}aG11`}YajC3z4oPME2e&$cu-P@Q)tNH!-mCq;afoXpI;AL=G2Yk2*MWhsoipMz2 zOD1tchrWf_gKEc^?^Y=D_FO14tm%CtiDaO2v?@_qoMrcQBRX|>wmYr-w`t}4KSPMF z7cWcEzz(oedt}@xFQB@g=*%A5cFGL#HrKckgadWFY=13a^h`Jz)twJXbo$=(%F=Nr?$9z0qj6 zAUNl%IUou<&F8dKl@Ivyx#GL)rOKFqg#lu)x#%Y~?X|C)wqe|O?ILpfBI@Ob&8_{y z5ymgk0sA#i_{KFf$;r~0lB5TyVun-_>cYd{fPXYNdH38}>674i0Hd^sy zDL1b!X7$lPEW4`CrH!~xO4$%S(fnFxVi?`Urt|vw=YW7dD82NUPBzK(Kj}gV&cRVF zxq_?W3{O_dhJtPFBZ-l|wb3k#%5T@Z`(nbsHD=9$%PSv5CZ^O}gaIGw0H7c@_ZpAb z+Dhw{RCdkr@jMC`F#$qzF7);_KzZuQjy$Dl`m>q?^|K}tM89p4_Fh?C@qem2_UQF0 zCmXWD~D}|491aZ6I~sJ^zbhSRfjL* zf=<#_bl$?Pn_1#LnSm`KMS3{T+1}U39MWupUE^}sYFrxDi>BJTzT7zjTjk&4fwcL{ z16^Tck&Q*ph$hGgu|}jrs`>xe7RhB*l`CO&3?Fu5ILG@QQZaNJ@%`D@v#6?U;bIl; z%p@Yu9Eoa36^m7M+Xj5|v|eG!H+dd8iZ;he_TSMfV-EjRNi ziNb?}U5n}Lds#1}CV-%gE0Y7mMVPI$1_5kf`&9S>FL=&?DT6LO-o$*b~X$?LG#oAt_0A+buQY!IyOV zKtiCV?PBLw`U8G_NpQ(Ta& zErce$k6e+&*+#}}Q(0T^`KlJ;z@}NMvZ4E`l4$%F>dt&R{}q`aoe|AiQ0K&r-QTN9 z=4Rn(KRv|<=hwi4clwE7eJbU^ruf`74t4gv!3T+|7e-NniUJ;ViPyJjCoh7FN)T7t zaUdgFp&i5TNP6?MEX!s}cQ)`9|J#@JTpT)YW^uI2CN{~)a`vI*lttur%;X04P{BCO z2k?9-nDB4iulT%mw28|+6UH=E_LdoemJmRG*$0=okqBuwnVYbSA#(FZoZiCCEcFB# z!i*Va7eKa#+4U4&Ib}IErlk9|Fk)^Np?2UjIS+nIT9y9;9(=q-WC)063{|NbEX!Qb zyDRbZbMKrCNriDik7EghLG|9mPJs11&NkkUiJkB9yple*S_|hQ<0@u2X-)4Mp!L=`SV07Ge0fEz9puQy>|~&) zGxY40*CN+gOs|!T*nzx}_(;s}^yg|x%i<%w-m39TWe<~x&Tn&^d#l)Voc6c_LgH!s zI);8*g~Gp)yqD9m99vU=czyak;*b@h3;ivOW@3qg9G|A0K@`!P8b7yYZ@S z{g4})CH{oYJfOZ3S=Ym#tY&NLnCwMj zcHE5R3U&uJf!Ve<2`8bFi6OqU&HB|e1+|E-Nfu8|YJApL(|J5k__lH-X2M943wdQH z=jgusd9J%K%b`i=utemaBd(n?VBa;4p*06h#vq0uGQO8BQp&S~7NA}B$px1e*G>A5 zNazu+>^0ETddt@G$%oG)mx}D2WfeSH!FTQW5@3%SUO+oISu zvAR;`=T#Wbu8$Mmfnp@<&aU!Z7wdpm_BK3i>eHy_C5>4PbWd;dj&a`ANCHE8PAdkI zNe%97K*czH>1X?t#{Ohho$no-yVw2J>vIJ=A_OM^aKr zQtAML(d*7ZB5SwiKJ)%ia5)UfBoV)8ZmOHPpKDiXiusAfthGbOe*pIZqXj~7qkR{8 z=bFo7zljHPB?TaxD+%c8+ z@dhv;Vm>M{zNTCfDq0z60}48;J;8=PS)B@_vNMq8g4-+P`pk5vD#Bdg!D)sl&pFgszoGUv;A&68Jwv+HIZ ziRc%3D<5*^?-eO=cv`;SkrrZ`v1-73;2* z)23Fbe^LU<%USN4KYl*Yqkgf9et|Cg9Sa7s;yuqX&m3x&YH~mza@+0Z((#a%Z-N7- z%y<^ofb3KJtxEDBTK6Ho|2RdpJl5OIxw6?>zQSb*p1lsWPEeC_{i@vQ9rI*hKz(oX zD(4KTyq$~G*|uc-WBk=&+Z~;Bo+uGy_GLA?dDf$PAWiQs;O$;69C^u1KB@m9buMVY z(~y6unmFX?b6_Rtr}g~He~&dQ6X4~4wU^8@a!z}1zVaE`yY={hw_YY{Za#*oyzDhW|}HG?oCi=*)qaAr6s$m83Tc{04FNJpZzq zpU)c3+2Cfwy#waZEGwy}rvo;WKaN*SQHU)VX~pYBsT10YIJ4kEBJG*0Pv_moF7s~0 z95UN7?9(RgC^5B|#xV&5lxhxN0{xYHQ~jS(UvU4H4?Rrf_GP;r)0cSF3V`Bq{4DfC z{R#~Dpq4X)z5nV?4rwaxB+Q3*2h_zNZMfm^CU$orvgQN}0uT`Z50Ng@jgI+r{nUr7 zgUA8ZUk(2H7%^M9IO3j}JNhkwqsLKVgl_gs*F!mO#HMnvs#kr=W;Mknxz!C(feYf7 zJHz9QvSnCrBpE46J}tbbYV*J5>!*6T$+WhcRBOlgHuodq9Tr)A=%wlMo(_`UZB40% z!fs^j$*Ft)_j1Y2R;Ks7sV~sx;IZ4rOKuxrAPh24-)_9XYzh#0zi1>TVuG^X`psTA z%boGn`9+-8IcjmS)4sRwo;Ak9c~qNRBO#jgzXE)AnpC1j!2EwD*k`CSG71C+;Gy@} zrm=h)9oLf5Hcp zB*@}zh=|-w9Ivi<5MaGiYMgHUVo&?*O+2$VBY#*@s&Lt09j=VWyVdqMI88uXUC6Tz zXosoZEr@JP=u>s`Mnzf#>|mz7k0um_fAe=;&EEmYv9u|3ny!1fPY#|;T0+_6$VJ5D zbxnZV=UfP!8#mls)fq9(_PlFcLH^Cv2@%Egl;FP;Ry~Ag*J%(aa~HGu?_P)ZMMJ02UDmpqt%ndl|HtukE+d|omhXxVmVL8w(;Tnj^DdW~K=1YW z^%os0V31EkvY34MQGMOly67{FLO@78>4UF#<;hZVD(=4Sx_XI+cit+*7kwT}?Um$+ z9=dn>Mf5e$vmCr0(e?k)v%GEx02{FX<6PJ`?cTteeLgV2qXzxcu8L;*s#gfeuAgN# zqA`G~c`4f0b0L?y7^><%G{Y+vwYXe*bS9 maKMJ~|4%c6sm-vLx8=dUE4pw|R$z$p4OK-Ag)%v_PyY{0=?-cD literal 57433 zcmce;by!th_b#j=4Vw;OQ_?9Qwdt1bE=lR`R3tXtrF1D>(xr5FN_V$3g5N?PpU3z8 zJ=eL;`L64nKg6}!Yt1$17-QZe=C}z~kb8lOM1b_*!2?uD2~nj74<1)Oc<_h?;Sung z**KS6;J=5CN-u;TlnjEmfe(*Og=B>uJSdMu{%!yZd`7gB&~$w80MvH>?_rm1p7DbR ziwBaTLMm>0drk08gkOC=LWn;TK}{o955mR{_WIqX&WolR7L116TsGYHCwDKoo2G6J z;j8Mk=XY{!3gR8UO~DKL&yLJue&J|abUMCvejLyxWUNC@+-Xw>7( zMb|qhG_T%ycR{;D*t=2n;w}`Q%pYACfy_VZBXKX;U)SjWb|na*_;n|`@V{I=qJjSY z_TR1^tVjKP3-IILt}?)w_j3Tg&HR_EhjAi*eM|OlS8p67emzU(|1VdM6+no8E$iQ| z9{k%<|M}hjrA7BV_i70mz^Y)yM}Z@khrx5#*}=lQE&3(clMv*Agl7>n0z5hT%=jqc zdT>l9wd2%G*O~!8*SCh#QS||qvVc20wgtGs{Yh>2s#1FGmMPB9dB>=y6tEXJ<;Z%G z`lH9VE)ZPQuTX!fJG`%CRT2=_y;asE=2@#Fu4*F{i8oAoavf8`}B zd~vyb6zJ6=K|vr}AM0Rh<9{|njF7mF^$kP+do6ZsEw{%NjyumH zY4`;wx`tp)x%=Mtzfxx|=Vq)eoYeB-v2}j!-AI)D+DGDeLUH|!qi|r%_M4^kf`^+;6b%_SHqAFn?>Hc8NKmb4h#sLmE#$8eUAJ&pb9R*+Q7~D;Z3Aio7J5 z_8@fvl?^gW`S^RZ>k3sZ7gSz16DTuP`~csRZxH=qA_c}0*CwS*-a>pLc5V_lL+nNx?11& zu~frk(ht~GM74{thTfR*bT=Wa?3kZxYMdtmO5~IUF52`m<_!61rcYOUjn8B=D@S;D zjd_%O%H#t>PN#1qk$XwO`MQk5gy7nCrI-YLE~JS?&ldtUwX3Tz)?DIdf|bk>c!TsQ zC|ul&3&ZTLXtY0!{eOYdy|HHpwh(hEnZF5Ha(Qy#KWj3_wz?Xy(ae95&4WhNJY3;u zk<+DixF>hHyiqula91<)(IQ;qZZK`~ZFY#7w?K>IweZv|!?0rcgHS2%V+>@h!dz=t zw2FPfvX^_h3~(rRV>ol(1Rc|_OHJEM1{%kRvI4R`yv5O(YC(hNXVp@&eQz%u#YS63 z7RFLuFW?$iZ97;H1-DXO!JrDdnBVo)mb%jatX;Z2gW}h~NpZEiDn;S>IvoWzkb_#S z{i~-sT2qUbN{bYv|BWEmsP)-v*9O8akClbVNKobSV;-%E&~FA)`v(E0+~toy9(0LW zFV61}?_@+_{DcSpdnkw?7)Fx_qJfg^+uodJ=r|>XtFuM!Ngu;Km4JcA@>V*Dlm)gLnwjtbof_jrabe2-l(!i^#A ze5Z3OAPEAWboYwegw_NJw#PgCi*Rnb1}pD|ARoEQ$exNo+I4GZNfzus-DJa9D^R&< zjd-_xsrg(%3B~kPSd(k8mdri*XuHWAmScA#Em*vIfYXln9I#Lj7}PuJXdzZg z9V-+mq4tNxiC2JyrPbo{P17;;Nc~w;t(mDYQGJWL_-_4`*xGV8Riapo-*zj2>@Ax& zBkn@ox>i5Jug}_U^_MJt@u@57Bg+{lxe$qiq&fC&Lf2?v6DUw;9z4AJUN)S0z0Cjh zOqcIOQlj9srYNb8X8G@g$~`w(pt1_XocDiKY^`afo9UA>%-H)=4KH*#r~>?@E44XeMvsjZRAAry=qtXd8c z_li8QYTA}*qIHDqIl|j^7fYZ$_opKBR{n9?6BY%2JHyn~%UYG>CYfKLv@TJEPio%b zES@9!gzCUrSF1qkog%piM6|GVE5uS3K)vOKShPx_0%rW{$QSQ*A(fsY4t|@}tuA#F zN=mlE(oYrMjKY0WeU~e=>%UAL(`m>>!>$Wr8ITRA^4!Yj-(edZ760}X?l~P(^JnTu zHHXz#g({%SZ(lt=CE@tAykKZA@I5JIb}MP`bCD)%L_!dFVP1}i6#N{CJ8|?;Bl;u`UJZXOtnEh$pnRT!=u;%`pDPZQT-x}+PN6G;orDkr71cr5K7O(oEkBnDw? zcpFtn#OeG=uKKmThA*F?h`&Yv;9MtPP6}e=u7-L`-dZ`zIcF#t^7oKZR)`_5&RuQ^ z&R$Cse0(-Cw1FxbCq5Lsqi4op!M)vbaD>cnNTx>iFeZ3gY_Ri1Ku1rY-^9!+zm9>R zd55*V2Ln%OXMT*t_2sw=gz{?9LpSjiaU5W>d0ZjKvb5AuV7|XYnB_bc1<<_DvZ~+~ zd_q4zC2v^b91lfS`K8PJ^L(UN0RalB#^&w}kakR!tG?{ucW4)=XI#5w>qqbVD~@D7 zPR}Clzf&3yEavByYWHcr6J_QT24;V}I)kT^n4($9U#4bmzAt9AXMi~1_*THj5c z;e}LP3(PEbE}ki@!FrDT4+Fg_^Pdc+G z=Wh7{FAU|t7E0WB^1>{AqfxL{B0C>jPgi`Kcq;OH3S8FQNTyH+giya7!R3Th<-?|I z+vm0Ah^!2UmA17-zVuH?))MA!sKfWK&JPlt6WLJf$2Xbh?~ZCXozmMkd4x7j;9*Ku zYONYdM9xOIfL&LkH?^RmQO@(&=c|)ysG&X%rk86WApz|MT>JG(`CKv*#rs3$bVoQN}Na@!9iq=4i+&{ z`vA9&jsDAC-rw4Ue(O2%c#%0En=LRUgvVlQfhFsX2=2WbzsnBC!Bq(3lcIRv2~ zN6?Z|(`ehzx802U#=s%J9Ul;2Y^1(AIG9`;@gPEei^cl=DbJGmNPVJO#qiQGo4}ZYe`lf@|8^8Y7o! zWcPYDB6)ta=#B8pEeu$PWbHZorh$U(Z<6jSi6#6b71yNvF44@){STgWI+Q~l&pMbY z!#aqt(_gZ4KWckIOm&q4J`E5F_U#G^d-MlP;L|`?PO#WJOCykUEoya}r)*-sGh+uA z^y|@}+pL!t9F^!bdJl<$qG#itF86d^Ae$%@z0K~OmC`=x^Ymlob_hD1Ry5p3kAbs( z11r?u{))D7*614o!M9om1UB47K%-?04C$`*s}6-?I%#4MnW6Fmy)7LnwFUjhsBa%F zZRx+5POa5>3{Mbp)?u(*l;oq6bS9(|A$-u3-9=R)Gqn?8QP&TG=}Ztv&y{`f`Nb& z(jRp3FcYQw4kgXQ0qyVlGW+mK)Ogx4yrX766l!s)a$z!BvxSb)g;j#3aAbOVw~_d}XV0q%AU!DW(k#D=`JkA{)o!;+nfsP7`zf_$n3-dehLj*efM4^A zBul*tDN>x(w%dvo?pPM{)A40MjF&E6car>S+!lYt?nX3_$EJk-<|y`@k?XSDcobv= zIBA&v2(k;$!c6K{U%5WrUhQ5!|J7W5IC-nF3F+XNx| zj;px|-@sd+30LQF@M6EPZ(#K+mD=5kTNmjw^qupepCR#d}OE6u9o(d0#5TnOb^)5Gquc5b(hJ8`J>#!+?n79h)IZ3gK4hR~+Z(UE< zFXk&Qc*Ex0pwoQRLy#F2NeKnw2b%_K7q6vsw2PjEbafuD%vrQFg@FX{PEqS|z|b=W zMQGeL)W0frjss~N&hy`0Q&nVt&lLxUEYkJpvq?AdJeoNI>&!Hx7#@N*Bl_>^@? zz0d+7A8yCK2#^)B(vUdJ`UEa^I{h=`BY^mye?e{cw6q5u6F2-&2rxMD$sDsC!?8YT zw}011_IPL?O}zlT;uwf^fiPrB6{?9uP7Td#U>hQ%Ks8LzV5tO$WJt7%o*zTWF6KsD zwQmi{NTfj>zOP*tcJ68=k|tm*M-V8_K3Mf+b`Uj^)s!^l`1-6y1*d+Q=AeWduvlzL zqZMnlW6%4LHMNS2G&Yvan zWbJI4Lduy38pt<%%C@W!82gbOFCPBJ9UzfgGcd}jZq*#ga-|&F{viQaNNi;3Vm#h`_}WWrJIJ*>0D$jf`=YD zkjH6)_|@M!3}EAoq@uyv`2&9aRxH8}X5ah=Ig&bR;0+XjJ>5fc<>-8nJZp*a4wHB_cQ1g3H1|tu3+5Iq~t!R3Pjj-y&cpwz4ml^vZO~D`KkF;OC?0 zD!*IH_^y+E)u26ZMh~W43;gsnng*ohVD43^r1JXKL>AhU{K_p88&H@~(`4{!==YDr zYC2{I(%bLFS%?9@cfm!NpbSn|wI9on1j~ImdK?^#wjKebea$CZWZ4hQ<->-LmaPI|S`a znWrePlBYf$@bnCT^J0atal=raQLQ}?h^y#L01JDP3c#$FZ8bl9m(x=qwXfkJr@*gb zw+*S{vICt*d?UNEH++!Ht!U)W9g>=^Zjum3o)j3JAX3-D$HU)BH%J0LhSU8_E#2`j zfwV{BXvCKTfq5sUd^%?NSGDk0dVA`C;Qo9Xri`Kd8#;@<{O-ypIuqN^HX=vH4ha1v z`u4@!ec4EP7;6@10VOjfKCVZaaUww3MXC^%4q^6cEdwmYoGalrV70Vb6d5N%>Rc$C zgdy=Acv#bVyskYFl3Nt`46422SCrxQT)sNpmNh%I99`g9jmG)hh`hi;#OQ|}Dbq8e zd>P(YNUj7E$o$ii6E%a!QVMzvF|Tnbm99z_w~j%c3Z(rVVL9%APt$cR0c{}U@l_b-ys)CnSQW>4GVn`mGfw^p;NTN`d)5uw*~6acl37N#S6 z{S<5FnsWrd45~gdW2^hR3%^X@z_Pw_B+Zgo(A#V&73*PNJJx8CkI!^+9Rx!}`-FMl zfgdq^ru!rq)xN0$ohgmOnb}bYAgi-bieo>UFi-KdJJnDi{PH%1C(+Exf zCg++#X)kC`6Fs4zwt?#3$Fpx3t-sN2nkOeQY<<>tw0fKOHp0hQrEaB>0Li05oCdR6 z%k6{XwLP%ezBkrO?Mqm3oH%TuiqD`Nm~4)d9N-NysdxcJypaOEa+Uk6NJ3%jtx+#| zd=nQMR&unW@85OX2!f`F2Mod~sfO3b=g-gbFD!g7oxJP{Tvx~BmT+ID%#xO0Ka8h? za`jUgE>%A4#+`i%HSdb^8>GqDJ6kHz-3pkUV7)QEx$1;E7KJ(p z35pIW#JGp74?e$TF9t#!*}$EUI5%il8egVqRQXmK&8DYWQ_C zsH>|$3I_~NK#-3TMVC)#FY10Lln=_qVmHZGVIK&CnizeLJ>B^_XE0gfJ<7X z`WAO*9qMx#CY&ckf5e8qJSVeAyFk5YF7+AD$L2W;$)YW^gYD1!3;esrH)%X?-6#^M z9`?Nr@gh#wzts;QGig1xv{tK~94BvOnhC#E3r^B)`6j#Z^`pOGKV-r_YbHqpBUDHN zH*Fi+k(e%{Xp@*15**{@ta5clCMHGjxcAK?tHvyd*4NqR8m1YKz3>YHWl4RM24OI| z#(@f6WQyX6CN+=TIKDE~=`(BQHqUYw#bCOD$GcHSJjbc68A~Oh(khG|>b>3XxVULv z--{EpzTDd$mF(aL-x?@ux2N!+@<)*}k>xXs5X%7J1okt7<DY=d^=50q0Yr zGe)U~;wyk(H$Gak`r#n?6J2Gp3I*F>2?)<2dK+;_^z8U4Uff02_@bbGq(x7*aM38 zOs}VR5Xqvr#(z|?nfX zO-aRT&22D0qJk=7O1J{??~IzlxPP13BQX_g0pexqx_eGIPaBDZ#VYgX^P!yR`CFCp zIBB%jsa_fTybY+5)qkA1gfUuta{X~jY_C5A94Oe*76s#Ehd#rPUeNU_vb%2eZCS9P ztb_UEjaK<;JU~_*>QH+vte)EWPJ^W$z%0q^G=4R!+M*|gtg$HnF+At`Mo~f5SXy_$8pYq*jfyH4Gda<-I-oCko@%@E{5(CyjxGAE zb{Q&<9yM!YS)b3)lk|UleiVce(`H9OfFSPmknB;WnLo^YB*rb8C`Bu|Q1=-8Uu%nI zg!udWw<>Ny$FM98T{xY%4yrUt$CIQFFK`z*z)dC3>WgMw75nf9xSVotrk(j_et6X# zYo*pTPRvX%D-iNB$xHC+If)6U!Tlxvpyh4@HiA&mY~e)}n$fXMTUDOJ8jYcN!kRR- zQr>ZXTi4lf#sQKd^UIEy8TCyyz8mK&f!EIIEbouMpN33rs{U{=pI>thhn>~J-HR{f zoooJ7Sy(iuLHnu7Tt_^4k))jkJA#}&I?C(UAzoCNn<2zs`U@gannLNs;=(?^b0Pe} z@i$uw3(tUML1j`i4Q(#3>~qD^YHjgJ|Q*T2SdMTDNy4le9Yyl*@bS(a25 zmT3NUFe)K#%mh6OP~fttye^%3=bE;EZKeE;-mySQ#*f8kHCHn}-O6l6si;}Udi3JT z%a<7^I_mpxCk|AvJ3u0;k(;MPZI!m#dP$1M{SO-S*MtNy1oWodwn9qs#|&KI^P&E0 zf1Lmui&8n@2X{95mUV8;yI1daAr~k+y^Vf-FRk9TeN}6YM9w%aL;y!0%~l#?sp%b>ZNG1?9Waj$BPdJNOZeF|8kxZ&!Ffq$@Ci8WzQH? z8tck7;K5<1<{^ojDn8A1S4R^zP-;;T3ghp;W3QZOz*uKY(dKL_8R%3Rd6PwA_`{D| z`$I0&Il+xtWAD87geixhpEMz-BkK@%P`y8z^-5gKnbP^5=Ru8lZkG z(kJyfa=rr0b(7x1&wyW7s+-I2uk`^n9A*(HkJFLVMULv(kk0_T)RSO0cbE0OL*saB zj>u>IS7?Y&Ui=P}X&q=#gjdnwGOx^ZS3{p)3DRc2{BGuRHu}`$sI z{S?}wP^r4F;~;q4it?U}oIUMo=g!ixP;PFRkfE8z6BwBK_rN_4LCM~w0$#4&eT;(^w1tY zi4048rE0^UyR&oVV2CX2K=RjM!f|b8Y`6&GAxzs;C3*5A58#Z3FP)5(x4q>&CAztO z8*tRxFkVpqEaQ(BHWxxx)(~JYCV@PZqY))xhrgOBSb?GRx#)Hsyvyq#Vh`}cGFRTc zy#YXiU7TN2N}}X9JT65ToQZ`G;P86r{un%|$9||ErX!=H`TBd{om}jCEUGMRVW)Is zO~u__lO7b>1Z)Ymzi_yHn|rKYTI&;ES((D!Ok+V@!;h|z(&}n2^_eQ;A6&_TKL!HW z7x&by7}XAG#||23M3-Ur&tpm^UU2iz51#^Cop$>ZPP(l{f-}T#d24Ro`tow+r~P_q zXNh~&+k0)T?0zc*kl_?^Uk!8sj*Ra6DOgtwKbdq59N2t1Y`V83z9KwOrYcB&%O-aH zY*}(6ujAVoJs`bBx2#X)Z6gc&HfoT<8LN0!ZIOm-*;NZwo=DxP*VX1Z#8tAdm(DceS}ge^8QhC zzc-QDNJD607`4rLBdckxcp|w&?OCOk9m2(g0*r7=Pn|s5UUIli2BwlELJ?qG*x?^GL=(AB$ttS zo?w3F-Q0d9iqP}>Tp|dRT~pb4s+<4n^4ZjS5Exf)`1%n>z$9v)lqO}zx94;lhw(Le zsZ`oaUI7@dxD3WOk0heoFv1gGAvsQ3dNehu{fM@lr?rG)9ZjEg@$h)^M$@D-P*S}3 z9RpP2zSJys9~g;PzEI2UULHyuXJUvuOvs!Ut&Z=WMUf)z~n4-=w&vjur9E z9FKJflm9T3WW-dd{ldbhc6 zPI!l&eFPjbf48vgtI~2L#m-loUeZS;iqOi{&2)Ze<(quw{oJ)T2lbiCdzte$I=kGS zrFcCBGybQ#O$&p>L!GgyMhI=#JGMxuI@^NBk1s)5}s*Ji=f29P-xV6B?z}SS0X*3`M@BTB$;{ zl>Bl@Y7wZ%o&o(KAM%uE`m~Fuk!Fb-GSh3b?%?gM125n0Kg?#gD;^BP6=&T;kdA_+ z{0I-2V4nWA-qhVfh;ZW(uH_o7$^n`Lwa$2frk)s4na<&M;ltclp1|pFHD7LhX8Mo`B^+O!&i3^9$ zH?88yuP5ccqJ4-*ql8Z5_GEC6Vy=M^_*cr6y2&sFNnNR%-dwbOjh|iv@K33c|8RNt zwIP`z9=5}noZz4)wCNG@&_{iRE*u__hP2 z1?y_B%97);aM>3nh~(|U6T|$p4rZ>+yh^_JPxeGtq+dY4-OXMM6 zI`OQDDy02L5*r0I)Yz<^b~l-Hy1P+q|NE=YkIeM|$g&RBvh2Mx${q62<#(4Q4~tV? z#`;G%3KiP239gA#B43maZb&%c;@cFsW+rx1KQ*BGJc@qvW3K4XkE=xw*zTc(A#b6^ zQA}G;q#h}!)+v8DID-?$xSV~KF#bv%RE`ufv58>w%$*B=JM9}mT_-GG?4w?4J}0^Z z36d#wtBCrH6f7ivLGR4yJW`+69|YY{4=uvKE`#Rt4zX(Zyt^LsHH{hjN*cCpHZ#2l z2e3H`PKrGa{B%|;mqH>{0#ckR`assfH4yi3+9T3#D-6-L2}U%U5zvHW$Yu zQoy}cCC@mhUm<^A-s``T;0n)}1WKb8?t8 zZ7SqDW>+>0^6f!VYAfz`ZMCC+tayi1Ad&Y=nVty4c2uoIDD1stD)uziCM)kFY@V{h z5LmF-`qbcuM;EU2=yEoUsJMU_ZN`)a;$5kgv;6#c`zgIvaj*cRu*AWyuxwOu(ImOCp=2v!!%gf$nYcss zdLM86vj(e@1463hu7mJ?J6lXHN?e<9i^#oS@7ML4B59r<=k)fpoLn!HT|K#MgO<9W zfj>Aa#1m9TAyg7-*&h1@u#uu>HO&&Ch~>0jcC4VJ)X$S!2jFXDt3bkDx65Vf8CDz4 zSyyYM3S~16%Y%r5BMmbS%pS&CudSB}b~h@$lKyUvA?$$m(}%}I5X4)bq;VJWL|uNz zUOfr0W!GBJR{de+(j`OQ6{#B8wk0<5^vtb2aqe$viPvf!cg1nCF_t-`+b8`YL5#`qiPi z^>Syfod;?}1I;3wqS^B*i2IGEIerc_2F~!MB2m?0<8RpB^%>;$pSPkI5O~-xLoEqP zCf|Gec6ZQ`f0tFt7Yjh@Fj4>!V7&iRN_y5c6uel@uMf1ujfV#NK6D$ zbRqk*xLipcWW4NJWyPvGX)%|2pC+~ug*MwG5%v!H7#Wd@>F14dku9S>jchDB6)YO% z(nbe^k=oW$wp0lHqpd3Fpu<1F=>c?-$3%204zuyks*&Jtkk7fT;GN(PhR(cSkt{ zon*s;50(Z;q1hWKVKQszX8|O_pFc4{^G+%+RdBi3f|WziV~l>{ZIqJ&h_ceVxk779 zLgwv{T8FKZz+E1&Wu)_WW5ofyC`4>ZJ4il*@n624^(Y=h@w3=*!qyx1smR=h zexiGsrhGL~ov?f6t){MEbb8>Q|admaJZwINb67Yd*T~e;; zQ+cSwO-ED0l=k?b4BXj>UP_WwWKI zLeVD*>ZlzvF0XrwS12Q(tMlC5I=l9Y8AhooPhp{8gW}SEjp=8wOhFVU66}&)lQUD> zlw-M6%MeNE*JnWU2>IG-HG3U>K`FaTwd$+f{7#*g<-vFZnF`JH(C2(Hg4*4 z&9aryvwt9;s*lteGdkR#w}}Uu0V9Mw$T0$(+(leTPz31$RSE!C86e=?0H0i$mr6rn zK|y)v=DiA-F%8|ymNVZd!nc`SzIiGVFsm471T?!6yCeeJaYc-yg%<+7#?_rPnDu1D z9e3^e+D_(oXMknyMZf|VFT%#~xGH-#TxKxwB7~@{4Q1JPVl_2S+W#Bih712O3$3#Q zW!28?snb%F1`_k_hm=-!L+%4;rpTI2pX(n!a%>oJIC8k)>D%pK8yd<>`mFAAu6LMO zIRDbd8UruHA#c1+oy7dJhKax4g3Vrof_Yp8=4!wb9C}bydY#c1tsLJzp#+fbHd7(< z0i(1Gm#>?p<~$H?t6TJ6yk~TM&(JVr$*;v(3x2w!j?TR1SU&3XQ89|i%K1~~p;v(Y z4>9W#4R*lh$ijP+Uj~sSAT0zG`&;kcNF6-U4>pPKTx$nX7k)QQ*X#wa(vFb0;ZNRp z&Sp<4ho}kPx3k?xix2e*I3(+X1nD!8G=eI~{43%KIn$Mz$}>EKeUA>-8~IN9CaCUZ z2E?DRL}G1&i?K;EnB{h*-`i!I6b5nWrD8s4vZ5K-#DvzT@xNa2dg{MM{Et1&Fm3kG zkvDFgxr;Psa*ncWVI_yzW0VdQ*Q;4p*S!t?7K|R_`H#4nYP5hJBG=?bfgc~#_JRp+ zf!@loFNohhbyRW39=$dB@8O@HZJZ@m0g?MDQSmcszBb5vAqe>6@y=X9UBj%*xuE|i zK%H&>jBnRfwvpAjfpkDSc4rvOeA>4Kzh<|j@hogE(ludEG`DroB8+v ze^PJs@=Y?AbR!qYGXsK>N$QE0cOnDKW|uZ$O{+E=1{krbgd}& z*pFg4SovDohqo7h^QcuN+&@a^R|EMyoRjsE`5*>l5z6QJu)Xf@WH1RUb@SAmUqIG+ z9d$^E8MFJ(s1y*C7g>NgJa_3;0>5d&#|Lt;mufk2&UJZ#{%iby%NtP=WFKo=0qc@F z*2ulNth0mCnQ=ThTEEJ9%493S8;U?y_bXeJgkM(>juAnR5Vd7L#j&!KWC4l-5Gw4r zQvT7A>E)%sl0zoHNtC}xiXjSDgH=ls{_yC~4 z1_Av{L+=L$strOwjGr*eueXRGAXEMb4(52Jzt_2oJD%H>rr$GDF5^aT4TEQiAXhA1$TJh^b@_n?6}vZMNJ0= ziZF^fpj7RDlnNYWMk{bB6;=XC>fKIC0pi$YkU4*s|04tbwtQ`Erm3XwLJ_i3-L=0; zP50u7&ICV-6kwRDKYuxXYh{mm%ly(vSP1g{_5Mi2tN6CQL=?oHKgDcF5g{@!O9mb;ZDdudhTCG|AIz7V#bWE z=+ZEhNH|5je`zPX>NrM!`E=zn+!V>**OaQb;H6{b>R!_VM>suxNT7YvS5t}K#n*c& zx-mfF5CXDW>?)gPy;oih0Z~J~w$_5C-jA6%x zD-~YOrbqQ$23n8?+%CE_T6A9Y7t2@MI9yMk4HS5p$gM|is*7wS?x_P+cp9e?eVVuW zFUiQRkpA8{hFrkeI+k6C0@u!|KSM;u;98+s)js1`iNHdgS$fve>|~E60buljw$u@- zU*KNX<+$`-f7s?RUIZ&x@qTq*({c@;dsTUTvnP~D?QcX4Zf`%P@L&7I3;OBb4FE(# zfx!U1&|QdX9-8{P8tn`LLQ`ic&#F1K5sRk9R(!v_IL*wrTu9-m5gg&tCP$}Qw<}@i z(U)Z8{%gMuuYQhX)nkAZZN`R;`;@;D5y!YcNP4Nw$afKo9)V2$kNt`fbI(qKJo!_4VOjNq2#?o zAliF6Q7Fq{%m2mxdBw1f&S{rUtfax0*=`CviAnBiPA(4!81Wu*eiC~mh=Faa>HqtYd%W)v zp@C9_0#WAu=KIAAatENn7z5D8yZ9c0^$7bcoD(!TQoBSLikuB;O1 z$<=pcDEGvA2W|CROG!aQS_QO3&pOQ!02F?4VKj67lY+eJ7 z{c+(Vg+!ns;3cu5mybk5I`W`X0{gpx5`9BwttA?up&sbyEsd-(`Zk0{+ss|ixbUjV z{o2~6c!2wx@y$3n(q7*DmMnv;u<6-hQ{gkN5294`edC;KlkFiW)v}qYhvH8ZV%*%X zVUg;}M5_Sa%AF_Qw2`4-i`g)wW^wOMsr$vWT_3x3_Fz1)0FzSqfER{)u3ZB*Y!ZfI z;Bi>?kWF&7!DT7GUS4CygX-Byy}b}TRd<^&iZpbb8AxN1HMVsHek4>0G4{wm?c=_f ze7`D|WNIg6gWJ=L@B8zNw)++>LqG#5vNSgtZ!os`F&ueERp&PK%;Nl5$NGEz+DU4~{EAW+q*t?d@(vwVJ5Jm`CPo_L`T zVf%S4wpb9 z>i=Z?q%S!Kpc60uL;3M15Q=h{+q>HmB#c9ZQ+4|M@#ANZFVG_CFM#bk$VjW&QW1qB z;JWx1-zU!fp6}x(Rf&eQikj2Bmk8jJW@bQQLcu!R#TR1f-eqNgV+jD?Z^OP_L)WUv zlHt3FJ9m;`BF;yUE9=VUHvBWg#1J1nFQuCd2?`{0gD+- zEC}%`_=RrwJg7j#b!Jr%r``x)rl~44ybg66hU&$g?<6=M#6DM;F~e0lu+VhQw48OG zH|indJ@U%F0(v!rJ+(e!R~a}KcW`cfN6}0a!APcljCJ$0#NtSsP56W~3yrc^0nSg? zM-SB-hy6rJIz9KtY!YxrRPc3fU&~X{Ph6y*@j&6+BhVuy-rt znprPA_g!evMS;H15TLhQwEiK`T}oJD)_$=yIGH>+o(z;$_+5_z=R!`IMa(cqInU(N z`w@D`^7b8yfKH&`#JB=FRW$0cI0{roKMvZG=T9kn1?+8iN0n52!w8=AE}KHvxT1d( za!82GcvK9SPe~%#8XU|n0@vV%tONEP3UhN7{1Zeo9^|5K6tYVDO(M3*Q-pWeDOl8l z4c{jtSzZh)doM+1RgQR7w}gdI4@n^YulYCzVroZ@ak$ADQC!&o2r6lj4*jKAB212r3^t1{ZBL={Dr#*NP#rPnc>#fTCazjbpi zO>*lDK2Mf>E|b$uR>7xdouWhmMR~12<2UrB0aKDh8KekXNv9MNx!NkxB$)~YI{rS< zBh<~SRnna!wgxXG7^Vuv?<5UYf*I#=9wLi_+x+5ET zFZB0nTWWLiqXX|)92^jK*aux#VxMIF#jWWk)5@1x*U&0>t7$-{LanJqEDb7uta^81 zX1LMj4UogQtssq5pBQ+@g2g)xTB(#nW)$AYqxFuQ+|z4_xd3|2$+7i;i?U7E<@HyqRX<6swhdU|1r~#ldTQuII+Y z>?maL1-Qi?=u`g#1pK=HCez?_2G@PvCt$WgIu%zDGDG{5OT!$7+JB@FkU6_UGX4Br ze^9SM=mOtakg!6cu%_f~+e0_F-pr9FH2IuOHR^6%m{m-c9%eC-^w4l_JT^%NV<{3{ z+;w!^n!Fq=q?)}Hk3%BwW_Tf)8t~L5E0X_|< z+@E}!(Km%$OtU4s3f^pkl}5eAQ&pTeYI`8xmmIniS@;?OkAW8KK=Gf8=c+X`Jyz`p zI`i<7Z#;;6N8!e{J+NG&CEiE=3fS&R{cZjZTtQ&r39Gc>1k2WJzgkUbAL?lo%mr#X zaSZuGh7Nta4|hNbFB1%~c4XyT)8Sf>u(nZ+$eUBEjJPr>2L<23;6f1>4M_%{y4JBV( zl4x0*ctzrm3jZjC|JIEMR8xx>w^d{SnHG6Hv0RwFI9pYuL!A1XpL_N*jt{@}I^Kh#iu?X`cDvGjB znv8a&=Z2&~NF1%$Ew^s&wc$ict;YEo2_gqT_`eU>|8gI85|E+^t|^_pw}V1%96FPE z1HmPX_xx|uFf_3-R$x1seB?tvyuL5Uc1am<{X1XAb_^aUD^O~?$b|>wA)l10Koo9) zA{szclt`%q*Ly9~Rly7(&KOhAu%7KF;(Rt4T8q%n{#3I* z%J3r6-)5G7?$TS6@2PE(p=%}s+i*D;iEZKXlSnpUt@5-nwoM;Cd5XX_ydEOr!!%`r zepyGe1={KtZ*O+;PE^822obXFaXXV)v>k%7HxT_(o0&(m^SHbW!TEb)j8WPl08gl$ zuCjlmSI@rt2`Q)BMhd3K4}}LQge$_NRjO1hLNDJ^k7+O2~;d80g7PCD|)i0%9cqCVmG6z}(NR+?eyI_8!TO7+sJQFv~1FbGo*! z&x$|eRjC5%CT};-q4cVtV@O-Dr~l^j*bat(=va|)-S?XSxr{}+2ww{%ra)ov^rm)v zzb^Z*`np4yKLH#9^JA}Z;?`a<`4W?yATwtnjuBE>>j&gMRf%lFwqVwBEd1n;Ks_yF zuVLzWt^|GwTzJOT+2z~T(^HyAno(wPe4nT0O4>r-hXc9fM@3@?O=PZL0Dsv8h&zA& zS&g4@r<23#0ET*L55tlF)$ua70ce=){|+oWtC=#8INuU1Ql(KKw2uBB$+>F=a7v~-vQ(xSS!2z$Ip6j#j|%YK z-^4(}qi1$vpSp)G#PfdqL7oX#0!dc?Vt@e3NRk6RG)B>T2>ZUW#u?xRg)(}t_&J(&Yo+_U<)i^TpysYNVQ$r>Jz#|ovI$I(Ur zJ(^F)VA@S=xSBdjB$UUkaVBWz^C#^jRfjR0;^{De29nAN>mA^UTnbV2i!L2B}RdQziUB}VFimnp*EerB(#P^h&K~(!3<)FEL<)rv{ zC^X-SUkE}N>TLzxqx5MYMM^vgX;^V+I6ywa2ukq_S@M$bNx{Dqi3X_kdE`oluD_Tm zaJ^)Jhx?}%5g8@Yo!mnF#w0iuioo^de@#1q9^Uyf^k#k-a?PE1gyX;O<)8o}0?0qr z=4D-)08dD8M=gMvJnFJ%4ZUI}&oTI!qm(A7!~v|lX7c#f(6}JL9PK5NVL$R1jBwEd36R}xi%7NEs2F8I8&059yhutN zhULiK)NGOIjtqe50A*d3B9%8z1n>m7e<$iBWY8}_D%SY*c3UVj5%GB0=O1dx19vW+ zuc548Liw32=o<^w-;rF$*D|VkJ7u7{_(sC4TKUygy}*CWt_jNIODLMH zh(3HKj15-JvhV}>#|8t;qh)|4szjlMy8^6L|4Q{8EF!=y!7yJ#@8vuzd&HQa!%nOJ z*{n%{!lS6*qK~}wCMBxAt^??W@hrm?*u?qCEfEMKIn7)fCPYYdo#}YG{Dq9f>Mb02 z8WFL+*ax4V0GGDB0RSNN6m=0H`AGz^RyE`yCpm!a(L*pM83gA}8j$85o>uEk<%Vg# zwJK+HNOF;R_y6(smSI(HYy0<7l9}PAtl`ng3{fobV&+GNjFTo zyKBA!*WS-w`#GNX>;F54&U@VB9^cUH;0oZivaqC8d3BK-Cs5c1&r*}zB8nslP_ zXvhaBN4W79+Z_FJv>1FiekfxBhFmG{-mlTRJ}>!F^kon- z?u}0JQv}2j6ed5ZEabOtxPMJ&rSMHuEbsBY4;v~PNi6DE4Oj{Jw|Zm-K9Z37ag^k% ze&O-^Ua_m2fGQv$VWx8qg*%mvf!we#@_r{FJBI+(xb{U)cdzE(B~Cz-iUzBN3%}Qt zTQSJ8|6~3Nhzoy}e(nstUrM{*S=j?sA&|+tU`)6?`G-QJeg{L(>YKPu-ZL!c-?qE8 za-F-#X?7e#!Zf%hKrVnY-rX6hg)08Q0(e-p7lIc{y|VGEAO{DPJhc&W6?k036}*Ap zeG6%AKw-Ajp~7v=@q%H)C)i|rx4Bq)kp7A}zNtHjBh%Z`k7yyR8byBO1qw1!E6ySV zD)N3q-OKST7cpU+;*|{TVZL5&V!{IQV=O!++e6W*iYGtOpNigO`~OoNlJy^Th?ES& z7+0k+qG1l88aU65c{*tXP*LI`MSROrXx?kse_zUw>tNrK=^AC7@-X23pE`_$5u4ra zs}^ddtMQ=3)0+16W%4idNWzq05u&3y2pq55>7hReHe%b9Vo;N*H-zYW5Ns$DJPZe? zAl2PEw+i2scg5xkwOpyCB-Nb5%-u|1W%%|;&tWP?90$7-z$Kdy^dJOuR^tCl{*mzi zBmYo~=`{c8=+GK6<`DfxAtjH{4RjX&5#9Tv%KlUCk=N3R91H5-4`uLw?Gb8I8G-Nx zvkiY2^u_7%l}G_WO|ld;bKCSZeBybj7)${J@{?b{>gt;3!OWsJagnSxIqyA%=#o_aH_OzUpSo)wKlD?FX`9J<*BX3S%<>k$6qwa$w zxp`pQ^?Lma;nV;1lkH#A9(1j@EDMZYdXXNYf`4QI(U=_O|NR7rM4bQqZ}^83 z{aw}vWv3qYhbCS(>fh0Jh(LtZ2Fr{47SO5ii%Ike^Jtz(_!qj&wExF=#GhD7CF^^o z-8~WBecj)G209wZiPxQyBE=6&YXT@0;5B4Exi2~VnQ$qJNI`u-ST9HqqR9WL=KsI$ zF#JDndqCeVBl%y_2E3rW0`zE1*zOBs>%BW zJq^HNSRQ-NU#mZ9Q&-X!`Ch{taLA-Amh`fTfcg()pNmGXGGZATEM|8U;lLHN!VE963!=tw>EhAE#*mezBFZ3lzv&DsYMwNb=SOy#wV{Vy|02G9e{)mK-uuLWePn#2W9xbk z4f-Bvr}M>jPy@3=#!cTp?3n*_c%TK3e*Ul-0Rbnd`@30bqIfASEY_UI!iOdR{@9$2 zL4x*r`hH4XEnYI!!u%-mO*@M#rDV!Cv~}44j*{!^y1zREqUnOJr2`y*x`eGTH;Q~! zI4K<8aXOp#*BVnHFGb};Uw}fj5#Y%z(zicV%xg>q7OkVze>l>9sLWphrJlB)6&D&b zE*i>B&=`H$++14B_Wr}y>ovEc2|mz%=tp>}`+7us0o1igM73X$^##V0?Z%N7auM#{ zod*X>LKhe2InIJ9CEV$-`FQuMXMuDU@NyzKkt2le>W;;o$OfS=kvSS3w)lT~XZB;8 zAD%?2pLx|O*J%b4wKO*3?H~R2sR2|CTF+NX2CnloXd@;}{sp4RI-x;`Nc3eHnFPg4My z*bvM9Ck?=bkWi)Dvk8cNR>r-8brfVW83bxo}_`DN*c%=x`A5s$)_$(*|;?8t|wB!Kd`hDRbxHPf!Cc*v#fcevlB zSmy2?!q3MeZ`ACL)zE)Fumk_mB>V#%>p^hf_h?mt&%{1Ru3a4NMT=s8&ZizV4_i~E z$w7bPKhR|Z7zg9#2m7=0?JVwkazOUU(Cntn90WRJ3(DKse(;*_XQEycYa|qcN_I$d zSVhUa%c*CXVJMkgEdJsLl=qL#X5ETO7*VEWD4HR1QgmkO_6lxU-F(9`kJi|Co^<@i zZ?+o6+l!~n9Q|P2I#o*^$KKrz8S{e^=v)2^+mES+sz z3=$xq`I{y1Bw3`tlST){az8ePk1ycHnRFfLu|lLIMP5Q;Id&ZBeM8BZc|G*T&YksU4b6HF9tp>P7oT>iZ6Nsk z9-JHSAuN1(C9iWMyS`>ZvPx|ktmWm#aV~1I+CV3P3*Vqmt3l!VxmOj_wPP`m>^nert7k^gl%P$~GpM1}c%>*wq1G_@wsU~`}k4!lg&~5j8oI}$g*bnM@ zJ61pQaZaFkD8aOzk3L%cxlh()oH^I)(ykx~adnw6=6IJ}a9shEA)zm3ZskJDQ492AnO|kk zlaGH>uFmF;T@>oif?P|)@c7+msk1#H#HUtT_=H)^zT)o)V1_fSMxR9-F5X6XUVN;; zw=`_`8?5!o>A|;nIvDW>gSHtfpsI?pNj4aYmdMSgMke$>?F<>Wfn#ewxAl;7(18vc zbQXsW@_td(aS&BadB)T?HZI$phDRUtM`!zw$pq;LA23!M1TI)lY_b`wSQU{eW@ufM zh=3(UWEe-C@JC*gvpmxy_r+eblrzg)5GxFaF07{ir1 z?xJhy$7zBT+I(M?r+QK03yu$fAJog%j>ox#9&In|Q^SF0U$Ci#4tvxo^ElWsIX{_i z4>Zm|)_UA1VF}1-69rq2!6zOd!}h8)i{@2Qy%gLN7bHh3s@T!s)sxrD{dRpZH=Z2{ zaD5iBx${s0k52J;H&CJc15X?Kl1E0cikTKZGr&jD^oh|nEFEQZ5b4TP1>FhZf1lBC zYr_QLdY09}-)_aeko%;GZeub>mziyhv-?M#*R~aw>>I8-*6qidFKj8f;;ya&REZf| zm9mSAFzD!fZPY*XcX39E8%*T+iUV+IToGL^b?xMXXCfdER*aXqPI3&Uv9G|}7}xUE zQY$J_L%ZVJv0pt@3o`>});nHM01y22wU`5CKj=64VP$SEBbPAx;zkf|uW$b5S;p@S z3q;6Q=WJ;`Rv!s)-1a6J@h+33+PjDsL7RRXr#vZ*OZXf zYvw+RP?uM>%~NwOweu4>PW+zY*M@wd&DoT{hKjqg|00T8*pU||0%4L2z9E+B<57vn z)?v;yZrG#33mQ}Asn$`yTCuM+XV;?TA`Oo zm+@1iI1f?2gyuA1msS^!p#N(DVs_!OAfdm&`6(>I?Cs*RqbZGB^{kdFEr<`JGw|1p zY(N#b;#}s&>X)4pLm=&sXn!uM`^AakW#HI<^BLvOVgC;)%)A02^7Xm+kV=fNiSJXl z$L%EUpxc$vv8u0{C-srpK&Pl-GR<}V`9u4o1FzWtqLE{|b{tSV7Fj5bYs_Y#S}&%@ zx{M*IV!fGeuK`hiR<6XAIDjMmO<`sh*<=pPkj%r{RI$EuDp5mTnz|R7MgMfE-2RpA zTa^2nf9L<&?Gn9Ew0d<&FAnzQ9$M-tJvsA{vHT*y&epnA_X{1_t6heFL|=(YU=A;{$8N5Y$KJ+2ac%L4YmAaHVpf&Va0= zgL>CL=C~l-fw2vb=lgm&;yo$DSWIWURdOkLNlGHH*keK4Bg>EcdKq7#PE;zI-2RXPL9 z=w*zr!r*@N3w#;gE#?4FUvbXJ1W1;^&HkB(msSBeHo);oULnY3TO3U*sUF(fE}>#2 zrrWTG*%gFlp&W=SzA~1J8P;cx3f4k_PPr1pdW;H!3SU(2<4Vg>cSWDg`SUPj;(=!3wQ zTB&&~`zvFdk)%7(^ZPe%0uHs;a+-!UUo8M88(##1@0rp8AP|R>2$BPYwMxW&0TNfq z;PP{O1K+*>yPDPEO;pj37@tYVS9Qy_CiF*wDZ;UcMK&3nhU=VKI27Tc;9y zwqs&u_S~Sg_vy;5k|yc>Vm2!Tmp~(O%~(P|Bd`jF~cm9fk_=8FTPz~FCVUfA~yH79s(2U zH{}n8lW`b`o-Tg?W5~4pZmq-oLUh7QD(n#MDL4Q!cUfVxXpeBPN|pIKrok76u1+|N zYx&r`K!Nzl!@S6YKzfy;o+gNer4mCV+!J3ziJf9WR@+k(!(e<6)E#5Ncy2N_?+^g~{vU+9!;TSw+}Tvwcs4 zn9zFXJ;D8|McMR2V>-y&x)4Ha{17QN zyj}zyxosm1Aoo!VGaySgF@Do}(e|$Tp;dF4M?V<4>MLt=)+<&&vrEdyWKK*Uq|Eu} z6OGk%bUk>pOLk9jUVfjD;oN%6ROx-iW&*o5__;n5_2E4=c21jZlzt23#^642>Y|0t zIt+3j1fw|jzE7{5t;Ip=>5+ICxr`(Rm3Xek4H z2p}PPo{FZN&yDNO%dVsg8TzkcqB}hd4(4Qt7A&>6zetrW7tRBN*k;$q1$-rusJ@je z&lmaQL52$QexLkI)Q6!&cZtMOAm&;8oo(Y#LGJoCE*5_D0aL~lO!GudWumzhO!ufz zjS(;gWFK~6Ny@{rKW&Hk0l1@?;=1spiGr{gzXyoT!L1rS7b_7ww|E;t{z0Yjz+2?J zv6bYe+92$1R0IhlOc*Fk z>=6}D!fN&P{oUyS(kkZ1iJ6r*YDFJ;g`JiXr4u%Y2i70G16;PL3@6)$BD_-TZ7j*= zi7!HLx2e`DR5x_IwKqS!{Kd?oJ|r;hvk3Yy9Wk3Wxo9?FPw0{QRd%aLS|H1WP{THi zOg9x3`r$v13i%ofZ$vA_B}lySF&PhDuuvrsxvU{tN6ZqFlzPfbB46?A%6*Iqdb;_a ziC5~4RZIc8KB)I?RAuX(Zsy71g%_Lip<(~gp+jL?TMcXyMq?e84!d7`h5qx&!$__D z7$8+>xE#tAuVvr)M*qmBJ+Sjb@dU#^EWvR_@Cf8h70VsL-aXt$G@Jxn*Y(>OaXU?r04mIDd(`x!i6^eS}qEfKZXWBw<^ zVF&gBVjdJKeeW(E5$Rs`fvcOsdL)Ab+i%b*slOTh=9V+{sOv7C*I+ zq*R4m-rnE5VE1@Jyl30UvWA%F^Pg{-DQM^gfji6-O3-<_Rk^UucWuyZUn;|L$S*L+06+Z%EBQw zi_dNb7I8fe$Y9T%w_j5ZWX#p$xL?e73Wf?gcjhT%LL!m=b#A{w_u;FA##6S*V#**T zlF-Y8qQ+nHp!3qkWJmy_)^yNj716sEMGq)E1QR964L+dYb~befY!S54RZ~IFYt4dAo5v0FSx9l>4~_0GF2xm!G98p!+;v?x({{vxv~E z4H$|akZD!1#k7)us%dougq_x`N>wm`S}Q^_>d?8>Q=fQ>BY{rOQbpX?m(p?wViu3C z*WWd`Ci9qdCJmc?L*ERg&4y?~90CCX>(O1y@SJ9;vtj<*bHGvsDK2FjppYlo8-G)c zk$`TQH83HfApKRrE+yWoIZg{7ai>#u<;JJjoHQ$8%?`-mzYL|0Kz^7YtUcoO?wR* zyXP;BH}O|rjT->K=(c3_{cUtR&uMc(2Z^tKhfY`fO7IVBv~#bUfY0)ugOCp%%Q}dFctOn2P!uaNA)Nuk4F~V+2x45s(*-$a}L!~gUMdCu^ z-<~(Odx81U_EntOPGi1B`*#$IUyV_5Ov7HGNXGVYzU~LT2~c5u{Zw}1@AEt5b4N|h z)L3EV;x$XA{9(QbUWuKGp^AYgjrG}VD~o~|MWTofdg;VewUzkBs>r>`?hs7q{HHhM zW>f?Z8@4Ik&J{9chs@rgm~0F?Yehy5?KcXPj_Q8ntv_`>bg$t0%b_bYNiIPKb&ZK7 z!t+RSxAl$}x+iTa{PuR0uJW4&PI7SFd}Vj{)N)s4qCwKM}JlQGyW1f?jH`9SZIPqbKureg68vRMJIM6PsxvzqEnbk@XZ|<(A z6)t~ayisZ^Y1g`;G{k#mq)XTi0wtTfbYeW8>}O?M!A>Q>0mQ3dE?SFXu#3!FkX2E} zkq}uLOmSSC)8aDGs042POaPxiC`P&X-eb6HAm@VxXf3 z%otSK_U~!k{}czib=bSCXgG{!8G3B)MnKaD#|?my zw=4jM6V71B2aFS+QuZGfflN5R9rx zosT|WsmtpSj$#0ID}oYdBw9K0%Sc|{daZA25!M{d*?;1`h=UF5D@UZtzHcMzykBNB zZoSRTuwfTP3Zmk1vEB8)GcfT!mG;fspQci%Ke}m6PvTi|@uK5#tBPhB-Z62%=)S-4 zCa}i&lKr~JWB78zJCao7+0ctfCAw4ZEXsur*=MR)`IcK>{BH zDiCh_d|O9&P-mzRbRAT|9~ zLOwp^PrW}dl9#)FDC>_{S*k?hH|v7WGx`~mDH1Pt;m8N;|F~9>@zZyO<5Vo%U5Mt}9DL~3F}#}vg>}$>t#}n* z(XlF-)*JMf&D6w7Ab#kLVooI0#AZwcCnTDM<0os*F)AKC5*ZVz;@{_G{u(&4eDRf184EHt<9@si$HVWY zRlua(htB7tX@GnO|rsjydO7+Xg-#+UTo3Q zOjPMfOPi{9^7UwEX@z-aCRbK-$9(HaV5;O@z|^?=O@?Ge5gM(9iHtN<_v{6ukYA`x z6+FyCQS@3Y z`TM9m$>eag<<$;7AOv_Hfa}s}GB`S`EhQ_r@&Ib$WE3mHZ2Y(OTsMGm2N+@Yq;K_NNIB} zKONbezUU;w5s|aWOMlbL-}vqV#)^7}y~~T?=sTJp&@6iBb>U({z($J?;IYiM(;S(E z&_@TPA&!^L2S_cUp1F`5aOL2zw`0U`5YaQ0U<}sV%r4pCwY#f|S%EK$~DKif} z|MxT7Cc|7mu=O97UV3~2LrEcc{-!`=*UL-!`z<*k)|=Lv>w{{obNw-|Zw5!>6Q2R@ zL`hE@_AB>;i2m!7>!zYa3r%5>Y~%Lt$KLnb!X{&n(IU;IS72|wxY0Xx<$=eWwuqVM zC}5ftN4-yJ7$41Y6nH@atsgzCxW;zUN0*!^i?};8pu3(MOvQP&^h7Bi02;u@jMD7k z4ZCPR4aD~AqZlcFLwd7#&ucVJcI$JvyeQ9xwBnohq{`POk6y0|Vl0vH?yyBr6}=Lg zey9Czk(|#O}00Ay&csG7XILf}S3y){d6_*mb0388f@S=z8i&FobaXNSqK=a>;*k zr$4CtgPlO`*R#~!)HSe%1+IDI<7$NbczmMe{e@qfYOrWqcT)0J%Q7WNYhMyBgYNfb zv_!!eQ#m4v_fDEkRtBw!PT^ zgqKDRGPmG!_(q$?k271($>Wf9IJjDV4;*V4cFVGt1iZG6#8IC5kJX*eW}vKl_Iy zmZ_?XOrNiiLrfHd=ZPfX1^$G6a}^~ z`^5{n0DXfddQ`R0I%gIQO@h%K90nTl(BaoTjE?sp;LiJ1JB?l3{ZFYc$9-`3&$J`> z_F57a0B~E>Qc2Ci3BN^2g2kgL1`vF=(Jxa-`wHQ8M65WsyO+|sxoKrnz>i9AJt7fz zf#fg+fS*Mi7!NyD4k3UoxJf5rQg#e>fFCu^2<_^5xI`Ndkg0}Ek3SXg=?%=#%jbcU?aZwt(O^n zcO0a>5SplubWzJ4LK^rH99z}`+U(r_6N}SH^aBON+KtfN6isNwW zYTf2L73D=ORC?V-@cavNrFx*FmKMEG1tUHiO*94a2?h+)eTT&;DmC@)wxWiI$1-+! z?UD_an~^ATS9th@*+Y$4-LjBfV#WOc>G4$*jIhyLq*5J_u;P0{A%Rq?iM*yC;L$u= zG^$Vf^*a_g_8Lq`PtF05sQwB-!j1@fnAGmiiKs}~21{I>j0SSl(F|;Cv|rON+Gl+` z^sOmrpPE_Yntsgp#@2xEgrF6PVgImCK8qi$VZ_{ID|i`CDKw5d*ved z&dtL>9!hb)gyDZh?X|+|A0%Ya@5(o*80a`PMVKtC-{X`)=5^pM?na(VFLZu6ZPC8T zjg8rbLhyvHvj#By#2(%MggsX+61kOtQfb^x*gd31Mm$}50f4;$`VC~(^vq3wn0%E1 zDMqjLuFJ!Eb_viFWP@cM4(4lsGMJY9CerH0|M8oJpX6x_bs|Pzhws^)AYqB-8ZAPN z$K&Apw%^#iAXFaP@qfs|-DsDz$MEd{XO?dJww*w$ua-Xxb%wMp?&vmZ zV6vMYUwE@M-6Z<9kqd2HZRNMKvx5S=FR~mm0`wLf*0lU7g1j0AF#As8U6-TfPA~7< zmCkmb@6|}8QLT5g;%Oo`+PkxMWWpK^<`LxnIJDH=V`3V`nkdSZa$3y3KUuXYzyC;w z)klZCW*l2%yQe%Kfl~{9rS57Oi=MNpyi~cx7{EG zvZE{m*kTpyMg|V~IhSju|tuLWhlt%=SdQfn4?G0);dLYHzI0frsZy=5?ZI zePK&6AWH7e&F!meJ5Q|JqPJ;@Ycfg;RK#Hf0T(2dY$2ke^h4_d?Osu2%FGZM*^mt9 zLxJGA-nhFt<$Rp;Iy-}Quhxqf(C5Dc13PuF$~6h8^1r`MIx-$F3d$ANSBU}f z?QQd7*xuCh9(MkWVx}5Lvt1W2=}ioeJ#J`e`>ih^q0~c`?MBGL2}^zCzSsaFn848p zb0pbkkq6*))FkhoP;p`O^cZ~kPqI8R>{v4J1Ng{sBhPj2iqTzelfG%NU;hL^U*)+m zU0{&_yO-!Qn&5MiBHV_bxUhrSD3s!_7Z3(ZQN4!?AigJq2hT;4x>AJxR8| zeU{c>_JeE-k)Dklr4MxiJTmn>#Bfy&4-igdl zb%rnz`gMnKGPO*Xt8C8omN%=Qf9viFYZuL{Xfc4DO>qu&PL$%Tr$oOIjQIg=;o5?^s2*!m?i9j!K8mw2cHNwAkle$qIF+VORyX) ztC`$5x;xBgdcbl2Zs^YXC7>Slb0?o1o8t`-1E$athAvodCZF3_^(A8Z#b5&--OlXZ zNKbBjhqdF5#%`Rle=i>FG76k6ok%2IKMp{w(ykB!TWjq^MmBcYN2JPIf55PDIR>}B z_>C&EICeYtUU>6}s~eyMMc$}1Z!~FSI$b@6gGUA*FyJ;*yR=<|{vuh>&GU4i-6%5@ zqO_NGT-B*%i>?4W>hsnX7H!AnX7<##q3-)pZ@GTss7p|ppsh@LE|FB&z9CONC+;TZ88&xk-+ z2&h(?OxA6eb>Q)sanMm-ZRnOhkCRQRQ7u<~;swlz_1Qaa_h>;45#$bhhdwGfP2&RI zQ7N8*sGTGzR)CNR*~b(F;A;R?lHK=BzAf$A<|UZO@z`Mp8wWr-_XYBix^5nH7+`9g z2B>TH@9)Hnz5-6qjJHtwH?-&eN$jN_$1~j~lV^Oqn*e3a29`uR*j*y7J zrk7>Kj_CIAk#P$m}DtFZuY;C|1C8^h$z#+C_9*_I(GS5A!^xQSs{7 z+yH1Cple$zE3N^Q74i0X<>@%vP+@<*?q>_+eG!rD8Cj=y7uvTv;nT_ zZam~o3AiDaQMRN!(?>}f=HGs4IzM`{4i;-xz5!FDzvSY$ce%Kf1(2E_?l*Zsn^VrG z;^`FqDrFxSKJ3FKz6Zo+?qcS9CQ&oWfBhUCgq15Z58bsA%i0#!mPb`0Oi1a z+1`M46#Y$mcxZt88eol;3e2&R4x_GsIB8>_B&;|>alQ;tw0jQ04x#>Kl>VaP!uMSI z2nEY8hIT>!4aiKF&Vsr)gD_)%fe&I3E28_`EX}F?BC@MTSu~b~xquJ)dzrGC**V_O9dpe3jx?>-{dLnj%M|#Mk4AwN zrAg!dmhH)e`3Hpk=YQQE{-D_dlt@EQwgE`a#1z(khsv|p&i8NJ`ex#zkwcoZQ>Ic? z+7b9oY?@YOsV5sBehHTJR1q{)=eCOy0xe56Q{QR^xRH!^BY}v!q>7A%%vGK& z!e0lOocuYc`)VrSlM#EDm9es^L8`1-Gp)Cv0#N=(ruj)jPIvNruQT?UFly7wpIfrD z{CgdP9fL`34e1-M;*q_R7`#6{&;kdKuK^?SkprcUxD5KkQX=g0>x>WPBE+-xdwfR~ zg%dcl)#?QBa?IHH>R^lsF15+iZ-%!OT7V5hy3M=Ku{I*3<5`Y8 z9|XXgFyFOiv8!%ak+74dR4wM5+Cc2#E_<=TjTr@K>oL3Fdfk{VJH{`b*bHF%knG9r zcvI#?TzCN(z0Rzi@A)c2=S-5p#;EOHmxiF66w zii5k)+dd*cY6;U>jjQ6J$r0D+DN$KIx8;?iUqeNZ;Lm(RU(c6@<_L(JN;fqI-b=A} zp~L~v@cW@N^OjY5Wjr*$D7FJ;&>+$9>K$d3NIbihmuC8dHu>Z9t#$b3?!}eLc@Ct_ zWvz?v_I8g5OYZ^n(j>^F<7hCQ8C$>-rKb9Qa`XMCLpJjiR6yt!lw`@XwZ!6-C40bw zHW%+XHcxLZl?wHO39}G?h?I$v^4oJ4n`{^&oJmSI@C3j=c=dz<7#INEWvtT^@tA98 zgjL2|Hml(&54opEKCPY=4MxK*=fFCEtz~`%Ai+gJp49rSX@3f>`6pS6hSmMM9f|2r zP(o&}(%wcY;e-?5ba6-%#fHl1<2^V(4^Q>AuIkzTg-r-?Z-zQ@DT;9Y*>_@E=fasB z-W!3_eYz)YH`np{F z(LF@R<}hQQenb~zLGxvT)hbwJ4Wpo3vX&V%)v=2!y#@AA#y*Per$-W`(B2**`8R3$ zA?B~08 z{8khfl_RRdKihe?!E3OLCk3P2f~S>I3GOXyKq@SC68}MxhV8&VcKH}f4&%LCw87x?!-L9)K4X2Tsjc=-eRS& zT+O$NIel*WMS#frK|q6S>k`<_Os;DT%rP|Tm>eTK6P_0ch6qn-#7j6pf83#?e(wMd zBg6UN1PsK%xWU^vz}mH1lq|*l))Gg7Qy6`3_~aUy#>*k^Fr2CHYNm;?fE4m!A+08z ztx$5%j-|9&#qv8J9NHebYev#Z{qdW)0gN!+I?a1<(1QeUS7>&8YU>9n@N~u1Jx<_H zVArK5+a)j$Rs?*?AoKKw@oY}F%wWu>4U5;{4Dc;PTPW>f97Aady)$M)Y-0bmhHYdn zAAbTaEhy59Ho+i~Bn@E)$-8F!Vm3nA6g7WT3{a-qC)6+)o6-~0vxmP^#y9a_wg|JD ze6};COWX7*y6Rn1Cphqq?siW7FPAC)a7R}mfLEbo>+Y^ix_|ttMF+lHtCwOwVsMU| zn$5tWfOH>hLF5{wX>(Qd^z=86)N2I%DoOw~G-nOmC3a&Mfcu|SuQI68goB|$i#GyA z2Yn2tTr#el&ZCQGAKsn-cDY2LR5A?M#V}0k{@Mm^5vVgx^fjvwiPi${$F~$8IGu$& z#PM2m7TyI+i;Eeg>eBRO{vs)gp3X@Vb!gHi%zm#DNEHOA@0wh{%rr{7!rVW?n_N0H zW!;Rn!xsxC)ycf*N-5+mZ!O{0xdWC3Z@$F+{xl--l$E&06rdOt?lF=CkB)cc;@xIy zOTV?^tSg8INAY-6)?X3GzvFaUTI&^WSOr1&<=p8p@M3B6!kV&k>`OuOc#3#XHoJ0w ziYe}h_Yfbx!*%x9Gvnxm){MS$C7zQ=!aV}yy`bbZFn4$| zQ1e$y(Gi&Q4HhLbG%KL>XE(#6l*}E$#XeH-v3H`1Ao7Es_LfEA#Kn;h7$a_WAF_($ z1@(qjrzFat%izA{l_QetAzof83x$Fh7W=npJQ;6iVC^~YbmG4 z_k5*L7niYHN8Jy{*zi;xm+uWaVn=Pz*;-4D33Qnk=SMD*JVjveiE~D>^WY4=jaXD{ zz+H9CXsK&hMmAVrY_eI{z%1*%jI0pw`}nO~du>j=h^W5TAB5zS_Ie~o&7Y>!O5~Kj zeDgUt1+Q~w>qPQl_dIxq;Q-K~=qbdw{mr{R=lEX%+o!z@9{ zfPAjRk$EAXke|Pk6s{lB3sJuFXUQ+4sICM1Am{k^xh3L4p8*sA!(gvW9g%Yjg_k!n zW0nA2I?LF&Fj_G+5-;dlTRYG(xF(b)2z9??nosi{oXt@AiRQBUkeqK8joWwajy1 zvSQ>{cL{1R(N_A3se`E&2=y;5XpTkEvC4;vTS!}L=mzQjCgk%69S&yB-5*_tFYtHP znk09xq4^IjZ~_v(uo=r^gM!jaaWu&=)*TR`@q${!Ib=pUX9KgwZ}zd)T*P$?TU;oM z)H=Y^=duea*G}h;)ep^amk7B38ps z@B9!F``CDRSE|CTX8bQtW$M-y=0}d#K7V%kjB(5z50r=<0AcQ3&-E)Rf%3%HhNY_( z?b7+jqjhyf^#fS*)K83h1u$ z-EoY`Ou<^O>BCDY-DIlgXuY~Z9k!jLx`<>xe~+LOfXah(7TW~J@(Q>sO+fZOHMKP& z*U+r8<3=v?h&%FoFg$B4lTPa$tl9hCAaUT812LYxBdG4tGdXQ{d@#gO7~%X5X0peM zizmYzpVjBV-b>)9Q6s&Yf*_wNUeRSvtX>U8jI zq+A^F9l&QStzjGv%kwdRrwtmieRJi|1Z?Ipf6B)UED@scpAnmL)0R{t2+e7rLXm`F7rlyX2d^{`6X^C3C&1nz2tcNf$TL9#>&M&Yj| zqDvZ%eoUwRG51ST1`2)vj9i6)hOU7R69$rrb)H8#D(YMYGjqJl*~bj;#e{dXup zoIgGg#;HgzzLv7uZAvv06;~Y6g=K`L%9L!K*|KOz6a&zA0NeacsYZ`%RSw7(%C?WD z=G!b6YmAYQPmWW97z2_+kiX!O7(11Sf0b9KwIlav>^=T1U?dIhE;kROV$Qm4nZ~+A zU@v5a+pfjjYfu|@-fM;D%*^jmN^(U7U&?L!(q2$a!*VRy6MS-J2nkbRT1tcmz{bD| zm#3EZ^)xME`Rg8^#T|M&e9)6EV3rH;q)1;De4!%t{K)??^mk$70Dk-}Nb+uq1Ki2E zW{&;#w35ufkEDCQ3cHcCemXg?JrfYM{Okxtjj}UQ28;?@&S2C}1gr$8Wqz*{!5PiR z;Oc35VMDOf5AWVuN#hU(gYGw|=OV+eIG8{grc0;>oG}n`LM(4Q%&POi4P=n=#f-mm zk&jin&T(1iZo1oC&%*pbF8u6xq3=4;LS$|}$cm)(zUB<%aNu@knwsFa5u}(7C*?Tu zWo{|C2SD<|oDyV> zs*|a4X{7N&wrLXxF6 zg;_1?TP-aJj?%4wv<}ic_?$Lga*m(k)Dni1Q|0f)q%wY2?D*P<7c0$=gmV}Yd~@z= znVJiil6VY3WEL{`9aj9Jg$dxmL$q^aHu~WvHzWg&HLBxqbTA*hS7Zl$4#I&K&pGFp zdkdYE$!Dyl-=7|G!{51b3ZwV=$P0MHT<|{zo<1P8l^Y2-P@+d@IRUm5__%!*-gLmj z;gDq$Ln?O7UNtkTvf~kwzt(aX5id^NNpuqvHXf~oovNV40d~u)FuMyah*-pLlviE6 zOgsehNIV}J3o%fR&J4=mFD+1FhEofY>|3jtPw99DNi@Cs@_ifXf6U<1siJ~AZg^x+ z%Dpm6I-XLMX}S26MF{yz3e!PXsd5$MS<8$vI@#wURr&;#W5E~nJlkn0t}vR^32IOi zn^)A+ZiT+IsBRCO<-42}A>y44!zEVgv&mI;(X(s0&y6dgIt#nWSLdu$S3+| zX5cGrp^V9Ab$WLAWSufsqwaGupt8ERg8a$q_t{4${l;u?=oCkEkhTxSHa6ES$&kD$ z;DUY(u%dPRA-VAOOgE*S>aLEmE#OUr2q~j}26X~99D%G}18`%6y4Ni;e6ZDX18gq$ zpWC=^KDUfd@p_jFx#2uvwY_{M))%Hut7r0R0-cyXXBvN#!xfmaNF|efS+uEHDnelh zfLp05+&Viwy{eG*AOe~yehPIKHMd{zO zLVZ$+@SDDWh<`DL(}!#E_z$j@ex9`#^lik4JXS^j zZLDeKe`MJ?4+z^wtOzdtm|imFz3Yj2J>I4n*L=Y=+Z47}kO(_8I1`iSwk6ZXGZ1y^m6#=@HrA zwN4KQp*i@ZHb~Z~0AohbGpm~tM|X&63)lz7k^q2?AReRCj4*Mr-VPESb+pqrP=ion z0-l$J_L=h9f^UNZI?d2DCqc1_zMGDO*t;?Z|El9!XH3bI+s&o}IGX^3{IY;$1)>+d zEs9_l&E!6M7P6CbFE}r{csnGc6k{ZIpAu+!ysRX^XWX9Idx5`rCU9Z_3N|%$@Qa^x zvJcU(K!M<>7%k-JE}+ETs8!EId~{%TKCqn5fIK{PaC$tDD$Fr7TgxQhVxO=Rc4~H< z^^;1%ex*91T29a%*eq#-dN4s7X-Dd1KLW!dlSIJf6cl)0IdDFA*EGh&t(#8NMId@+ z*v0ys3(mFTd`(z5`I3cGR*${$3erd6W(<;I%pBba{M$D3O&LpLl?VHY zYhf4&c=YTj><-Eb%A=<Q&QP+K1TCTV?bg!L8*qBx;@$E%r z38Ga+&!wT4A}5rE4MG9`zSgA?`dTm+o=V@Xj&$*Hj~ucqEvx6UPGvNE2dEm=F)sUt zIJTFAqWz=fE`mpl!BNX=*S6oDM!Hl_Fx$EV^PBgTN|w?r_FYnYhrfPXg@6hnac&QV zvqS0NoYfgHKf_CYI%%pmuDtkM3(^*d-gr6q6nj>m!nAttJ28QUX(SpnsHZ4@s22K? zZUj}}|7-89gR1P>|ItS=knY@qfOI3VX{7}z>F$!;bR!a*Mp9`33F$_*KDjeD|o$Ni%e+z1ZsyNj&lgt*E$K zTz18uJ>~8_SD~Vy_r`vZC7m^6_cM7evpE);GLNIOkwt3dPtHaVPMpBxSX)}R-1u0! zdq0aMDEE+;DI4Y`co_`Jq-yE95k*i~p)Kd(n?`e*n6>Ecad9Wy) zc%e(gj?$`k0D0*3JAgd!iJV2O^FR*Z9sz4l!Bq{DWlKAFR(G!@ZcqNbuoGe$FKyQ@ zC{PLAnGz6}>%B!Bs}Jg>H&U(!&eg5uNPap|1buhw(S|v_T#^BoJNbYi$i2ZE*G4$y zuH;nFln7R>=%5Y@W)5R+*5y;~!=;W9yfOX^F6(fC%Y6v9 z{waoB($GcS%=1%Y;aLD;TS^a~H^?B-Ic=nzZp@;8sPyFpa^U-9J8un1S=H}!o7c}p zBtvwbz)iM^KIdZ~m`9Vt=fy3YkF^Uw<`ZOD9evU@cGF+9lt)u|FfTSec$Z2;w{?j{ zAWTLymjk9n^$U3>G8lve-5o$2^4ivH{{9st(2)BTAwX%Ffe_eVHs5 zX???EB%Lqke#6PF#6#+9(yjrl%@}hn^x;}}0i<4_p}JUsAY~JTih0rfLnFxTxN$D} zfQSO&|3l-Hu4Q;XnXdn!DT5kXXtSZvRlPp2{w1CUT?LIx-r0z9s3H+i=5e$IkPv6G z_+yGamhz5|f8h(g40~t2;g8mlMeG+LV@v&%8v+FvNAc%_`}%D20hUF`R7NPDAIOX3 zZGS8GuOHwF2cTczg3C@J9@WL({bo?(2VhbG{1ymX^XSYVrw+x{`Wa0`mjl!>ka5k- zv)=sf_QV1dCS_xypu*DY(U;>G;HGilG=}_d+i6>%Zwh2#mP(_2Og)vn{LwvEs`Eis zdssEA49-0O!ZiStVvz>)LD10!sFX|J&EYh?at3uQ?R?rFX+v(d_Z}gf%m0QQKvYos z{tM(0`gvucon=XGbg4PxwCW+ci++^yR|T5ti`r#;Dc?0E(U@lK8Fi||qrj-I01ZjX z^DTDqK_1Hsb8sbX=(zy>g75;sm5lra_2qRey>*Pm@n$;B5rALNIxdmD#%i&Cy!|aE z9=-#)wozW1_dr@qxlR(6E*x7%0w8{;`iqU=(oUN-PnpZ;62_J?MoRa)^q60uieR^Q zP~H2eWwZKqKZgg>Z=h?B2A)9vfGem67v_!DE}M1)qZ@~L%oAh+6uTFz!vOvTz#qC= z8(m!ot2@i5P<&+(c(~Atdjt?9Yp+>2HI+ZR8|WQ@v=&m2uglwash+l9x5_3A(M>~V zfVOBM(Miy_j~z8i)zK}y#eq~avst5Vt_{n+0wtM0Femi(tJ{Yznxpr4qMT;^L>lyo zxb54rrc|A35iZUF1m$<+f{&UxJ}T8UeF7Q%b?Y{~`|vi#nZoA_Pg+bT1|^x55}r5% zkSLayuy0Q+ctPdIZKO;%HV_izE$m~MJpo{8XVlfXE-AtS1}e$m?FxT1o;c=GQSC?& zvyho5;sb+hX8`F#`7k>4Fi(94J_zTLRThPa7q5BjF zu4t*PyBhB9o?>m6b+(GAX&Sj@JKTld{(6(aN@DH|Rp!n{Vc+`SL8^s@PTBQAW*VRd zsSbUc`{A5scjD4FiJ)EK{_d?_vm)=Im5k8~iGI-GmphaTL%=!XqWrYg6-6xmv@0zh zfLeaO5mWfh17+*eSveu%g&S?-(r|1L6RAc?1_-+MkUi4XALIEuy`1hn5JKL1xDab+=+h}EG$W|7DMzb zRyS+7vNFib6DLJCU28|%u1t6WBj3KhIlzyG`0gYKu)}ISuch^gW*h)S?h(l^WUv$! zE#MYq6e+7CQs+ghMIdy)jr?pD+q(ephfZQ24R8D>uq6$0(5B@tZ-R|Lbw(()QVEmE zYSR;d2K5W(B!;e|xO?VJ)z;C|a_5rMxx8+W8x&rl!a*cWNzg_E4w7CUfF=HXb^^e4 zE-{FF{Dw5b(jdy+769Pza_`+6i+7F8ua$=?e%!B!!Fq!2`|-7;zXus_rZXq_gHysM=QI>v*2ZNT0>)v0!{{I|Hbn<_^YFHWIRQ zf8?pA1?UX_Wry=@Mr+Ro=}128h0cn>qUTRv7i*)s{zc>iKLAih06ZfH5IRuEvO9NF zQ@*zB>TGZ}Q(nqcpet{Rx#Ez)iPF`^+4nx=oCl2EM=!a-6PRcl<^O z7ccyM7umq>Z-|h~r;m8mPn`mM?YV)xw$9SGgq{^&s#06L6WUq{u|Z)E=%*)l*GoB< z`He-vPXLvSkmW;~L|~tXFSgy`M!zvIv*F-EA~62~vFin>EQ4T#iouvm&Z5^rg~R^b zYkwTU)9Oo*j9;M-k+G&R+T;aGxW%ZsRGSM28jsz9M_}?Jfr5SX!eV}Mfj0B?u0rfH zTkhRyEr3mFerS8{CS(hYQ$o_88?Ea|1$^HHcCS3!6^KE$ll1FnJ7g~6FHXdN2q_IR zkKVt5J}^0b>{I;F#5B8Do1h8;P?$7f+5As?zk^mizbddX0!4^&Cy91!FQ*aj3 zD;zvg*5BMve}5cE(Y5P)%NYX9KMkpl<*wK5UxPt>Be~w-=hVqB;5D{n0K3Jtabdg_ zy*D?M#cgv4k_zSJs?(V;)FU0!KWmgn+mnJ~<*Q8K<935WEG3(Eiup4_+~&(5ei8u% z$zM6^g+Pg?qbBH@UsydVisPa*_>%TE$Cj*_^3*(w>hgPHmQ%j{SzRRLZY8t*q56p| z-pJ?62sQ%Ry}5XQzQuGxytF$ z%Zc;755uYd&>Kkm1E!3f1XWrRXS_x%kqZD6>m=5!8fBC7siFIA?LE4C)7RQLv^qEW z)cF{0NYqc-AfQA~w7!pPHi^%GzBj+{ho1J3T_aA{f6Jg7w$nM_w`u=*=NY(yqUG0% zha%cK?2i$-0IJ}=KE(=jy_&--x05P9&k8=18&<%)KU)E|l;F+1c$_~9C~^q{BtP3_ zIuf-k2L?WovONd{1NCtA^3pAYOw;_-=${f^_1_V9S8R@GWX3>WS$WY zNaK@GYN>y&)ihXN>*++f^SdD;Xk878ZAV;) zz?f(x&1k`Yq-Kq34 zH)AX-dbw0q^{!qQ3j0V38Wr7gUmz$!_iY{o@T@s4TeZ7v7&{Dwye_#NTW*oLTx5oz zja%d;85XXUTwcGfLe_z2@KI;^wlx~ftcDu1{Zq8lb^S%ffP)Fy=^+$qhFq3W+N~Ut z@DA$B;j^w}@wDIAQ3R4`ZT&5i(y=aXi?m+TAz5@lpeqcL)D`S-nw$qg$)0B2emlaf zn4E~JD7X%hUIg5@X(##0b(@R4q!M6Ec@<^OfvQD8HAMtm7b~8d7XAm`#$J|IBLjG2 zz-adbC14FX6oS0QKs^j%9;F_``qSPOi}i>Fu|;|N^q7t5P( z++b7>vs{R7@i>3UknG_@*A}y^G)mt^2C(auKiwX^!$cAFX_7DrUjZw3ui9I6xB~Ur z6~pW09XxbH%V7olfXVZ+dfgi1vtECUH+K>#94VI{qMK$<6t5o3y(P?=y^Tazh{v6 zKJUpPl7G6lZYw?_2)tlc!~d)aG&*>K6*^Cp`@ISek4`&DLsZaRavm&6p|Z^89+A*K zKqhjjej69?qCGZ30OJoe0qDQKcz5rUC%FzRE8O0cy1g2}YyfIw>+tBsKY=!b|8CqA z2=H@(_OgC*i+raw=2ibCj@fy|Jb;;SyY5rIdF7<$i|d7$_k5s8>pv~ffL8fS7SW%a zAMa-A&B5sKdiWT}_4~R27o6Klh}uj&P%>e+B$hw8aRjGO6c*aQ`|s9Spqq~9PvmRe zR@0-DdP`7le~YyT9GhR@e~<1j-2&>FxJ*xJSppzsa2wzSl9`|HBIdu__UwJNrMc^Y z=trUTU?4Do9{{0x0`mc&<$~OD6X8LJ)K3Y1P*WGftt5Dn>9`)d3~*>``s#JvN3&a5 zfiH>tUz34KYb-?3#N_uN+U8Idra3$?MSHRLZXM^5-lFa+ny=v$9M(>g0F}0cDQkqV ztj32$*p-aT_Mj;{R_HkbX5IK$j^m7ZMIu<9$7chepMircs67i5 z@HuH;q35yrF-@2>$i>ftVn9HjUH~>2*w>)y4b+55ZXW}qN6yu!Z3kJzhUV-kE!l1R z-T!)3RG^42HI`3mzzCs*JTLvE1)zE+MWQ(}uwx*12yV`?i3tMiSpk>atV7QoAhq+` z>*=bbzu1BEgfNq3`}*?T|92lka7}~dzB~>>sssyzgoHjVp@XiM>5VQ85#qe+PZbhm zp=6$4w`u`a!L6yF@4E=lrX+4JZ~cNG{NE2`2j2ewK6JhRtsj+G@Qhg?w|$K3{{+h? z)cFxW7T&@T!jz1YJku6Rg^?bBQHw@D$-|{Q=F?JcLFV7H{_c}3*Nh+XgOAJ|_UslT zkr@HsMROe+|0?em+~j}Rt@sag+dnOb|AzGXXVyPpW5WM@%O4Dj|8ye$1D*1Jt7-E8 z{~5RKB>rw8{r}nBpc&NLkL5U-`x}PRgt)g`TlRm_>0M*!g5krz8)WZ|3_70b{WAtZ zg$t-!U@`snDfYa9_?cm~b&>;;{2zAs4dnU%epJr?OLF7%X0ir-hiY47YvE>6iu38X zp+e=1w2*`lRtfFL86oRb;kOtlneo~o#ZBxAmOau|k(6p}krufp+6MyZm_7`XLT?!@ z4&3{9mfx}yNe?cq)}+VjL1JIi#PXe+nyxSoAM;RLyRBY7_Zi)w-PTZGi>Wx*S6Mu+ zbLw^GZr`5F9Q8~QjO>+(7@XLdbsDQZ$i%cZbQCWQUoJW`7~^^D%(!e*+cac+Xp%w$ zz?Tohil8YG2*_&Jy1Rzy8JRtSPQCTI>>d^0G8``odO4e*vu}ncn#WhfSgOcJlF(k& zUc|21);uPDdLi;)bji8pJ(QH;hqRyhw`a@h_VEZ=@eFaI9Q>oM`^U}c0_d?`9nuun z#35*t&58ua&25~Mb8w#_69iI!gK)T&J#T2g#M9(%8F9*1=+iW_0GPx=0E`AK7ljBx zC(1KV(Mx&eU{;v_bSKfEZgM?DB5xDt{+z~MLkFJPc*HoZyoHO7$^zFme=E(RV2*;ZhcBuCQIXIy}apF-;GPd2crV!^104g_wdDW8nk!D#^HU~ z(rh<*Vu|Q@77j55@*8~uTtb!~uzUPz8+dZ~7)w=aTV^nc@sMxKTn1#bMnp$1%+W}# z-HkC8vtMlbS%;lSrl7hLEGH{6Hhj)^=KF5`$loAj>$L#{YmmJa`We;=jb=fUYF`gd zvaIm*Pz)^Wp=lZD!8Jv9hkJh7!?CmICVe??&Xw7b%hFmn?PitW`rz(3EgObJ^Cdcc zcYijnPs?3&7usRBLB;vI418Ht6M9bf7sj(P#$v*Qs%(&+xRl7EpU5q!xhaL4 zVZam24;FTRERcx=u#?vA40Z^+j8IoiS&1VUlrV?W?KRgLgou-%db-u(50GP?l~)k_ zc-ukj@5z^+td-iDg)`}>Rqi>7N+Iy*2W=^uiQpDB5j^NpN7xj?iWX=}qz@T_>O&+6 z;FLbr0|D;~*?K=23A5D(h}Jf+L9uZi?7p3!ldD% zcJkG_S4S|YcblgB&7yeJr6B&WNLqvVdplo$S1K9e)=!PpJ-37 zVR*F#sVBI-b|@H+cS`$&Vg{?9>272-$x7$Ja8QWwUL8vxzIe2T^A#upl@+94 zFjvj1@9j-mJO)|0R{y4mft;EUpcj+J%AdBCRo1>m&8uvYFJR1m&*gxwv^VVC;fCFs#LGeMy2DmC+G#v(*Mr8Eoa;+GS&}Pi!D&3vg8`9UC;Xz6 z7pwD&<|a;S4%B;sh|Y#XF^c{jw|)Fwj|!V@7uPTLM1^e7<{#2SlXK{&^dzkc6a!BZ|L=tdb}GD+n;-Md0bEq=Ncs_8Y_Ca_cD9O2oj64@ID}uteq^BqCM3Y zrpo353xMwVVS{7X3q4C%r~_LZKrOEAEeiE2)LzndPu(fCozo|oMz~|B8UYIkbbH|Q zaJ^3lg2^OwaA4MFnL*DxZA!(2b`)mQwy>#YBE@vbFDF<>V5a?fE@5%+0rGXo{bajW z<9R1bBztZJ(CD?g1as6MRsri7_!mVSwgY^S4rmC~>$=#Vb~y9OF?qyT!w619l18xW8j*5U$$iFuSioI ze^-&6LGx9G)y6W$rP1u{=9UM*#+3rooqhu2I| z!0#7`Nn!iKBt(WW7T&jGF32@Q8R8;f1OZeh(;Nf*WQjy@iT4Sn6-nJ%xKTJ2tYb`U#&r{A_pDyS+Lm?xWf< zj@xVg$bxqDseli{H0bLe5 zxGWi3L?ORVi$EFs;QtVqgoN-2XMhP$IkN=rv|)OuVxBw{x3|n@{e)4^JsO;8`*5MH zWA2z|A*hGP9%tUrtIj5{Qb){u#Koa`k}29rbW^m5BS~{7pTwXkshq%F=N9W_zOmqc=D!$deox_EHjX2;l(e$ z8EoVp{W$&MA(;4jxOrVxh1nqN%mLRDKJRkI`fR3Xu7EZKS zB#r$J$&qDsgfx6>xVNCiS7Qt=HY(_(MO!S+%7gjyG(~WiG#H)uxOpgeA}SP5#I0?i zNkKX`5Sg<^PHO zB-*C8EW&Hw`~kOlj^QwCYVkw2Ap0`#B=HjPxR&(5Q4+u=N z5RPOEM-OUYroyrIJgw$uPlEb8^AK@i^RN(Mh8g~t3$e-r`IM|4$ga5W+Cmrm%wzhJ z;`S%o_gg>OYwK*o^_Gk*t^NvT!Hy=OYBVoS;!oYQBqhz}2h3|Uuj6xRzH#zWDopSb z8LI|s%c$lE*IM#64$PzYagh>hQO~ymu`HV?^m|$DM79s$Nm1Nki()sU#8;{cFr%}C zp&_>zSb$f>@ff;AgITkX4fusDoNo0$4{l1is`Ou7V)?Cwx9+e4oKN}$1|S#(m{odt z^Oe!=XYjrXIPw1BGR$5h#C`I4{lsngdFFOdpfG!5VsF(OQ>@Q-ZSPug?mMEnNMgf3 zl2}K3*oR1zIvZpeDK8Q#f8NZKdy)FlrhVq1oy>G=PP%+tDs?e=i!%(Ss%jLaRD0ij zJMV6UVQJIJuvxit^o~fnsJkYP^yi`enfY&-W$WWH{#+|r*=;Jgl`UIV38ygU=V4bUVQuo6lLE#t8fw*zGx9zTr3no65+`BzmqZW=i zleD%DvUlsr-HbB_yFiFP+(FlUmiA0u9Qm2xYSJn+TA9UKxn4qFIMl`hgFc{+f++^k z(mivW+*1*wEIj$1WzjAx59wHB6PYB_1C|Z=4$1({CLDh#wkLo1)=p~ANi9|$I z;?0tcX_-BgA+@-kyLy7gDVbRM5G+hg+~!{7t4 zF0__CB=_BYNJ|nF-V)Cwe&+LHE+{$v;@y^8$d4(iW{`2`G(KHL!NFB}2*Z`Ewc62>cI3SvI^^_}L-W8~T$?tMd# zX-3k2w~huUxcOA<2dFmC8$R$rfNXw2g%>hUn4`by>Gj%=qmOtt2?Gt^D}J2$gRSdx zb(C5p`UEG&;lQ_S(FvE?q?cs8#IcwQKwGS+{CGoK&yXAhLCmPQH4LMn)_*y>KkHk#M8PaVUUntSCX$6tzn z{07}*xsUNEAa+LFoEcls8gur<^ zgvQ8_6H zd{x_`Wk)uri2Rg!NW}=#sy@XdUNZRK=XnOekB+$Y`4~uxP>yCJMHY{Y-blBv zJ5}=-;fT~lU3sVPuGN-4*#i9MhM(e>;^;v zDiG4F+Ke6(ZQc)q1g*O!(la4fq9$sk{&4HCFBxRhhW*gnH&D=()kF4jN^`naLtWJ7%8@$OT`~cxxX2;s5cTP=gm-gy)4KsY&g7l1DEHr)?B?oWd9^Y{x4$6;>Q$6?8^9K7h z#Paz=jzqQfnRUtB{CDoJk@K8de0SvjraMj@Bv%4O>3B^eoOgF-M#xkL0x^-NOHBU~iD+3<0!BZcwMZ^q}K&B=u5 zqt0wO0MjoXILk;@RmlzShD7aInt=05yFzj@pjZNJCDv=1kMb`*)YpM`G0)VeXah6* z--}7*C1}X7@vwV^h-LpBL{Lmq3+bY>@9$-`WDlH)0=+(2@AoezBQT zh)AMA;P=YYlh|akC=>TcuH{#K{%D>BA~mNKtA%6r{N2D=4n0rRAStByMfM@E4n;)# zers`B2z<}5dX`WvvWkzEs`2?>i!#wxPv`nDf-uU2BLN(?ihTCm z58*A-mC_%{IM>$cJkt9&kdcQM8Z57_t4YQta?kIsK;Dg#2<+QnVCw?9%+RF$D|{1R9=DlFP#3m~3Bgi2Mv5Zp$lT3bBKMo)#^W0Q9hV|>)Nr_C?eX z;@M=3ZG3Xe>W_kfpjNd zrzv)Q>A}S2y@5ngEG}M>j*IPTBcU!6or9H>)7rUoKQknT_MYeap+|ZI+;}>xHNzW3 z=J(B}{;f|JP};2zp>1|6cquzI6MKgbmeTCiSB0Nqu&n{34e>AcKvEYQQLDrB;8x~w zV8dnSw|F|K4^vJF{pDA*>aMf1s}maZEjxxS*g4$caOI1b)bXP|?aR2*1tb(&H0&<> z(wURC2n+osRECyjyLfP{^WiEHjs{oFa@CzkC1>urBkROhH}t@4T=ubi$8FGc>-XS5 z2(%+j67mI8lo!TbOxW+AKP0ZI=qa9$w0Q&JR2(OC2yXh|^if7~^saDFiz&Gu_mj8s zIzOCL|LQ%ST(}M3fOcpir%ADD$jl>ed0iLho&=$lwwV{lt9BNL)e9EL{1PKkArvU0SlXy zt7F%1W+oAq-1G^$QtW?N`ubHk=>}qJxbEMpd(V}z`3Fsr87Q{CnGk9$ig+hh~Bw6bO zBiVT`C)J9K{Jv&*a3z|Udrl{A;RS3X+pOS1lb_H$w@~J&n95U z{->J2tjcj0s_z&!|CcHA)OG1FY^A>_y+A5m80jp(2sfrbE_sd`UPIF%lwDqh0|1WTNY=P zKh#}r`kVXXwG1a<$G-L%t*Prjy}MA>8}HPm8E^4l<2St9OAphIhkK>B3d<%LXH*V$ zhT1q22(kxwN40@pDb^53DUJ2-#paHfAw>%kOu>dvFY3HKO`cwucGZ245JnD%$h`U2 zK+M6z_FFHD&i6J&frTk;{lEA#?i2J?#?YQ2Ppm{6_L>M=l8gyy7FJ7G<8en=VcN399Br!jHT-)#9;u>dS;-Rkz}m~7AL$nQ?uVva4!yf_{@VWilg%XmU~g5xE0MId zf4s8S?hvFEyv(6`-87AzhCvG*7gfLPPI}a^FT684Gqsy?J4^6btGWT~0Ax6(T7by# z-M>%fb0<`U>S?4IyU(jNH4nwBF-o>%VJKtAY;4!^&Q2Az6B-Z2pfg>~MW87iNCo|N za&fTXG%e1rzpnS5ou`g#ELtp}_6E~$B zIKA`Bp)pu(rQ-4M%r>|?_(8BgK@&q-{-62p!U#esaUa~wNS$*nKSS1;-TQDBdZRX` z%-cDdg}RIqCET12ekSdPXw&XnX8jd02ni96)?@DNgXc6u^VFH39PT(rRW~>?D1*%;64bD|u=Tt#z2sRf1jg--9DwanVhsf9AsHR3?MQU#pPekF~->ZAdg(mNh1TKG(z2Bib%>OE&EEo zDV#>@{WqIL62k{obJio4g8TMsNq+w9BnEs!>QDn@UD}toe~%&u(k6y;yqR}k&H2as zg}8;QrM@K&G5CiuM#}iR|434O$ULzpLvppV*Xra}k_%yD6YIKak&ZmGQ+w3yO2+@O zGCotl0}WlLVSSUq&IZC^Y3*?Oc&FsCV$?t?!M~3(kN{32d?jZJgv+z;ed4!12H$TF zJ{X;jv4G~{y=mk>LQ< zg_RDYJxNUxk<=tu(#HQ#m(97ro#%j@Ly#} zO@M>+H%e~MDLxFJSz2mVbgYs&akXbiv=h+KuleIN zd?epl2c_0}J37ko(nDz@%#J6ExxZ%!<6o~{msy$NHBO-BG12Id^8YqSOr=A!m)&zT z7qncPMYZCbp0_tm$=0fx-fUUomsJ0;ht%mz-^$DV8|K>R1-cTk;53Uo{AhoE7qIh& ztf|}@fgAIhT~=Zl0_jJZMc19rK%xulGMNR=>o8n}>hdso-UvB0iriF=1%;&hf3 zI_8f{Big~w|L84N9*T0yFrY3aD_xAZ)YaY+PZABt4Gp>w2j2-7}Dy0a}-uEwer}us#dD& z+2rg>8yOE$5?*N5cK72gk9tpY0@yq<)$VMLxUz4`VelK;z0pag#So8dW;4}C-Oz(`a^zVvCtO9 zbJ^Ae$0q?M@)R{ps|*F%!^D$LqN}8^Pc^koB2N+_wxZ1ZkaL(USK5X)9Gm$&$~~_M zdc#Fkn6s8KvRGt*a5UEW?t+eb{|I_J7VWey;>gK~Qa)*(Dam*pX}*O#A%ZhmY)&U> zbP#f2vIrqb;WGo)db{ZlBH;6nN#E@w+*FGeq8i z3&9Hr2jL%4u+xcI>2v8_OGm<_E7NN6jR*oM_cD9Xj)2q*SmIMQA+i_S%~T;#FH&RC z-k&dpjeLG%Wh!3lQU7x0 z^Wl@IZ=QlTqZIWR(|&9s)rvDDA4~HOlG>zc3PPzb9zZbJ2WF&PqkDhAOz{`wT<-;A z24^9q*q!cm$JTp4iC;fS434lW*c2|+QyzM5!gMA~D?rcmw0pOkqUS?{V{Jn%VNYPa zkYc*Ae39yl%OmCk;u~v^i*ze|q8-NsVfU>slL(!nI5?y8fZdB7Pg~-6eXixwoBnF#}5qdVkzc&-t_446OHx1L%&+r8dG>jx) z|G=5o8FWs5UR%({B)xl_MwN#cgt^D34ADQk$e*iC&m*9*%%y(=6CLb5&BcH>-4k*; z#;Ynj5;)6f(|yG20fQ-H1twrA!p`G<+;h{JPrq zdo+;)8S4xUcERcXR#+DP=O$AHxAfv@cGl~+(Axb@@;W4gMWUs))Kit(-F1Ups0K{4 zR?zppowGp|Y<1u0h7K){x8m(IcET>SH_`FOXB(3no!~!85EGXYM|x(*)}umRV0?b} zKzje+1i)L(wQ`G$owm=x={0y9n)4?MIn)K+Y{v=Aa7f|vuu>g5#c(%GpRj;wY|(zb zPbk32uv_@H9GG0io7Wu_(xONgdV*_0kE)nbBDi~Kl2OQn2K2^Ma)=+hhxFd1kFi zaCnOo9+yXDx&jx0P!a#Cypjghl`@=IV~fC;LIgS`lwY$_xHjWnSnZ@>Cr-kK~ z4|7mdH%NTEI=GXZY0xRG^$+uzKYkrKKTK?K`p`KFQ8Kvp8{lSFZvh(V`N%Lfg?O0$ zyj<-ow2HH@9do^Yp@KVG5hrIw<|U-G@9C)IeG)5}j_Vi~F93K~sO?KPuPvjcnk?2Wm&X#PtsoFHodwz>MA z2Q^&AC4M)RHN;1!P5Z`svFH>X%L*R(Z+1RW*w*wFuPM%&5Kr!j{Xny;a8l+%tPJ6r zY!NRs38%*kXnDRIOFTJXwx@lgIvsQ*M8C{5(xrib5Ejr5!Kr*kXVu5b!HH(9dio~A zfzk2a%z_oZp5}Wx<*d=S)n{tO3(F*#diTz|zMuhe?3&(sX zStNl+a6?*+GKsw22u8If(aszZ1rK3i-JnH_PA-Ysb;YY6{X&ojSVKoqxhlUsbM&9ACnYk;hXNG$6q+!^xjF}Uf}-J`3T#AbDEHVE?%g)B1S zsfbBhOZ0x02YKGa)%e_KPtqmZeZ5MQ;Ub7EkfjJiS1DsZaV>i7tn7**O`}lF`j7!3 z_H&~#t!HiUNFk?w^7sHYrnJDVT}MtQ38%FwZu6`e*?S@5U-#oVwfnvlvh8*#+gu^= zLdolZw}0e=+c$J|VjTU*BOs*7$6AoP2RQL@Lq5;1PCww)-b3(uZ%kQ-XYD;EiLFaT zFdsc+=*-;vP>Ni}Kg)K2x|jIlibk-n+qTqB@LnMFV5zTaMeo{Dt@Nwo5XC&khxL{i zMrWnvaz<65OAhNVtxip)L+l>~Nt$#r+{~UF@6v@#k-F>C>j@0a&y}AZxb+z76Y5_& z6c-J$2(Mv=aCH+6!kr-llIUp$d!NcsD0b%d?jj^)WDYLxx0R+)lgGRN7822}F&C@p zWLwX7Q*t#1xm|Rwq@IcKCxsjAqr2-~7gsA8pta6_`HusHiMs?!hzBf7M;>3nWKVdx z#p5Ko3E&F$wKXD~JC!6g)O1rL5dl>6xB3hd_x>z_XPuS(xxO3I`c3ij8m=_( z(iGI!hW)IlhXVwhfa!G*`PK(v0mKgJ(Q{o?HJxn>}6g zKNcPfnbA$ulVAr)MUcn79gFzIo(Jcpqp1h*?`AX`Nk7;k8noB?hpSX9y`%!c~n6*sMzv*A47=XZw1P#(zC2kf#uC# zhz-@Q+!IqGF*^_IDZj^Uv6(e5SudOF54&TC(fTo54|=fUG|2??0RX;ovpuE zfT&w~^8zZ>YX%VN0p;2{1;MeheEv+c zor5zW@Whv2F;wsb1pZ#;W(}QEW|r{M)qJJbZ?k30=+ek@p<%RQ3z}70ozCh@*7M%9 z`&ay4UOt%+yduM+=`_i)BW*UuI(qLUqp}A!vZs})wF$WUNw^?tDYXrBm?q7Mf%Y9*x|yU z7|Q%olAZroxGbB784pgrr;_=(Uz^7a?vu*bv5Movfgc#J^*2#j?TZ+cCfX?R;P^RD#K$Itwc5kLz3$(f|3-R@u}*KfJt2cBTB`|}pS+G%TlNa;Um z6>>TxFukekA8~j);V|*<7ydu_3Qa4gJWv$Kb42L4H6PG2 zn7>F5(D85X|JR7Szcv$$__dM$9?^I!f^L8Ra|GhAg8_4Z>o)&A;?A$V1(W>G5ov-q cmj^cp!ervd`qi&!{sO<3qB0@{Lb~t%4~ATXCjbBd diff --git a/doc/design/mkldnn/image/layers.png b/doc/design/mkldnn/image/layers.png index e65e1aeca47cd2f0c5289d0bb209ef394545bd31..4f87553b41b2c38caca2e54c039e9ccaa9d605e5 100644 GIT binary patch literal 14414 zcmch;cT`i~-zG}02~rdx2nZ@jFQNA)O#)IxLPsEkj`Uuoi8KKrp;u9=AVs7roj^hr zsnU_&YhdDce($>L&TraM8&d$l+=j`+Br+iNAGi@~>1&9I<4-crJuA+yB zhmXX)q{&EdEz7EEk8yAK9(rm@c$Gu=Yr2czB-^DX*-Eai7WE)S(`DcvL_B zzVOp{sF?8Z#FsQw6b*dMw`c8>Oeeiq_rNVa!$(^@?>kzl@2iJPKMB9hXPn85(VTk4 z{r$6S;@lkGRu{OBtqX?hSaW_q|hpbDJ%R=mj28Owc@R-XG>) z^5L}ad_LF}J(|Vr5leD$lfq(mar~{y!sphG-`d>eP8rfrNkzpbw&z>}_eM+zq@jU! zDJZL`7z&}OX_B7>9ZRzW z(MFyNGo{U*fkrI9j5f>J*qO9XA^jyK)q%$tF*{%y9SJDTTvn9b)n4^5-cDTd?;+N1 zooLDl9L%&*{q1mq`5*uEe=(N-LbInyz|u@<;BL0uVMpm_?JJd%X@lOCL1-Cy zVMKCz-ArGO#`$;va^*9hmoT#TH$t_92F{S?vZNUZtP4CuYZ^fWt4H2AAvk|6fU zUD6U8i1M>2PGR9)*3)Zhh`?DU~H^pq+t1Gdl)yJBDM=uU~JO zoA6nTD;)8FL<-n)@3qLWwG?v25;fBT1B#{Jzf*qET^!O({Wb}eXR2MxsFS8XZ)8km z&M3(?swO}Cz0s=3k9YVu_54OTt&3hsR7g~^$kxRE}v#N5iye8x4geD zqK4%V1nQKmI9VvYn`)M+&Y!T6_&IL|Qvc-1YIng=SDqqd4~pdY8gNvXB)Pr)wx1s5 zAH$7X$g`h6MVR}k^ISksRziG&t;BL+RbQktS*N@_t9G^S^BAz`; zYr#qp-SVs~No`W-NAnj?(yvg`EeF5r$VYW|DK2CQ392)RKV z!z{W6as%u2cg#(k>mL+w%$AX{$-MQ(1SrJ z3MP2embXh1enH1Lltdyx#Gl*Hzcp+ID#LP=u@xjPwE>#8^BK1m7gDd**B1s4E)W?T zTOsZNE#wcQtGZDagljL~^=l8Cp6Zp?ay`uX#67AO(f*okbUh?Y4CTFJV!;ChJ*%$D zWno1g`!gJDQ9Zo>&&cYdc!{#Hy&OYF9I*FKBgj$kgM)KfR83)vF~>W}S{6nsRm8Tr z$pc3wCEERYxYG15kdSgZ#&p8`ZYmJuJ5y3S0asv|GUd^6Dg%d=IX)EKJg)3{$Fab+ zsrdC9O%cJ>+7qH?%TadoYQKOE0Kv`Lw)xlcDGc58AC#!LXTSn+=1TkrQxz(a1~W!Y zBw%ABySe=xyiO2J%~fA)IoocC zTvtpJ7~s4Y=NlP;yR&lper^qP>^1iDC;mBOX8dnzqTO%CJWfmQi)#8E`_>QeBI-lYr%#$LwUdFu@>c`8Td8-dBO2GR6+=z+rnR~mj^F{2B3s8P| zuY2;xP0zyBSWGMuwN#wiWepgz*sj3=A9mVUX^Mug?0KpQ{j`|W zY?FKNhv(k*ZkjjAkUqLh$1G9yk51;aADho(7ig$_AMXCRFbvu{HPqk`=VNPA1AW3S zj`s9vng3xAj*V(Yh8yygZH@8qnpo)3PxCkMa*r?8?D*-vn#vn~5MO3yNcc>XRlvwF z(T~ga@H}#WKBVn&Tn{++kid4|x1%Pq0Bk$@wfQ{DE)973@Y6t`t;u^M?}^Ltn#Zl^ z%?W|J^5BpzThvXI+{7%gb6irnQJ9gey?7 zq=NtPhk$O|HxEc+T|l@aG}}mVerQ@6sYC!cN}bBdGg^@;X0J3HsM!lLzbh!;0oD>G zYTMeF-n9|RpkZiu03WeTL9ILzOSYvI2A(~nZYz1l7e>(^x z-d}%C#cirQR&A5evM|+xF7BKA8LZTXVIJiWK4$7qELhtMzn^*4@>bKcRQ9MCdRoA6a*H=(r!U*jk*6Lw?9 zIgU{737v$KzOYecm|!SR3eO->sMQevApPD^CCu+6OcVzc##AJrPeycoNsAd@#0#TqI=gNYxHVGt2vblHcuj+tJ5E zY;6T|l4axhE>I!2l~Mv~tBvX>CyakWIuzf36Vzs*H`N)m z{_$!>nDKCQ(=O6Wx}%~#_m{aq+qFX6zq(dsC-hx|h0d2gxXy&+^k-cT=WC9Fnw+op z){|aiS#Nzdq*l55D&YBG0NI6Wmg8~a5_Y9nH8KCpm-ASni{y> z7Mp_gJ&bTPRJ38OMK~)}O>7de{f4c6!PorC?HP_sgYmH_k3yK_oUB#&?}>?+>*iOX zyB45yS8T!4;L2%E5rsMx!jWgUdE0JWa+|&VRrLI?zRfI+W82(p+PCE2_x8^Lb%TZ@ zgUp#V`k(ehuei=_fIB{F5OSWxS>H?uG(qlj2LWrA){n^##b31r%NqpyetI$xUh83y zhvTK93vVTHWFK)w$x790#J*Gxok=cxB{`pqw!EDzu1e5zCCpwbpJCjK|(?yEBQ6ZL~wE)!$vETQrWXpcscVwu<`t zUhNNJ4a_Uy9D1LqU(@PU*q!~w9BFY8U+mU$wJC!-oSyF8@a(J&GA~eF?LFD_grlYL zJOdJ%(^{r`QeGj%xRNMRrgZq867q`KOA#?PxKvn4PA5Yj@m(?W_>VwOWsj}q`~w<3 zWLr^5B;QZ0}iFnchtf|2P8kZk<+>goQ|oJ>G>|su#Hu^)Jn) zz@`c9tXG9vBYXtnGuwafY!)vmAN0ps9fDG{ry@Y6NwZcR9zSOv)o=J@Kd2Blwx6%gL{{MozK0ZD0C$ zbzG{%A;~v;ifvOQiXf~;ui=m+^sYI-7(%F{a+VQ$(yu`juZtaw0AF4#=DH zHriA7ye)y)1pM6bVciwX#s6p6?8h{1U&cxPiZnfAzkH=1c=!gebyxDU)m=Nq+VQB8 zq>mCe=X;@CK0No~Dx#ZTrLsIHcmWT3ulGLI>CqkZk*4=Rs*PfUrvI!oux=)O@R?zV zkI{A9QK0no^vHH{Asqzd1x~G@Owdc~vC4#|H}|1O{A3H2Gh5 zNRcgpYw<3%3*g5#8j;S_TGT(2g7OhG{BU&pp1qCqbk;TRkym~jX^_ZH>-9BO?$GaL zQ5I7po(4yyzEU3N#X})9`prUtrsrIt9|aUBRinF+PdoGTo>-6oWk=WB+_@>s3xED2 z9Ykw(@lhJ@9d092iCE29uuhTUAzkHEhq}~}UtU3Oq4b(H%bbxTfqk3d9uMQ~&=Nt< z!6gt|ptccp;L}9+Oc-Bp6ZIJSN3;SVgH70Qg~EWnVye$+B=IDlY<;oIIEfMUEC6@9 zXbcvZke&V$@rwBNDTurpk4#T`I&%Nz>mg%5xPAp=hj)`W8)pDVmYpjV0X^~n9h-1X zW-1P5y?0YT5koG#Z!-k2F#;qoLiBqXDie7_p5gJ|lbw>a=cmAC>v(0@?ven_hXx7w zi70vD1eaYn;rQJi?NsEO^yJ|}@+vY^rnF~yXTU5N8)oQHuF`;GbrnYF7mu%vD&GAav!D6!Fp-f)*^;BOn zbb29NS7ZQ-(Ra+QppT+_nry~vr(JU?T?ivPRkBSmjmw-Ohb3%Br)$wMBbhNb2Pa*h z5o>N(O#Es!lVAu4*-xRrdL}<{8aaH^5w8o*d%5bBwQ}CzZ>L&LGpm_pFv@N^?e;beDh&grgKG7~Dd)hzOY&Y}UPhC@33r3o78F z65;NY7iYt1E;?5OkGQSZ)6YkTz4~9UI*X4kP|xDzQA|HWNqmPFm;oNTnHuAw_8;cp ze`1sV?`e7yNt2^r8yU@Kzw@@tSiL5wlYw_hKiK^B=uY*whd|t%01!jWwcV&s6bhToE@ z)|`|)w~n)G(ywZrN&uwqI5~A|+T68X(qERJC7%AYyo@GLn$+_Wg|gakqR`+?7P8G9 ztMPjJIBxRSaw;>#2Ip*jhT2xeKSWtG*@`<%0bQU$SIY^HQpbB-v&{Ubtc~dv|KuEF zGPx;9HTxH3W$F~5jXvbIRaBf_2xJ3q{VZDK8*hC@;tY#zPf5)x_HKIDTIA_1kF;$k z0Akqja}BFKG_b1JcK@nn({WJBGLOp@cm7d#;BMVfV2LcmZ(gc9yb~xXN7MpurDpBB zPR2~hxWf&`yaqRtm>y9HLK|p+?Qt%#@bGk#llA~CZeaDeZPd7a?`NtPIBvR}7mS>y z#GR7ngsAnD4h8)=mn>Rp++u3~w^c(*X!HeVoCf=PKHG7i#QE#Zkf(C#-}CtQeFgi; zBXT#q(X#K8vzbK?($pmOO702a-1k;_wq3J?OiFa=RTsyo``3^eQ_?S=MY$oEyhA|R zH}&=h(!bc+w_b8x5J-Z_in#C)RNaQamC^L?wX|O9unp zHdP7u95MoZsv;hT6YVT|7h3!R>7<;tj*fLsj%DvFlV5-3&OMjIp#G}d=ur0{-+mk6 z?PE6RG=tb^p(%@2bx1t?jv!8~e+P7Y2w21k*VI+Gf^mVOeFbG-U*#(e;doJed$cV?SzX&JG1G*h`uOeoDz3wm?_{;HHikn--J*@Ji`f>_yB*rf z#}y*vGBtwubL~pmni&6z_HJ6n4W_5a`w;V5tE}W8% zQSo1$J68D;_*c4L^ysaMbvk<~ex#yzDckO>!Uu{m#6~&$tLYDcRlb=1?c=Xn@-VHh zul@vNhjJp5HaV6AyxNI{+%crZ)V$Jzd@>SX-b{b+LRB<{#{Oe=K*?&p?=PZ4QZdoq zsvT#a30jQ1U}657ab4DTsEYBCXUs`XF28s)fOv72jdF|We9oe)1UPNZ2oq6x(u%#X z9er^~s^jW8L5)GZRQ;dFKL55d|I?7@KkL0ul}T^Lkb%n%7T@)+W{Vo^Xq(1qJWuQ0 zIR*}anwVb`vP?~Yl3TKIHp4%-w=GYX!ph9_?wi!7zr9O5pjg*51HYEYJDw)%3B@*Q z0V=Hr>Y4u`)N??|4MT~2;yJU+f5WF7`JX)J&AtB$H-4)9-H4w)$7I>e9%%G!T6EUe$H-K(ai0}ROp9~fB`JBheArtl zI$l#E2k+0?e%g!6U~rIp5Gf0R&3;Sowy~7q%S3*!v76nxtmhn&)L$*w;X(A_vZJ2` zJyDa+mHX0;DwP}irs&Ca=BH2FUpyM#a(EXNGwf$LxQ{(5WZT~m%G+9fUkyF|)knja zrT!G>5}TMjSUqX2+-w@AyD$fTwfA#Si`x-ULOg8qPS^;oG4& zT}6`9aArR3o?+jtqvPro3lH;dHFPcmGZRMpG0X4|x!=li_0^R~_-v`^L_0XWk>rs< z_2ux7Wv|(jERws*o4*{tYxu2oX54EMcHwqaz*BhUx3-wECEr9cT{F;15wl-yF*H#K@$c&>ayI1T$5+}I{_UN~n6h2Wb zxm-SercG-4;7vKSb&y$uc$F8QxzVlywcm?@W-qHTBBQ$rOL@Vnz7}Mv5@J5*3a24@ z5urCumPd5x`#a!%Q??*_<~J@SVnht8CmMDaPpL>{iVvG%LVr)z+P3*z z%admDX*)!|&GY3^hWp= zr$wrbV<&qss+1~=IgRH#>#dg_>DBoyfEB$@W*hvqypOtMQIXc8 zE4REpPL`8~mFq;@rl|lBqL{Td;r3NLBIl;#nitJ5$*!iCTz+c9?>>B~ajpQW$CD(U z-^aVR`Nr|RpjG&!waMmkGOBKGAuabXgiVqatWQr)MVdWWu z)1!^H(|;xDTy&2dbVd`_g7ay~kT>p5Xpw1bcZ4$sb(33Q+vyL`xliJk%vCTBJ$5J4 z4y7wjv21O~`H$)`c#`5xHx`L{?HN*etE$V^sP@4T>+Lp@`$>pl zrPD}kAuDMI&g*=>>f;nKSYS*@m7~THzA9&PFVVRo-5e;=+bVjR^%0b#sOt%%63Z1cy7qn`U^(v#^)D`ctuZ37u3#iZ*Wp2+cIfe|$ z$`p_a9U{xgi6V39Y-68|&gfOaLhv=&LNERPy*UeyF#%;d##*Y~(b>sYviQC0YZDuR zCva_SpPKS&BmaioRcCa>s!w3>P9KVM17|ZSvIpKot=LW3u!51&2*ZSR1#fALF5_36 zPs|aMC%J`#6p+a3jlRMY-!1}qyLi7;4 ztC2w@a<--3*w}%vq%)UM@R^NBa`I=cios*zq&hZ7#Bh>fyM$x+S~OO!woF&f$0w!F zaQ9D4Z}bJI=jePbMs*|9G#xfZ7V3<&EguiEGO>BjPsDJPT=#?!nf62r-MJRG zOVD%aXBWbB6qJ`s!RRJ*v}rZ}a}_5sk(2lTnf=N7lt>ik5`YmRhtu9fAcHrh4E1%Y zT2DlCan1+hDpTfQ|FnPobfH~G8L)5*b7Y15nB~>w)*bjFppK-VxsRe-RDpM_?-rb0 zNq*$D;X-dewP0L+f!dSUNl`*-`k&0gR7W17DpyUzO^8*V=I#jNk7`GpNOl?(Q;4YC zSJ$BUKL6_3@5WRpOM9T05p}iCpMd_&ONEKBRc#yQ564(51;TLZiy``m5@jN=2$n7H zrD8kPdVsQJ$Laggo?dr${6unxvn#<*E?N&hwQ(DwjQcB^ew;n?axibf5i41Bp?!DHa@AOpfo|*r!|z7l$Y2&d-r|r zJ*E+BQ!E|e696D`haGf+qyRz`|ICqF-@v62$#UlbG62u+w6qDMRC<6FV4RWw*|v~*g%=Qauea+k*x$-h+)z+_9%-&;jF6IirWO_;dqr5 zT&VDBdC#mu0b<2l5Y?T}EfoX5%a%%Ld)A^?l?{5~?n)$K&U0-d9CnvBbm*{D~Ekf3IJnuv#J&gg4y zBAt5zQS#sR#A~#{tbo+^>a9iGvV3BJ6A=4Spt!-h2Mr5Spse_pXB&Bj4>MoAuYlqa zP!2VHX5nmNxEH!<=0-xfzOHRs0+D|^{WhF}-fqv!xQ{)q{aCEj4da^dhJv{tzS9{a zb|e6Rx8wYMWax~ojj&_Xae)T;^pc;a|F>al?u-D@r=515r@xpyit;<3K#$@UBxr79 zUM1GLA4LPYj{)YVac^i*&MtOh7u?k-k_wr);nw8Hd<5BdPu(f{O2j_WS@$F?z77Ur z24MN*uH!FSg|t8&Km^`Ja@1mEev2;)R9yvJT_y9&1<8&Nv0z{bZLhDWFg6x-bhlEu zD}0BD?lfaoi(L8hzTaFezk~VR9+I8wr2PBvyR58o!vQo4N7C&v5~k7-$1(b3Zz5NF ztNYfg<;y9FWjGYr*?th$`tI@*nFqG6*K%WdDU^RuLg|DR4jb}&60VuAoM$wf6H1zK z!Y$5aya;)wiouz3P&i02g+lxKGt1_l>At_hqo}q0pWEg9PL9E%ckXfrJ>L2zs0T=s zJ+ZC#66X&u|2~yH1>-O7Aplq5N|2_%2RXODqEGPowbgd^lUfVED%RTL4lB5SL6*La zB_n_QrytGM-JWQ(--z=$&hWf%@~3BcXFuCzTStz6R$ZPq=6<{^|JL^P`Xl|Ee0`P} z_8EV9^x|&sEGq%lqSYHklxnx@m`oZ+0K6SySi3H_uXnaJxV9i;c46*9{MT?h(s{ij zaweYGPi$A)b%#5+z@ynyc0mYPzBd{fbbD1nnb^ZxHn>M>i*$N{G4=b>2=TNP)2Z*O zv1TGM%)D^94*%Bdq=2lLjJHLt(Z0}i4%)}J3PLB+d~ zrMC%zy2OfkiYW|AxA?RlTd~Sx`*eSuXA`|}a5f<|s2p_%$?11btCEIvK0 z%Wy0ZYz$jZ`F9`YUuswXx7%;#)tk<=lbtKWFM%0;c=}BX*J2IQrUq}yu7w7`JBbmL z60t5MgpkPZUj!S~P&dBYO^ArMh9n+wvLV%K!JeLIsK;gJ7b$^YiTgKLyUFF!YltPq zy@Je+uS2{|&popFdP+$^S`TA+T3$U}J88;8EFX|YU#!@DP+gATS$a35JuJ;5r0spP zGb`t}@goVbKm$$w#z*jTSSK;#d|VLvU}2(zbm-2GuKe4wQAa}9V*d*rs?4d^ubkDm zn!Dwhchb`nZVd#>8qO5Xo9!Vpa*90-t+bNP8+MQS1v~?67&FeNS}0g9FZb0VVD&I9 zl~_Lvo<2!SQO_|hsPkCs+un|poX7L{YGrZb>Dz7^n~440@{(k8cjL0Hs#f))2fTRb zTU0zDaC5k32CLC>ijI5ud3OGfH`!KKk6wb?O9SL>d@kf79r_XCHXlfY`0CQIr4*`U%X;v<51Wla5Z4M>gFfQPA~hL?jaJ3^O27y0x{L*Jq<&)Pg* zX+>;Yr8(8H+=5wln@PP2VX?s@GbhvQU2waVJAH?Lb!G%46!Jg<|8WaY3}+0X4Fqk3 zSxp(icO**Q31lY_O+|g>3%#DxcW1K5!t~HGrr4KHzVXgyDz!5F{o7addSyMyUaB+b z>P49(&1wJ5mmSImyt58c>}rN-#vTnMjEn`YDLseXx0Jj)qlbTbspc(Qj))0p%hrR} zC?7J`fN|rIfxmQ!IfLwVJ9Q3(-H(MmwUOP?lWD|jTRI;xF^I!LX5)-I{BUb zm6DV!NOl+x2fi`c?MTYswV|E&tDV-;x!GU_9z2z1%C<8k#pRy{bZAXsX~$22lijOH z#Ypj*EA z6%sVjazT@2cC*_X8mOP!gIavk;$v^x8kA>}6H?X#)i*jtVa;k!BIS6RqbM&gPnc#8 z_-1$7THH}ihi0?U(^_u^!L)lBG;p}oLu0Og)`nQ9;-339D^PPAyA5Gk}NniAd_l%!e=B`oBi#032&BABOf;f&u7KFqI2Fg5yp@g{T3Q;b@gQ;UW_!cp^#kEvdYd2X($ z&nZy$aMcb3pNrMo{7Q*YwRtLTOOI)*`Ku6BU;gq-wGUFlAcL5)3iQ5tBjcngMFS5N z;xsNet1HiyxB+}UG6xGXD26I@8tfGB%{@C`q967J4#r$9beC>ScS@a7A)aC?S7=_# z7hftsc^NdKDdvKlXZ&g*gaKM$aU$1tyx3r%hy={WB{YmtB1|FWpSg%<1gGa zULX0%mbjaiDzgsf)2dc0aZ3KWOdsKDyGM4vakF@%*JmQF^V!N!cOK)<&}3JAB$k&b zh_Bbsjndu*kD<3Z>HR;|sp6a7V*_CwgxSWw3oVEvsH^YC+_!8*g^>|3Fvm>mE*+B2 zv@$|Wdd=(W>7%Tz)rARe3zshjv4J}-`B@kl#&%ie`NFlh0^h^Mve)jthW9bRdkw}Y zv#VY%PtaG8>2@%WSiV|zm@#nKtXouFY5FNjU&mHu-_+qn1!w6Ly;UY@BUX5+6hb%j z12Y3o&)yz~r7|HKuYY5M!9}2+TI#W?MtM1IXCkPk*L#S2{3Y)OGyNV_h|VYP|hu6WV^ z3O=MfO3g2meT#dF3PqNb7VpStfQ~!I6(OK*cdi!bo2FO~4D>{!L^T+m?O6mn$3cUS zb6{YH%Aeme9apY_bsRi_$79uqrlLJ*>TX3E&%wp%@L+-Z7u07Y_ zeALG2%P;x35<&eA^w{eX8j%mY28g^3q8D*hqW}_Ro6mWjs^4z(GFBynm_7S<%}uPi zHAL3RNj!I!UP5fZZHFJJ=m2XGasWVC_cN!XJA9AIIu#zAG3yF@|K zMjU!j6dZO-rTws%`oTqTGJd|F6Ylo)FoE4Ake*|ex|t37O35#5_Qw)%1<@)#xR|v^ z5EDpXUgGX7hF@zK-9!qJhF&NSx40+6rK~Y@gV7@nCb>>#K{b;bJYVlW^^0D}ZjpgG zl3W&_kybdqEu_j7=J&fdl=0C@r<;|Ha@Ta;JZaB~#pvXh^#E1xSh;FL z6z?pQ*%DV@Om`2{AaW=*ba1gjGip)G5GVlW|8XqgD@*jaAZ^jqjy~f1_kRrc>XlHg8>9K1fS=`M_Ch&yg!<(W)6OCp$*onHmwX4n}7*Uy$I ze=79WU3trN{ zU7D&JQWhw)?qXUV*PqE+TA(nYAJO|cag$3Df+K+WL`uZ13(KKfS_od}L|AMWD&ve9 z64Myt<-{$%Sw}5sd|a0_j@Z?*pu03G!*67Wk`3_<>NNfG2IF~_HG_e##CT+3#Mi}9S#TQv1Bz{U93?-rdL;si529f;!@KtHaW!Q}pJSiBpe!jm z4{s%j(tD6WGYhyTelFn&FDzulg3D%ybi5bpiQ2x@cGxh%-Y`%K`VvMM304zMZs00T zu8C>>hi8udvpDJhZ#n&Q(Tc2j`?MBs2bGgt=Cqr=L8*U|a-C?>N4U&oS03}bJm0-j z!5TBH)3Y3~d%WMRtMWddKH#NE_SK2?e}wQY-rO7lxeV{-o7?7~$|_=8^73pb3^f=j z+=csT8IGj4PtlzJ0z3axOn({ua`Etdi>C)>Yo{XPV6rDdI85d?(WTw zO4^<}xJQ8sah38#7L&jqqcwJB@l&gd?WqROV{t`JnHUk)jT2@seESMuuZpl;|I6t= zc|N~uF3!%5$?bgxO1_lu3}{X%@|DTk_5Z2gHXh=O>pA7k=Uu=KnRuHH{8@b8Qe>Oo z=#=^XyPmx*XB{sKF7^)ZsZOo&%|<>-?GjqN{QBH>zxf;0Pc{9yW{Vd?yI=bE5|{t% zrOJH%x|c}e4w9@U3S@zru%^O$ITb_F#@2^%`h4``+$D1)(c+#a*QG}ixKYX3X;f1{FDeffs)K3HV8 Vd&Ccgdj=3sk|NJa28p(T4=>Gy<%FL-E0210X9y2`Ms|?Wbb9s-t>fw6^B(&= zlV{HsG^NFaRo(RVnu2|?%$u9<9}RQUxiuug88pMDj?BeB_ohKW?C}8?ohP` z7V@_r@FE=x5OCi(=;>01`?6di<;x z8!TP7G^(U&s=m%L5n3p1ukKD$SN}{#^4W&^hGmBZg=3h1LC^r};YE5hNL;r`kF&7} z=IE&wdaNd{jDIoP>h-J=Ve3Z>ce|{`9O%5TU|=^+)TieHS%L!k3I64QuPF0IWWlIh zt&u4kjQ0T(#7;rqX@7b}$MPo**Jjc166Kxb#OLy+;#;FrT9Hf(=EI~qx@_z@TnVX} zi`xhn-MSRHcBbA+w$RzUqGaa&lCw2e&MWg6^bX;7(AZuRR)M-&>yCo+wy1`>acSRD zsCx(9rPxVzJJUbL@*n2?DF)MUG`kaP5Iqc4=w$14=1A!am2{*|k^Nagt$zoJlC}D} zBl!LIPu4mN>$Tl*56jxbx9IzdAHqz25W9Z5BuX76BJB?lKBhIl~#!HoJU?E&c}H>;JU<2zWwgc%JB7m|h2~_qh{( z>BralBxhT54*m=%-c@Y%P16i%sOPKvg>6{DOfQYM2PPu|Pu(|bF|oQ;G}mUzF8eB0 z@vkhK5I}JEA(U)p`gbF4Z*1Q*GP|#qt8&EmR~Nv&8?@|7l7eE$nUx`($*g|Ol68%J zaBA{0XKC^J3r+f8NN=}T61f|QZu@2~Ry0jlUR64jjLowDF^(cyEzw|2eI*9_=65NN zB*EM>Ke8gs?gRr3M0TIfZ6mZKP9y{^9Q zaA?Hwib=5XZ%oGTxV>P#Qz>lQ6NP^iY~jsYA=0Ll4wCzN~*SJ#gaS9z2=L-5^8WQf-dq8&>mwAtl4N-<%QeMD^^s)J#B& zI_#~qtkXZ<#q!+@GqT$$UD2#034y&8*stM!7o67bb}Y=T_Jp``V9Qjt8w{>AK-F+G ze5Gr5L|IYwDP4>^5eH}+*>F1UR{jRxSCly^-6=iTvV&hIUAXNdu6dj@eP99j=Cnx= z^Kgi|Ct1(U#RIp8nsXZRLt0a6Tkqab1QZ{f(JXqCO>6a`#PNP=k!1$+Jj|5NOrGR7H#BS{MYb?T1>V z1x1{=Kh)>)mPXCHJM=d;emTE0w?(smKt$1}JQu>6 zbKc)>vMzls@05ygvVc5p8@Na+-J?7$IkSX^x{8}8^MIg&crg9bY?oW@~Y8Dw>I|LwcNFR^ud#}?U2SutCzLZ|c;KSG zlpL+HMaz#!JS+j}W@2|E7!%OG+;|+JJGZ_r&&0Y`6oed!+9(6W zY!rZ5y8Rn46h&~n*;x9SFcX6&h-IM_F~J)ZCQjV7ovxE#0mM_uzi^vr(r~k-i=)aCaUygSn43T?9`q@ zi0mcg=Mo8u=UR1pOM&#|L`)%XS**I+Gg&cn=1?$q2gA32d;UNCX)+Q?5F>+Gi%*yN zE?Z@X^JLPu#YeR~PyKwE&M*UvBL3LW6L*-d8d};VX@V`@*e%+)J63-u+HC%Zw_369b?`-XsO8F4R z`2Mu*kKXE~bXKj*<9{0jxTm>6J&>P5=MY1_xTT1xvE}`L^2PpA$`8Q-n>SR-npJE! z%XBUy8bSGg{+?68P~mm%NtLyKs57qGq;Ae*({>4-dEySh6aGyY@^6ILICMDjQb!;h zxgWbf&4HWLj!ZrV<*)o--_)RG+h{bkV58+b{dp3$kQ8QE!1PxP4l+!%LjCK-hWJpe zzqb^AQ2xYUufz{BEMWid$N#|&{~t7+Nwnu(tsLRpM_B(8Y1T)p)B?g^uY#9|RMOLG zh*Kx@nK?b>HJDzl<^kx}y+gny&8JJUtVyr2M3C|&D58MriO!!~ z#X)mkiYz(#juS{TS>XUvB9WU87os=1RstbX{vqWl@qZ4?JK!5DkRbxkVEp%o|F*$@ z`@#R+y|BghQlN<7NK!fG@kwek`wSot18bcQ2j_M2H+?wciQyY+aasu&Sqm=fv`%FD zX+S?Ru=9Sk)+nTK)UHC441K6HeWf3LijDeQj7J=RAPnRx@||ok=~uvBeS*1!Q}2UE zA`;`LpvOd-u6~=n-5cfkF9-&r@qfq4|DxXiC-a27W`+Va+gclotxtxV92fNpmAuuQ zDT{niS}XNfAT{IEWeZawmn50_kT!|SadlDfyaUMAMt(VP{aWk4THwWHC~WSlJ83-F z2W7Sa36{29;-GWGPW_Kv#_dsXe{`~% zmpM4KreE1ONR#Z}0a@5j?QEb^)k86s^P=9U*M{+uE5@5B1j_4@d8v>WmW%@re<>`D{HX5ojE5X8wi<_8@=+~vs|5E{n;k}r3Wht4qGn^az36g@(S9}xfBp+G z20wCICW64s@2@0U$vaxcD|V2!jcxh|Tda=9r@Qm)iu|FSmBl0ii<6W*SG2HUfPrwE1_=0;_Bbjnwo<*bl*Y(hiQY7yLoPl`XQZFPGpFeM>*S_(X{o%M)se(fGO0Yk_&7 z8uJ4w>M)5)D4Zpz|HWS#UiJ-z+E8AB(mKuPvaZ*KQ*eC}H1X_`ZqD2vV zPL;@F`Qca{bQ z1E`7hdIeEC-1pZv2;6`HuGDs+#zUg_uZf{V#c?$M%z)!B7)RufWNYKgI;#xO0XNeSbZF%~8} zizh()?LZo?`jj|f=4A&w3N{_e@p0Rhdw0 z)cnVAy-`jg0(f=jn=3_{G_*qFPh<*p?0n$9X|jZ&Krze!O>cRvgh*#)<#Vn>kqIaP zfj{b`rQbmJsD5koXj%>~mFQOh3(9V!?fBHL-p$_Hi5%PpprCoJWj2+-La&X3q;wNb zg$i8;0&_p~`|6cWd(VDR0z?&<-u#2R|At%2s!nMGbRe>c47JB++M~lcYo1(>Gz#q8 z5Xt{b!Bg<_y=yUk!)CSV$J864Lb*oj<}F8eD9XPX@RyUUt(4IGGi>#t&6~N~L(OB* zua62ii5}>AHP*4aOuNLHCiF48V}SrT$grTQw&LCJrc=;DrLE<{Oh(6Oh}-D?d~h!3kWEUxn*Q;XG;9pbx=K>`{A}_|6+F9PQ1# zFr7Do=CD@GmKIYI=D<{P@Ss_M>@d6mJGDO1DUglhFrjIFlz$Z{N;;ZERQWv4XxZzV z+~n7VvNFlINLtnBqI$fn7t*N+w}m~-7+-U`c2|WdQoBC2iqvqXgi%&3g8c;0wL6A! zvN%D`d12UaGiApPDXZ~=E+NKM0iFlO^*Y}Z>RtHFkcRbQvdH==({zz=XOEFd6sMl3!zz7-o_p!#)x<={i$&bTUiaUpl_Tc}|{3yv_@F zaM*cP_xE56o$@Bvq9Jxv>{38YihCPYS0~czX+1N#*9K-Xjx%j zYc%CNA7XojJCs(C^;h~=z$8BI^l(S(Ps5<2#BUS2XXSpw={VLV)d0`o81n?>@LZ& zL`+7~3KUN|@6wcgp`DwpBJa&MNk)iAnr=-Nc_x@P${IE?5XD844G@_woYAB+$~3#%KIX0wj6YgTX=u5#V1rU6MSecalo-o64p;6g^y z+4Vv79m`650AsQjGJf9*4|`;ct99JgXeBzM2|f64^z0kMBwb%keE(%-7EfB2_(bM? zpy0P%QQBkg=*p|k`ppyxYCv~sKZa1G5?2OH>W{XtbfmZX(!qqf8V|lNfTS`CBMnY; z4%aaB!YmC!+~%th2+3na<w-lZJR?GO2w-AIZd;--d?DgNdI5L zkX3Y+JvDTct;$0}cy4Ofaf9hrg5^d!RpO4FqHk}~v8Cy10xRmvxSI$JQmb7`=ifwn?s}IuO5&kV@{R5gyc|Ir6m4Tb z*;iN8@#cp533!c=%~j&<))r3(EQDG1309R2gTJ9G6sSH4&MtwV+t~`W0rJGO^Cw)*m%H; zQ15tJR#c77Cw&{k29Xf4;B;r1g)5EyTWckcE2KUTPWv@mfTe+H`6t?Uxe?t2x5o>x z2Fnq!57obESf-__KD0PZpEs*E`|Q=^2TC@$4Q82rpngNE5tH;|1W!_zF)dX;%d%0s zuv1V;Pn6?F=9=q=+!6n4cGR)6(WYF40Yi%*XYMvtm{3`>Mm^K_FEXKc+6ks0`*GUA zv)W<&6p_PF@l8%fDNS}>t_cB@0$yIu&c zqlr|hA8lyK7AS!bwB2?pZ2-N%4xp(Btdn5IU*moJsvuskp z-jGHwaFBFjHbtwR<{kERf3$hN2P)U>katL1yv65j>@1D1d!Tow~17B#8BuuF4 z4Cb9;{o+25nriTjQF2?ED`GJNnF8U5rG@>8)CC~}O-!P)u0XU4!gd)C#&pV&&V zVFm)=bDrpQ{UmC%L13>7A>ro>r&`vvwjbdTQHV-9;eC%Pg8iLpuF9OL14&}$SxlsJ zx{*!OjJbU%udD^A<@VvzjLsUCLP71=WT6yoi==oHe~ zx|0`51^fJX=g>nDDPH##vo3!zyG)XWC|sh=_M|J^Be35H6ms;t5qWvVrCd%E`h5KQ zCToE}1XYmx8UpnbX%WSXRWD{foS>i788)Qjn zw_qx)h=}er`xg({JTK}yn8p3j9J3WPLr`q% zmJ{_VBd#Z}rt1S|K!!7)t8jh)9TJnh4#Z2B&K=o8^+Z`XlEiL+5+6@e-l02nI24g> z#7k`+=|gy9EZ2P5Wei7v%2G_j@FCsIiz)c_^MitC8)VgOcgLV`q^{8s6I@Z|#{9*m zGwB+P|7GhljGqEPIGZe&r=#?cjV~U@pqoh!)q%Y^)qOzz-22rbUa63$Y~{z1SRIhO zMl>Ts@stuer1fSFz3@90S4QzXgG9ZOjV{q)>feL<)qBlTHT}Wp zLJ%gDWY(Nq_qyz@;4DXDBSLw(n*Z)q8!Gd{EYp$F~%79wR!5HS&f;q zf%#BWiOjYvEu65-{Q+oJxltKP&n!`cHp5Z;WF+K~`6S1@r>lEZ0ae_+P@?WKGN-vO zltbfE?HchqG}~-ZUHQ`jYj(}1$2`q!x~BKCUgf4sV(yfmrN3GSO1p+YzKZBV+4G?b zq>6Tta;se?!r^DK>7P0%5-&^|-o{*3k;}SirC?U9^>f|014)ZBdyBV8^S^p|n#l96^guL!x_>rL;!1t4_4#I&bNXIi9VQBVOX6tW= z>*!N!ti)?vH@7N9QE@JDEGmm{cJUX6;hT|fx}|jBVH~_OJ*Ek>JM3c~`j4s-t_@)F zdcG}b(|}tQ_Nbe9c9et}Hpup0tHRv>Wk7wo*Bh&yZLou`QBRe3ptuI#EdD=!rp@d_ z`1(X*>Fk?&?u~3SW+$Y-Dh&Vs(BlNdElr%F;Fe^CW(KsIdNF5pVysO6c`sg!gUo>jNPX`zQ&ZBK6i2;h!yV$;yzoagT7j@vv2K zeX7?gbnceYw=|v-JUw@B%yI>f=UU3+2k%lOI8q2F0y!oMMz*uY4H9K~3({lpNO(Yp z)1T5|h5gafzL1~)Y_XGJiE!p{eek>2C-ew!mcfTApCpXWE1FikyBH~1zV%rpCzmmtsnN%En^vi`q@o;hq6nmwdgNs zR-yqjyS6#}U3YtNdb#hbF>-w}8u@k-f*=XzJ=KopteQ1|vMokes!+My)0O zivi>au9sO+HTpbqPm6v%Zu$xK+sw(&!RmDTxTWMEzF86<`s<_{>KD{dt<8V%_)x#O zPDM-`@QHKBHtniW2VWo^8r{3`l#m z-1gLyY~Ow{q<`0P|Lase>h{k2c2+w*E&2>tI@$o`B=;*N;{^!ZA!ieR{bu7s)Vphc z1eIt%@<%6Krmrh*71vF~qJh~DRhgGdRI3n9@9clNj2TOW7=2;6#KIhFt94rv%D`c~ zO9|i4+R4NDNt3INu-pWVmd2mP$7#`eE)EyGE3xg3NeJ~{F!#p~Z}jP|WSN0Y@bIhN zdfJaAknU~(nH^}eYp?kY*sj)@o`2wfE^ksV_}BE00WAAk>)X?cMZDK4!q>YW=J80m z2dich9=O!OJH8W_2Lie1K`N}ilZm8&cCoi`wT)}7#6$r7a?-uwS3f(U-E?s2Dsbu_ zlsL(GKDk&OWW1ek&;8H-Q8Q|o6ttpgXK7{rZOb}32x7Y7+Q*A$O9~oM*yAho|9~BF zKafCdZ%2Y(BVn=-VC4gOo7!82=8>?+bU_}JAKUm#^=e$i5XGJ^^ZItiaZwg^0l>d2|CmoUuSm*L^uJ?uSjb)ZhiK4S!n1CbkqE)E^BW zpi!>h6|o3Hqoqt0<|^+#!utxD(6g8NO!^-E`Z&gX4D2yEPlP2|*_lUA+o5a>(q>)i zs9TzU!3-=}fGXk`ZCQ z*B$+5hJ#kE797L!=Zfy6N6uQ&3?KUXFU1}K7XzG&Jzi>g2;^q@pB=mkb(7}QF}%Ku z0_Oo(fC=tgTrQz1t6JJ;{;8sD{C0$a_vz}N*j*bs$}Q!|7^2;oQ5zQr=c=0ea?e?C zQW6rJ_qcOgW)C%g`JUDEnRzwd_sQ}u-Rp)4Ez9Kr9ZB2HJU~VJCrFw)W||J2`84SJ zEhGNXywVwa8*|+Eni{kR=dLkS_n{qWIY7UhUN?_8UGAG7 z%!CCgq`)P7O#83SNr7Nn&&_~7b!@ffVBp1% z1#{?8U#XJNCsjGO{t_6cu>G4Y!e($=!K>2R4H`ZH)D1S{?!*@^5gOS!`>FJ-1EhhF zb|gnhv-rf~k@EFW+c-5G|{26^$bPB!WH_@5W;_}50|4dMx&d8>Sp6X~R~~TZmePHKaRtB+kJ9aPeT6nEulsgN z_RqC?_kd1dps52pS%NP~eEl@ag`Y3ZKYy5yJFcUtSWkNbt7dVL`^?}so~+5Adrjj5 z==0)j<(Ch2B?)-qEI7%6C3`n}!5GT2t&InW@_5(gq`+B}>cTBvw`U*3qQdS@+qxIo=xpS8!9w_yj$l)8~=z@qw~ z^Z^LnUup(QBy^+bueQ1p9NqLif{)i&>uEyllCahkdD}=KyRe(5?IS*mSo(=ep-N+Micb-|Q+K_H z>Mf`TM%rZ9_?H_~!lZI4_OpGKRU=33UjJc!_5LZgA0J@&v(-P;?p&_x{PYWn&teV6 z$}ie_LB%QwlQMU7z2QrbeM~R_ig51k(zigIh+scCXBqb`rn@ac0Qkc7NQvPGGUS?E z%Wv>f%;4nJ|fg7k6l$ z>9ztRf1IDX>~I%N7;SMI(m}Zb0NKi8$>9cTG)^{$S@OvsE;YqoLewgC*8rdpn^&s4 z-kl%bY&u|O&|Yd;Qzi|Lf`7m0Tsq=BpPJAe^HI;0Xl}@Am@^Nyfkc)_=rk`y-)^i& zBL_5BGj;{|4 zsb@5pk?2P^i9fa+6Mp=9?{ zp`q9ET;UQTZ|~sf%>JcncGl?b=%>}a_|2x~yMCI&#Je!fp>fdBzG(r%b^T#Br0V=c-vFlSm{QRx>8~)5uI@tIV8_v5 zR1Pp328OdwkJIwIyOKEJR&gn%+n0+hB=zPzFiV$P9~@};!Ix^&DHiV7H|`@F4R>_;>xGlExzo; z5@EyPTGavg=W>=Q8wB38Z~GV^UJ+G0ztVmxB`$hRA1{{}YPf=d_J>#Dj1iuq)2e*fj(eEYk6jH6Pvn9JVIxyoszXs^(d^JmjLOplf zEFRyy+4Tc-SvkE)cmLrya`@`jtiwC-)xgC9$AJyUSH4~5_iyFFqKfOD>HR$i$kqAt z4R#>3nq<#PlGkl{g#MAqh#T`YuQhHac0Ilpds`g9kd3JhB6wKtqzaKI0=l;-AEt*5 zF__rfY*=bw`?#+=iI^`O3b&0)QJa8;z7_{ZVBB{Gt?p!%J`2jvVBlq*a2xWe@47K} zca6N?3HjwcmOdC;0CHkM?f0H%8UOJVU0dAr?yFml)Am1=Nt2QodWV{c1na_)j; zwJJ4X;K&nirQ)UcK76N0hV{mog8ElzoLv*&D2Y1^v+l|Hl+33~|qW<^u>=E3aHTon##x~NaR(Wn-U zQTfzb0w#G2IB(>G{AG#+8TRP(y{8=k^HDLV%ly1ulp}-60Xg=Bc6yOFlkfOQxl2)7KfSz$v1rwr2c*&~d%6W^OXsOxc9ktTQL3+Xnjvrmaf ztN;gjhIYob`Zo&e5A8uj3mLrxLV_dOiRcn-ArtyzjQrVc9>^;nO};{fH)0&P(xf;6 zcG}=wwso2gfQ#Ud$wkr=#%aNeS<_#T)%0BtC1F$9X6i;8*ym2#h-HP1GY4TH5=}ek zct14X18_WY#J!e+wL=n@ju#jM-<66%Q}oQ#b=d`Er~d|%;sLF!QORFfj}OaIXp`G{ z=*+SS6n8&Ez2Bs?ERFr$GJ~Ztr(CvH)Gm02-{=fP^w7;Xo3;AK)SR6&IN-|y>1))j zCTk`OL9TqA(G0zh^8Oqvwj~}_X;i4~=wx`F`?w#3EHCcedXPA|+$P8P88uMM!jp{Y ze|MMA2sS=*R=8?f-_dBSMVTv#{s> zfFqe-rS1ASD|6fLz9A+#YuV{{uHe_ov-)fn0cC#%G9dd2=B&MqKlaC>$8Vp$t6&## zC-{t&5h>`<^RiTEKk_><92HDsI?r;aIyA!D?+r+lTyf45z)vSY;xTeZ2;L;T8W8eyCh5E3@=cEKB|r)gNi6PRx-;_UHo#0qNRp(8mf}dc z-3Zulb!t)~2Trv3(QsDJkVIVhG!&iY&>8m)JcI`cak0}O3}y2H`4dSn#dh{&|Kp{c z{)jS=aE>}mBEU3$`I5-L@H(G;0Cb676RUrsbS2dA^5yXm(BtIFL3oqibO$o!3G#9f z;mY4uPA5bEZn@O5f{qtxpuWpwxmc)1(qzw3_tJLCb+-0oj!WioMiLj|vXv%ndv00O z9Ag__6G_LRKQ5aP?j8QptWSrk%$eP80_lkC(5ax6a;O+KhWsP z{Cwx;Q=yw$C=!twx@y>mfSBvIL}-!}BM#uI9hxp#Ry)#m<`$cUMQ&$62(oHWX*MHE zQHZj(jh5zgBUuNi=H(P;^q1*J-GwGM;P6uaJ6T@4OOL_DnwD~Rl7q3ko;clH?V?^2 zkiJxmzFffFD8w4{oo}zjcx)<7ulK#tNB9f^6*=RX0%9w_q>Q;mXSGS_r&nlB;xz|Z z7mn%o7a>?3Ee#Y5HM;u?UUH8>tVxnDY$InF*+ZGbcebhhd0#1yy6L|E8`d`w4V*0* zk;dWS;NC(;JOq(*>yH%@rhD&jA3Dlz5VwJ|Sp4}BeeXg5~cbxc|}UlRYPZ(!~noAg5-B=2U^aG%y!wmQ~r$So)+=t zYWk?da?j_89E87e2b9C!UGNKq zQrK0ao|Csw-iSFTYV=_WYWhUJj|EF*WE~LO`J9IWuF2jYgml6M?t0bO_7a>!-znIO zkC%MP{_y)^_eQ>okoX8uR9@fV&~`OT%QO`Fv0k^fBz4mA8=eps(Q)9)c};Pt?GDq; znt39MjBQ2Ewx9R8q3Os`-TLC<*}ZdD;$W=vWBv)`ZSAOyKaHC)bfgk7f1 zI7oZyFSoZULsqA}*&uKWFM2+=8i-ir!xr}FR_#K+oSVnzD>DH^bjhr z;H1jh$|rUQHw1+P3*xIYUA2C8^^=e<3Ybgs@7G_0-J%hI^Z?LKid|L*bW7xNHADP(#u1*GqZYNpOVkNX9lny4TCAjJ8+lnhSY;!9vI+^t#P=woswc3ICkNkE$9I zH{mmfC0oqzLB_&IKQQGwmX0R*n2q132`H?NuAPF13Ge|0zaNho^#fUXx%xe)x&VYK zH)oPxKOvWbiE01pCedmXHBHhT%7P*1`t|y;I_QuSs(jjmqYDH?eM{GpQYE~IWG8`p z!!?QAchMP^-vZkK?m0vK9%8K0CYLTYqgujW^E&)AY6mMFBhur(BZl6st4l9sgxfu* z)=dfV<+I1GxlTXv%KUi;xPHVKQ9?5O6^B}sm#c7N_f;|>sH(#Cu4Ar_Rkgs>w zC80|G_3EDz(S!&T#gA;?$u^WHkDN1Et(Z5 z1eb3BnVfd%s}IK6CW4NO&fdf1dX?{2F5nevG?`HziN^$2=`pE4EXlMGGb&Y*O~zQd zF0zIwY<_E{=!P`I_Tx#lULS?+Po8^fe>LYR{te`55xNt-pRlklyMH9^N4b6-P5KZh z6hKDUbCA+vk2~?AufW}+nS0G*r`j;6;wCg*eX=|} z-FFei7mr$O_V9Oq(y_};s6TwMjx`6lwdh*9KEy%WRBeuIKc3ZZ@A1!`h=x5OM%KQ+ z`GHpUKazh&qjC+hA2$}0M~Bx_0#Me|(eIy#no`4r=52RL)%#PJnFRBY#FeKpRe=x) zfogo-5mdErizNm%)u6cy$5=`-ILp(n(q-w(B{U<^eGs z?QXg8LB1t-+s_y<)FNZb1{1-d4I!V~bci@M)V6bRbhqy6cX^fG?T%}gYv*m>&Vf_$ z1)^v(qUeJ0o>lT0ZrRC@g~|M`S`>GrqB%^_VXKvVz}x*!KGqNpD4%&|wiJ96E$0Xm+&sP#|}iK&r+!=Dz*36 zuv#Kz0%;*OCCvV?j>=a=Y67VKM$q*`#$|sK!DB|FMbGen9cMjd96u7!b7|8KMb0l7F%Er)w0y$DU7FXbV8)CM!RVC% z1IKTOwckJpw?T|ihmkIj?k%`X=!Yu~cV_#{G|Z zLP7W4Z8(f$Mj3NQMqiP7UE&>iqB-3^LS#Qiwca8!$_W@h54Plr=itd+Odr4hSWlXA zw6Q(;vg`fZ=sx~Pnu^5nGDhc{YM1RO8TuxnI z;M-;Y!_9pb$m7g#!($eCyc7%!$lz9=Ejx{l+mn|A;o&cl%#8GF|W0z=t zlfA61jlfvNEfVd_8YiCrSy>wd1f|*uTZ6ub>&xqDJS zBn4&HaVv!xYEX9H+0(i=PU}?O=d?hLwXjL>U*u?>$A}pI5Lyln$g;F~V-w$=|I>iY z$T+J2n=|3vw;vr3_g;sze=1VhVnbjXEzLW2+vf)9TJy)Nr)Nb6%3tQ6yZft1Qyny} z!AK>>;-%=&58riw9hfbPqHSGPnI4`m>yzN01}SqD9FepVBG~(zBBs%;9UHYHB879;{km57`)Z^1=l6b*W zW_!MR#Mroaghnu;(As$XiFh>7Ce4j29%zR?Fu;^JJE#`kUXy3Q4wpbY;s~K)1<9@z z7sDxAqV2Z+DidSClf^@%FiL-3-Qr8(a#o+O2vb^L+&(q%F83~r>-6*0pUviw?5ugi z@nXUb5bY~TiLMxC^-9J44Y&f=R|c*frY>t(?iSb*U5Q!f}3WS@Y7cg9*&t*Gc=2MH^zkOICS5r)U36DR~ zz7onbt$$}6f^XApc>j`~lmU;h?eZrz!@Hi6lycWU9-VVc=P3v~kSqK@#Z7VwLkZEO zQmYvp0sbm{lxCSkus4xoM?je>-`U47I+DII!2v?{X4=wsAMa3hG{!*}?0dLcLW9%!PTQk*h!n{=VOBlQs+Sk=NkBq5lP%b?;& zo+nICu{3yP%$r~h0#XM+X=4Iy?3Roo&DneUVNXO=|C#Lj^)2vS#UbA+}4b@ zl;X4(6Rs_h@ZOFVHAPeC9Hqrpaw+2b`NntR8{}2V!-1qcPHEfVTCA5E4W|$6%)$VYQR~bA-j~)_C4fa?0p>fEN!YWE*6w zhJjwu4ZU?fM0&k>TM?tq&2(+Bsd$@*?m2&kFWZP8ZwnIco?!7sHHZ_4!!gjV`>Q^21-!CI@^aXU1e zB|zHvjmzZB(Hb}MEr`6AlLnx6Qa0yIZ#+On1(NSPZbZ_mtFO?f!U(%x)Y+04_TuE@ zeLLywhbP^7?m$j#H3^D%!I^s(VFM&13GGobP?hD$+3bpx?uJ>!RTWBnU|Kj{qzu$=6n?&y-e51<8rpui2sALhXOAy)^Z8RI z5MU(DgJ|+JUREw6+94p$2Kox)o(@*G0cx=Z1)Ei(B7R2;0<0BH%Tv{*u&1e5UK=bU zUlwkE?vYWuLeBDPJ*gXT`Fi)6cRzh6q$F`7QzBVfd|xm|w>M;MC5WBvFMP&>`uU_a z+jg;VhFf}hiultL**`0O|K~!^mzYmcqgehjZa1~wx&%To^kW4uIcnzAd?EQdo9XdJEq z*jDx=M1|>Bx@g(Q?yt-3>8|Sl>uM(`VUCcZWW9zL2jXCRn`FHp{=N!gSXcz!HxZbS zp;Iv|5CpMxznPS#yW*+vT+REPK&r||bTVKB8uq`(dwpe2`@MN|qZ6aydk5ZSu*&-;1=0Oj@4Yn^Zl#jbR*`vx?uM-vIO&d^_@-s_{&l6>Z zasd(P)y?tB+I~ol3!bWj#4Mt)b-iNkj8aQ@$h8J4{9Dx* zwQ}IxPdJy;(1gHF$1c#-F)(5-06G%>90kXKX7pL^g9%7uZUeSDHRcvxJ7-+P zB!+f~b$s}86oljFIv*yA;zb*>V*asbiEoR;>y24UMds=(ZLuJ9P zNx1kO!Oom_gIm|5_2RjxU>Q>7+(<^8z^k>Fq!}KP*y#f656*0!Kz;{ho0g*F`ImS2 zS-QO#X%}p5Dg#cm^lIaJnJ@QwP{c2&m%jJ%U~Q&S+RrQyzK*o#EO!RFN*HUnmAcxb zNnYQ^?~&g2Rn(Y$w*(>Wh}yi1H9s&=W@vaNw`x{HxVU=75!KB`Th-N4P5~fATVJ;} zza3=djH-cd2$Q?OsF<=(2r&8QXDvOvhjQ?xEDC3 zfwJjgD5rESB&(>MW>uasGztox6@IZ#PO}05-ATjP*Z=C+pAo47uD_U}5 zLS{M)JJM5o)36-(1#3?af)YhfL5tXL?%L@sSLiooxpU8dX&J(S8qPh1$`~*>hjZTBbPA9D$p1Lq}EF! ziEntuieI1Q`%9R=)@H)&8(XL}Kx4ay`f;yfNqUH8(Qx6ikmfypV|30Rc&`5_**4L- z(pf_^vx*v8L-Z5Lp4+QBa)+iY4&^bCoQ;`?2uFLT+q~}dkm%P5jW{?uQY@R#?};b* zTYz^?#O?U6VGH4qOv?;1+y~6VubCI(Z8!gey|)aDI{Ny4Rk}sG6%>$0hVDihLFrby zly2z;5$O&Gkd~Bgqy!wg2kGu+cs6?O^W6XEyglc|xvu-Am)8vQi@o;RYpw6+J5riI z#?ag!d_wRk1S^KpS9?x4&N7Qoyd=`>oQ+M0HHYI+2`L_X6KDO?q&aqM-@OV_N#_xs zrJmaBI&ND1bRqD}N5!9g{6&KBb_W3;`?dy(MugRO!}cz#yeMfOkwsD|QKtG;kvrY0um>Xo>i6ISbOExjQYBj$tYqxaIZU9T%Gg(N z8Kf&|6ImFjpqH~DZo5CoiZre8=?UXn>ZP@P|+;e*auJN~-$hPf_(!_e{jG_-x6pv#4z6%FUsARBcdO=4H;oA@rfNwPg=!A|FBd- zV)l1spLg4jYA-6nqFaD~!VWTu06D0%ov$qfVxiJ^R%X|X$0VANpEKafduesXk*#ll z(*}6n=i9Ba!M~{6{Mkf4$T2+5_ACi7@LFHQ`FNIhj<0S(UT~F7prgC&kq-vED!}+$ zH-5`n+WczNAJ#zL#=Pmt_Q_vCGz2DjF{G41yRFO8*Ga7N^WdnR{}a+lS!Xdi9SBoI z>y`vlS7Laa?qvd6ZvN1L(Ed5CqU43RdqJiz(pVr-sut~k@TO;dQUw=JQM~=Kx>k4v zZ!RXDG(I%UjhZ89Wz_ZysJDJvrc( zG1g|YzlT)egnf)T_Ye_=anttDAT*-;>5_G#W>U-~4$Fg`bOU(xT+}laqtH>D3QxDF zQxjeeb0?&@EFAyW9ip6+;=IE)A4Uv}YnF73R@;W-q}OUui38#5sL*&~Ru?4O;YT}P z;TG(r>E$Sg6gVCD@0Ggj7EyvDB08`jlBN|DOH0G3TeQwdTs=>hrs3T5;^rA_%F{pF zdhLvUS!5bEn4gx_@hQ@}Aa!)HeuO7s$b?0-`c~LDZM@{A?_~Wr@ruHD2RhRNeid)y z!!vsy{B{TIG?&{YZnGdVVs?|@8|Ip}*?k>7L1f#9#pE)^e6%|Y?H$-16=8XHM)XUl zJ^Sr95z<-02DvyoCbEhyrolH6{C`mB@f)7ld11?Pcx@uO90nRKZsj{TOD3awyM(98 zVbo??5OO)|C=$|V&Pca}6FQry2B!L*c3C(E3RjxjsKKFSoESA>brw!~V^sT|x`TS8(Voh^n9+EU&uu+69U#ZLj zGzFFCPl2(CgOJ;D3dRbi04RzWDU?w}bZhWE>NG=_C$V83)lsh{suAVEKcxy68zJT> zdUhq7&yDj%GT%YrN_Hp4W{Zk?u+BGdg6mi|qV-Xg}WRNi%&D zdkbA>t_gh~2hMOl1{*RgsnPV6z|2wSn+j0}#^nY3CyWSi9{Kay6yJ_6>a63(m5Zf~ z@p5Xnxu>ie#8exc)b55Zn(V&2FO|<*Qc)||BXvVr=>w?e^nPBD{nS6FM ztx?00Loe`L#4(%2Qe zHp2_f#X%H7+hFZt8?!VO z!X{t5m8RJriaR?ol{xzy7dO}XG-*E|_l7ju*Cbf0GlqBUOr}_Lj=E>dKBap@CV7q%FDtE$Tur{RFBEV>@^faXcy@EpPJ&VW1#*^c<@sB8Fq9s&( zAIvGM`xEnnhY`!(`VGGi=1{sqdIIf?rjcqPZc$GxV(hX8f2&I}qH_>IZ+6?YYNy{{ zXAmDr@v=FmmLCsUp_NNHy#x!wi0?93Er}r|o0GmRFo}{&oT4~!6o&=fI?B06Cl@IR zIUT>jZ#O&6B0$<`%b2C!kErcUS55q>eQJC3YZ(ef-8$b9W~W?>+ua`sr`sHg$)A&> zONcH`Iu}#u4YLc<9KjhoDyA+sR<{ z7Z@miHA$!y?v_NfYq-I|yp38ju)DRX9Ir1h0OZG4uG1aZg6It!h=P)R4uZ2i#SpR- zW1@^J&R1?p@D}!c)8!3B`Uq8B#S{NelwAbG3i<#0kk0v$8$y z?-IWohx`_xC1~+frcJ;@!|%*xVCE}kj%8Y%C75OXE}xbNCU7zhVIK+3zL9j=liwns zcxa5L0^e*8sBI@sb*@=g(TrTGes+=TE(2@5a?B-;KTUY%AEu z5R>!Ib2!AX1<`nmsX$YL`7*0zUr*-Mdd#jf%nR{da>gmrGyKxFIn{$XpgnT)ROsje zTRpn`sVtoc9FL=S01U$+nw!=upfBjyNKzn%d&ZpVN_3>y8#c{$)s>omMztD=6I&!V z-1y9Tti9lHl$>gh6P~?~zs8_df3PNv&TzKoEP`+Ay~MF)DXu40aE_w|)!t%n%z;R6 z5d#-UC2@3vnDvlwHTUs% zu51d-tI#&v50|Ph7rCv_eIG;ee1GE@s&3DJ91j9}um?oBn4Z{=2mOtb{gE-g32!;! zF{OgI@35m&uD)f@02n1e&1w3*EnewUvK1?_j%pT!yT~1!C9IQ!qmv_SU^?wn0^2;z zHES1hlC(ZDd4%2bsDh=C!p+Yv%bqj{)grk2?@t*(wkmuTq1m|{HzgwMfXSi{9Ol)CDG6*cq`y8qDcfaBNZ8*&$?UFq?2{qyn z>i6Va<_#$WN1p9yEhzYGCIlp6(`ycFhUJD5=BI!)*OepV!Lcnp7{ETR9CW)=f)sQ% zBek50+UNS^KBJ}C3!I9nQ;(ejajjT!+lS0|d&?)#_tCR+>^>YqoAp*c19CSKr;?5! z=@?x)U1hAqKjF;iraxvx-9=7;5|^V0i6jbC9*nssKRf*JQF=HC9XA6tif~01!cqXy z9EUu#2?T}ebuFRivp=wAZto^=sy@)B$bg>3_W&#eFAm{g)3;XZ1&koekwc$dLaNu3 z_aj{KbZ0p!3(wKdZT8lqxpXL*UF^MF5o8eX)tE|@uTzzy@S;O<4kDRuDaG9KN3mmZ zZ+;}BK6R5UA)k}%@&bvrRiK|VkY`_23cN$`r_)_Nj1I^-!+7)MuzwXcRv%vUX@)+;=@&ho3jfHCmqJyJ*nS$;LeiK0O1Tc~ zq2QSPaHDW7N!2=HyXEor!38GIuX*~rY^K<4gpvQ^th*U>ToD&d%1E+SnF4hd2%q8S z!s<6fA@)~EzO=r!u&+G%=V|RBlFcjbrp#Rshe@h>6Dg!G+YcA;GMe%e&Rx1Ffk1dq zZ38}PLUKjI*|N%TAFUr~?Ngqu${HFX zhL7I7d##_U>j6^dXr|F>VpkSY<+i=|4;MdmL9eY548L9}g0le|*QSF>Q_cZ4 z_Mst4OnGvuWn-1pdAz*G2b#W>cJm`8Yw}PBLljL#ukZ8&Z#PFGUjqFAzLd2_of5${ zG8Pzy06pEF8B`kGNs9~XzDm&F_7!pu0k;wqfSWPk2Wd^4k44nWjg3wC$_09m+fuE_ zOT`}0M4YpTaV-O(mLms_SIyGk$CLJ_TAz56q-cyjADwg2^5BZ(xnztH*{LMEZ$zAO z%YO07xLacc=W=H#xpUuI7koz6EJMMPmr2Bue$Y(T5P#oGd~0U}%y*!eW}1?d`qTDp zKe99ZS}E-cvG|&rJ1_{@KM6^BXn4c8@|EF?%Kgx$Il1p1yheIL(d2|ENlyIk{M<4q zI_IUuAhdi>`oXtmn_a+nKxe}zW+Jm;vw?)LzrQ?%*%2mphb-Xb&bQhi!8O37vZr9M z>ej8#4+n1)mUmXR!QTmPI_u>$O+s_~Uf?L!H5n9D%c#XWKfn1z`$Y@$&+!>rs%~JV zO>|LN6$ah`Bl$B1dmC!L39FuF0=zJ7F#n+XjZHLgPYhOG zV-sld);KH}C_dm$^MIIsriGo)9sU9-j%hcB)4kNMgfOuRgBhFdRsVsv-}zl1SCqZP zqj}h)%#CwEl_Y0oe^P(o(-G_qlzEPcSTgqm0H*j{jB+%%eBCv3R^JL2`u;Ehr9S8s zG$wD4Yz?Yo_?X}y_q3l!7t{8^MlTt0S zqW$kdYNi8qtGffuRkNTH+=ucA$Q5{+kFMYoq0#Oj-aL+6j*Q#c%pUb7{^e%*LDg5EQ^cO7PejNpMQFFA+8jP^FX|h2ES76 zmos@XCB#~Pu+j3(2G|Eoj51YKC?!9A0~2FQNQa-mr$L4DvqJlrQZ3@lBD&=& z1f871?BNjx&XE^q0JN2!!^GgLGge(~mv(32F&22&P#b*Ze(MFEqR2FNqy+lHQbFp? zMME_)f=ro3f8U!G=Wa)Q$#JAB+@7v`a5H}_#@7H}c^1bTEFSb&m@bE7NmA%3Of{u_ zXr#t>He(Sg=O==t?Khe7pfDE99Z*|-NQkzT^Kl9 z0tXLnVfTLj1TFS2j6nC~%jRZa9BY|X3)HO@#97p}FPS!Q$w$fI!#aV~THCc+ z|B^7$>l>xEY8ov@8Dre_V2CPqVBRToibUR6zx;02g?nnPl?w#GEmI~Ibo+aZ;k`t0 zl-N)|mHXR1jW35)3U9s#nDVi7Dz~>|ne&Z77RP}?c&@z%P2kz}*!@hpNi58`{m|rg z{+1Nq&%E`l%@;wL;vTb;Qs>2O3=anG&zz2y^a`5FJ@=3~2c4wmxwT-=$dxi3f*GS< z=LKE5xFYV`_1f3mZ_I{^&;pt>OOxj3F9(oTSCA>ZG;`VFe*w`t7?Nm}^>|9&F;V0M zJ-~`^dnB}J=5<44KohA$X_!J9U4~%$dfTy==^lIxUepgBde#wnzke9(TO9269%6i5 z`%TlMN}2m+d@5HIvZ2Q9{oVXrxy2!XtRmyN#5KO@$`h1Lv=JPs{a1CRFH(XjW_OQl zN<5)bX>C9x9SaI7gGGAX~jVkhU|Wg(ACx2g#BHR+a6 z<0nlYabSs(P_w$dT1nrs*2oTZ_(l|IVY(Fg-@NWV; z62-h<@tG4EF$Q*)_(iQA3pe$w<{Hz``$CrD$RqrE3GHx-w>LZbp1|>nEg=a^-SMK& z%h9?-wG>+7eqzi%isvG3TI+ShU42HgM=nDa>9dL#I0Y)j<(bE)>$@`WP!^r}A*$_r z+eX_MqmE_5?x&Rg@`Na0H(2=b#uKiz-YNRY(9Y51Tf1Qd&*-;3JohpeAx9dFZJmLZ z-Dn~-v}hf*T}~os*Dp3q%=ZVpQLZ&O)QvVQo{DIBKwYd=$#o281aHQPQ8M93`T48_7=MPomd)3J$9mex6r*3p z;C;Cph-=|F&Y7u@B41F5GgJl4QzptJ85UyMc>WW1Z&6o7bB(!v6z0MnujlOVLV1%n zH+gmQEA102HK?}mXLCQBu}}%W^puL;Sg(>?$*4V6x}vK86G`0FI_Jaw1@#?Lb9F7R z#mM*aZ?UwX4bXQ#E-U;~a4s8Ahqg@|gOfhvQ_QuP0RvPqJRmPmX_#Z=0F5}4lOVWB zrka}KRnu*6Mr{4e8y0e$1R6>LUsQ3~M9I%s0AOR2Y+An_ji?!gx!VG7i4|Uz~@#_7=|481~j(c3fI1$a@n8|~l%(9Xfr}up!iA{Mfxc!LXz0Sl(1Zr`6K z+1;`K<35)wOBg$}mz`0<#|%uX+D@EgjuoC&lg6)_Q}do4z5lw<4FBx1^5XzVC4P1< z4`a5iVq~pZr&O_a{OT1C6pGPe0aDPW>UUZDdKN%V6bc>5$0gQ(%EEQAMgE8@-A+@^ z{Z50$E_{hXozTm16fat``OUTlq!C@RWdjQHb0U)tvURdlVBDKyH(CIU_O`ey9?TKV zg^mdFADgc#<~zR(SnY`W=3N_wYaf&x+(iVYT&)f)$5uqHmLE%GpefQlRQBG#`Q(Oi zze7W>Gqu`<83_mzkK7dUSI~O=p9$6A{hjrk+fDTf7wph5LDYTU%4hBdX@RtPb6+pW zDGl8$V%i{Xr-bfmvt;i}MMMz*jK`bk+`--lAr~k%gcKN)i&klTvwWO*X%iQK?oK^o zF+d{kQ(=GHWtiOut_qCsIYnLoq3NLMebJNCdt7Q6mO$PPRGkO&Qh;62R@((mdhsr6 z^(jkRhihpnzN9}!HX<25H!6b*w;*2uf?mhRkHYGaddEWIg;E0w*FzIaNaiHOl=q!vg z46DBmJL!3SO~I@_V!3hB3RzvD&iDz`@)JVtb$yP{hais%i+>aP9Q^&#a0l0&KbwNy z89O*C_Dl?199O?M4QcNNn@LcWa)II0Tin17f6K+IB^vf?nObbFm><({f!kKZe5DFs zlB24;x92Oajek;*)y&wl7K%$FE@_2#N#Q^3o43#1&O*;Ey6&P$jYr8m{VaAYRNR0x z2>Slp;D>7f3oz>#x4xv}J?6$6@D><)+aQD*a=Tk_G>LSCSBpJUcBiKE^@TZH`=KDQ z?zXOtI<0Rbj4a4u!&=^nm)ly5hpHQ!stsF$#cojv zNm6MH^qpJaRc1Lh+2eZMJiRICu@J~Mr&+h1k<~*(&UN+6b;hUOT*B1JXvQ(vxkdq99~iSYpm8$7waDVQD$kRnQ{LcNqk6SC2hOcL_)&NulPY?3tLxiqs#+{0@DSH%6FvOhz z6$-#tpQC+Cw{|qnfna!b3sY21!H5<=N`|2&1^wazU+GUZohjo@BCi!Om;!#UaR9u+ zNDmL56NPV6@2F2%P&xWIN{7&1?U198XOWScKGKqJm}Z?XBc}JWLj3eT>gzrL?8z$p z)HY_Im06&0bVtgjg?(&=oJ=+ghypp|S3YJI!p|QI$01R8n+$>jd$=b1@p(#o?L7;0__^R;ZaibS(Kj_s~bfMeTbLfYo)UQ-PaSvj& z;f)J5LI!0^#J&Hs;$emNau97w@MEMN!o4vjwTRsFd8gMQB#eVkkN}rOdp3^yW=ffP z^8?`&<5@QH7Q0m<#A3b>|Jwx+6~#*zEUK+%T2+bxf2pT$=cIR5tv$5R%&NDGp%wj` zrzp>0laBobuIH&Cloae+Mkt1c!}H`WM(7b z7V1B`;12OXyU82#T*O{A!eBK2h(%B^n#|`-S+`J2Lry@2SlX~kfxnaMbb@hDJ(kYE zP7{GV3l?)wx7OL8i+nG=?P_tZJNcr#VsvtA5Jy~X471deY|Br-+t56pl53`RWd2&{ zFd$p)TH7KY3kr$Oomg9gdxIHKYpP>2dAB4~#-r^*J+_C+RE=-1L7^$<>kCB`l$85F zG864_8%fwiobq06^zJ>AFhI|-%yRMp22)+f@jeZ|cF)cw8GF>eNQ%CJSnWm%G+m@n zn@$)EQ`cG73)7~teXY5qXs~4_?@^1EB+5e09uAx$64S>W)cT!O>TdK#%WL?{IOhan zrW-75AWdXT=|*FCEtraP)~&aoKh`F&__Q{d@^?V(m2<|*1%Kv7Rl5uavP{KIuyGgv zAR0qFkmAIMazwP`WUb(_$~X~ulk%W5$l{PS0->kv4``7+iE^R@S6vT?0{h6wO^=B8 zJ0V8e_EhRd8?1D{@Z-u1_iWX%aF6~_0d3ARftoF7w&q%!Vtvot)gEbL8ombLQ}of= zlaHuAtzP=H^!0h>?Hy;w?QH)7&E==eok*uaoxJ*)(7(8LuR>~7Cx2_FKNhJSl40I%eEZW=Us>?V5=bhZ% z6lx^f-H$MeK)ft6!UxXR8M>$%n?Ss|Kt=RRZ*#OxExpB6Vrtm3OSH>S@-fps@&P91 zfF2|Y2>p2V##u!f(*7I|zMCsC2JgfhWXE~0*SEmD4M>sLq9`P7GVUcjNEZtcwO6%o zDDA=Ky!LCahUc87lS}VNMvT+*g^7~3Ga^&9TEBOGOriRiWouYCuKXhd^uDzAc!8R( zx0ZlM#3>-SJ~-;}Xp0_p6b%lf-#6sL3tVo}F~}G1qkk0k_Pe&1;k66d5QRAtH<-@$ z1gg)r^1YR>NBdP*V5r-kk^fE+Xmdw405GGaxsJPC029**BxRo*IVSTdy?46 zBUF5z;~z>eHqth(&#RMofH#SoCSBvkO}5VNzNBh{Ym6rdD%&k|*J2ebDLyyPc9vk@ zCZZ#a>63^s-)nUOgB?SNMJb*@=Y^s)1V;-wZMlRTUYy^}WrTg8l2m&ESDK$tDDAuOe6OHsxhL?Rl|x>@H@3ixuT6otM3VaxrG z5(rHV@6xLWSJ|W&Z<{wfr88`d4#6N{uhYwXWxwxAbg6?skiyt~DVcTdz&{Dse5gOb zur@2XNC@C&WW5M7f#4jPBzHkdPiHtWU!TlhJI@S9&d~j$;>FG!EnFyh-Rz=KC#Mr` zK~7$@Cs%X<00)QN2f7_YoKdb;z+w!2Lf6+z9_S-6GC24JVrEFTM?6G|H0c~r1Lz!@ zXHMrao*_I7C!*)u1?sD*AswH(^2aFHENw3``R)a->D8VT9sSwAf39eJCob~1ItemY z?AXC;_i8TCB;iR|)VJ%U#0W7BJ1*2ssI3_osze9IcD>E?H@U_Opg!;pA#esrR_1$C z3Jo%nSvIiNG@#+1&XrG=@8oDPnx7OJp0u*xb?Lt)I{<@iFp{V)lemSKfsGF{Zd1GC zJULISgh7a4oLvxoq9+t}s$06Dw&1IipWp)EyM)w*_4mBEOEhYF#;HbdYg z9K-_Cs`B3V$S>;Do%R2&g!H!Z2PlF18gPrcqQ6Tb$po;H;s)EDSI7>e{qB6&(59Ka zFe!pqxwm)XSCaNTOT{0`9V=MW{f7DX(LR#adAB+0*RUUninnsw;b3V7M(|b%JM;va z=tFR@rJ@kvd8H5;*# zWJKpoHSSDOttIb^z^4J~K0)q2@a7C-*C9vBH4Y%v&#I$dwVYqkBmcBoSI_| zDCGbc{8-!gprj>l#hLsqwI4`M0L*O>OlC?O-Up;9?e}u;cbjiIl_CZn_^haehjcW* z6%e#U88VFqzJX148Hs{{FMuH!X*;|K8eP!}T0p)>E0n(Dt%?9`Rir>B_cZRDDdqYV z?Q+oL6d9l@G(xY&V|BnD-2x{HBoFqy2dm|I;&mPrCr$jfED?U!dl(8qo}5bHOz4Li z;%gZ|zWKnBZ^R6LPk1cbg{Iz|ss4;m@lObG?iiH?nM=E)>bN9neRmK5ta=}zO1R1d zp;;K6uUcOYQH<)AZNPa*QAD*}T)f8k;Sb)kSeRGdXM4gVKUBANWrpN|u%vn`#_^L zfs!x}Iug@W$;3%6v^TD9k(Cssq17t^&m#AO7eV zogtUu@y!>ii9JckB>HtIF^n1=$bGpe^%8m?XffCKxrSgr%l6P17-;roK@#^QMTRBz zVk5j_UH1-20u*4ri=W?Xi5{F)f8VtQ!Yq@sZie_Q${Uqfe-!Y(p{eDL!1KgSw zp0yYf97ms5e1s@mWZa5UIgONQjTU!DLLN&s{|>Y_=_Ga!uBP$fVbi3C+Ub|W`s#)V zn`dC;l3XBYiQ+3i`8C7s=nngkRTia1WU|p4i&eZ}u*=^O82H~xHsb1f6`Vqg>VfWk zf<_=U6HgIInbtrryXJgznzbC?=^l^2?`q<6hrBUtnA=+2zD0zrJjnnu(#T>f6~R+9 zN5tm!c~FLVI08SHq+iEN=iCxS!FLDbisdL47-lE1xQ5yAA@m-dV)nWey$4+Uh+&oz zcAkFEYWairv|76i&h5^daY`7^@85w|W;ckpGTr9+)GIEwHg|1LbW(W{K;yQom*>2l) z@=vo@z!7Q(f33@a;@duuzZX@x&Oy?wDvh?mS&FflqM=x{inLargsr?gqEf$xr`n-} z$L!T)o#amU@)DI9ek0#$tC#6;b|uz?*MeqUc~%o^H}geYVdu-yHwz)i z$EjcQ--N}u;tlQO>?bCm`-@%H>*Lv^^HI^l;L5}gplG??47HNtjb*y1@HUEu#{ODPX_y7N zb*4>ZfPDlC%0ESP^6A3$m*pRD&y3G+4wV~Ccjl3seg4^>0VGhr@M-X8fEtNt#=67T zZPwn%mL^l2;te!eT(y~P0A6s%1Y;L07@o?tj``-Jxf<7QsPh{E zV2xP`i_X{{<4$MJbB+r(!RW0{aEWFxH z?!yj*1EsAMOYEIuW`0r=;*xOv{@nx^*xz7iq4|ARL)VE_M3P!cMjT`fpk)H5OF)fZ zpB>M#-8jjL1WGi|PIdl$V#Xr|95Sfiv#G{|xLTGko*?xDtWeFFbZ6J%%w1N=-G$i~ zAL&F-mUjRU=^wKVSjz&uT$T4H233GH1Er(JiXSbuqpsWa7iazbe36OUG?ng4zwZE@ zfPt3t$x~^+r3E_66;J;2X53?^gc`sF-~GG+HSS-WEpD5HciwJt)vqkb&qo1`4`eCl zm8k03?$3RX27qkUv}mV<6-nCEyf0JJ*_K~8LoUHHD)=MI9nKTiaB0pq08EG%4mTdY z>jA5X2IneTk1PKC=fEA6P`Jz$?Q@(x(%<{dE`2A22c_=>gB;F*2Lk}|3P#xP(2BuB zgp~VW;cO_r@uK?@bO!=y%w7h!@>)BfaQ05r41Iid%FB~@Ja~|QeUpd99{P{i27dG7 zxWQ!o3NSP3E=UReP@3fP5%NgcEGJ ze+zg!6!LLbDn6t?=GLAA5ZPaGa6IVh7dg?NDV5Y0JI>w_ycQ^~$aQx>4ZX#1L_6GO z+xqvFq>9Q4Bi4^QbREm?Imwl5#APSIOMp&63N@ed{C9xH;AtRt#w2KVqhELc?7cv@ ztxp3cm-oFVb+F8y=_~z&PexwPYET^E( zdmp@Y0l-wz16&QoF4PY(t|L1JXNGTFhuF?YJrA-wXq1(j?;rl{*yucef%BU^F-swke$~QPJpmfk^isXO11&7idgP~dF^amf5kO1Zt2$}5@T%rX#SXvvu zSN-XU2nH!xSA(oigN@7&U-y?i3lpb=^Um_X(&*m>+*9^l3OTw=OiVcVsTr)T>h8NX z4A5Fl_lF$q^J*H@@;QPl9j{-SiYS5;3W%mfc+3$V-5f-OKW{*P4P)eFdM^$l2lIb|3Ce<{Y%ELv1qXA0Wu=Ys~pfTBME!N+5kHtDE*3#eAord?uO^V2UTi*U{Qe2p?Yuajh$;%qmZ zeH$Ah^I5nC{QGD$T}q173J2h2F@=a4vlPQezAcSMpt&StUbSDUPWun?2fbCIJDa4E zwQ_89DRY%nl{mtig`>5Sed)m+S50J0+T#T|){tJy<$ujh*TzokIMvT~SuiZ7i139W*6mR?x zTr6oZ)osi_Z_ERr#*5X!ZqeA!M**lwykOv2*%G%*@R#`%M?AUQA!zwTk#&IWCOm@# z0bXn{&jI5~%a{B2ROn&%4&J^2%r?MO>=()oC>*%F?mQE!8DP`j?Z61sZ|7uB2qBK@5o zAUR59P7I5^lKKYj;I7~68=PA`OUDqKwJJ0+7oaJ;Wi_fcm2w}zfK@}1=PHIyqlM0% zJjSl%_{-><76`4?v#q?KF;oF0UB4@KQa!(A#l8P)53Ec=Ma?B_;9c~#@A-5r+|Cgn zpO}aqL)b98I{T(99+hoc#OX9c__dSzEk9=_P{PhivFQ8uTu!@sNr|e*xXwqlAdc;A2>0e;r8rnB=2OfBvDScWZ9 z>ng_-vBq9JjB@A3`Sfe~IC>ob9OuLR0CZNQIM6c9ssiDnjW++Gf#L+y)Ggt^ULPPh z%j24Y)t1=Xy031EfPTvztFBF<8(>d|&Z59))!|qMl*xAP! z2pk0(GqL&Je7n+!a0uDB^@5#&CjeTE2g^49byI+=$roYM?)_Q@?#2;O^&i?ks(^p{ z{xoBM`Knqr9tT_YO2pH6nHNc7BMzL0N-G5dgfCZDeMUvw+x{!H% zCFt7P=yQ%+=zU0 z{`>`HWA-)EOJ)jKoY9+L5FQTkuPWR@$Zbw5*uVVNqV-OB+@WFNaIYsWS>y8JE|s~+tkk+T{O`AdCG)ciY#*_}6ysXJa=nVpau|4)Yx$~KAECdx z4_bNv%_n&}`{LMAnDhZ`@-Odl^M;^L@=Vc$OM>YxlllPihdYny3+LaH?`{qwSDOg# zss%1XaSITQ(uo+RPwKYBQLp(lF`@4lqc?Kn0A_ELVTR@Z`S}-FXL=j8nR_+Ds&$vAHE|`F(Er=tz9Y|qYgq;%UyAKtH2!ZM zEB`;V0)7_PE5?I<5pQ&&ScN?$5j(luf>CJo`K~?pi64OFz$Ky5B@Ant_xO*Dg8u^^ zy=4D9dIJo+7n^c#5ON=YHkCH^<{~F zW(9ZQ90~kt;QI4FmiPaAPxycL&j0B$A4Iu1HG@T`nq8lz76sqg)RHUWOnbSG)!jfj z#>Oe2z^pOI6hKC&z2<;v^F|R?ea?@&uKUY$vwj@o?aO}DFI2w^%GftKd4Z4I3)^`i z=%kZU5@i%tCGeyRvTHQa8ZYo}kkN+3Cjs+Z9z8&Soz6YzJE{kpYWVcy^(U(V)g1!F zC%3&dFKmC<>3lG|E=jSZ@7atHX3-IUUPevr2`qb-(oScJP;G<@Z<8=}ALR~bS$sx` zKT|FU+=1Ch+=`aK2&3L5Y5(6=pb{{*gN<>Ox8K{lRm>g7#U4C6x0Z!I&-$8WAgjHN z{fujh5*q&Wg=;*b9W^OHG^KC#!_M^M@R4>k#wF_+D!3FX3|E_0&_3m>+6@$A0OvM7 zSTPkCN$L|M{TPNzJlbGkB8_ABu^y;Jd`1AS>_jx7$`$-&d^ef2LKR;3@3S#KP_s5n z|GnMg`6hrNnp(iW*S?G#OEzoLOXJxS|ZRu#$Ec<(*WF+=lBt!F_UMKv^67%EW`ne(H(jRmR(GXoqE%O@0W`sWO{ zkCY?}G-zQOOsaS+_>R%}-)qEjnrAQ=o~N94s9U8oph`-39?z1z>_ApMBjJ`ju{mJi z;5CF)Z(M@;Pdnz1D8^J3&>+(;qeqINy@So2+2czRUMueUMNCF`^rnR@W}iKC6iseN z+*#^ZW2xldK;Z2dN&(4^|1Gm&7cmKAM<`Ksm9|sQ&Aqx#JZ}e1;N;*f+q8%&l;@4g z#oHk_$M$&?3RQ99OGqR0&0Smut$RAd!Ya#vC3jhq$8wSh9Lx8qI&KFLZt5)*-__3l ze0)o8@zL%vUCqzYqiR**+~966Xno-1%VHf`RyT=!k*W=y3LCj1(dvhW?gpOO3jt$2 zRg-p;k*uaZ6}Nncvc<8gEa{4C8o$=-6^8gDwn=~q+5h>oPq1w*CW4J{v)xc%UkUc) zw~QJeSe~qit2~Vr&z?0;5gXA9w`_D*tZ14NvWn<@rEUMzS%v6)NqbPNk2@+ya^=Va zXY&M>{!3heFYRX*t_+po%r2uIelLUr&73j0Us%Q$xL7B>u?he)^&Ju1O@?LZM2sO%SDh!~E&d>&oEl|UK&hg&Jryzlo zPdVx%r5ozW2Q{xl*s2ctxXeQb-;z6orJ6Qt&{L?q;~d+BnzQASR05y9^08nuX_6^Z zzBFV<;<7o}8t&BQ2D9sLFEyXq{QNEh#!qA-^*zk4bJaJwNt{&ZA+`<1{tQNTK|3u% zV1&f3iSvegcXAOpb8)bm(Pmh_skh9sPXf+o!krs0hm@A8_xdTSBdjvX$!kyr%2zIr zt*Z*&oQ$DR(aX|c#-xEz)NDa&=&Q}HuNCt(YcK`tGRL`7yzV-UxF`nc(F~e(xY;^3 z2wNcx`9B;oFZll>(8S7L_euz{>a96IXI_tD0~+n*pqe2^0&drD@9wY&qzs@qHOLKZ zEZJvx)y}VwUjGo1TVCIF3adJ+gg_wBFjF!lcs+Av~2kDo|Mog}!LlR70YYdVh0Mz$Lyb^grv{Fvao*dQUz?lre652i-V&{h(V4E_ zj)b@r;0ryZA}-r*&C$<(TPuRB^3YE6wZrGh74aB`v=70bNiOI1iH~`bNaPeTdi~x} z0l(mDD%7Hw_^4-7`R9juc9B8IX}Xqoqn4e-x}< z+wHZW$0>J)sLwx?{Y$S_{OL06aF4vEu@E3yag+0b!a zc^5~y6HKfmCWU&?s9bceHaia32sYq6Q4@VEbfV%T5YVxx>c`PadAd+doc_y67ame< zqA_#642>%->8a1+0{id)r7r4>Y+boXhM!>}73cQ*_qd-eff7D-k+i$J)89y23uQpb zru%yKKO(8z*b$%whkUQL3}|NdNoKRBhbwFcPVuSwZL+*rQfmO0%!>c$-p7?Y`$WBmQ!PeegWnf4r(O_ znMufTxlq@j&kZ^QCip@CNYgH(0L(~%0sH==Anos`{?s#+K_c?Y%K8?r5z>v@$Txu^ z##mxJ94dt441#IDH9()ssz)Q~TB^a+9bp+%G)8=~zf?&>%ia$zKH zp=S}GYoAp9eKJz(;dI1W60nW62n#^Aef(&tB2cO72$Gs(dJrs2jLHXGdPWXJ|gVh(;0&-#aQ!BwQ z??JTr0w`WY8lH^C-m!$6{q0;{P-p=K*E?z@Ny<2n_aPvnt(r8Mw0o9NlSIVx*n^gMO4{`6CV!l_muV-1J`!sR>ggBOPqUtD8+%A) zRQMU5=x1UJm<@inCe~TUA!$y+o(uC(pIWY`+e=Rrv^@6ZHlUogS(KU~jeI=e(P*-y}~obO4#xcHBK<@vV^wPk0Dz#}cyUR!IPFyC+)jQJOew8y@#2S|t(8xG7JvRjQixSJng7(R> z#rOwh8(Qk2vf%lLdu`#w6&SCR(gQ>WwPg@II(|W`n)4svjL$huzzK1Io9GxbE+oPN^2eO@D=U{=)$T!+i zICc}Wz1@sGntPqg_4K&ybLLS9!`RiHPCy03Mf)gJrIA`c(t?~L;nI%b8hOUt3M6@N zcsDv{R7N1ga{qB2P%m0*R|ur2*O#T9ZonZZX!MRfE+wu!cc^`&^k=3h7}&*HY($}J z;#$mcj=FS< zJ@3g+ZFd8P`D`pxxMxy;Q%5Pm&d|D_1$lYal)VLg5VS(9a9pT)2(ofh7XoXCo z9wX$)xh-Ohmx>rvYCtUGc&k>j8U4S?`_8Z?x~^RnD+oMF(@+)ZA_$>31?jzaLXnct zrME;xLBW7D>4aWF6KPVTRFPgnFCry$gb+I9jORU{`hMU4_nhlG^JfBcWios1Rql1K z*?SEFxQd@Do|St=#VU{YN4}l+kS_o?IlPkJKa>%FTQ|F)o6!2a)r6ITEOT>|=Ka%Vt2Uci5BDMnNfNA;1Aay8@Cus|x@pge(&Y?yeS-1<@ zv^NUV%5&}8BOXX57QW{D2F!Q+!%N&-eXBxM&JrUmEchGqMMp zJr@2jVwb#PosxN z0^Knyo$q($2RZA@Ul{cm;wP!7TxwnPB(i|b*~nJw!ndB8mO>l6)XMAIvi7h35ZDaI zN`AD#*yvi;>d-%AOAZ}F8VN|iA_8YJzK}NgKlyZvq%1`-u)m~Gk>W*A4&7e#Pr<7> z)bvzEl**iSEKPYmo8)AsmrZw$P3^`aJVmW;01rp$q~4%SBl|xbhp7|-XrUbxEO3KH z@Pb@_oegRPnp2SuECFArVhu+zje&a$w7zjvP!6{t!qfjHr}DEHrd;0e2`$MGliQ3; z{F=osgN@qX1%bFQ;s|81&&3!XN2fp*KdO<$@}c#^qwe0|xFY{bRAtU4YJJT5)+;6x zl53SRZmu8Ca|0VPf1&DVSI+U6OujT^gkSIj{ouO?^sE2T@>NDW7r;^9WCcHP#)`l* z2rH|=YIqiY0PjqAHIJ(ld~IWp+WTE7lGBKJq)3%{NAYC@ z0oZ3}#ZGtm^4_Itc7vxlb&u!->#s^@8YB+qC1C4bs@yvp`t$L3$_8(|DtIkJeg}-a z^qUz;$l8haJD*hTuP{)%^rAN!!OzfsIcr5i1u?fpvY7n8+q{>KrV({3-7d)oB^3Q! zbt|X86-lS-VbjXftAf_+uW7#Qc59Ar`A}9S1`&Svp1!?#EioFXZ~EaAtvl3*KJmPV z%hXHCqlKA8-VP$WF7wTL^ySYux#H#zY#)V*W*%c* zd5#L*g2b;BbCosA1NH=d(^EVz5# z^jgnPkm`L{sJY)kF!@@fi*>fu+1PXIEBzwHa5y8Cs7dSI-x34heFSB=pp)N8jl;n) zho1S7mMDapdqZ0V zy->g!fPE`e4^w8OJzkOQKpU5ygxT~ngc@2(yuN#7=SOrX?lYBll~GQYYOCTXA*I>K zF>ct}=AD{700sBR_L_0zeUgFZkxgc)5~+IWo!#3kG0>}Y2}58wz)$}ssx1PwIx{`n5fPyn{cK6MK5tE$ z<>4?5FKSdV6ZydH|?Kw01aL%v!1cX1~eQuIK+3lDLj{ z?4<|#?pX+-(*~S2%SRX|15cBRLK0Q}t-U(D$5^kZL12OK@>Q6m_ln$>`o5@N-m|}Q zXv;*12k<_<&Jyz&E+PjIcSO5DJkuZY^>WKea;ymQwMZ%m9G2Sw640mm_lx+6=yFO; zv{sa?Sl-O%gu>l_8=nZ9@?YHtw{(hEU?Z^LPU(de#5#ZjJ3d%b8wMSQ4 z*RsmH$$GySm==k}-b2h)v}z@}99Y~;cC^ux z$0*Wsuq9$h&iRV0LxaBZa1-qgFh=MS5jDWnNISA(0A%sJHYfo56KY;Qs5F#CkD|HS zx600FN0=6xrz4*)V@@kyg3l*fx7fB+2ZoV_v^m#Wo$R4+6jc;>AEB<2=1&6!k(~yY ziAzB``8SRbLoea9&g8{h;#?10HriI`UaI(?vRTE_max3&q^vaApRNM(B6#Mk4cWbS zR2z}D$cZi2q#F&KPQ)+DG)J}nDy``3smd?(*o2$cbc_4kPyYt!qLHGnp*FqU<5z$n z8GcC=fQ>&r4uX4_lyU@W;kv#(uk2ehOl6`wNsk|<^%#>}i>oeUOxlQAD;U*QkdW(C z6Aml9`{p`B-iPyOzY-qy?jDq==d6L~>IR5y73yvhzz;+Pa}X9y@g(;4b;;?KK-|Gw zd#WH`l@fP(s+GVfvd?Js)}ctPXP>EOo=snzd@OG&xzU+FblGO*c$Gz^pwM;`V1q7e zGDvw|jMYkL);7{DF8lQ9j_T7>+m>Nx0zv*Xk@v;rcXi3aigTPwuK)s4Gbj0I3g95Q z6^DOlraJiydx@#fyyS_ilU(L^zdzk}y|gTN@f@L%FL}cAG%4wI8Z}+mB_U7EijoZGd81Rc2h^IVb>DVi z|K96QIONLT4$y~%U(A9QXkYEAkgY*T?HglU`98g@ll6;Qa zMv?zly$fkid^v>@K|U3aXr_}d`BfXS=5o@CBPlV^>ns=^VA2IxntGG_ACjMlM^vIO z%$f^5KnIyvzjp-LL4K^hGX&le78{>i?|-!({Pb>`9CkAzN2^&@m=rPoPK_j}W#o~9 zQ*;@hbQR;nWkLUR!)R*QX|uL9M|u2*AMeP!or&g-RwPeMPTz3lo#zd9{4(SeyzoRa zgyNBsmYa>Lb??{*d3#O$awx+mdwsE&&V~D}A2b2CNXGrgl_4*s0})8v=Q^Qr;;mZV2shmdXw=AiOuCxavAYlhNGVZe(6o9+;@!vQS5k9}MjmHUzN zp@?~ag5bMn>T*_5>pTTJMB?6N3vak1asGh$NcD%hAiD^5VToO&RwQpf@!62}Me6d1 zv2vaJMq1GujUR*P{%Q3_pvEg)4D@70@5S^S69So;Ql&hWy1Uy`qv{2-JKtq>C7%sj z`8*Vq&c5BSxBRZ?Q^VYB*P622P+s}B?h;etkfpe0Uz7BiVQeV3iJNrNz#Z(8-Sk(- z$ehf<^Cz`@D}ADdT#3ZM;i&Po_>_|hkcj(WfA8x0GUp;kS;+*=nd)@Vnx^4B^3Ru* zNj}IA8vUqpy;t_8xfM%q-#qVwF#dXvTs;ya6{B^tpP_LUa<*?y9CTfc2q}-)zNS8+ zyI3Sehflssgbn(BYRgUf){C$8^s|F4<+(bcy;x4i%#E7`powUC5JXub>vqGB)_{6e z&8f|jbZ~R+%G}sX>(J)*#f2Pi^i0SqFb09{=NL0Ie32^#<$@8^=#BO?90KqU8IzdbU; zz)%Lsd}ZZxQ#EhF!>J}(nImPw&Z%L|WjnU^qtkQ4eLN|r6R9TR8~cj(@`a_DZ?lpk z^P6XiRdI!E`hUqJH>t)B`>J#8$^*KDSusAnq?}At?>%V=A&*nLkDgvb{ZcM6=x(>~ zd!uW8^6fqpIJgk>WGX?gdZ5_Qbo6FV7tK8o{ivgMOlTn%E1E-{`DQ%>V}=njkb*i) zpC*u{@6z-Tss>8zE0CQyKn#})gA5G{)n@zIH7@aJR@bf>zVC|$!}Hg`)YoRFZVV$r zw}0rTF*=)Y=rL;aPAM7MjZoyIJANV=yP>Jw74M;Ona2mhlKBNis$U7W+DTqYu$%i> zRhl4jc-&yS@oCR6ZEZ{gQV%T+)1J`^2?P1_*CWLBwDSfvtMmdI%cyzP=yjJIx`#J) z+_s*o^1iwYJtwRLdd?fuev_|f4~$n$uoq769VP~2EC@N}4bvXO^MN2YA73r0DS@LC z1r+Xd&Q%o!Z%u4w*e}v-X=C!3*a9r!m$~B0F28_`C>85NGykk-esfZs6FYdj>i}l*ISxZA`7q4ip^sf-y;3iipw%K6H zG=OP59z5q14Sw&$#pKZBa2AXJi~02_>JP`Z&yLiAs|b-|`|5-D4o)Tb@<-Mow^@vn zlN$VOK11N;UHOOGW$Z|g=xx<&xc6mLhOQ0-u}xUjYh}5`ZfhRFBo+Y+LsS&ra?aOdt@wMZ}%Wt#4u#p$>OzoaZ^w{(R4a9_DlE-C5*Tc-8#hR^OAgH zpmi7dZP-hm4*w>sJ!4N?WghD~TWIQahRr;fQ9X2XG3gSzBDKkjEek8$Y7*WE+3b8`wM-YO|W(l;e#U5zBgS84W&157-K=vQ6jvrKaQ=sDH^&&i#jL`nrUi zZLJ7A+wqr2#zP$^twQQ-^q3cztd{I+MN8w`Kd?x-Ubgx0iyj>0UNMXqZ`=0T@wmL5 zWP1_e6#a-4uuJ|cQzSI=Az&klrslH@kyk2oM_s1g(&mj)G5O(9!!K);kBBJ(t@hXxcY& z#)=KyWiA$Yt`UB7-mwN_vga0|D&{6CoVgeNPdHB=^QAjI0yKR4<_`kBSzt{%wt%_N zH?p}*HROKIz*`>yqM^u`;r6F1nOF?`k@+~;Ypv?YQcYR2C(h4~(f;4{25JJ`d9E>3 z#!ik`R1Xw-m{z+9Y&xy0iGM1tE-nP=Ru>%_iT(1eUTV(2&*L6)aM-J-T8IkU;Ei+J z+zj9m8ev9(yPI1!e*m7D)Ub3o9^vHfa$b^{lgZ|cee)^F83E_dBLS4cp_n<6Ey_xv0169;o&)grr1I6qlCbhWJFE0 zEDQS5ff)otC~a@)u1h5D2LX~U>=<)d!0cS6~ek>ERj?~r{FHl*f$ zN2m?e#5;jRJ3Mg;lxRPgsM#X}l^p&N2q);(D;^j4W$wumR!2U!9kecVBaM~kL!_d0 zv3y!9FIRIC`GTP4wn}NcgYj#}3Z!|tGv`_c-mD@PJ+u?GiF~t%(UFmvkvS{4blaHF zmB*R3BSpi5T);R&a?X$4h29@Gc{JGHFl447<#9SP{`J$L2vnW42lL{%J+Db@J#Ze8 zu57cV)P%FPsBlwHki%{*fUBK~uf&6yOmQP;8fJ}FrJkeT5xRr@b0*XDJYw~0fyLXp zke%Rr$i)q&+zUc@`#gbp!DKpg+@|M3Jf|HSIr6Yb&K)0fH|#|i?ds=tz@=R!4Ra2u zvRX5XV&^C##?K~D#5H$PcEqkeE_w$}F8gniC!A{Isoik0Wa@IL>F>@G(M|rRc}scX zqh(IsXDj}|15aC;3@xg<x@l?(L{H}Al&kf#_XOQdwc5*E4iVf=< zWxVUjgzlxNLk@E6+GI%b!#*=ip_8{|<z`8lOHu=J5_JZS$*igHfD~e?7lMMT3&)AMmW7;F=gC+{{R!HgTtf}(l z;%DQ)>9GCf4-Rt#`kEKxg>b|nr3Of5pG^vi?D8W`ziVAqR6x9^4?rAcoOH#$uTF>K zowZl(or9BFrd5+6eV0s3;Om;QZb2oG#q7)&1kC<0%P4~15Y^q@Dt2$zS5AH75p&H#Y=UzTorsio|F~fWu zmr1V6ny!_A77Pn4Fd$7k2j4142W{jDSK@S<&Xl}#Q$htulO_tu$~ysl9WL=(eXW9# z7|rnU_1aw}!=b>bDL)O$a=FyRA%iId?kdl}^^*x5ikwUIIA-Zt1rQimA{D!ot~{Un z0Hd)MhhLY#LHml!QmUn zfX%>LsKNktBoLBaH?P7*Hcuwch^Z-LjPAr81_$hg7yuE8YknKL4;eD(c`l^voGO)~ ze1dfpsTz4T_OgA*D5z=B{z_`u&S-3CUfbN1XJfELkbKBMgE5H#4v1@eR4X{~AOgbC zuqPU<2TB*7D}!&Sz;y@ta~WlB6ij_uDSi-<2o8@O4N59J>P;4BM6Wk3r}kGzm{+#;eH4==3cVTrlIPzQ_MwT_q4=pvHcnlpCktu=DbRtL8=RVH&t{YdV^$4x; zF%sgyyuw=r^;miyU^%F+;Hax6&UHxA2&!_N$Z0+;qjRvTb+UQoQg0fV2`zm$XjbVC z?t0(KhwUc)yd@2g$wwFdG)@><@4I$xL15C2o#DDGiMS$7*22vnP7@HwQn>fGm*+P= z#e_nOLdwql)2Ru04fOb&5`-q6V!4E`c)*sH-d7~i)x$=8hgFmCkjW6!KHKa=D8X+- z!I^)eV`WW5=)699Mf9+1pa2zAMLZi@51BGDm%F|3;^X2CAMJYnHGS9$^&e>L3r%)t z(p~6NgIbHP>G$x_2=01}>?oQ}&oZlXpGzgxA3-ss5YRdut0Cobr9V0?bGKdO(Fmt& z>$Pz*R7_ipyxm7?j|kuFFnMiqbK$~A&Mlr0k6o(ZkF+xtM}fJ0MIy36AA2f}tu65L z#^l3u&oRYnxrb|~ObTaDQv$Y>EO_g~Job<4dO41>Rv{sV(A1^(O@zU87@`VjTI|+? z7V&vmo!Ue3s=ZIX1281q!MCBnRf6+lMq)*@#9pNZp$|N)Gl`8{<4kH7w*^9nxZ^CN zxOoHtgm_ zGR~eKomj402g=^yuxK#B{0uQk{#);&s_Ou(YmRYu+*w6dKgO-TGA1v;I3;yB#Fd)K ztk%wpr_R=Un}5Zzm#X(nF(^;DBluz0X#Y$Uyo6qjjw20dars*PB;bnStWNL#?93$N zcBGhQ<<+P8(o+E)UZNNu!l4R)xAVFL&E=9fHtT-8`a6#0LXs8J-P7?KlDR4NbXsVR zNl7WXG)h{#(pgDH^nKU;VOOh@`Vk=O)Std%2-{gDq|nV%?FU*)R^|NVK`~%MBay@= z0T11w`u9x>(>@tlXE4I9e_8#}=ceX`s%biWg~jRV>KozxQZg~N)RTp(^oWYB*Ydny z{v@TStg|7;y42KeO|vyG8ha_Bs3WqGAx0+6qwu_AARO)23u3plRcb?mRnU@x)TPCn zUYi^WyxV`@;eCm_)bt!S$I7qXew_IL83==pwD4+-^&eZwhv7CkI*9Px4Q{8CpvL_| ziax0=YBk=f-`~G1udP?{N5Y)pS;#e&QcLnKC50dX#l zln~7^@Kf${8Mk}7yQdloF)QcqUj(#1mq>#oIhQ=p+S&?my&tsxcdiQj<4;z9F2nfu z6#Pdn7=8QL_Ds@L1=kg?;(U!C@q3QXz5lpjAH!202wA7(aT=yQk5~9F3t+%h~~Ypg#7Jkp5I{PA=C?|Xt4S7A=?Bh z@(HG4H>1itPG|eFeMU_sE_5h#z&2=8ltIMtIZZod&_T~{ojrdG7gAdmz=|+k^Y=0Q zsdEov>JKc$Af@pqG@5^rITHjDJlfDdKlN~-58fmr=`-k?*FuF^n~$6Yr2C69`d1=& z{?hDQpgJG-keapgySU;f19AE-^TLg92Zv#VJ4Bl`=ry!o-SjFHJ( zTtR;P4RC(=V2V%~XxnM!>teZA&^8~%ORRznq5i!?EmW)Y?G@d&Ai6=`4r@Sahz5?` z@uq%F#q2P?D0$b8%gBYs^zVXeQj}G1rLmiENBc+SP$wyz4k#bgod(va-Ejx@(`eMF zN%zgHu7u&nb3%tlm`87cZ?eWcaA8z|-I^GWhM~FQxH}XC#i=J0osWhZGle>`^8?3G zQS!FdrFL( zRtKs|>BJk8Bi7i2v^d_tx_w;@e~y#S*gB^2R1gdQiTrE9L=O2`U$_c*2 z`d@tCWhUyL?=<(o$A<9!2I4q*`hLI*;>?YMt<10PhU{_NaCeqsLzEHjtiujmK^@{l5(Y!k)olEzUO8n3K1dAki`|MA%AQD7zC|q zK4dnYA3SFg*`ekTEqIoCcPP80RH_BqcHOY}XWQ9_$Pn3A3TNIv-$SOqBnbN_)YK>F z)5U;4@c*{7&mlURAYg7_6VuP`3mb4!hT-Lcr`p`k#vByzXC0%pLK`zacbp9dnxnkK zm|VTMR4>M~>Evc{)h{h~jeMHfq@W0Uw)pLnh{eI7(Qe4}9j`3??))p^cU8DHNZbh! zZbP$eSx3lv?t`-GvyIHx*LZ;pcRszUnlG<_AJl`W?lU&`Q{I*1(?V*(O=2BlF+|r1Kfz8NTQ}G`;UCme#LS`9PV3?2mJh zTYvH%!!v)Fs;DJokZW^Lx!c0^o&|1xy}V( z<}`~vm>C&$+xu^MaejQY^UCe4E5r(LA#Z=X5MQm;Uy0LJd+eKuL3CI2iDH0ka)b=@|lVLeU>`;y&*AcE4o9-%w1uYcP3kKfSs6PmrKgu^C*h6NEZsl@->q zpCXwV!SC5;WfOnyo?2L&N2nNaE>;|^ zn;El6Y3<{Hb_XSx=3ZS4r(sMN>ab~g;tn?Pbh(^9!KQ;?D7J2jnvJoP*|WX<_8?4mE}^7 zw;gu3*3wibeJJzyER-Bi7(?74S#Ep9A_5HG()Mf(E9+5C9uI;GAo+I=bvB++cm%y3jtPxbzJPln=KKhW z#TZ)wj~T?&un=gGIams%tAyfrqbh=@cv6DOTnQ^tfIYq<#gQJY#xJjG-Bu=95+}be z{+H8EVue0wR{=&Lgde$g+?Z~;l{?XxAKG%rXdhcd`Em{7x9VX(+_|rqMS5@BsWUJZ z?O$U&N_X61e^QJ8lDc&IjndT#U&idYDP>zHE}VSWboA(vM;{OGL*G!ztNyzry7?uo`ZhBJC(t*LlZ{5_JL{N3C z(AUihC<%O9@Q3El^CeZPEeR#=TmhV_fr#UQX}Pm?!z--GBWt-~aHgPwwpt^zq7ygp zYM}*RDIDD8x)Igbg!NlRB`1|qOIj@N{=y&2or#*fPetR7M}={;b%1x59uz<;^*_{7 z1$~oU(g}yCgC;{`KtL_*!NvroBCMs~;mXu)UTqv`W}+z(>clC}&?vd5QA3r~9qB;8 z3Hur{U*`dFq4Zqa)~=HrsQ)z%Hbc+!?o)Nwh5YP+`3E>{A0LU89`E$m2oFwc>DLF= zh=VZJ8|sT;a(*6d@}lPA&-|P=wi7Lmd|j7g8#BuSTIe-e`4q_r`ECVP{QB$C3Bz?T zyE+}z+Ld_{tl~QM)ll1JMJI7jL3*10$O%$V#pnHFuO>e-G7{S96UNFu*FC4kGAjL# zNVd@Iia=LYmcb|sjSSeX;==<7de=gQCz#W$5DBj)Zw)&UW6Ua}2o=(!@_j!0^0&u5 z*I)d5Jm=5#{7K6GFJ9P80+Z%`+wSUgr96O(FUa`bk!HAc99VOF%9tJYfc-(Jz>&Py zf=-qzcpQHJp!320j7RxnL)iSIW&CtHl4MTnV?SK7K=|mk+7h<2WBNfOS{PQiXHtX! z7cxnYtsKG1?7=Y`jqkJqoaHB1nZn_XeMKeAO^Ko_%0%C<#2)Fb^osIFcQO7^9oOZG z9OO%yj-Axj&~nb!cuU`}2{Ul+tXYKgI&)Wo__3VbtxUbgBtt((7|GC$x7#Tj&&~#h zxin3j^(LyCx;puQ_`zQ}FSB=46~CGJ^;(VK+dls(2@S>Jjf8+SD+eXI7fC{j7nr>y zT9)b6oyBe(y5*~?YJAcKg-$qpUNKX4gico}USOT;E)~=!6k0JbD#K`aM`^1R30BJq zOgp5cN(jNqL2klvqkhkXq@Nq~_I7A!{$q3xUA@%V6L{B~ z4@bk1%LS`rz2X>9sD{J*AvSoBe@PT$cZ)a75ofrS)WeZ#H3e*n=iqP)6eET)DH!DQ zNS!^J>`sL^%hD&$pUl8h+IkM9Jnv*O4J&x{;U^D-hrSHl8NAxV-+hQZ?73hz`7wLm$?IzyMXxQBXJs-hpp3eBfxAujH4zaa@Es0+6 zHP?!EtL1Yr$go0M0TSM~(6w|jGGm9un<7$@@Bt$tBL~8QxP7>*ppau0Mgitp!L~E{ zI`Zj;>(9&o2iJM^gw&yt&|6nQ^%-6ja~^R26z;I-2%aUAmyk%jVl zNBw2lBj&?gEdwIXK=AafeYi=f@Nr)UA)s$Oap?;12W(vEpBWj3(YjSDIfTPxrxL%h zRv!!ll!xGQu}|Vb`&p2I9g7!%zCJ%T@eSQh=>8LkurfzR6CwS*kRm zra&OBo0S(MldbLv$O|?E;pSnkp_#R5q3&X-`t7&e5*95O`d=f4xaS*l%&0I{D zTOeI5;b4W8`8_un_jr)^=5=VM7XcA;TFM41vnoLPchP=JKJQ@8J9^*#bG!};)PDF# zp~#_j!a)g84LaV@vE3NBwxG}mw~2O`H+V#{XfA< zRRfWV9pV1^L9oay_2sd%XYl#nLo9!#3!oYW;J0GEWrA<5jl(8VA&vIk{F^@Ts%zwO zErK{hGW{dl$bZdCk-CGNM=IpB7}Epx^~(LD7ta{)BKxnmq(T^fw&NW`t!C_C*}56P zCwB~9V$C@Z0_GOH&Wkxdf%(l(s?iVr?B)lbD=9)%^p^S8<0jQ4a^W|)Z7b&i-QQQWL5H?Pk^e=zpCRT2l^rD-6NS#aBqAYh0|2l zMPt_+<0$gNU+LtIZ0={4$N=w7=p^hru^5_O?56xp*PUa1cRzi{HcYP8+Ryvh!IIX( zwpS_f*ssz8NJG+LKA383jV7SRc=>6yv-bZ*} zj2YK)9&l#fxW;9E;#{(pL}1bdu>Mi@h0oy>J{Sqz6DgSymcJwR`CIgw>`abc02x0s zN%PiTB`?x~>G-LugRziPV7ZLs0+CVh+`pwFpO+`Pw(n)Ay;s*}bh#oU%ww{|bObFy~0)sm2;jM?TphIP#L_s_^SeZhrGbsTU#$ z6^EYITW+h*z$mUb^$=+v1)ip9MlB~_mk@h%M7L*dx;lps2l=n8V4Bgax;>W$i`h}e zNK_o^SrgkTYv$OszulYQ0xBnN+=eHMss@KT@^aJ9CdO^69+H`tzqs&rPme@vLp(v8 z{ww3BM0Tj9)$foRd_~skd&_htro=06Ws7hTfXFcJvNvZUjnVioX4P6FeXf-97Lz{^ z02z?L1iOvpBhJi?-k0<|CUsKpr$ma*TKl_%7<#x1>86O(Tp6KkU*l?K2hNnkettEJ387J zT!b;dmTBreJu<*3=sSu?OEY{6YLY275Adt#L*0MxqR2uH*Y~mXO9|-NHUjH%>!&`+ ziN@4Um%XyPj(}_(?pPj7w%qtP=Es-if&9+a1}SW?%<1P7AXhM`iFpnmEk1oK%1E*a-^gwM;7Ex8{k$7ZQe>=Ra*L!IB#KWiU zEe)d!0S}}-^7W=Y!=G2Gr%&ex3U~~(*r8%f0l$b0CiINnH%~$z?-_ZBd&QktFsN)u zt6S@le=+v(`qMr)6V>59F35-(U(>wWM>r9k?oo(v#`ADH)GI1FGDDX2k?M!5D}?kk z`?P88PpgvAzL`UB)F#sMC(;_>)@jEB1_`yGmw=M~m-C3Z%=FrS(6_*Lw+%Vtby}{4 z{Q4us(xv$YvKX)5dJVjyhFv)8BC#6k%#1GAmmo=wLo~^( zt5XlFA!Vi*{p}6;jnn#>eQbvx%~{zdlvVKoe2iTfpM+x}5wMi{zC7;u?Q&VMXx^&5 zHgJg{w_8VQX=Q65|6O*dgCx6B0X2P`8ns|`s>WrVUZ1Mrs1g4Fd#&y7VcFW3u2pP? zQe7h)dWM;@5&>R_q`%p%u7Cz*hBI&zcY9dEroytdNh;aYo7-A-^ynamk6t+exKV>X zBR62ga(|MGe*iuqFyH6Y(hQhGf^T)swS6dLFJ(d9xF8eYA9FSqwFJ(cCYiFMgXNVqCUXuaxs;!56?2 zQQsQb!!|LqrD43~4~lgZbOijIi!e6)rAV}~IOOce@~M27Vm2DWS~|SF!=-18^CIeW?xSrHdHGdS?4(VYKdD49wAt}XiH&!*xZ z(%BPjDl`Y0&FGz1(>A~F2-hr4(QDoV)9tg7mIRe!%Jz$w-4FolxLE1Y`H6kIDzd#p z1m*2wR-ZLpVuw5}0H`dKg9pM#RL4;~DkoEawTS2oKH4nJP{=a(^^0}wNXik2$?6@+ zTu?Qi0WziCh}blN+^GFfgN^Hf)AW9T3QHwcW|-}c^nRvVn*#FAY#WUx9f5&TDeR_! zpG&?yZCID@eS-I@KzOm z>hc!A>N=l3^m36N(z=&eI=A{;8{r*ezSW)%PrCyhGJgJinqn#yHl#5! z<}0wR)wRs(m`bnQv-$+6_@S)dtUQSWGkL{?BBYgDlor1M5egQeBHEac+Qqw2-*Ut2 ztnJM0uBpi=v+4NaWT5yBHL=koK#foFH6q+Rs#J*{$E?-`Fkf{V?BFQFb}bSJ;SFt{ zi(6@tMS$dzCan)jvyh4{m^n%TE1K$k_sN&qILV5+C$h+@eors^mvUKfExKx?TcH=Z7f7B@8Ts;E5 zBb|dOy~AdJ@RK=Iq#qZ(t5aw;-~cI=PLs>bJvQ5b8*ns^0~}VFtCvfR>fq-Kf4~oA zD}yp?w3mxP(VHp@^YO*g9ZRFzmZz2{5hQbnt*Z`p#POzyw~&YPvy&vruMy=k#g}E6 zBVd<_diA(kjlS}Mhg6p6k>)iQ`)}?n(QZ0^kHg_0bUrpu17GQ zUokVDHydBnl~G}t2(P6ni|&;ulU@ugo-<=Bi_5x|y@B4Qnp&hK`%}h~{0AV}j-@mP zeLdM#H&lIpZ;Q|??ZAS0BOv$HSK>qQVD<*;`j~UVyqu$7)Z1xtwfr5 zoFaEaeG2=if|#9syDf@lqjyLQ4Dm@mU7t$ZTcKck|I^*D-II+}($$4>8K@5!hJy*@ zNnwF0R>N!2w7BvR(sPcH9jnX^x26F)-P6D7S_$^yt$lq|W;9paD50af#3`P+NrHi~ zh+^<-0(WD=VE7|adOK543*d587{#A_li~CBqe+F81RwsTeph{pf6mUl&YMTAlau?y zlQu^IH^7@u#k@t{_;Gx{+rMviGyu@_fP9G$aDM5)0FyGvZU+$k0J;hG(=2}KI6&jhG|F*5YsCDQ+#jS6Fg^Gm7kfPIKS;h=Ce`b1qIZ7fs0ep{qU5&7*6T@dfVWCnsu_D=FGa|p?MM@7; zdmK&eruY3{Nc80bjBC6apc=j31V5Mksgg2XWefIfV(1f%is6j2)BKdHq3FMiOf-qgr$J%0A z>~AD@&cWdK7|uPn`{Tj>U%%kBF;T^J?vY?}dOADw+^<>}aC|sW0{uGEGAok+r_Vot zQYCO01>Ph7b&iP2IS-sHw}Da(aNL)UxeT11w*jm+&?7g6aYl+Z;w0Tn_IAP6W(QR%(+P!l={ML=nx zx6mxq5IPAE0(1Gj-`< zFP`avKs4t-AS$8rbijY$sUERFL*=fgrVJ_{;9LdXoU>QbR04sjVi=FDX@U0_-o7w# z2Z1jAIQylF7rev)0)eMrK2tLAwjj*XCz-FBFCPXjrrnEw?^^7xwZWP8(DnNlCY2lG z_vL7M<;54QGrQ}fyy$5dd;7#{7OdOCXlaO(v(f1_jiy|qdZqF1Wohnnr<*OyMdh#J z^S_m@`703cdg(Hoa=$ZnGf1ZesSp2L1SX<*+xEEv74YIN3)NW@YjqB|5|Co>Wfq_T zU3ji|*4(}E-yWPd_`>eEakDa|Hh|;iQZ?1mD_6q;1V0v49lY*v0R(an4+4qr9Dgt* zcctz>{&Px|#6c@di&sk+DW9n z))aktIg-9MK~0_Nzs|c%d#-{DS|@sZaO7|yNc|=I&N7dz6SF1Qn%NccLiilW>e9oa z8^E+*h*47m4R71|^FR}H?hY@|fUbNAI&1zvJt*>>G~hAaOK(chG512g@$bIrw?5VZ zB@#aq9bO6$80jG{n^WaywrI^)QUsaZm(}u^)b~qu(eD2j>PveOOK__-ib1NX_svO9 zS!*{>ylK*cJy#deQc=Tjl@V&?xUC%g;2|ADmLj9Wl2&AYIof-CDZYqOH~ypEnV}O$ zO!O1tnIT)vZt;SytVl9HK3VG=A=+zaR8};5bUERUpx$rp9^s4HvM=L)xBWb&?7bv^ z=hK?3^Bl#kCZJa;^f`!i>m;k(k&sVRfua<7u4xR$=dg^@n~EB`0s@H{oj)J+v1g#~ zd_}fWh1YVu6ylLhd zGlP_Xob~f+JXhrCiCF#}Cb{h1pERbux%U`T8-F$X7xk}AlPUGS(B)rRvc?l!7E$Rm z317cj=hIFNO&3#PHZwM=Nm?=G6S7h+K2ymFnc_DsOEDB2pUk=F21>x|Oi=k*~O*h?9qQ(#0U(^O1MBfDd!M}{ZaM`mi2V|b0C z+Fe^w(A!83hpq~RBpj|BQiN;YIevf|>#+z^fvh$Lm4lw?hob24CkwOB{k9cLTLO^% zj8*B@I-B49Qe3(}69@ZMLtR({=C_qkQoEjAB&E4uJUbM6o2Uwe^mRoO=G}Jb&Zm=8&yQ!GJZ}&Xj|8{EH zoz?E`(*4VmJTO4#xBu1|`6do>zT8rula6)b=%h$YbXuEr5Pu9W>3vV=lDuc_qtO;r zn@usQ%!UezdE05%P~Lj!-?->Mj!g{imSLvzm}F?YCc){rQftQ?``rJI_M!C2KslYC ztECfWewP{zx#}^HF^A(E$Q^c~OhhBYu&**d^T&E0>O?|i{ZF>3K|#sDs+oBw`osQk zc579(gy7C&V$bY?I3N0+^}VEQEaZyCPLNXQ67EUdRN(PXUvdj`MUAr&Td0*)Ak;}< z$T~(0y=|DzRf4a=jVFHG4F@zBv_)IL(Ngu z{G&T=y9+js@5Zxi0Zc*Bku|WV zx2$3WKEg1z3&}d&((y`+vB3NY*oa>A#DxtT+!jqs@M{C(nZ2oG#;3&{SXHNL%Y?>4 z?~RO{j0*QE8)ce_T6G(6R4BEq$>;H^nA>j57+h!Q9m~J!70AAQdTE06ap8{)^9%0D zN01xkd#!i6X+UFL-N^aVwvfck{a~`U!A}dLcsDtUO7>U|E^uR~Rq=z?2l^{(dJ+kZ z7iX7olr(zkFB$>2R>QNRBUPit=yn1u4?LgM|u)NQU4H?O ziTCBrZujyfr65-et!X=%d9W{D zC&H+tq(S2?otxpOZ%#ucwESY@P^JaZD80 zDc;1kE^HwSxwy1*H%ErTMunc=)X!3M+MF8Lit73=dx8odNJ%ygqHD~vgHG1bqkT$? zfRn#73E&@fZmqN^$v5=MEJs_HI&U(bYSooV@LfC6t|NwFRw?L1LwFJ^e~TsM+c!=; zXE56*b^kyv;fE;9{MG!xh+pcuXr`QPOv?z1 zFni{lzPXpgiK5NzjfNcmoee4U1zHUT3#2(MSIWHyE#`NTZ{FB_9A`lZI0)z>GUxIg zWjvmWayeC*7s@I-cZs6iRBwFGURdzf*S=B71OnLsN!#0!DI=xRP0Lbg5~2~@Debx3 zRqaj_1y(dfoLu!wS>KJ}LaAZgM~~>93yIo$lm&I2J3i(TY$rPkXd+0emfFJ9H~Gs_ zD3uwa-UlVGnjAG>uI!#Nqi01AJiRj3&(nlVJe#K%0;=fpI%368ZB(X}|X0Dn00jWq`+ca_3cdl}un6Ns=@g!|$^cWaE;4POCsq#q4e z4!P|K|H58!KX6XpSAWj9y*52O_)um<+ecTch}B5i#PTOfU^=>h=;E-+ym@Oara~F* z`YP(O4z%9EA~4E)P$Jt|Zjy!eN($=>#O2jJZZNvVTP}aBam|YdlUZBZKpDHuB6<~d zsmSj;b*84pg~YWZ(%{8&js>pryJR0#AH9CX(s1aIr+eWUrLHVrjZ(CcXimGybXc? zM!%c=AB^#T11kPo#PPp!kp0eovc^KvW1P*V#Ppb8WX?U12nsG~PW)w>I0ZND$-M^x z^#Nh5@25UDi@%qB$ZU0i7=(I3&yQ>nLqXD?BV7ehkO<9x&hSei#H$Dso0vr-6F~KG zakox~^kKv{=hF2y1Z#09(9^B1{|++&kVO2Er_MY3%~;g_MacX@WjyPf!i0S$ldw~v zo?Zr;ySPSrVTC6nJ2m3ax88TlDquC@;$>b|QrnLC2YPw?uwC~n3F9LVC>q1B@RP`Z zxn71q4@VR_;d6A9QYR>=_cMU99zEurgq4ica=D^v5OB|lg3)WAo3y+>whu`PMnZpz zRzSlc(7(gfrsiMWglB1pM?;T+kk=6Z&mOm3knwBe$j{B}(JrzhgBTzAqKWV}6>9t~ z&|)YE1i;^D}PpL1KbHKm@+bV)eX>?p(=b!YWTSl_>l5r93Ak zKl*T$C?5+?JN}uKP3e94V9 zF)s;3QJA^%w2*cxYHP#^E_^jzAHa&Oi_ZYJ|Lu1^s6kZ`1gf9W8#4ra7<2_NE%5U~ ztns4tbLGVBkbWYLdC=D3_b0b)10GmkU?9<~1->$UkJvgsy6^_TELW(g?EV2sH?hBt zql`Daeet-^)bB?a9i6gJY<&7(%m{xyi^Ix3pPRXzm#a`z0syeQ&Yu^%10zs8n~Nvt z!s}g@WM`-Rut?m-cj1BLB?fIM-}D=Cy`Fuzq+=BA%03W>-Pu}k| zg3g9`E=F9zRX6{b_6q}CjvvVhobdxl5mbvxy$0uGrq`%iv!G^%*DNR@D#YpE*8#{D zF%I<43v`lSj#k3GYwl!tUsF~lhUescfQkG<98r7_Rxh#hL? z^Y2|=zAQ(W=vj5IHI=!&m4Yw=+d>e?*hAu;i#;|=5_V~FoVBwyPYPS2zT1ug7Dn0Z z>0_~HIzVrFXT24m3g@*Rdxd}PG(refm3%W36#iK?#!7pJ{^*@$5K|#%RhC=8kKniI5?1(N5(PH097p}Hm8^vn4qNG^HFDmPAx;?m6iSlF? z!vPz>`Lp@u71t;<@3@PrI4cGWtO%E14@L;SI>W&LU~T^&7yIne7;JiV$wIIndto+I z;*(T*0D`{<^*w6VGp}6mKZ6353gc%^5MPAb+n?jY!Vw8(4*IaKOqZBUZRmkI2dGuZ z=7;JjU1ckLKmTeiWJsA1iqa-u)2l2Bw=D`7dGpoYKuJg;#cngQW8vC z+t4Tlp#B%Pse@IQ-kKM}4`u10j*c0&_O{Q|zrIOMXuNd(@OBIl!f#f;%D5rF_0V}ukD&>85V$9ZE>*DNT7-_ zG6guN@x))BuD8f@yR(914xu|8>l+C5+;mI^al$mX(=a8IRhVo3BURh2z{=ApeLm>RchR=?hgWGBgXIT)w*~a}4eZm) zW#W_v6{KGGzH(ppn#0_ORExEEXN{o`JzKy&86QHey&c4k`}%q=DO8E=?J?st@vqkP z>x3f{gfuMYE$rPmH!G`ye=!Px?9}$%)8Q&aQ)WbUl)eG4#toMB3PP;{{7)Wy^(u#p zt&aurT5CRWZ_WMl1IS6z3)xOtY(!0H7Ore&1NzlRS^X zZ<5+STo1&&G(2bbImn8W9^XiS2ENF2Sc^*dvgu-)N!ZK!Jv)*d70oL$ZLlx9j{09tb?Q{;sZYMT|G^#gRDo7Sw33Ljy|`R{?0&;$JjoAv5XJ zM48#UbWtrZnPZh)h`m&ww&x%}H+ppgduOD6lbKA}CVuwR9`O?)aCA?L9HMZ;ayNDg z_Kt6(`Ihx;x`Uvkze7X=CB6AM9?GI}dWKueIyui@V>z#DceG)AMv^61j9>}WevsGd zkps2FaJ+`?$FCO{w))KjVvgqLW#)djDP=Cx|E=IFO!b3y|GI=*#`^uC>Z7F&rp-dw zi#`1TE-eb5jHK7ESSCI>!?Z{slQSi7%D(_}Yy)WvdC7F2NU9j0^zxE~0A*69XnjI9 z9R7<$$h4?H1qS4fldA5W6Bo}WNv($FUk`lCtZ9Jyc7F~N2(OVpKKHGBEQD&2T^l{& zA%YpiueN9s*U@&OWqVVtgEMggS`wtmzZpjG{@sWQj4oU7xAJ#8An-N*prb+piZ}+9 z%t(|BwQ?*NIH($}-H&OC+D_XJo?!NTGCdEKbq4B?LwcYWkAZ7h*uMiMPKah)K!7mY ztpemfcQ`umB>CzA?uK_Si@sKnsE^Eg`v;JHUoMtzhj0FsLS#tQkHmqBgyrkg-zxZaZQfrJ&#g(xB7&lQRpI4jpArgO z%0-Zt-t!FPb$IDOueyjN=AMdTmcOFXFK@|hNV&*WXTe=LY`uT}ZhtQWA+W(dpYr=o zstfpRTLFC&ws{jg03|y&5wtU<94(K2(3wgzKhhj?0xE*vfe0~Kd1^{gB-86RS%A}Q zj1c_Qh+f8y&7HaAfkD2yP9CW$L8EI&GOD#dCSr_IWcX1nBv&GR@6f^XWs;W6wn1ioz|I;7xdC8MDIikSnwCdra>SN;B z-Sx=SGMt^j>Vb`#zsxVr~)xlXJ7; zThh||`OH!Kg7z@|V!%f&-o1@_rg)Nu2E{wZHzT_oT6GIk z>(RfKanc|Q+}zO*b#JjepAnCVTk@S7T_nwC&8}~fKc^;4*LjVGCrIW~>bxqx=t1fg z;tfe2E}FI$KRd=%$+<8O?8>BpW0vzN9_QQvdS<@sd)kvTbnGfLd&mB1y%9^r$#D;x z2SzXu0$?7X@X3sSkU1!>L>j!Ko58wp&VWF{poR$rHt9ujzN@w*A~!R60}c-0=b5$~A^3QC-DvvyJ$@DY zV3Fy|eEE(siv#W#EXw0Ykz-I5+saBOZK6=yH#F!hJ(yL@>2Oj&Gq z3%vObX9pLDy2^>T<#D=#{hl;@X>JH_duw?2eQV6$n%ym_QIh3Eae33v8_5knO}Rk3NYG!_mH7?QvEU#(F2juo_KCc6%{kOUHa5`!6zt#tB27b`UbmU zZ_bojc`3uPMZ3>72|QQP^MZi&&lQQ!3|vowZ*(||de23ntRUyFom=0c?~kOHIOr4i z4SccEG1RseJk0wT9@}_pk7X>hgrjq_Ns%p>;or8ErMcPB7MMkLgUzNngCW0h8!0y= zSwtUBM=36_c=EBfl%r-`G;A-@`F7(ikH2d<6P|m+0S*`vIfTF91PR%k)i0i+Lz)vJ zT_JL5g(}QXnBELC<$ZYN%iGmmKv?Bmfy4BXzdW>`LpBY8prRQ}8%Y+s&abF~cCN@BpwQ(?!lk>IZv5&d z1KI}g4y7JT56f`E(IdHktZ^4L9}ZYBdzqICtapC?I2+7 z;91CL7`!y-k%<^@qZ=}Y;bR=zKJ%q;D^*f$!s-nVvmU%hp8#*+h+Gl*6=}{Je!%LH z=qpyR``vzrww(RGKsot@5%b@N751ndw%KKY+!gi* zYp;r`ydm7ojWL_-@fZ)zBlE*AJ2~}iA=FOsG#>Mga#1{^jlRsVTZTdVYhTA>V?Gm= zuAZEh0$iNs#Lmc)gbkKa->@L|ZpYM27W5mH7=y zYGp}R7~Y)WG}1?OCJ=QCQNnZ z(`K0LhxVy0TS%q=wUruWg(VKd#K}U}irN^9y1S#VJ`Fg+$w6*+ME&VX%$t48Uw&G$ zVaPJ*$eKQXPp7w2ULi5%w&9cNglc(00P?8d!WQQw0V?hzGL=4RV_(gEg%>85V2;&BHbZhFh0>{fk^ zK0KK#FDLCC+urtlv39Ef{}gR`c4MOcWZx>9#&QfⅈGu*u(g~eVz=rG>{s&^mT9a zn$&8xU)227%sui;QvN^;_e{^7eVq&h2gO1^AgeusVJT+c4uEF6t6b z7Z7vZdHMzlwm5Lv|DuBr4StDwE5D9SIvkY=>QzIq`Ci92kkkVZ@R>|)!5()ND!q8u zIv;iwEWdfGKzLEGI+g7a1%cOI{9@m-P=Y@SlkEI0%|>5i)8d&Y*NiF0^I)tUe?Bot z(A7k3j8w_Aw9al+Fq^0L#CT8yV}|_lHY0=Ka;F{&CwcRK72x=4orc2qL+TmxNHxhl z0k^X4<7e2v-kez(#U+%v1yaZGx0$V`6$JRQPFx~8L@Ho<-Uw?%%Lo+nFWKIQs@o)v z0%1OJ!&x^g>sU3u>>ayC%LNaS)RviCE2R$_8r?bXKln=XE7bS)K9=E;(Sq^~@WC{k zw)O9o%gZjZIw5qGEEYd`x2PS9>ug@36&46AYFFIzd2q#;;8WqN#1GIkVl)vk3>PhS zv+j$K9t1iPA7$(;OhqTBXO{R-YId1q>drjJ$iG!XD%LX5Ty- z+vbE#ij!XSUdj);EzF!AJMg(ts>Q9Q(JyWOZjx3qB4w+b)Mu28q_cHC9gTY3Y(*QH zN_|NG_hc{N@@iYwLn&V!p1pR|M$Ejo73{nSj9s9$d5&f{qCIU8Qbdh$%3qVEYnRIl z4&r|8`=XRFk~`S9{>L77kL75Ki0kw5#{qwqY_rLIZ{SPuhCa_jmo*#J_L?l}N}Dnj zV)u0-oG~8Sl%O>-LJ6VXg2R?9}U!kf$EmljQM{W$gS45ULk9{jCs` z?wE4s549`O2rKfC+T$>4bmMkJkFo&s@ej>j<aV&UmV|XZoQGjt}chp^xwa%LUwhaC0(g5qN3byo8!REB$pl7TN zExISUucBNmUl~(MsR`*=?~0-ybPS01^;ZBlAx_=c~$p*7Naq_P}ysy@4mt!bGgFt zkNadxO2HV@lXOPiMAUes=vHKQs?HM|N2LTHo0ju0j&PEIi3cvMmBdngQ8VY0;P~mY zY;E6x0xrws5TOG54^CSryCZy9^HfXk;}_ns zKWJZ%J$|sMV;xXE?KQjUUg#qJg9W8<9_^-Ck0Y-82k@i+foAT_vYlS>kU zr}TK1kJCOg4vJ@!h4+It>k3H{p*@UTnGCXCt{PzDFKVdpz>|L~%h`!e;>!^XbnjHZ zLyX3HiW06^)y8=h*^!a-G$ZhV;=Zf}n_9_%M$bZAPJY5Ul*=)Hj)iS$&z-T^WoPb> zh4afLc@2{hpQ<9{bo46Yw0s&iywZ1zYYTRwsM+FidnP5JCbL=B%pQEwwe3-U(j@4b zp=&Mbx}l3`PFWk9K^yRl#uuk3slKJvZ8I4vaYF3rOhi2avrFT0smvt=^00BkPOYi6 z6~NaZpi}DOk~2+>vMVA7r;LJSi{+ain#cJnsBSjC;B)fj6m&hg#?|EX*zPJX|J5OR zho>K$6h_U}pZsE&H%i~#roJZGQF=ieeqeAX>xJdAlBD~0)^ZU;s56G%K;_mQox6cq z<$@CSw;Md@SCp`(_Z`p&`f&j513W6_Pk3Pu$kY!?AOVt#&`(4@O1=ogLpgp)sTbrdWW>`kPQ!1B zTuzLVS@&8VyuUr|uG+lzm#0q1Arp1#Ye|T)kWp~j*Ev;5ry1iH*ecqVaAHmVGN%TA zpij9&?y$N&R-`!^@qTp;FNHc5#C#vk6pf!mkk^LUpPC@cEoFE-9r2K+~|J?h?V=qk{qgCOy=$@9O5Z zuS~#VNRKmLE=ZM?qoiTf%ABd)VihB(z!|f{japcf2R{ipNCJP(wQ_!1qS>5uQ*|-w z$YoShX*O6r~JEYLv=B>)n&Gtcd>@nDwQ&Zs7bg6W)#7tr=#)b)r5Vv8=(nb5kus+V-i6d1@KxQpQv$N8R$>tO&sxoi?RqGun{7 zzIOrwH&$u_J>A@;FezvansNzvURQ`iP~DN&9chDBgG#P>#>Q}c{o(7OO8MEKjfv`l z9)o}!vXZ!Fkq#%euguvycllxCIJmcv3}&Y7f1e%vZZnA&LRA+Gp*^y0LThS@WwEQ= zPa;SUrgJ)tQVKV=y)*wNZiI`mOcP{rjxhw6^X8Erm`Ep_1fhVaPkXwDED`eQ+LfZL zZ>Z1yomK-=KmAKCXh+}I)e#eBn1)%5!gZOIX%*Lca`&&f6vbwVT-G0w_}Ie;UKpn| zG&J(=&2z;Nn`U^>Ye#u*f)fFyolo`=%f%)ih96*c7xJ;1hY{R6s|XH#UGDM^(*htz ztL+XW@JGkku};NhrOnb~w)z{5d@!C>>C)t#J)vRHQI$lNkuW0z=nvU&B(bFVVQrY7 z$O`w-5)MYxu9yf@G%--hI{saX8wjH!t=jO|tLxL_r^>EF;zR$~L|Hb*6bgiEsItcO zgjYaxR}Kf>CVjaT5V#oCQwOfqH_@?eDK2`iyQs1M==@c*HzZ0}$HeWS@ms17bgYr+ zZ>N#-(oWh3+GBaMsA8eDQ%CEC3LvR_>)1*--JZ4iP4#t-Py1^w-KG;LoqHIvsXi~q z&5(ODE=PAaheIEw@B4F2P9hxLb$2B!+IfpFhf#yZQd0KQ($8_wUtkGa1Rix(af0K)myn9lr zMt>9UHFm$Gz+gA*v}zOkcT*ya9`=;w{q1d^p+2gJ9izC#u(oekY0uqc$!Cx8n8O?z zwonQ;d6;WXik5D{wO$XQoyD@K!xa2>-rR`IFQzLj?RPUwckX|l6Z~l)TfS6#R+lSU z9mRQc9Lw3LAd#NA5i0sLK*GaXACc~qV<4NS^v*Pa?c>w~^v)l4a%7=)*oN?$CCAaE z%$SEWOLpV62gZ~mq6~SrQ0pip|IWGVtKv&5+@rkB`(WDNkjT2y?o@Q7BGu0~60Q3Q z`}eGQ{*dJY90#4vtTI2*={jE*fqi(TO^&KtQ|3-QdR=Xhk)5E&YH*GJbi!VH>ePEx z<9w0?i_x+wDR<{(e_uYkuS91?G&#_hC8q~8wxOxYJVa0NkK9m#y5Bx38%@_cOlb^s z!kq3*jabV(TjcT0*UuXIefYD^OJXuH@a+tmqNU;#yCBi>*C#TViYv1G!q|PMvagWO z2K*z3I=|1=-$?nSh}8f;YlfBG@jvbJmL!xk>tU(D37sUUQ~MeeZO*^2wY}7Ta<eVzNe*h%sWVE1B5|B^vFQ6_Y8 zQG-^ZM97fcO(OwmC7dLBUq5?4=4s8uKuLj7DArxl>*Usht1&!}oERpe3N9}^V9(Af zv6GVfG{T6D2m<5dz8y7Eyq^X`ZW`#{+fH;{k$ZO8dS=4l4>dC9xhm?nsZ1`xL1A>S zfL=>x?BfHOD0@C#Xp&Pq>sYjKY^fb@xw$NcN7E@2Y&`UEjSk>j4urX9nMCeYMmzUj z(=j;ekjsFosDt&(DX%Jub^0fR+_X2@csjhh??K9;j^>(b*GC$LT`sJ>zl<%)BRKeN zOZV;BHeP`7lwe@ZbCX!n6aO)WoDDNyZC1VE5N6eN-_rxLqM)qun+()rV5Mb~?oz`m z>C7#eb2jn|k-wJ07zhG)OK{;`YyA9C&XXReYiM=RBmvPC?i;UQY%`@+*I>xz4=6(yfStBmIwATDEmKrwu{h{y#dZ12Z9gf%K~$>K>Pk?H3b@ z%yRu+b>Y|+jizgRI2%cS1*He>#7(SP#B5r&RM;7g8uN1%Zm=&(9;MdiPoa_a)7mJp zHTf7pGnK?wfdgw^Trss3e;Lic!!ZU_H+Twv*nTnykP(=RCN^a2&(`bDRt)dDK-F@E z_rAmkni%{lG7R6tsu`e%M@v>!E#gse%ls|;(8P*d_Z%BL&zPmtmWbIBNpk%o%u z?P0N5{`KMRImh$!qPYp7hdS~ZeHzrkLjwPyIkD+DYaGz6Sl6If4|nBnBo)(MWhwe) zEUv$jUm*EKThl6qTYbKS?MXJ(%e*DGeV>V}-}Oz*1BI@1zD4<7p8;~R@vmwPl z+l|+RJs9uv4u)?{k&UJ*j?_n}U;D517P{}@m@6*Ognh^xw)&mf?2}A6QJViLvFT|X*A4l65>>OLXGl%LTeup4ewfj+r&hH;C4?xrtvd_$?p$NV#8 z*!)bMGBS@bbjHP?aNly$&NNDn03+n44Ba|OzIhzbgM3Peb{h^_=ml7ZTRm3*0#L-+ z2+alF{V zZ|^Th4lqq|ZkFas%rAM;$r>=thOPZNQz3CdoV@#H&lzn4xX(R4JK=_TYYH%QTJsG? zf?oeE)>>MFC{E#2|_W2{V7TiMlx%rpGjXx1gvF+a|}h& z^sx|NbnlWbyz{*PN<==UcO-{DKPEjGZtTe^9&v)0w1~hU#SNd3J-zX+5A(ykPDdbGW54u-u@#k zKWw{Z>9M#dA9!BjyHck~Oye1oXq~eX3P|DRG&^tIxoZB!59>LFSHqP}@YGIckMy^k z_%$sab;**$2v+7^N!1dfe1e>P1N!N|c^@5ro?qYe37AH2`K1wK3&FhKG7Q;z4;{nh z5Sq!J6CT4yFW1*It)~-A^7a*bPQ7#;v0uJ>T0#>S3k&!AB<{HXK}!G5NxqFxh^^(3 zRv<19uS&ddZHCeg(`25;^IPp- zpO*q;aqZxjsfPF;j+P|s8y5)eAms5b5|QzES1s1FPoC_Pp#?GL@I`CJd6L+* z@g(;&!(|g}^2?EwH^T(2$eC0NKFV3r0#;&p}h)KM8QueAp zrhKQvv|F~l=8+(UbMNSIm%45T>z>(Q&!TqIcU{h0G~fgRJJIq=Pb4y>lOHwBV7|I3v^HBRv8Zr1J%K2v?dKHcxLcU)iJL^C(L0z6BfeAb z4IN|5>#;}2e~>*i1_Q|gh@LCU`TOLZW@Lb6K_PtCxT%oqsVHK@M6OOmfG`Qq(3<|j zI3n<3(EeVJpM~_WEy+wbz=Hcx2D`Rw2}(`5#qjwjE9KY1d&(|rq)}XcK|yUdMVGgWO?Rev*4{8U6QcfKu)!by7E%dc^IgYg$T-G_ znk1-Bg8JAs?VFmPVgr^b&S!gABt3+{;WlBaO2yu#{P1a93Q1nvd1rM+4710SGTV$E zm51S~&vvtY0<4x|oErb{tG&>*j*yfMS;1zpQzpL)7(c5Y0r>1OKq2SkJO3Hrv?2HN z(DaJ)Y+`K!AQ7tmHVbeZbOOPhjtRUq%!#MR`##0qh)%GQFNG3N>D{L7G=Gm^i?xmK zcfx6xnBkX)N+O=(BsysV&?6+5m8LIru*=LxqkgA~ZO-Z0GF!Ar95I5@v@JieF0+r> z`gm@R-}CGZ+gYhFzUWX$6ZBD6a`0=Th+t1|b?}S}5E)d}#ey#E6Pc&xMhLYzPZf~Y z(;QbX9G<_<3~=IJu*m(O4+|$4t0Y1phHn6rGPSzIjpyLJgrEh*Y8@&Yon~V`asi0Iiotm)C$cW8CTQ?sM@xy+lXJ9wTBvXgm15&~ zIhE|}sEui2;>U=O@vO9lv5B_f&P@(rvuA*Sm-p7*%ErcS(1J!2qC}{&Cue3$FKYfn z$|d|xP|&h$P|{9FLV1Af$gs0fg#fC3W?d{g>#l;VhwSJ@-8bx!K=uQ=`(t6S5+L7y ze4tjUzKcNe@{JwJ+V(73kf2R)A|Ohjn@vJ*U&RS zLOx#NynU zU8!XJ=sOOm;8as<5!AiExZ(B7Z*0Rp1OqZWm_$Es&>MZWj!Rg^I0YfW{+chy>%r|a zLZ*Luubr2GF%3c;E&~+spmRme_!<#r1_@iM0M`QB0=e0J?994`sVVqIBmF3n_&ks* z)_{+rNhMVgW|FbEm}Y5aaH9K9sjXn`&mf{Gxk`Vf$x9hqoi;{7qS61wb>8<@ zd|R8(2chOI-s`kEls2g?C=r9aM?Z~{gP1#^pe^I0Lu|sG%JC0#h&G_s93mwhMOV8$v+gZr0Is_%w0Gx^!v!$`l~da!CVdUXH>6@u@}y=jp{+FW zxq78B3vflsExwh%h-s^pCvCrhb4$t*6Tk|cj>7lKNr3tQDj5N|{90Su`16rWXJgd2 z?)q`8e>R5^)iN8q9*r4*tcG4+{I9|PH=Yk*sX+%y(%I@JK|OM8GZH*`hl+#QvW$Lkek zW((PNT7WOnM zcZj7V1du8JkFFWcXY5WS%2z!+ORfEZTpG`Hqt>!16Ohn!WW5Hg{Qtv41AtNf^DKf` zZ%RmqXLD6Ix~8VhLSfqOj6Q1FRS3(N9(e;Ken3hA5PCgJ8lC_;iR)oyYUX=xfGnxu zZ&ZtCpCz-o$z|c>Kwr8rf)2dyj0njTlZ4>!MfK#QtyHiL`;wg+g%tQk#tm7YIi>=7 zhQa@A%R9{u{Zsmt?~vKuYdCs9gz=+?u_hOjR)O*pEAy&HcLQ)mNLH`|V4)*$F{=jZ zmIt_n!Lwxm0XX|8(#7d2K>lMk0Zp(@_=|VKFdu9(d;~m6h!5X>)NaGQw_dSYJ93h- zDw@x}_ubI&?a2vjpJ^)kI~O$*JaivPjQdE8d(9E~LbZ7`7QegrRcFlJ`T;Wu&OVsD<5KVYxhYQ%el!%qRl{Cn;1 zTsz8?>R~fey)qPW~=riSk9L* z(@p?o3{%IG(6(-%9ywh%S6(N_UBtPq+%PZ(XlLQ6#mX%Mp$x~#7Ax_<<21-40DMdB z#;LS|SE-_i!e<<1KPR^uNgu^`yE0s+VJ-fZV}RQXS;!7wAFi}sx!e6%MEHDWn>rvtPcgbw7lu{(FjBH}{0=(?P>t6wXrC@s5m{c$qO&EMNgk_G zC{0>pAMT4d2;;r9zqXt3bZV8d^15Ss-JP5pi$#3O!)+4pU3kP20Y$I1cj_~|-P;?Bvux7Oo*NFCdndrcwF zxjg;EN{sI|60U%}8--!Ax^*T>w93$f)>)kC*;>3fRN4Z|8f#Am5d}F8$0Dk$fO6tv z${JwysXI?ZPy)fIO~68L(a=Sd5D*y%QU*eqfFjMIL{X~L1PBPB6WR=@^pMbt z7!^nep(MZrfqS?!YrXGU@0a&}J|F&P{nt5r?fvZM`DqfJte|9Y>2kT?o>R|MSKIlw zTN-`6fZthFZ_)62L^zg<|FCqeERV#HyFF7#ODK@t`w2UD54%4heL~^LOua~4+}m?$ zcJ;+Ob&liyu$FFkgc;eTu|ShAs@VoqW6(D>o*$NDUJ|jJ1f2q~3ppXA4-`u4=Jf5} z?omZ_I|nel=GMCgjx9o$Vxt&sSbxM@dRT{D1f_?n^Lc9rugmIeB0S4Ef|-|T9vocu ztIHbQKvU^6W;<^KgqCPVc`JI#zjEL+9y7bf!)~@Shy%aB3&k8aUmdaCW6R#Gj#F@Q zdL`B5U!go$GhK?^^Og85PjT)M(&?v78zF)dEaYa%Bl}5RiI4OmFG3iG{bKgX2vv~C zS*(XW)GN`pZk`kp$87nm4f1(!*`pJBVU+-C!dSDwz+mB4#Zga`9Mkro-JTERcROJK z3QF=KNwUeG4WIGqk8G7+H~5;noAte8r90J<@vofCL2C_Rg}Gmw;Vt;4pHZqfq*-!^ zW^t6I=MR+DGjz-axO&CL`Y8+8?bMW%xwX z2c2=wkT{2V5|EnwmpBmH7{96Eje0qv1@&X$~W2;g;5gi_S@7ji0dj1ASz%} zUd~zxH%c9;{Vitx!#B#1Sp>kQK(B7eYJ@z!us8(w)tiIJ;2?z(xQExl*YnOhQ;5YQ?vAg;8Zi zAkV>50r?TE1S2QcV2_iLkjIWyzqIRf9*BE$uP$x*MJ9PWxxPLF_C z56*~lABPl{XCm50ME7JHcAm-E?YzY|Ew;=HB=GloE=S!ZPnj@+yle~+LNUA?S)n1? z#7S^I)3mVj8(8C0riA4*G!JAX6fGgGbhIJNB!G**Z6*|>{oXOwujdb;jwZtQq1GEX z=1I!f@d5VzAcd9jJ3eywZ}F0;cCPM-x&1Pv6iv3e{A>@<3gIS0B4)MyB2FpN9)=(I z)L{d8u8_m}EY{4|4yr#qmH(#F&E*7`b>J;y2uCtAy@H!LBUGw$^T7V5oOtCr`d+B_ zht!T!qJ|}KGZ#dYDQCu30nQ1+DWsz`Ym!Yo&*n^xEen`CZL9h_J1yt(rXf&~pN$Ma zpgTv8gUXexAW;?Tg51Hm#%b&WC(j;(myhgXJ@v!qly|;V^mZB`J^<*Q-gqNJ%pU_G z9k$w=%84$`gmKZ7vMm*!(H7u>hLWa|MiBy5!eP8!N5hI1#h%E{*mxDZl;xy&EB5hO z^Rlv)3-uKmFp{aOi_>xI!Y7`5-XX_{{mp1{@LZ z`hx1q?cBV6x%3Hber07>a`)DL}yI=F21f@WR3C4DniyU6Q`Fhcn~M z@#!+UiJA~t%)V+2{>4J`pd7C2uUWWU=}A4lC)MS5KD@RN3H3veKN3|oj13Vk zIGSvMyVr)TxcR!>g8S((E8$b%-|er=hQ8AZifv2}Qib+F?1Et5;o;tMak0i zZ|=OI(N)Dh=lHxueP(&KTm)aNc)L*;m~UW*kEV!zPOEoxA|JY(=7RVS>)pg;T^3o{1v__^0S|wYghZ+7u zt;QOwYs_&XWv+@==?2?v{QP{Q##pVcTh!aRtGNVf+`%Vg;5_8ghAAqa8Ws}Phj=$; zh};3iX2};6pof=nC^1rT0d2O zaz72#&+qE3^_2o-O$m$DB6ns&(cX7(d1VUxd<6av-;(p$M=&OKx|-P< z@*~lcm4OW7zYBhq;^m;tc-e3pzglpHaw4Is0f;DwtRb{tkJM020N>9S87~t}5z*L!BueTyq>hT|GrJVTb&~6eD{#Oi<0xwM%>TAPJh*l% z+{1O}SlcWXB8xuC4Y6NDNF5`4mj=6e3UX(e_X-3WMglSpnPn;1lNXy? zSBV1SvS-6D+UL>hFSh{^4zW51m8jxN*-ydSVDq4F>A~*;rJY!Yec!xr`Hn6O{mYT} zF7aFFvYMZN9DLSopI4xG-G$UG3c^4ZYy8&-x!}10GZGP;JKTNlFUBaB6+svFs|)I5 zek{~b9pSYM2z7V1n1?@b+h|0~HcTHt%|tt^<7Ta&BJ! z&MkyD%eEMdNj2r0Ul$(aIsE0}M%#}7C?~1#$P0NlIA;6(^-T6jS7Nn1MLf->H$LW= zM6q_w;3%`;hlf(KDIzt>O4jvd*vJ}nZbiyNVJbcpg?$aRU9i-ke`IYlHcE1ps|b3@ zdzD!j9(#_ZPbV+r^nYa>nM^{J$APDby&FcL9}k734PSJtdG}riAr;s0{+6x``MWpL zfwwT$j+Rd=Htj#m){!D?c-rKZfUFlB#uAe&$^utoc6&eciK(gZN59aZLJR%o+Vad5 z-ZVuLpcF_oMbET0_Em1WThvIR;!cz6KzKP|)O9zsPHKAK`Bx{yY*{1cs*I1F!^=tO z{`CT*y;eke{M`^BCpTF&J4g9dM@+ zjs<9h#Dt7SDtxI2qso{35^YF#-kQKhtk4>Fzw5VQr}x^%(8e7IuAQAXoWDK1y44oB zX1*=yl|48Y8NzO`3rl{OEj4sr59_(4wO!51>?>tu#^__nR53GPG`#9D_bVCh9h_^Q zdL_)a=XIF+m7K2%zoFXbUaSM_26hQ(*tE6sD_ye+Y6~=bDtQJA2hcP32Fghzfy@VT zg6j2&KawK^K$>P;7XjJ4zkp6YG1EMkqPmO`7u;Kz;sxWS_DC*wPc$9=ljd+^Cd&Q{ zDRP_;ieURq4?4R%N%%#}7*h2h&&K0X3y&smB7B4Xlt^k8jEw5LKeTlI3>v^VL(L{l`BAgmEsJj?|AH^x zrd}6_M+(&D5@1#uT|JzVjmps<V;QIyPci)Py{u>O(?4{cxjzUx&#$+%Xz)BVg2sTlnM1sQ(2hpF?p83`;IoxK;O ztI)u(Mw+fUjTAH&Kln*4Vc5JsDZJp;nYKoN4&bidf*3&)RYciZTR$Cje{fI<1DOZ3 z4Q|fQJGPmCWdha1QKY)3TadCO%sx2B(`$jt?rB%iU-% zAcqeDN;Nyg&j0YGOIp|bINp=aRz$&_Ka1E^XLxux`v)tzWL?Yf;BP44Y0qV<@?rE% ztLCv0J~jZEy)&(9#S-pK)nPmLpJl&h6aq`pU8z?(Zys8qZ6;e!`R#l`HFpRP zAm`$%7ksF-k!rY&6GgCPlvejmA_J!HS6|}yT9Yj*Rp>`i^cy&ik^rfzeSj?8x3VDh zKwbs2_`e_AdhNlyn+~?{bUGXtZ{}-2%70;fh7gmy&m{fWy9qLki|Z2K_{CXsQKeL~ zLZ8*4U5_%m=Er0y(*86q#8 zLMQHSQu@6%%ce3-ZbvPhIymEkMRsT%S)(se(0v7{(Fp}7oLr$$i3^(Y|9t8H1z@zg bZx6dwj(Ge|H_ZZgUC>Pv%j=a!?lJ!c<|gv( literal 19755 zcmeIaXH=70*Df5(mW>7WmTf^PHb83JpwfxTMx-k!NDT@Rkt#hv2q=mQQlvy`M4CvI z&>;zmN(m%LC{hBrA)y2kAS5CEi_iOh?-*x{cbq@xd7kr~@tq$T_qa2Xd);fUx#nE+ zTGzFnU$-?su=m(r003~n;@V&K0KhH|0I;+5*FEBIfESG)iZ44t?ai+MYVmT*;+x$* zmu)Ts03Xu!@$dX1zLyNX<{SzDNPYhK*^#X-bsPY&?y~sn^35oZHPV6laHq?Pd`3Dw zF>$xdrrA*XuV*eDs;s=ku{T$((>(lnNAmsANnhDyoiX3=OE(I}#tcd#?wkKbJ#o1F z*B8vDH>XbQ%xg)oKbevC+ld9kGwJDxiRqneIL(9LPKaZ1Iv1xGrViAfU zbn_97d&I2mk`9#JA$~jFjA=VoeC;w(S=a>tTsoZ?E55s*m@X~8y!>rb{F^twUojEi zoWIg50RY5b{Z-t3z|gNJ6~&ihC;ylK^f^JIC9L}r`TcR)Wb`!2c-MEd72X(1eQ3$v z5Ab9r#=f4bQS5~90k^Xk;Y8DinPlm>BLIM0O=7HPfUQAX$1{E5_gy7KJCg{b&g#rG z5G5T8>rn{x+f}qxHq+n438BP1hi6TeB z+}UaX07~q~=Dl8c{mSE$IeSx%xU-(oChpqb3I36*5PJO3I$^ zsS+&_NmT&Az^R2@{_)Z<7XBNF4GTAlW%>4`J?3>{7W)is_&_WrHSkwJd`X7*g-!;x zdsR4dLENW09mNxPkRWkm_U6G(=Hr%>q#Xb~1No?6E8YD!59VA{I!Df1JpWv{-UL3T z>y;YT7_kghQS{d?jE|ghXQ$0Xact(R`?cE@wl{ICKTFlR`D_k#~yHTG@~fh02Jc-KO}_>vpF3Y zZH%%oQn>Q9o1)ob7P}xjmA#j^(2xM<PTGaN2d? zP^z&+K4npB7if#!!c$%XVxy1MwlQ%fPQs5BU%O$lJyy-lRn_zPwAP?48PR+u37%T; zU*SkEQ<`^4@~w&jJN;p>MUszXMEE(Y>z3pl!s3?YP|J`$fY&7l??EzeMPBO ziY7(Py|Wb_TlE{kvrl|zVTp+TEa;g5Wl!_3#l}oAz`xmq03!#HX<2d66cW>B& zvGaIUdVp@fHX*u~t10g2IC^Qk#Z3dLxngxtTaekDcEpHN{dks*v5R}iKLU6Ye#K<< zw4{l?PVV&_olGnfp_@V&p=kIl2Wu89(=D!1t={oiR)nKi6*>?JobS~;EkZ`9x1?0u z{B2=>6#j~joQOM!l!L8)?dYjfxrU7XRQ%%uaEJD+!y^w2hRMXj`o4DDS4LwhvI4U{a zd<-(?>pcw;t{q89N-ih_(rdEE`%6rcXAwFZvXPjI*hmc9Sdn*yow~d^Q}7XZkRl%N z97yKcS)_O>9Q&11pO%FpNQU`F^B3VUylrN|4%>I{O}WWH!x*D!gw7X9M<|>V>n$g{ zSgq6N`@ju9Q(*zkb3y>J)18Y{F5&mKi&mEx)9_h1b6xgt`-$I~YjgLYIEf&W+Y{+zqnpkS zu|4yOGY2$het2Bjv=a$OlQfcg+i`x7W5e2Bw|p4MDJXwKat8yQynCa%PWz=K!x?Lq zGoiE?=#;)-JTbEaka}1=a5@6NLccmum+dv!=gK4rRAM0lyCIX5*p4wsr$jGfD%?AX zMuP6~nF~=Yy*q0!2-6eB`PG5L9(FOBL^l^z4fW!>o7UcLlFv--mv3PF?3K)O+-tK5 z6N~)9FqYAd=7Ld=Gv4=Nm&f1ql6ROM?8F(^dU~8WQL%Yu{DWN{t22&AGAsnxwx7A9 zJ0r}PQjQZDZBAgfxAD26M6SK;#&Cr{I7!iobguK|PC%n&ZxZaxoj@0zrQJirBL*jZ za-}fCoT>9l8a<`;avZSlp>t=5j*o87=+^vwXVQpJLaP)81%21=HYpW~t}FF;Y-gs; zj2(D+6%1$DnqAYBKa@jeKhH0(LqTGH(AL)O%iCu(Tr2y01}hLAmkRW9KHS!L;*bvH zxMqzb0FbKu)79Crv>FRl zx*nb0AQzIyks|b~!@ra@C~NM0s^+({j=h!?gNx!riM&0xN7U$KWSr89)xM&Q*TkOg zjb>@mq&R8F87IcxAAqsxH+NgFoY5K_4?PeX-Ah>Ggy5PsUq`U8w2jyn%34?8sP-(A z&r-xS#@$Y`bCSooNBk%j?D9Tl1>U2tj`0uOx;fd&?NBtHk>cWtr-ebpvR|$RqMfqF z4lK(Q(?WLw=I)+>}RID}pSsy*Hm`5JI?50rF>KL6l$}pfl zJX5!bpWw82e{_4SD7hpl$WRvG3Tp5II@R@ra3Z@UE$H-ZLJTZ2-o}Yt4UU~s>Sp65 z0QU>irC;ik6W8udKMV+0Ht?ZQ<3&~U&WLIKnMNtPOQY)G#^R00owsX05&|TW)$71U z&#>ZbZWJZj-pxk{Xgxjh-`BVyiDg_iX zg%2l%n@6@8Pip_bnBYKmGIU>}t8PWn=tpfhX=VQIy@-Xl!e`IB@kH<>gP@#JIU38ZVu+SgHCde{K# zb!bFnPXgMpwyC!b2`dFJ^2SXNn@Jo^$7oB1iA zHU?-sN-v+-R*Z{Uycn!B^i^A)_61SpDQpc*iWwqoX|)d^8QXQsPgz&h8dq?3A9D{bWVGuvIf=ei^mLoyL&j{dJfz)O zn_lGTRNSYV`uRv5=5iStRCI8i`WtBBC1N%#xai>bVawt~N@AV7 zOmz$TvJJC$IX63l1rU$1MGYoN!v2SY31XwVgZ}u45!8q86idOIx?I=_szN9W%-@#n z0CZ)XSm-thSwyqb@=vQ&Q}#o4`szEl87Lm#yKN>P#zO+xW_D>jR44g`tCYUZjG`Up zv~e>yZgA_NQi{@>Y+uVEksYuz%BXgFMw9rO1MLQrqOMsbLeBi*V?^#a7|f98)NWoo zXLDT>7t-6wW`oH!lAySGThEsegK71K0cv;wa}-Q*zRK8|NT1@59x{O#_C^(Pz7u=u z>P*#?`^!#L0xAiAADNI?v8-wDz5u19&Mb(w=wM?LD~~JIXI9U2mbGkIj6YCW zqM+L1+7eN{QQDYkZLNgLloncjZO#UNV5(Mj_FfIBs56ItO@?Fe+HUVM2eY*`i$tch z4A^1n&g%eAqUplAK&;8K=mrH_AWz5Y)dLUuB4 zejlM%f{uqTK3?_@dKLD`iBcTsyaO=QATCpvpC(v-3M1w7GqO-68NYEib>r?D?Oz4? zYBtcUWW_7se4YPUMasLhe6n-vgt%Zl_ZOXe?xQlZR=V*rohvRQ^)vr(8ToIn_WxI^ zT)p+VoWP^++`_xvUDG-`e9o;3B@~XjL)VU&BRtmYzkQIt5!4@sD2Uk{o6n-j0=iZt zIe?CJKX?dp38u*7FEyAW|KvUwEmAw2b^zvjgC`|Dj|7T_oxZ}cVRWY+xcJL!2mkem z$Ny*p35Dt51Kln#W`cw{JdUhc0-R=M(++(X&4iBDl(^D5Bg4ijRZ@zeeYvEndVO0E zx%C^Wxp{Ct^+ugEzW&L$$m1`yj75z#J(c1IVU6o=CC1Sh>=b;=_VW@i#lNjp^U3{G+H~~vpWGrGUJ<=c zFPug1{f|dvrPDoSJcG_g?wDJeZn;o|8++qQkxlkNSa>_mP`+j&vx{;x+oCaGZ2xT$oHL9{PQQR295X7|Z`N ziSItvZ8xE6Mg8$s%MYPBKCYQ9GvQ?Q#auz(Q#g7AF4&$E9M_7v&&CfTLMRtR3x7&1 zo=$Uuvr9TeT3SDfJ+z>upJK6UIAum7a!ftn=McnI-_e-$x;xGJ$PGmhV&2o!c=b;8 zUsC1v>QkWmI7TN%QL1^xmyp01n!q?~Bg=9O(nw>`^lX&floU(S-VVKVXa0_-PN|#4 z_n4nh4tuoy(`ktTaHx3dop}8Q`KD7*ZN(mMv|4Ja?y6vd9-K+n3|Mu8d$7L_K~bdb_iy^ z{am_~AUbLN8}nc+C$j|9vB}z?X1w7GW<&5%QP@_xP@<#w+tM(@miCl_n9Yvs0=L}k z%<7-M1)j;!)`L7;ppUjp>nQArhe~oZG$oMaIW-C2@o6skn=2>^_wg~oXw#@}Q3Rch z6cp-o*ag)BXF^5WL7%w$Fw4Ks|A2Z!vvv)J#5YTowM_@+501wD4UY}!#Kips!${j- z*1DwSoK+n{SSI#Q%Z&srL2c1n|3EgHdWGC6I`$4f`jcpp&b1VSp!4_`I_Ng7ydnG> z=Dt=(aEz>3G=dzvx6sc6(=L?#=M5k23P=r)r)33^ku;#Pfgp zz)z2NzL>a%twSRgfr_X~WFfc(S6 z0Om{SMr~u=S?KJNxM(>~+g;)2T7)7>*1vPRr$8EMV|P27)7KboGuNUiWVzC>$Z5(}$=3&O0A zJjP+M^447R`-(d*ZE~owkLl6q;8wGKlEsQweM2B>Ak@f-yU{1=|I-H@$7nkOj$pM; z7cOp}T?sevMfPISb4(|sxl?XgA+4)bnOU<>xZ5=*teOhQeK%8H|B%rl3YAK(&|Ktx z7qU9A+Tol-%#ER;51|7N%?MbB7~6uniKGsj5&`9Fu`>=_e+_-tS$iO0AE)MQ4+tvAbEV5F^054=X7%qV5@t0LWD-X|qkniiQndYc zgQug3E9$LgC*PWE)!~rGt~z^8Oc3Xxj@A?Lr@AYD?UnwXuCX4geB4X*w-^3$M3dta z_fho>j!!&vIqX7vajq;pTLH2PrQxt~Czh~kpRa>jU zrh;F1A=TedwSr5IDOVw;FXu-pUQ@cos$4Z>UUch!=}Q*8?Qg(^vDvfQbX2+EdD^t9 zm(EKVCv{UK7rV;Rh?8~d@3D4@b7HuTfN7}yBC}f4x}daML2%;2rJC1^_LUkH4FV7L z6HeWiI>cp>Zlgw$1y&cgM2Am!I4F$&r)OEp=Euu$X04Fg(|vbyY&bP|-lF98>dOXt zR=BGo;SdK5hl|moMJQUU#Y(?rm6f-Fh*wNrwQACa3Fce;7bnIW2!#=UZ#BHZ)>-E* zLQ}~+TpQnZgcmEsqxcOemQW;Nzr0~jb%Sv~rVz_{S#VQ<4Nq3Ffx``Ry@%-$Vmr#) za7iZi#6n*@PkTlf6o>)N_ibm^DBHxqU3(jSVj=nc)XGcOI4)Z1_}DVJLS;cj_QM$t zd9dTAKeuk{ch;*QeV@+sP~{6vf9oe_VkHq7hw=|p0{0mq`zF#h+l=hhZ^JJhHv@a% zt{zwVOhGMw)VuITHZmgJH*Td391~nZ@Y$S5IX&O6hbkU7{PNcR9fW&&9>{o27yn=G zk9*y)kC{$j^_#OE0b|bXW`)t8K1dJ_G7+v6)rP>QV^aoM_t?&hM1wh;EV?1Hf)$eU z$HmGGt2_41bqzwxYxwOVc4?vQ-q3i*87Har&6dubp3-BVHO4hsG`cro6Gu#<1*^Ed zn?n1Ej89GAxoqEo)s7_IVGmMCc*?8yv%U0fb_Iq-cRmpeyar+3)0g$;sI{}k)_>c?2Q>qoGWUu;7~ zzS{Vkx9{;)O+yQ=HzTTX4N|Ac_t0|oCJ6Qz9TyBm8(BKfAOyfEzh-2+5k71{olaOh z-6JCi(dRk21RM(D4b_l-%U8a0n(P4Ubnfr!6~1oJ-q9$RpQBxCa_`x0rWt!`ReEBU zVVtp1rdVbMG0Meg7p=-L|8%1Od2aX+0(DAjLcwPQ62QuC5co_CI<#ew3j|uk8!V1+ zlS-#IwQt_ZbB4sMmDYHDy!k6)vz5O`N{~uv#ahGBg(GBEqnD6hv>0i)0!cUT&ul=G zdCCITkr-o=vkDX}($$4CI{EiXHLQhsO2n*X-1$+WhwTGWSdZ;uQ<`C9G$hpJ(V_gP zO4`*Ve)ULo*ZFKM!@I)r${Ws=rV@e9cR1GgxDmtPN5g)iyDEtX3Nzpouauu zj}Bj?Ej_e>l<1%J(6gs9xU`W3u0&iODzfnhA^viwH1;dkjnalW?4 za(jy3AjU0sxXieYG$9y?pUtugw-3O>=kMS&>kj#DqK-5Ovex`3oy+P#F)_!ji!a!H zF{fxI7(KC?z??$X?8Ew-XFgCmasxR2)5limMasmR%}M-A2w%%qwW;XlHYla=bX}IB zDsE--^weKP<0t)wKRWa`BZY!0X-;gGcC9~U`ORWN?DCwpnuJqR z!Hu(l=+JO98KI-9>eY~fn;=<-#_8Z1t@ByoRj#Hs)OesoTrTq3Z6FS&F!2>tQ(hTN zvOGoArq)ZX`%bo7LPS(VPjcf}Z)dr$Z^k7s-De^QC(MSbQga6#K3#3@XfXB`+!JX) z9*#kF-gl?{C3=05w{~jEk5z0ix2St^-t{WX{uB|(QH6MM+Y;`^ z*~N zapvVfQpjE1+V&r{Wzt&>gA_)FK=YUT^0f^f7e54hHOLLvsJVh-*YF>4*Et!A%!1{< zn4|@!+pN?6fbMYeDiu`kklX~wC1gw`ew_e|vkD+x+vM({6s*`MHFpZ10Qlg4C$}^~; z$-;-9)T&QKE$Y3M`+l#(`it{IJf_jVFGA0bs-0SJF#c&jk$d|D+36E)D8fN8W}pLI z?#BJ*QxI9tFWAaIm*yWz{tg}f;7g%93H>`hg?kVw`n(`jAC$n(U>Dfh6(64K0RH+N zVfoRw2mj&zgh{0lp6*zA^BJ@yv;ENR8%{aOiPA4muWZ7{$o4URl9-ZcKE*bV)+xkS zMKotX)nNbFa7+aH9huX<)+*JphM(YrgZ?3Lh zb>AL8q-W-{?McT9O+aMJ()UlVD5P*+TS{ktdSIGA^$Tmo+ z)Rb_K>LJ#wC$*?vX>LWjRy*VJE-x;att!u%%YR*&8ylDQLX{2_H3P?bz8xtHkE)uP zY$Ush_GG`9Mr+p?)w(`}OfF8?KGjmc1_`ysDcwMbAXBfsNkkIJ%EXW7RywTGa|x9? zPIYaUDqMZWW!@@WaBcc&VC#L88HWlepO+7y;P=`_^jM=8EFxo}(a(xXSC6tm^&P0I znOQr}8U`5K3CweQ+ze6UmYVkj@BCOf7pm@h$*=6f3!U5iy~=)jPl&^-*8oBSuGb!_ z8YgAw)fbt=+!K2OUZ@V=fj4q^9y4B()oF(v7d+^k4r`kUxg=KH=PuPBp+mM;IzO-`2IvzgMNHEG{3d z#cRbtH71d?_K?__IA2Q5R<24q+b&|ubzwC7w<@E)mIu6x3wHYs>jSQx%=s-xaS6H? z?O4;^(!&J=*GYSfKYSXa&{lITgYwa=jiuR?g2Jzk+UuS9Ua_n0rrf~$KS_U7-k>{; z@%yq{!c7}Xg~iB`AJ;#@qX%uw9=Ka%o42I~_EvjL9C>)ApLMg5wb&wheNmKOTl?lV zx!vWjyv}mBm;9$}LjTA|_>2^rYcI8U`y?Wrd@)uK-U>eRCL3~#m1HywXceE3n^g z8BE|hk#H_+Yr&g}Y-ZDC#xQhP7n z=zj^p`;$Uv@gJ$3XoHq=6H{=cvZEUPv`-bn*CHzzIa3^3o!*5Sw+VAn#TpC7_Ukj6 zdV>y{r!}5Y?dh7Jg?feGa5Q23bhvhtqK}s<;OJr8DnnkkF!b(=2C% zU$u+=v;MTZ6#b?sa&;dvHcW_`2$#4}HvdX6RI2^4cKVen6+0440YKk5-JHz!)5l#b zP)=<-WY<<^106vhYEhno8`lvPhz9rbC=Bpq7Y>%}UvAEL}*!}+<7dtPx4Aj5IzC(qIQJ{k?{FUQrM7cc|7hG0k*&_Yy;M0&3&G!$$jf;{M{^WF2*f{wr zAP?Lf&ZTu?R+~1<##f*QPR@uBU|%oHT4iuiyWCymicv9iM+Z_Tzs{%0d(?I~Q*I~* zm+!(1$M`g(%p%sg)zg87E4Q7@hSLi;YCaty0ddiNnoqzYmdz@hiLs}HC!{RZv-qv( zpEauRq){IJ7|jg#Wb2HZH+Cuj`)jfuIkC|4iI6!T14Re)Z-*56?}Rp^vvic!E3Oz% z&AeUV^;A4kE1V11ygaE9ZXOfPuqSJ0mUh+#T}I%NS(}Lm%)e$B=(nbDTkg#)*Uq-Q zmq`d$ucWtz@w9#kcM#Y)!QT4+Ob(S7`lMsP9zf7m=7SptA0@@jX=5xTd}ZzQWsnplkMnwe$`MKz`pjr!_0KAB;C@5a?fJSdX}w>VRN@|HBv10{hU)2Vs=t%uvf9cG6v znC6b%EU*q0IYmA=%)Zi`6jFY+& zr%ehQc)7`6B7)hBwpO})qlIs1zaljfvV6tmX*TRi+rWAK)GE82*zsBY(B+VT48LMT zfD$fb#gcpa+wbg#m2?w&SY65KnRZp-czDl8u_xs ziJM}V=Barg=x9pe}K7`C8?SN5g>3fR$NS12-XV zxY0WEQH8a6;Er30iID9@WaN*5b&sUl0atJ5nkpzIw5VSGTmx-|V*@?{F|<|c@+b=M zhB`YDo3rg*<%u*wG%gDtT>eWJ_swbYCo#jLzs+}LCYw6u4x6}0Ri4GYGj)fhvBEi` z+6ghiVbkvrBriuxi@;zEM{K-_~rjdw8 z*{6aaR(qr#!98^8XBAu5Keyj;Yx9#4vCu2;m23Y+^xp6%F>xS=qMUMW-`#Mt31x8@ zKt4ez*72_^ZIK5;9K$3pFW;;KyD|*Lwa&*wuMXWXL12}ZY^DGQ@2;%p;mi6LW!|H{ zq3=))%SkVmY!Gz0*9%G8AoTj*8+ya}(O+{>#mUp^jlgk-Z0LZIxR^dYn+?f=$|z9C2QGERsvip8HK;Ww}C~R*0kpOK+A{7oM5Aj(%xpF!j3sqwoB4 zVfd8k4-H1{i|!CWeKohk{&GwHde*L2!!BysuS`R7+JrTEBmI>*9KyEpOC5V#I~NKe zX4sbeWhi%3Zau3(h5Vp@9Mj2KsB<4oh;7bpe_NSl=B$NNa2pjPc<6;Dvu;}w_fB0Y zk=S6f*ixgBR$anvj_!w)TMPm3sPX>lCz+L1zwi0;=JSHtoRC+3ShdJkxg&coQdFnd zE~HZrMiJj@9SV<>+eCWQ0lT?y5q+_vc=VmCCT6Lju$qN$daAk z>@L7^66{(P4U+R7KXUUztUF%;d+XIfVwK)y-6Oyx>%@u7VdxJ}4*wsF+qCsqY<-qE?Brr7 zt}@1NQ<}8#n==*-)hKIR;|SxpGtF6w1E0~Q4nzu(%lOt8DLV7L+VXkGcjLSvZGwR& z^}+pIb12LuNxtmu33_1!wY&kgALc~)lX$bPgI`n8)M?@?sAwqhxC(_+(2n>JDI3$- zQg->llbi0QHm*8oa(u{enqj^YRqw}%?8o{`YM=uB`+bbUxpndew`SCac(98~P>CO_ zxnB;QC+o^jj~z4!jc5ZErGV&_Eey=={00G}9n4x~NGrrbVkJ>ZbK{5h;d&#> z^Mm|xMm}8w_h(CAL0CHbvpKCXN6qJ#rz16xO<^@kqsm})(GWFDo-;b6LHz2J16BKj z8lH`Xdf+jOh7LtimFiZ+&n>Hr+|+?c3W(?;64Jh2pH;$zwK|Pt?~!B(*2*yB{PPpK z{hI?k)M`nJPhrCVE*ejFhr5p+8fG?L2B)fo+x%Sr$~Xsfh9?BcBzv3Avevofg>b00NH>%*H~YXRPDhBLx-(PZy#uplzEf9>35-D3CJStGu260y zhpF4>UsBXuoypzZbDr54(rST=`JJJ#I7YoYW@69USPH6RMkxotCBw@G@T^xdlHs@C zmXp9po_ zSFjr)C%2%ZfBo#HFvXDJBBkB0_tb?tQgg4oiP9J5Q{a2~IvniHhiZzf2y2;AJ59E7 z*Q~ys!o}tdfL-8ZLsYB^%t!w$u08)Sr6CTL%EcMapT)r(j}F9=(M;L46m>>ncpl_@ zm6?l&I&Q9rrwpXO%MXbAyL{Q^43#eLl9t;t;FaTq*3i*pvcTn)MLn1SpD{-1#vlJM zd(FocyaY;0MZWspWPfuu8~#&Ipg`%ZYDp-DmAn(_`YEdNLeO?c82)fo!vwwzSkJ7{S86j#@UdV; zUNv29Wf?j4&4*Hx;x`%bDnyf!!JGz;H;}xAo5wR}+5XZn^SkXRw0DOpku;0Z8C1pj zH2vY>?Rs=L$hbLw%sZwpH#W5u>L;P!UR`w$btpM2joDD(h_cP6=S(qBARQ`oXpsGa{_WfVZ$G=d!_Rl82$o@4FG4n~`r`7c*XA z4qD$be_6zPxfoWrvtFy;{bB-z3mAI$lVi3q+d&ho{0GMgBJ)pH_5HIg^Z$j{`wyie z;F9SKU%ZttC+^EP_|HtcZEhfV1Z_j(=iRhg_pAPUO2&P>{5!2QBh)n(} zLvb9vw<~Zt>L=IrpP<+LKW=1WXemhwx7<>mc=y8B?7}GI|DL|Ldwp^4h*F-KtwD0<4;< zcE_=NI#hzghTQ&1LfHS~4EwKA1pm7X;o0M*pmU5A$Gg8Z_jn2p-!kuj19%DI6z-oh z-g)nQJCC*&e?vi$(;Kn={iQDEp-O93514PpChX9iX=Pxni5jv_P&lRnIDb||Xhm#b zMQpsDvhgMfDOhD~X&l1~KZ5kaWFf}hZG-A`P2`|KeyN8e)I0i!GZu182bM|s00NR@ zB97l?aH6Nq&Ij}*gN=?A#&KD}Ld=5FRCgEVgjiplPtGOCy|gu`j2VXV=gmvT(X}xR ziMx9jje}HL|FDa_t9x#471fcTNt`(%*4S^L3KWwh+ zN7`<#h>2}RxUcou70o1K>NMEb>fIwI2=;J7nkY)Qeh)KR`;*H#)uAjQ)_`#exH(g^mv#KLO|IC0#*f&=2&YAbM z9Q*`@TOEjhja`M1G7n^u5qqDV+7Vd3|g`c|EN_B(DkWob0*hyOUzo}fwHJ6CDl%3Y!L<%Hi<{);qxs^%PJExr!k z*V9=8ICx(~5KYW@469smdFw4ZrY+BzRTOQoN`i(-yPj_K$djA21-=QJKoPmP=DPjw zV|2axP7soA*KIiS?SOw@l(^9Dl~?FLU;62+zzpE0^m9_w(b6>&?i5j*We!r$@%o)2 zc6Fo=IqFW?o`ZdoX;(R3TYI)oyFhnN`Dn$dlHMB5*u#sgp1A7c-(M}qP+dDz)JK92 zX$MKfiq;h)cDWvITR8o(ZTB?PyRH06aYXk4QkUEcPW$OXNOJox{SEfMH(S=D#!yey zA=*H4;O^XJxRB72l+YwzQp67;bk@U%3MzJC3Ig28vY!)QpN|2~Ti>>ph~#}yrc(FwVZ5i#^{` zM^8*&tC-dQd!NV9fiU|&Z_LaF)q>O1(=zR7!apyo-9I%In44p5S|0FCN}Aeba#J;v z%!SR4i@nk6V8NFpfb4mI@*E0Cwy*a)R-GedkJqW%;sk(>YtAntxHGlQqOwh?4&HFx{hW` za#Fd?Da617uj>m@0yCHM)uveD*Kp=W{Yro3Zgw=1e?)Yb7o_!z1!$KU?D@X>2(hT4 z9v(Zli`13lYg2<2@x&esXXui;v|@z}(j_1H9g#$(qZl4Z99yWr&Smr`8tBz}IghtM ztRX|CoNIdSWF`~?Od8zp<@W__^{*|jh~>6aENzeDS)L~ed=)GGco77TIr?4aADuY^7)!YOj?s$b zRt9=n&2+riK}fbT17?PwPZ;36gWBEv+!r*q-|vblZHt!wRB@pa>`2-H*7~y4ve$W3 zNl`Y4rjFE^p15}~9v9`BaRLnPztOoCP}OlB@H$_dj#q)=Wtwii(%@rW%OjleSaLfG zb889NC)5?)i;#%Ve~T@c;ot*3FAJtBl;Z13N?Ju+u-K@QUx>6Xn3T9*Y((uFx7;=} z$gfcBaNMiIO1%|LR;XWGg*cP25Qzf@_s7`=1ZQ%{f$$mBieG}vy@RXCu}%h;EO0y-PUPQm-^_57Ca3Pt;BbHGc01U;CR%)*R71E5tSXDzMzsPQ{Ch>=hwdn`XzYN$ zg~K+q5jk;`1onx$v;PPFr;Z!rG^?hXLI8-V<5) zp}!w1uI`B_#uQE)?2J0IzYe8+a>|CJc1C_3!VlLI70xd2Ih=(%ZCUkdJ5BWA_&5@z#!4;p;Q(2)pJp@yOT^w)c}2ThiE#-0Lmd*Cb+@0gQ;yzMi(lh+TyS+pM+m z-wJTYhI2hLa54?Ef=|FOpuh7tS+47^pDqVN&&!L4qQJuXlKKkoO&xBC$pMKN`G>xT z?7>RaWSH#!*;>rx_>NAKT^P08F>c7ScCRnDmYkwSX~e0&w0ztCezErBnpu~!WBlckpUD@#64~XnkT;i@f<6}p6S~e$ z8tY#Cd zYwGOmdJ>$0We3k@OG>gU6IBOEKFcj?Q6Kk3s3~nmjz*q%t(<+7sdOS-B$;uK%M4(8 zY`h2r+}8a$udz=90~}{Xu|V-K+9f}N))L_BA1Bh%VoPHf@dfhoTTjt^5`P92`6!v>H_u{jz6F<;IqYP*1=oM0&DQM=cG?H z4^Ba+J8|W}J1mGJOj!&R{H%R(=T#{FDM- zWa$%=St^5RD6s#t-rxLb*Yy8eY-T*(NR){es6lRXE8|O^{ZAFE;yV4`aq!Z=tIYa$ z9wzm_w@?3&m)rrw%hTi7_qU4JVRiskv5!gU`RFlJyBI3?&(pN>sQ+6I+WM#2=FWMp bXvf(?U#`U2=OD4O0xZmI|Ej&>`RM-vK7}V@ diff --git a/doc/design/mkldnn/image/overview.png b/doc/design/mkldnn/image/overview.png index 84b455c28230703599a2529f014cfbb222138fef..1d81b5a4b5db687c06b92f88648f9895711fdef4 100644 GIT binary patch literal 16329 zcmch;Wl&t*yY>meT?35;hY*5$APqF`fg}XCMuS73Ay{ySU_lxQ!9oHAcWB(*U4y$z z)9^OWbI#OE)zq2!*Ss|!y1Le`-o5s{_LASau6u>QQCA?qrN%`;K_O65l+{8(K}|z` zUSeY+uiPfN;UT|JU9}WIC?!L*JIDgMm9&~P3QAcl-km81vW(-TsOyS?LfG;6K~3T$ zq(MQEG*Xh4e&=Cykb##-+v$0vqGtnPJy6zT3}TTAEAXI>!T`{sVY-kdM7BEhVze=L z7(8PGkkW&C$_ANf$<{hh$y%K3IfZA4T~Wa$jKU^owJFI_Wu5a9=ER^L5ND;HP1RV* z;r&NA&17}G)o05B2*mv!=DPb%LqpBY&F$*y=(c%xhYxz|1j1&NRX3+dD^00X!e-os z@`1Z#RS+GLp^4%L*1IV?*jO~Jb%MV#JjYi3 z@F7{h!M#|6P1svQ3@4fxG=a5AEu*Mv&Ze5O!ls_S?FyXlVWXYQ#f`4(Cr%w2QlB0j zU7ae@U7sq}-|Dnl&{UAi3r5~XM)S;jB-VsD`P(;&>AAUGHnp_Bl295h@-+W-C}0x^ zMsHWdjN=<60by%q@oqD$ZkX|s5*2nRk*7I*1#=6M(Y#>AVGR1I0r>PC%cEulv(ea8 zl=bkr3f*XsBtz=T%)S6)5b;I$jueMKUqt;7Ls5&?+_54yW5+fY0k1z!Fk5sy2u)k5 zu9x-%Vp_BKBfbg`u1IZ9+h~_6sok*TBvWa5J+pNWuMrJx>$ozr zwoVu7-vZOn_;6#!23dZM%HUeP~sK2t2T@Z@&mk@K!mo$QjKipXl9-pnQ z#}bJqzvOy#yZtkWxGFS=4Qs0DGR^DeZ$cB}p7)6y%%O{;;MxG$bqR3Ph8jKYPD&t) zuS6VnCpt6!g2mq*30W-QH1bF z3GYznR&KQ}-1H$B55W(s@To!Jo=>~n(P+5gX?*iD8K}JL^*dpiI%xv!1Q=8B@e)$l z{Qb}RmeQQF;+!#@pnLR|t=TV8B~sE-3~QUHIcG@J{jl88d0%`NugtZ#TFLLYF5Z=c zpt+{}hXjkDSo^l+BGmDALkYgW?usTOMO)wuwf`YB#IycUrM{<+!x8vUtRXRa;C?pf z&^9CvAC|haKHNyAj8d%nPEaBF;Asr=ZyDRHHC_3UM^MQb^>Cbe;f!YLuZwT6*A3iE zzA+rHp1-xgTM-`0Jh-gi;})^`(}a65rclXQtPbaTr$v;t?vu`*cWd@gkhQcXQMO)s zBts-;d#f~O9JpTFUp0LTrd*cn>x|_ve~tAG`kd{2Hn-{O(zEPn33ecyg?2GlXFfvj zZRukmrF>yb8pb|NB@+F|_>*R(wmGs_q1zeMjcpCOZr#$>it?iGEqSyzxetk97~hT2 zIG(c$P@+Im*$`D#i$=A&y0@O^Hom{ zn7+xWMwb8G{3y$iVB+`6_kz>-n~x`a_jiutEgx8>=>(znTOizJcXqRK+*^I#-dG!W z{KdH2@%j$VS#q4E;xAp@aLUWNNKU?oHx;_oy9NyG_Y6Q`$zVT0RGk7$<|>^hU%-Z% z(PC_7N?e9e{m`Z$+%n#Y)=a@t!50EdYvOaKOI$VAZgvTms6Gw|4ZoQ>LBtH0Ou;yB zh&51nK;mp5pl7C9&YeZD9X*Fe-9Y{LxPz5?4LEzXm*>^E!JP zJE-c9@59CFw*l+YufsML(6f2mxKX=fH0c!0UnqbCg6s^MUg_<^^w*z=9#4L#sr9N7 zuXO6Ot=ot`%u~t98+4MMUA&3vDaEG{Xs&&+b^xa9$WkdykpifcmfNHd=lSzR)1V1_ zCMYfj4uh(8Ig%@Nt&}iJbc}gj7}U*9Shco?A`&WL2Sk$lXPGfE0jMJTjnqyt-#iH$ zy$}~bk!(ydI(cc668;pI8B^fHN;BM>aPv@^Ci)YX(f%1rjZUF!_aQoae(*nzak(~(C#EQ5PD(6`O?quliM1SV%%Z}hxqo5UD}JQeNi}p2R~D;5 zEF&^R0!~sAZXRsIiLyM}wvZ3dVdMPu>DAKSS@x5_XbYEY$!^x}6_})95Z3?i@&eECj{h)_AC$G=renbW#d`z7?C`z6eH}ZGD z2(o;@^Og4tPI}`)beg zwU>3YV9sQBH55)0PC)O?yP}>M>el|{`c2vWBE%=gxf)99GURW@m-g|^p%_YSqbOzV zNm=+4_p^2a?B!LG?}QH3Yd&))`xQW|j#f95dHk!?SD3CZt%(l)iVioO_O%JiB8T&P zCVcNTV3`$8t!QmUbS$mW>%EtL)BPCox^ON^JW3@A!Z!|{fy)fS5?H1ig@Nu(o+q7} zg?2N-?Lj9fmqe>;W-LGxZWfnF%Ag|OQ>830LlJl1BsD-Lzvr`btlGqP$F|zRH>I`1 zU%@(J^rBOh?KCURm(BxcxeKl>rEv`Q_zJ;aoV4h-ilU7`fT~%71j=M<=qeDV8w{m% zV6vt;2$PItQD)AjVo{>x**gw(rL1&5G5VvhDj zDx*(%GMzvlQH0dB(K`+}}AiStAK<)!;0$<^1_FA!R#lO$0vlNe}4QSkC&C-z{`XnxLd zzdJ$0XFcr8D5>j;=)0O<)* zT4M_WlezjY5ZuufmU1eyI1ldx%^<-5-*raj%qx@V7p zovmu9+!WYpdMR3eq+J5`G>Ck>aqDp&iz4O8SVM51W%)Hb;H$^CZdIY(g|qWi{7$}_ znTHD2Jg#<;b0E;$b`x`6*~{CdQB0oU@(;`j*MZ7L5}m*!(z>_1!KL&@;#=u$k%Jzw zKCh4OubQ{cz9Fuz>V9&u2nwn0Nn_nKx{fks#kzaE-T577O0bO3a^(H&W0`Dtr${kc z9LiAF6rH45yfkb4_j$(D)NG5m=ZuxmR+s%#_CwSC^7GYgi!pui$~wVZ++X1zIld)&SJrb_X_ zNk#oOUrud`kP3QX?_>XAt90-3VYvf_>_XENdrX@XDtzoeuAAH=CZ1hN>^I*(Q#6`- z-h0xMJr|Km$8x`V)(j7ZoOWL&h+^(t`fO(-2M2GWw*r+3>^gljqP%H66Y)dMbg?qo{m5Gu(U`39Qtk> z)lY2MD?HnA)f88>mgg`i0uX!OT%Wg#V#%j{|NQOCE%&t1UNml5oSXvv!QP6T@r1P5 z;#uf3>ZBo}DB2L`A=c#UVLHC-zg*u|ef4Pg%vXJx-T{<#e7otI_XW;2>xk(67XGN1 zE!8w{z)7bGE>3AK8_qf@@i0a3?4Z5{JYs}X+H-@is^cs^wJ3gB@2?IA&qjC6Y+5eo zmpaVZI2YG_Fj(zJMTj9u3Bm89K>sEaxMK)g`V%WKb}KzWi0r4Ghx>KB2`QDsKnzA~ z1&I!^0Y$6SRsox%CZFc>d?>C1=h6!$IzY076)Gd?ruOP&&w^Nm+U3n;#ZZK@*p;k4 z)GyMaHmlOPJ>scv-9-zU521dR$B$u>Z^_J*1NGBJXJ&PFtxq1r2z_u=Ze^1{`#0;% zOp5?^ie{>JRAK_98+5UXv1nJ$M5l;YXo8)*{ro21YvOFJ3s-U!&1(=G&P!U)x$9@4 z{Q!gHmUXFa|JFUfq_nC<4maqc1f=BOzThnE22oH!YbEe!Y?TU?j|lR5GM4EBC^R9j zTlaO@90)KKtVOH#Q!GW`FY zaGKDl)+@^}OP@nm49Y}^k1X)!spM_k@EHTJ>84Q36*uSWS6*SwzF;sS$^2zPX)UHb z74GM^4zz8)Qr1-dtl?ZCX;hwUxHeH4qN7_%0niS`9V|^Ka&#qb#1YX@DpOyN$)G*e zsGU?kxt#m}r)75V6A0jI|Go~5UDhNLlAzNVzz`wLp^Fsy8Xxr|8fEZ8Cn8{;!}QZT zj&m;$+O{D0Z>^V`ZPx4y@>+Mj@(j9Yd(7GNMa4GE8hkEsBNNYU4hhiO`wi{CeevV- zxTPh2x{HoJ(BSandeEtSu(04ZJ%uDEBOXDDqZaS423@j)XRxRX^Y;YUdqKxFt~`aZ z?I~PMD-#3>Bs%0Pluig~nmVXB5N~3j>QVT}StUvhU9tX{Yemwr^@*fPuapNMb)}ss z8$ai1zHBIOFomh-*;?e{x=&Y7gId6mjlkf3YLpYS<&*YQ#lc6jhX|?fbFxY4qH<%4 zHHajf63IOge9nqm>~eNfKb=?b#bp}L?qVbL8AGo!wa7gWM}KQGaPI(;y481d&7)zjM! zTVwziBRmTTecm)2G5{E^pDY^o(yh$##6B0nhO-6|ewvaJuQpLi(I}In zy;0EGBL-{OPnC_Fd{jL6jbpDo+ zQOrxPN#`gRBxLN88~ zaJ`c!@>#+>8J?~BmSs5_yGB3IhT06CCxqj|*00WEUkB6krz1>66zu*adcBL1=(5{o z*5e+TpWplZ`Lh9JS?M2~aAqtq*qNK3H&jpv-P_%D5l?rPf2W5}Sxa4u_xDL>@q2qf zF$0Ui!d*8Yq-&*ZE0(Jc+NqOQH}<7`;l}J;iWr-0-HH~tGLV?*tC@$VZR?Zig@t`R zeSJfHeJE7eJ`FktVICP9TT4qzGmw!9q!D*N6P0I9qYe=lca`n{GfgtwHQd*V2Zk~X z5@>%+EFYh4-t7F!4h#N??)Am>BpJ5vs;Q40Zt0Q>o(03>1_put923~+4*c@Mo89ar zSSGJ?GIFi-@#O_-XS5f*j=qq@5$mqfn*ALJ)7g4s&y-#0d~D&0I`;s1PbRO5B)kB6 zq0Yc|_tCRs6Tau|s#RuYEV6w2jxkH{%Nif)<;nj)%T1}lmIEn# zz(;M&QI)9tBZw^!jh{qAW-WN6oQQ@|5yr{VAH399D(+Xr2x5-(>`ZYr^6#Ij@s{1@E8ezNR z8KsRTgKC?6y(?u8(ma9H2%#U5VxFgEcr>)kt~V)q;q6{pEC3@;#X zI&xUJoP$Hq1k}!%aZojcjo;74|JYd}<`m!C3pkv}EA2*$zfHL$E5zP4i` zkAEQ;79MSuf+7{g7B!witYn95Cm=5>qk;i36&vb|EQ)JRr-D#82>c}ACj-B#jb(&@ zn7}Zo>`}9noU!hb4Y3(yfqHh`UjJl!Zw(lnnXt^v(yg$J*1#rYhB`I@nO|@r|1wk5 zHhIGrJ+S2%pMc0Io>_QP884XaA`6w6CcbV6FqpFP>C-3KyvOFE`TgO~rCv+m-6gRf z02Z6@Tu`{DEH1ik1hAMwqlN@rp8(9SBSA0A;PK4V@!`=pgh)a8@nQiDdQ3ZnI+_DA%o9JW>I6vi&J{+kJvehr5$l2$hX?_^idTJGx-0n z5JuA9>Brx_=suMRjIB7jgMBe|DJpF~kb7(&~QfI_Bf{~*o}tPhR@od%#{qfPa8#qjQj z22s7e1(JrEy0?0z+?d7qQeKXA9o z)j#*vQ(xSYOy;A{R_mina(;I&-Veah=BP;TFMCC_8)$sXI(R~JtH!6$=9TxZ%n(c; z6LPvL_=>(T7L4j zwP0^_meFS%8u|ViceCGidvGy(xYP+t*gwzv>#96IW*4T?SeoY`R=FC@aAci8$*E?)GX%fA3FfGk#Q| z;rPMJ==8`Gf@}`(uM+I@F1++$VYkvC=3mN7@)&`ObAm&WxNXqntoytu6(h?#?j9zue1kh&l-)Q#7-+6~iPJv%56x{(NfZKZK8 z#+O}W@cEo@Y4UO^qNHP{`%P=LJ$VS6C;SR%a5Rs@PG1{6-0v?+Tq14Ywt<@jKqy+v zzSzW+c}KY9>0+AOlEI0IA*06~h6@>7{TXPn%dRPfs#6E0S7+p zM8BBb$rXcdsg0c|zyu->E9j7ZZo>>Y$YY}d?Bo6DcWl1P5y`1)@3!rqy74{@8sy6t z#f}e`tK$>C{n|_AblWQ6!yUw!O~TrGY!Vh8yaKPGQUz!=cv*CjEYD`R0(eZQ0KF!q zg^^T>m#@pv@eTN<)q4zKw}JEEDHfP2^viX(y1t0lYMI+cskIVp4?8=;btNXj=YBvR z!TnR~O{R~jx!Zxvk*sYfrTj}1<{=hhO)@;T4~cj=uVa{V`mdjBPl%9GYmy%Jom@a$ zOnL@iKh5%dk8ki6z~oEc#Tmw$;tzN9Qa|(LyFPFuf~~U&3Dy4bb+ zxoMdvNY~?f!=0xiYRO5Q5Ys^Cq?w8DRS`Or@f&9MBpc@4&QhM7B%ufi+dJKL2eWhL zY?45;x!hfK@aAmF>e|EI$GYLvF1wlM`%?wqZJIRyXkm=DP8 zdHZ{CsErrud_Nn>>0`u=C@bLRF<|$$+cWh_Aptukp!i;DRr@eLvoJGFVC-8R{Q#HK z&Y?ynrH;d+&lulzmF>hY!nhj-e!Bc((~)Re&+iu{wlG6(j4yygsU_;1ZICi&caoOv zn-XWztfq!_FEGUR$$WLm_)5|^2>!=__yv^Tfr)*W5-0lSi_J9C&pgt$=s~a$R7=6s z(bpe`2c;eQ7=!_NaV-tMg&UHM^y-E6ex>sofrlT0Ou~wJ+TJ;|(3|4;4O@|gU(+X8 zqWW8OndW&P^`5TjnVlm&zF-t@3F2V^j=Zpk5CGmG6Fw)yOUJuxKNgvEG;ruBxJh~e zGms(+n7-A4k`Iy@1J325Tr%#2`@*9qO>>Kh)`ha?d5uc6#$w1ycw#WRS=7wO=IoAX zzta77TSSXhCT77)hfk=40g(*yO$lL5z?lJ+3g$>s>c2j}XL1#}EvX zQD)M#Zfj%XUGI*CJj!`Up^yC$qI!dH$Qqvs3JZU>f6gi@=VNmB)tpIY1IdXl#l^*% zNHitBkXKQO)IieXIU12*04mPsxQ-MxAK88koyq^+FhFV^_XoDqB^Ne`+Y5*G5Q2NJ zKGr*=&Din0L!J{GHwQrT1<9Iye62W-6o!V4W3%2BZDgm&#*ait|3{5)7mbW4sKe}X z6C@x)z1*oFTnK9aNctmSCMf)Z?Gd?ttk4DO?4CuP+2m4*5E|;)@fI9L4jWQ+0r_Xp1SrQqk~tk zUNz1z0R286(rMZ}wrmHQ2tq?LK-%2H$@TPaFHpg*O*2~aJPQ?2#vmGHVrL#W!dw5Z zt$r^bM)Oe5vv%R!4laXT+efRRe-mwCW;L{fiKrc<8>c7GEDjF|2BvO@EiI4rLNg2 z#yH$hAc~!`+bqY6ymU;b@ex1esCEQRh?Ayi_9zN{j*edU2HJL!XVGBy zv2Hy($V43II}F5F-F})z!&@vMTxxGR9{>GAYs1v1^l}5b=pa56HMOZQ*PM5<(=NF- zFK?zBHX0RVNv>5g${9k{-YA70gIJHPk1T0iq&tqI!(&bhyqxc)yWRTGkg;iZpdh{> zl_7q_+s>n5?eJ;Zcf%ZJ!1;G^%Laeiy>nXu#2gWj&9>3`)Xz0t|Mva`r2in+!{7UE zc6x#C>h%Gzdf4DfiH&DZp7CD~x2CbeGjZ zhsoV42Wazh|7+HcW5(=4W~azTM8DrfNnW?FN1^S*AiF#E53YZc*)azSwc_p`etFz7a@6n zVCD0y+f1zVxI6jA;+7*lOe$ladGEl>H<<#hVYP4k&K#4o-!WlL)?0!9boWf_V7T0H zhzPh0TXFm$1T}5&R>)wq>mCxB$FwJe+m~*|{v==#bVGND>pEF)95wmN>vk#&%S<13 zF5|$!+`obUaq>E-w$J8f0J_}Bb$bTtN9@7irO>9qiGUMn4~K>=B_4{RdDrRL!*BES z7t0o{=o_O2#us82ft_}1Iudx`&<4V7c6mK1n#1}XZ&>W6Hd8?}kD*i41Fg$ap+fC& zmV1z+O8W)r+eVJbdrhxC`JH5tRELyX4f6)=1#0Yyo}&2K9YFvw6L>&Ee3d{>}m znxoryVXO1|eiPCC9M*o8&b7EP(McMOBciUMTto1g=kC)7F-`!-g2k5KhW111?eaCk z#jz)XiaUyqk8Vzbrlk3V-S>4@icc=#yn@T67|yNmOIsiR?9NgO=2+nH*!|1-V)9$C zi%Nwf{YAzZ(Xw03!?n$Wq0kV$CXJ%?E^)xYz;;`e?5q0)zpK6WhCVG5LB7nR#+tEd zi!T3LUh1NP<`K2{u3qEo{Dw{F6;XFENO9a#?b>3Fx%@r^0g;F%|B z*umYf!Rq{Q^H*zec$m}ZKMzHPMi=_oUF$m59SX^vSK4EKVsFSm+sZ3#mLwgikS6c_8YEfv>+oC_UhenRy57a+zqEb@G zv+9bSY@>PKZ~R$acMbeJ<>m%298)SItv$Q9LfYWoyDN)6_lM=~%L=-Rzq)-J;6oMO z|6OO`^$q4E(66IM9>k2vJJWC|dc^g>tJg<~j?ResWXiccI5NciZB~Veb6OI;`>l7W zj*lnU$LQda2usH8KFm9ABuv`>caJ=yQepaapNW(dalDqd*?q~VmQ6|u%6Pu&>5U?z zZC32WWoT65nZn=5Ao+K)cO{;+jLm4OOp5$E=&7j5s@hj8v$*nDWK%pl&>-cy z@60xkpSxi`l>;&Jn+jCF6z$K*$FN%;PAnY|;5gZOar$J{q?*b(rLvZg4;RYf)Jmmr z+eqFKjh#CWOp}Teun;*y(AZ^tn7hArUtMK4Iu)xp+oxjQHHB(BibdrjpCvX(p1dq=Xy+>Ra2*o&Sq-jwd+$))3%e%dB0;%>N^{D+QtWN;BXt!p-@*)Eb$Fybz!-hWV^7vPu)MkBD0dn#}=DLw@##9P`Yuf z!AyVIh#HC-Y?hn{ojDx)x*w(!;1dn!9uLTdRsEjZr+fX7Ah^yMG^=rr3K@0BGW?|$ z^_3fTsepC~eRb_UZZ8=-Wj-$wnF_G!&PnshtU|o8YiJQPueL2!Ge|NHx4h|uf1kL~ zK5Pz~vlz$+{@8fZ)YMc`P*8BxvE!bRk>N1;>%GH9Us6d^)1up_Pqif_+>kB{SeP;% za%p3yFmO+?b#QP;Qc6j2@t_)1cu+0l)nrv=Wo>>w+w{oDN>|LHM9gT|f5KV+ljRr! zSVFHuEa@GaKSs?qh% zd(Fa6*G*J^1()tOV7L62OeJxBCU8*LXbdthLt55@Qh5xoehmLJ>q;8)2F4)_{BWIf z_=D5&>d%|%1`ng1esLA|w2CTNI}awPUtF;|6*vn!;3MpUPII3*#8=i>#-RTir36JG zt$LZBS$6u(Gr2$9C!R)+fA@c^cn@nY&`(dSj9noMSCxL8On+2;cOWW@di4ati&zf4%;@=FGe-TDRX$d=5=(IC5O51WGUEYz^8<)ZrvI^i1)^0Tfa_3(zFRT~}{B=%F=oG|vT@Tv(QXfXAW5{AAt7|CH81Hcz zRXEDNb+o1gKYBgwmpQ{u+W*1aEA|c}*>Cx(*QmjMuV%Oh6FgYIK8(2fIUzOUXeK^& zspSu8#_O+G-uDxY;P2aUL`E|MzuQ4GC|d1!GMDhluC02mtRTZQJ=VwEn1g2OgSuZ? zdgW2akX-}I9`hVoXt`;|zOlqJo2|Smq2l4}bnr(X96UeoRAWn;&ReN>l*mv_L(Jsh z)LB-+;&9~pc!v>+%GQPU*IT`TczgxqeSgo{Xj)TmvPEZE+t2fg`|S$i_6WFd=_pWT z@|?H2X7MbSQfumArxYcvgG5O!Kyt1eGw_3xiOS zb*7w>oZ&l>2fB;d=X;4@$-^_VH&CkrpMV4+|AsfrSQ<3Od6`+v? zHcXQ*NRc#qec6Ra$sQsx_)ImQ<&dr zeFp1uee={-tyfcWZo%)8jo~>U-QAL@v>CB$|G>Vs!BP6PRVC4-x<+Hs)9$$qN z^op*Mn@t3orC!GI&W*1gk6V(8s5Fxe3mU)^AO9k>3#w}){Gxs(irI&d4YyJty{^k~ z)^GQRWc&ix#yLa?{-nK+An<^`dwU=fc93ji8?TR>U2$+MKG)BJ5yjp8=}7WsjG~*G z1VjE!mrd5|j;%1TC?c~N>e&(;?2(EH38y~st?H#Ym*e~Cm)y=14zv#$4W;v&Q%&Vn z7fAs}tPDL2JZ<*ftp<|ZAX8?{un9kaJXX;t@#N9txqq<`F4R1%ZGi*>0c=w;QyfzO z6ny{1_hwOTk&>JM4a-bZmMsIM|{m5ZwEQ)sDKye;l zS{nU=`1q#IaMDQc0wKa(5Dw1YyUVclUjx25i*m;_`S_iBnf2v2I%rSmB?o@Moy|7 zKR(<^|)*KZlDF_O2zB#C>Nq$W#NM~ z?2L}}gnsuEOUYjL+0q5ZCQQ@_{}^}$%KwuPgUWG8A!7&FAW>Pp1OEjXa4t|VlP-9} z>3mWlh9yVySn|Jwd%=}-tUIw@MNGKFO~q_b^pYEFrrgebh&Lyg|})fsk? zd5;xSoZhL`b3^+r*kd5LS~Y8GH(7FDXUlbtJa#z+o3EVjm#3-?*6{dO*=9xS5NvgC zyzM!;EJ|V4L@P?{ku@@rlKOj~--j@1a1(~Ju4UgxK*u0i)X*yZ@*m_l^%@8f@E9WBaZNe--IUi0_BS4E;)E94o_kCe*BOqi2tze%BDg7B!D~h&Mpr`7_*XGDmRoRl)-p0WzE-m36$qldNTl!HK z|D`3P>GAP#$d`IUE2{$is*g$fbuM|fj*bvII=YhIzxy+sXL){%F>&Hw)Zwk0seTTT zn&8y$yIoMVoLVE89AM)5g90v&B9laRjHcXYnEH!>0m2S1?7{D#rW$ zqjIZ1kyCa!Jk>u`uh$CoT7#G~XN-;RD2}pp1@f;O)346o5V_59hA}q3Cgs^HKjU0l z_O?VQiXU}Ic?6^Sakw4dH^|z`l9Hy8=2D$2uap4RNt)$K=e&0UzzfV8O=}MV{YHYF zua-2&n@n>aDNUX=VYO|QKHoE(yp0@-ajjGPj*;;}-q1G1sl(LI^JM<)$~dk+a);R4 z%a~9s>t@W3N z&#Q8_3$J+|zCQ@{|*9HPMM-B%!^9^GBP|*Y7qQKc+?*~j2|)^hA#WX z2WRJA-=6>W)s6CQhFtOZG#lE^2GC^cDTR9;@V5jEb;=E$7g z5`03VRj7nLVT(}wadb_4ME1M@#>smB>#wLCbGo@SP5j!sq>c*@2#{&W7bNDOH2(|{ z0%zNQ8>3KoUztJ*f0}7~qQt6gTz1AMAwiN`agxj zd>D6A`x;{y!>`aqXkU*&kmn&5QZ`1Hod+1kjh*$_h2Yl&^JY*fnwhOy#y>123R(OS z<_{+~$$|D;9XUi!B7PT9s6Krs-zC>i0!t;s)llJ82Lr;eC`KzSrCOMj&$%Qll1x&# zg?~$GMq$GKptAlzw0v-6rD7ne#3`$2Sw=JqiSl)nlET;)jnRo!_Q~@aA)ci1vWAM* zm;~(4F#~k4eLWjEHv=m1LLeXvB_gUZt%#W4B`_1I4e1G!CVJrEZY)8GaNipmE3S_(ts6)O2Fm{U#(^ z(<0Taa`raNyo{%Px7y_dJS;4OG!wHP1Yn2A-5uvd?*-XZZ}q_(&fdB!_B3D%XI}l4 z#;m#+f~EX1LE3<}f!0@P^~qnB%E&J%O4^Uo7!2k5H8U9+Wdv0--jL|nR_96$dpaZ# zDr^xCL$mtnH@KX&{f*mrV_uh=Vmqj^h9oW9B0y^UG@DD<1|?gdNTMqcQT2I8X30CO z?C%ea&x>wVw={v^5;Dsz=j49j!pNHMU8^$;GXs6d3_zs(i#Yu|g#+ZlK=_;Dt?P%~ zJnkZr3!z5`*BT4!v@k#2)>5rxM5eIbLO4Q=)ZsQk36B=7FJT*kJdDwDQreu4f3R9B z{k*im=aM)}V66)HWC&}?+y-vv=Boad+jrj|@Sg8y!%mc zV#PSm5vXWM1|!k1ZA-z|Od~7OAK6iQGp;0Qxa_;sFBc)18)q+_x(6_<-^RC_LmqSu z`P@g%e)1aF!!wU~a$Q<^`Oc#7h;`KYwJojNXA)Sa{OcH676@%1!S177i)3$#?_&Zr zHweWKE}CoQ2fol)7CHTnMz2i`Yp*i_4Ggr4Dj{qzQ*m(rC1D~A_utQjJrEH3OHXQd zRk4EgEnJ*hLi{0;A<4e=OXY}gMz(!sCh5lDQJXsnIf>dXKz8UF*lA!8k@TrOenNu@ zDY?@s@h1kF{#<7FyApY*ce=81Aeegp#6i4$;mt_{s2yLoE^&MrJF-q)F$%p+{KTac zK~;Es_f9P3OXF#m7m^Llf2z=pyOh$azw$ z5u|zWCP@%9w319BPrwAMIB*L3p1g~pu_#R$&JbQzHp{2NWH^KpW&0={umwAa3$WPm zJ*}hvrLulKnA7`mMooGSn{Vej`BlDio2a@3A1RS3OYaTXLA(RK6{izv*5`QK69m>z zUmc+3FhFI&CuL?i!*%m&vETZT!r^w(V2NcJQbNG?$4hExQ%0Mc3`w;-Hjd>!rR3vC z52i2+**RZ_BhK@5Zhc{qp;C^d_12t$Fsg|58*_o8czGtJl)YoJGsuM}Z3BtUv%0@6 z3+j18s5D7XuM%3oBCku`RjV<4*w1Bq2}qQE5T&Y7cWn7Eo2dpW+NV|_RW`7{v~D?& z^qt#EYN_~Wrh+`@)Pj^xwp^pW+{6!t@m$YsVJ;MEhV1m!;)UQ=hdQ@4k=G~G_SjSC ziAL^EdEL_Mfb(^>>Djkj?{o>FhMjcm^W~@eoWoozOs3+dE!v48ss86#YNL6QVp%9P zTG}s%+MG@FcrH-7aUhAE_^|(CWvvci%J z8tGpPPUx)I?4dQ zS}xUQy); z#+aP zH@g)RSb3Q~sN$LOM&0@3=`yPVY?b}D0?%@JsvaHzdicl8iMfyHn9%;+hA4Gqu-i13 z2d|z$>0#-SD8$-IwCV2G-Vx24p}Cfxo*|OHimX-l!bmHNHusQe9W9z0ZfzOTrm$YI~`&MznL3w2WK^&p}-N9kzZ`trz8gAfRHR z5~c8iv{d=%9uv&oVUy82F^4{UXRJo1^T)S{al>nO_Dbj`g^Bv<``Bx?DHr)QgM@xB|S&1xS2lQ*CgLBo- zR`D*B;GZNpvj+=H&rT3pcF&ttWIm+NM?1U6v@>L`;|R(a(htWCj5RYurr~L)q~rdK zQogAf3MXT~{V$I^h1WFTp~jMomJC|E!qUw4<$dD|6Qo7$@aShhWN&hzorJ08iewf!*=E2r()X3xaC`xkbvLzssfd2)E C1fC;QDBo)@`=L3Q0u{!vLVhba)sIsst3Jtr6T4!!RjixPXAWh z$m7ZtrmjB+MbbT{J6Emgz9K4ys3Ngr0c=sCwFeoJbREB zs`pFpS&|qF56he^P^Id3YL*qz9 zyP-6r7J9dsLnOa3j-?p&St?ys7Ag5mSQ zFCQ@!gG0KzyFIrjzIm<>q-6+cv(qjfCJ&W>VF}AS8>%U66!gEBIzqPge|I`qS@8-+ z3W@J#!_|OPqF3WJs(G|;#^?2^`wI@SVRH?ioyl%216(!2WkhN4Kz8yY1g(`1vg*-O zz5b;eUqQyK47wt)bmgwIk0)?Lx`ovdM@rb(E5}IEbPXpM7TgDopRja~1^|*= z!1>PKG`Gq*yc7s`-i(lIQwZ>VU7)n%*Z0W%`zT)7Yx6N4>pef}TZ7xi?#7to$A;J) z2&tSsSAuAgwZRJ_0^yPeJ1<(}*In9FxDexC6tQ`PW-;3d8a;IwxL(R1#eb#gZ4>gVWmXJu~6%mlS6A)p_4 z5Q#gOQ9`Eyalc!3j*teB^Cj9Q+s{Ynb4}vcP^*HDosJM(i%_J*rD(aY-Z1c3A{C7j z%-R$?BL!?f-5RRFX-X^~)cRp}qwUnp3hfq{+76-?9P|TzU(b{jb~P!HKC;WSA-2R? zy82EQHy)WFw!a-eMG{`w?bMv$JzN9*ypfWD5+u^euj%!TJKd+fc;bQ5V%(W)gBU}J z1y=?J-s-=uyVy1$zF1rbqEx|fjlX^0p=vWv!i|I<`bW3!I%gV~Ud&mm`t3DW9gz*% z=gK%Ausd{Y1)h2sJ!o=2by{(~c5*tU&ERxrJP}=7OPkP{`fjfYok^tvIovNiTSx(d zikPMKgaWg54PtqCoT)pa+Mm)q4OZv){E{YsGJnl33s!agdgcqrX|MOfq_39>rB8_m zHB#NxM;);f3KzQI%?jA5aHKEtSnc0P6FB8Kva{Inn?)UB5OmGdeeBx-y4hkOJQ&ln zi0>t{o#9I5W#6XL@~#KF6xFSWBUVAa=C+Tcu`AJME#i>MwhfmSSqSaid1p9xKZR0P z^$zp89_d937ut8&?{Fo1S2XU`OQyLI$J@@LxxXzUFSO^K=JgNnwOJzH&l~v4 zEO=FOqV7)e-%g%P=9!N4l_+;#WZ*ZZvVFk&q$>#O1SzFsl3=+gT8utN^lOCF?p_sT zZgo6OsTC)i@e#CK6$8lvcEr>&XgWh3QXTG71vNT1xX~~}D@4!|8Z5t(g7&YY^7B36l=VugbwCLQDuN`Vxnf~WuJSv%Hz=~35Cz} z=>)>eicl2rFdH;rRj?I0{P6qBa#W$dtTDZ8^!{v?5ZdV$b+_WzMSBZ%@g1Aw8V=he zD-%RwvOU+T`>$ViYM6-UYSrA!N}s3_aJSmzAz}994q82gRqVZ=KDBaZr{e4YIAM$x zwBYa!TO&pG*Xiv~ObI`Q&-N8m+gJ@20|OByf#=Q9@6{=-IM$?uWu0WFip9DmCr*x^ z@UW-M4V-nUa)b(TOmY!2jBZ?T>h?-`;EJ9qf;jePH><9JB>{?}tpKjy)znjSjX9G) zD>3kiQ%m=mFuDm_%N&j&R;LJ{b;e)M^i8JR!qWsK%tTUvkj#h>#13_=dn*HVU@#p9 z#oWfJRT+`{IJ$>krb2f}+ghpawpJS)_Bo(aHKObO-Nc!f&v%fOt%12S!~VnY96x+6 zu%#>?fTl%H9vJxZL1@v8*hzkVrpGUP-LtACFbUVgIx2BWHe7Nb+;-1E3Au=gismN> z8WgPb#uTy%>gRRWfYR62){cXluzR`%usl1cnQt}evQg8$kN?02k z(Of2zeC-3MzZSFRBtLvkco>T z=L?+Qw{qG1Elc-QzIg(_%O-%%)Aarql%WbpWHbWT|vou3u6YK5bxX72~v3D=6&{$n7;_k`^% zQ>gr5w;pa5csaE5J52uLqoAsi!$Y)N_q6=Bo%uk5FYO+w-785uvBbkau;f-yD{y^K zHOlbTIpr}?#6fpwVP%a(`V zDiQN2J96%;=JKbe^pgiHukQ5oKp57|%7E(Spp*JIR;toSc|3D9gSG$8QT1g}cHh_>0k1DFxjK1oXQ|X`(tu>|7!&6#Nu8>iW#L7NFj+H_->O z>NLbubEm)GbwBYjlKQ)Q5RUAfa4ikI?w% z2+o3)bl)llJs^1&Iy|Bteeu{^!~NBrqTwUO?PEJ&pfW_Wm^bXOgcm@EO*)ElABjp7 z3TqIC^o2}46bXlat*Sd-y*%6KJ!Ht9hbd*Z>Nk8^OSzO3u}hUsegOVnDRL_Sz-#O* z?!Od7pqr|xD9t>vy3^6fupL_vs@`5+Xk#%INK0th_s-q2A^c3;SQ2P-8}wxA=;gB z#_H-s{Ve6QXdom@KSMt5=5+K>a!Shh^}W_6=`;Zftt4t@%JJ;zA;cl;PHEkJNSlU>h^a~k;t}3i0H;S;* z$()dA?fq7;c@~8Y0$*(TVd+xINnqF=>wk3|>zP1R7g|gU8(ZZNSln1;0eSDt#Ch$_ zr+MxEO8V0r!EVqjxL3#jkt^;-{Wlr8jCGoEhKn;SR+kT!4*nB!cLgj5f+Hk6aEEJ? zWN+_DB;byBs)h?ZHiq)A-xAAvlp>ZtIXOA>Dn1Ve1!S*%_r@+}2)Q(@C0DzmTQh_j zMI-LLC@U%B|ELknBW&92BPkeGb9SVw%Mvf6kOOBiqw_o6>4W7sK`qy}CyR?6)%9t> z^fXytZh#%!=n}l(eVTre4jh3n0Ip?>vEF z({ZN7&)a*c#pPb`QzHeX2>>AHj=H=xto}9cMB^J=I_1}d>TaizllqvrEO5qFQDjS zdL6lKu5#t%koy!$#!So*gtA~PH2Ha%C)MjOw^Z$1vtF}MDjxB&$MC@_RjBvbPxpSs z)kR)Tt*_N#G0JzMMei*LC4|(rv^05p#%x00uWz1B;7}?d4nErx?g*zxhSUVmg%`YV zE7VqEcf2sguU5_FMkzRc!p|bmaU&CG->Utf)TtzPzTh_1S! z&)YVXu+<<-X|r;`RFVJgw<87m30uM1pstDXtIs}BYeqi$MK<{UYOT751A2bQ#ppxs zBHbaiV6c2qlCur>srGY0Gv_!*viY9(bV(+Ke}y3BwXL zA8cfLk<;kF_Sfgz-ovZCDFobWJ{uqp-k9d=N&D$!gy!w!KIk1oAdIa@c$DGCs5pIY z7ppXK_b(b=-dLypsUFMMf1JVU8U6(LU7vxo!BzGYN=>lpf4Fm} z77G_woblB%1}21P@=@Biwf|!!XW|A`IW4bza%t4b6cJ`_Ynd-2TeJqH1jijToOtcK zf3R>_$sRNM_Co5DGh(_J`2|QDk*D-2fl?+wm#J|pXW|)i2%g!c3+ul-dk*Ko%i$RrUC4-hUZG`LB75x}3tmz<_5KEWQ9{LTD>nYo}K7+WFZ@ zCQwRPop=HI0~PCl@-V1A)8|*qNRqk^TZV2JiLM{F89*dh2~$^&r3rnlwKl|BYAWtjFgWpaN^5@Y`U@^0AX+e23m93w1~KB#FOhhD$sUMOQH9>-%D zPkuFP2l7?>EX-3AyU--(t(yRD(J9_ zIgsEgsPS!;gpeuc84$**`69*Q%UuWS(A-pxk3V01^(kIa1>-))B% z9Wp`OHvLDeJu@i3T z``S&oWdlhJ-*bfJTDgzF<%2iqe)*8CVL;It3~ssfKzaE|5&KM2jv z?@mqHe7`ye%piB{zPtKB@J_#s?>d(-I2Gwcbbi5?J}4t~!_(m`>?sAO zX)2Zms-x=WNm{qKnD!+&25u1v=?+4)-5cyeXNeRIrz3?xJtSeqZ|rMIn_SzvGjTy1}h);lJ|*cfy=StRgg)PCR^Uix^PqSEJT77Oco}dfJjjy~a*Tf)Mnx z4CV+OcxlFEY3w;I>3-X5qewe(Pc&&si}W<>1y}prRCoXiL4%cy##*+V>(6SfZOjf| zr#2psVqUKHIoY;@TDvvXsFP)SQD@^!KMrXtL&Ntgg<(b}Pyq2dJ=JSzWq6bme z_ZU9|yoAYpKRs-1BgwhLB8FFp=<@}_&kNb9whSON?Zd4#A&J%4}HgYfL@y8GTr&E^i`V*h)SGJe!o;ncM_mw)VLSCG&y|Fg4j$r@9-m? z!h=+GL@!xHRyLEY9$R0b*hvFs?lm{woS}Q~929jT{xIj>g(6La11*1_v984)8x5t$ z%ujsDEzy**NF-^Vz@Fd1UYlcMF_+K>1wHJ`V>p)l4A5!;-=j3*n4zn0Zc8d$KOL(z zxNpdPyGDpHSv~PwW^eu}bdMI3 zEOYylb=hAVg%oT}=G4dSe#A-``x_3fW;5QyV4qBVQ*!=Rc!r5xq%&s7$e##T} zR8CU05sPuGxu(OFYXDTvcuQC1OMx&mWaF#Th+!fE@fmUR)GuRJhAv+jAY$7N$QDTW5lw%2lj@Sk7TuqH@>@4wDEHwQ5ATucrDF3iLR@K-_N#vvzDJ6K30;Y8jCRy3 zPb0_nlll4io`iYWBKUv@ckofOz-e>=^T>pQH`Qj&D{m*%r6+?HF zNiBe+XJYX>XnHMh&Sf}0%6nx*rp^|6GBJ&?G&WDM%cX0d4|2LuFEk%; zM)Eqd|B}p@g586MPSi88PP5#7Xvn^Hk~+1r&_?*>IjzFHdI^~OA)DJw5nT|~XEpHl z#-qmz`a;0Tl;U+J8`JEVxyPpfZ{Elc?f&)krt>Zy#fd3D*ME2|-OL)3kyQB1Q8wS# z^8SMNr+e3aX?>LupUHxEcj`#^8J@dLqM+Z*>a$*b30m8Ld?K?rMN=>a5JYdh^J(=+P71 zPQMCQ)UKsU|8T7{2xG4QwA?T8mS0pkWEeBz%DL^^dm47`uYg1ZO8+8~2N4|nd`{2y z*V`@oyDUl1HB}`}nV+5MRw;OUMMW(A9QC zUV?e5so9ZJRJgXT$KSf~Nup{E{Ez8-zb3r|Ywr_gKubs3EZonO1XEcz`_m!F{D&-Z zU9Re}8{AjjCfI{2MMjSSK`FLfc$#3tL@&zm1Z}>3X-3i5!>S6R8z@7<9=Xrmmpjj- zND8O-TsxIQ)&}(WIm_V%(-L2XAGY2&4&9%1SK(u=0%}vkt)reBiC?*X;=gOpz=KPKF_De-A^9JC-cJC*P z*}|uH@KG4J8UXDaFx_IsY+8Mp#@X|8knlUHLo!S#8I1h3XsVKq_==%a#4;6P1Kw5}}_rY{NZupSoy$E+6*)rOWBotN6WKC?SbdG9!Rj z>1VQh%(pC*bzrFqpN(>UW$_)IGx^sZRY8=7iE8g1^OJF0+!ET=Q~?VJ1g?2chv5th z?zSmKF_Zt|-{J2zoFSvWaGPaa-e6-R$7n-pZ1-a{3rW*0j5Xrho<@|VKCOt5cdSJJ zJH?=C5w~uc3ND6;O;mjUU^}aK2~|hp%~6Sq&+*0?muyt@<@1{BorD~V_d%*3wYhas zcKrcrc0Qg1N500~lF_gF@c_a_l=^^DkM#G}OT}tg;_K(+l9HS})|VhSez4Lz9>FF! zw!HjjQCQzQ@Vq#)uiEUbjOP3L*&u_KZf~ibS)=jbDY{bAgrBEJOBWWI>+9=dKYlz= ze0OZG=(Y1w(S21r9f>rks;Y8${kkW)vpxd!gfhO`Z1kG5Qp@v|&Q*ypp|8gmNGgZo z0}_8?PR>sLfsGM4#Q#htTw`JX4YOyRZV)bSp(<8Ss+~WM@)9}%-3$1uy79Cfl{X{onoKQ4r>ZzLE6Qi( z)|6q@%E=cc)2d@{>vNaxSe#W3-E-zAe-@6v40*GU)K{-%ujRXrb_EvpHlr{?6~o#$PX+_3RnJI0*jjTPQ#oXx zm5v*0zT-aTMjnjVzd-gnJ6E$xt%m~ z^j}_Ev{8B~HjKh%Vc#U4iH!E7a@Mj^LZEW}yoqn>V<>XE$e8Xjm4?9H5Dl4L5{IB- z%5DX-N4%e+8eVB6nN9~j_7!qL3`-dy@nMRDCD{E6v65T)g;GDI^yWQy!)nJ2+5#%M zS)G=LURy5C%ki!2VZOEYjedVA9%fa6-CT*V9sm1V0toAdjZUSZwm!9;^ERWgG(SS< zs$&F#<o51q zn7kZc(0qDnr%DRPRS=&{(!C|TL}hc^W=Zi(zq}cgqV}Ub$sBcO;P{6-U0xANNF)a6 zg|ms-TG6SL95g#qjP7h<#Z*tksl9{cAwJPW z9k&nWW!I_3!&3{G?zij}n%?$8es6y1#o3qKQ8_4qLW?+4#9f+va+7q9a9~DPibFOk zh)xk;wRqt+WTUL+SWMeD{FPj^{Q80ydau^clDV<%?72TPUNWKcgU}b;+KAUHuiWLG zJo&?N6tBHWidDj}P#4ETNv9qJyInPB|0SDxwf2lf^J~RcR{#`Th*;75CPJ~jE=>+1 zqTAm%YuSF`2T*AJ=ESk_$bb6m-X*~wWInFpXn8y2LHX1(;hq}!RI=7i$eUB!dp0aO z55SZfvv;ebFtV=3b;s(AHmLz$2kqS1)!}cmpX+nv?|zPd!bcHHr%lJZ-FIxp6Fq{7 zHjCFuCdaO(B=;I*|CL-1Dxt_JXUF80*29f!|) z6<9$I(q_Efre5$g^gxK+G8t(%(RX4*AUY2i2E-MKIAy46&nU1%3P;3~t@v_gb9*Y} z(qDkrB-t z&00A=(5_lmmuXB-V<8jY#GmhV>{stf-pBC2!Gkvi!qd{ylqZJCdW2brM$dA`C% zD?Kw)Psw|4;17a-+T**7FbSSvb(I?3TWA{3Qt=xj5d69SV^5C(p`5L2ApcRy^B2#b zk2QE~jHOB2{Zo^SjdfDQE%aZMoA2rhz|wC#;pc;jh=}Bc&@vZXUYu=cKqL7CKmDJD z1kZ!z?u;8^ruDIZg8N?=4sdwQKPd*!c>?tx)mm)vYbyO{*5tL;6<28F$W;NRc<<==vZt6|mQFaCPP+z0$fo1T&0V#i! ANdN!< From 7480291c627b7a63cfb39b84ebc6b3d01813793b Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Fri, 1 Dec 2017 15:12:31 +0800 Subject: [PATCH 152/275] Add version and commit information in capi config.h and use unofficial glog for Android API < 21. (#6113) * Automatically configure the version and commit information in capi. * Use the unofficial glog repository for building for Android (API < 21). --- CMakeLists.txt | 3 --- cmake/external/glog.cmake | 13 +++++++++++-- paddle/capi/CMakeLists.txt | 10 ++++++++++ paddle/capi/config.h.in | 3 +++ paddle/testing/CMakeLists.txt | 6 ++++-- 5 files changed, 28 insertions(+), 7 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e76512166f..f7824b1066 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,9 +67,6 @@ if(ANDROID OR IOS) if(ANDROID) if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16") - elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") - # TODO: support glog for Android api 16 ~ 19 in the future - message(WARNING "Using the unofficial git repository instead") endif() endif() diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 08bdc1e162..0c6b3aafcb 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -26,12 +26,21 @@ ENDIF(WIN32) INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR}) +IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21") + # Using the unofficial glog for Android API < 21 + SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git") + SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8") +ELSE() + SET(GLOG_REPOSITORY "https://github.com/google/glog.git") + SET(GLOG_TAG "v0.3.5") +ENDIF() + ExternalProject_Add( extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags - GIT_REPOSITORY "https://github.com/google/glog.git" - GIT_TAG v0.3.5 + GIT_REPOSITORY ${GLOG_REPOSITORY} + GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt index d267b14657..ebb083c5a4 100644 --- a/paddle/capi/CMakeLists.txt +++ b/paddle/capi/CMakeLists.txt @@ -4,6 +4,16 @@ else () set(PADDLE_FLOAT_TYPE float) endif() +execute_process( + COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1 + WORKING_DIRECTORY ${PADDLE_SOURCE_DIR} + OUTPUT_VARIABLE PADDLE_GIT_COMMIT + RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT + ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) +if(NOT PADDLE_GIT_COMMIT) + set(PADDLE_GIT_COMMIT "no commit information") +endif() + # config.h used for C-API. It will store Paddle building configuration as a # header. Make user just include PaddleCAPI.h then can get building # configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their diff --git a/paddle/capi/config.h.in b/paddle/capi/config.h.in index d205307588..0ddbd8c753 100644 --- a/paddle/capi/config.h.in +++ b/paddle/capi/config.h.in @@ -3,6 +3,9 @@ typedef @PADDLE_FLOAT_TYPE@ paddle_real; +#define __PADDLE_VERSION__ "@PADDLE_VERSION@" +#define __PADDLE_COMMIT__ "@PADDLE_GIT_COMMIT@" + // Since we only support linux and macos in compile, always use clang or // gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below. #define PD_API __attribute__((visibility("default"))) diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 2275c950ba..8132742749 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -5,6 +5,8 @@ if(WITH_TESTING) add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies}) add_library(paddle_test_util STATIC TestUtil.cpp) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) - add_library(paddle_gtest_main STATIC paddle_gtest_main.cc) - add_dependencies(paddle_gtest_main paddle_memory gtest gflags) + if(NOT MOBILE_INFERENCE) + add_library(paddle_gtest_main STATIC paddle_gtest_main.cc) + add_dependencies(paddle_gtest_main paddle_memory gtest gflags) + endif() endif() From 57dc8de934b4bbb8be06090436dfec7d4e788fa1 Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Fri, 1 Dec 2017 15:26:41 +0800 Subject: [PATCH 153/275] Fix the linking error for iOS simulator (architecture x86_64). (#6081) --- paddle/math/CMakeLists.txt | 2 -- paddle/math/SIMDFunctions.h | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt index 86bb270a43..922fb51722 100644 --- a/paddle/math/CMakeLists.txt +++ b/paddle/math/CMakeLists.txt @@ -26,8 +26,6 @@ else() endif() if(MOBILE_INFERENCE) - list(REMOVE_ITEM MATH_SOURCES - ${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp) # Remove sparse list(REMOVE_ITEM MATH_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h index 439f11b79d..76909720f6 100644 --- a/paddle/math/SIMDFunctions.h +++ b/paddle/math/SIMDFunctions.h @@ -116,9 +116,11 @@ inline bool vec_check(size_t len) { } namespace internal { +#ifdef __SSE3__ void addToImpl(float* a, const float* b, size_t len); void batchAddToImpl(float* a, const float* b[], int batch, size_t len); void colMaxImpl(float* result, const float* data, int dim, int numSamples); +#endif #ifdef __AVX__ void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len); void decayL1AvxImpl( From 813bbf40a1a5a2583354d4bd516c469e137c1668 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Fri, 1 Dec 2017 15:42:05 +0800 Subject: [PATCH 154/275] disable test_recurrent_op (#6153) --- python/paddle/v2/fluid/tests/test_recurrent_op.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py index 36e0c84c0b..694ff0d8dd 100644 --- a/python/paddle/v2/fluid/tests/test_recurrent_op.py +++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py @@ -454,4 +454,6 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1): if __name__ == '__main__': + # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 + exit(0) unittest.main() From aabe1db111625519bd7f85d7100a3ab7747f1e12 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 1 Dec 2017 16:12:29 +0800 Subject: [PATCH 155/275] Feature/simple gan for api (#6149) * Expose sigmoid_cross_entropy_with_logits Also, change the `labels` to `label` for api consistency * Very simple GAN based on pure FC layers --- python/paddle/v2/fluid/tests/demo/fc_gan.py | 157 ++++++++++++++++++++ 1 file changed, 157 insertions(+) create mode 100644 python/paddle/v2/fluid/tests/demo/fc_gan.py diff --git a/python/paddle/v2/fluid/tests/demo/fc_gan.py b/python/paddle/v2/fluid/tests/demo/fc_gan.py new file mode 100644 index 0000000000..cae959593e --- /dev/null +++ b/python/paddle/v2/fluid/tests/demo/fc_gan.py @@ -0,0 +1,157 @@ +import errno +import math +import os + +import matplotlib +import numpy + +import paddle.v2 as paddle +import paddle.v2.fluid as fluid + +matplotlib.use('Agg') +import matplotlib.pyplot as plt +import matplotlib.gridspec as gridspec + +NOISE_SIZE = 100 +NUM_PASS = 1000 +NUM_REAL_IMGS_IN_BATCH = 121 +NUM_TRAIN_TIMES_OF_DG = 3 +LEARNING_RATE = 2e-5 + + +def D(x): + hidden = fluid.layers.fc(input=x, + size=200, + act='relu', + param_attr='D.w1', + bias_attr='D.b1') + logits = fluid.layers.fc(input=hidden, + size=1, + act=None, + param_attr='D.w2', + bias_attr='D.b2') + return logits + + +def G(x): + hidden = fluid.layers.fc(input=x, + size=200, + act='relu', + param_attr='G.w1', + bias_attr='G.b1') + img = fluid.layers.fc(input=hidden, + size=28 * 28, + act='tanh', + param_attr='G.w2', + bias_attr='G.b2') + return img + + +def plot(gen_data): + gen_data.resize(gen_data.shape[0], 28, 28) + n = int(math.ceil(math.sqrt(gen_data.shape[0]))) + fig = plt.figure(figsize=(n, n)) + gs = gridspec.GridSpec(n, n) + gs.update(wspace=0.05, hspace=0.05) + + for i, sample in enumerate(gen_data): + ax = plt.subplot(gs[i]) + plt.axis('off') + ax.set_xticklabels([]) + ax.set_yticklabels([]) + ax.set_aspect('equal') + plt.imshow(sample.reshape(28, 28), cmap='Greys_r') + + return fig + + +def main(): + try: + os.makedirs("./out") + except OSError as e: + if e.errno != errno.EEXIST: + raise + + startup_program = fluid.Program() + d_program = fluid.Program() + dg_program = fluid.Program() + + with fluid.program_guard(d_program, startup_program): + img = fluid.layers.data(name='img', shape=[784], dtype='float32') + d_loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=D(img), + label=fluid.layers.data( + name='label', shape=[1], dtype='float32')) + d_loss = fluid.layers.mean(x=d_loss) + + with fluid.program_guard(dg_program, startup_program): + noise = fluid.layers.data( + name='noise', shape=[NOISE_SIZE], dtype='float32') + g_img = G(x=noise) + g_program = dg_program.clone() + dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits( + x=D(g_img), + label=fluid.layers.fill_constant_batch_size_like( + input=noise, dtype='float32', shape=[-1, 1], value=1.0)) + dg_loss = fluid.layers.mean(x=dg_loss) + + opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE) + + opt.minimize(loss=d_loss, startup_program=startup_program) + opt.minimize( + loss=dg_loss, + startup_program=startup_program, + parameter_list=[ + p.name for p in g_program.global_block().all_parameters() + ]) + exe = fluid.Executor(fluid.CPUPlace()) + exe.run(startup_program) + + num_true = NUM_REAL_IMGS_IN_BATCH + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=60000), + batch_size=num_true) + + for pass_id in range(NUM_PASS): + for batch_id, data in enumerate(train_reader()): + num_true = len(data) + n = numpy.random.uniform( + low=-1.0, high=1.0, + size=[num_true * NOISE_SIZE]).astype('float32').reshape( + [num_true, NOISE_SIZE]) + generated_img = exe.run(g_program, + feed={'noise': n}, + fetch_list={g_img})[0] + real_data = numpy.array(map(lambda x: x[0], data)).astype('float32') + real_data = real_data.reshape(num_true, 784) + total_data = numpy.concatenate([real_data, generated_img]) + total_label = numpy.concatenate([ + numpy.ones( + shape=[real_data.shape[0], 1], dtype='float32'), + numpy.zeros( + shape=[real_data.shape[0], 1], dtype='float32') + ]) + d_loss_np = exe.run(d_program, + feed={'img': total_data, + 'label': total_label}, + fetch_list={d_loss})[0] + for _ in xrange(NUM_TRAIN_TIMES_OF_DG): + n = numpy.random.uniform( + low=-1.0, high=1.0, + size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape( + [2 * num_true, NOISE_SIZE, 1, 1]) + dg_loss_np = exe.run(dg_program, + feed={'noise': n}, + fetch_list={dg_loss})[0] + print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format( + pass_id, batch_id, d_loss_np, dg_loss_np)) + # generate image each batch + fig = plot(generated_img) + plt.savefig( + 'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight') + plt.close(fig) + + +if __name__ == '__main__': + main() From dda277ba6c386d63e052fe50a8e21d8dd2df579d Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 1 Dec 2017 17:50:54 +0800 Subject: [PATCH 156/275] update build.sh --- paddle/scripts/docker/build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 502637c881..fbd0b6b078 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -36,6 +36,7 @@ function cmake_gen() { ${PYTHON_FLAGS} -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU:-OFF} + -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} -DWITH_GOLANG=${WITH_GOLANG:-ON} @@ -57,6 +58,7 @@ EOF ${PYTHON_FLAGS} \ -DWITH_DOC=OFF \ -DWITH_GPU=${WITH_GPU:-OFF} \ + -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_GOLANG=${WITH_GOLANG:-ON} \ From e50f35706a2d64b2724bff483e0f203ed4882c28 Mon Sep 17 00:00:00 2001 From: chengduo Date: Fri, 1 Dec 2017 18:19:22 +0800 Subject: [PATCH 157/275] code refine (#6164) --- paddle/platform/enforce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h index 97338a4ce6..5abd4d4a34 100644 --- a/paddle/platform/enforce.h +++ b/paddle/platform/enforce.h @@ -244,7 +244,7 @@ inline void throw_on_error(T e) { #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ do { \ - if (!UNLIKELY((__VAL0)__CMP(__VAL1))) { \ + if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) { \ PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP \ " %s\n%s", \ #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \ From d066b07f144589ef72fe05faa8ca0f91889fefda Mon Sep 17 00:00:00 2001 From: QI JUN Date: Fri, 1 Dec 2017 18:21:05 +0800 Subject: [PATCH 158/275] change GPU memory allocating policy (#6159) * change GPU memory allocating policy * fix potential overflow bug --- paddle/platform/gpu_info.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc index 36b216d872..63a3351708 100644 --- a/paddle/platform/gpu_info.cc +++ b/paddle/platform/gpu_info.cc @@ -75,15 +75,19 @@ size_t GpuMaxChunkSize() { GpuMemoryUsage(available, total); // Reserving the rest memory for page tables, etc. - size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total; + size_t reserving = 0.05 * total; // If available less than minimum chunk size, no usable memory exists. - available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(); + available = + std::max(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(), + reserving) - + reserving; - // If available less than reserving, no usable memory exists. - size_t usable = std::max(available, reserving) - reserving; + size_t allocating = FLAGS_fraction_of_gpu_memory_to_use * total; - return usable; + PADDLE_ENFORCE_LT(allocating, available); + + return allocating; } void GpuMemcpyAsync(void *dst, const void *src, size_t count, From 7d9ff4081e3a0816cab4119dc146c73c576b71cd Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 1 Dec 2017 18:16:31 +0800 Subject: [PATCH 159/275] narrow pictures --- doc/design/mkldnn/README.MD | 20 ++++++++++---------- doc/design/mkldnn/image/engine.png | Bin 17102 -> 13586 bytes doc/design/mkldnn/image/gradients.png | Bin 31247 -> 22890 bytes doc/design/mkldnn/image/layers.png | Bin 14414 -> 11646 bytes doc/design/mkldnn/image/matrix.png | Bin 22085 -> 18407 bytes doc/design/mkldnn/image/overview.png | Bin 16329 -> 10766 bytes 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/doc/design/mkldnn/README.MD b/doc/design/mkldnn/README.MD index 287ee620e1..61d453de24 100644 --- a/doc/design/mkldnn/README.MD +++ b/doc/design/mkldnn/README.MD @@ -5,7 +5,7 @@ 充分展现英特尔平台的优势,有效提升PaddlePaddle在英特尔架构上的性能。