From 5030681c36e9e9497f3c45cdbd451c8739bdba1f Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 8 Mar 2018 20:41:31 +0800 Subject: [PATCH 001/314] add MKL for fluid static and shared library --- cmake/external/mklml.cmake | 2 +- cmake/inference_lib.cmake | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index 739a910c7c..f24cb2d11b 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") SET(MKLML_DST_DIR "mklml") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) -SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) +SET(MKLML_ROOT ${MKLML_INSTALL_DIR}) SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 6b2237b858..fb81498fd6 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND) SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include DSTS ${dst_dir} ${dst_dir} ) +else() + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml") + copy(mklml_lib + SRCS ${MKLML_LIB_DIR} ${MKLML_INC_DIR} + DSTS ${dst_dir} ${dst_dir} + ) endif() # paddle fluid module From bc0cfb2283633b65669be1d8f7a7f2040d6726f2 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Thu, 8 Mar 2018 20:42:16 +0800 Subject: [PATCH 002/314] remove PADDLE_USE_ATLAS --- paddle/fluid/operators/math/math_function.h | 7 ------- paddle/math/MathFunctions.cpp | 15 ++++----------- paddle/math/MathFunctions.h | 2 +- 3 files changed, 5 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h index 47e2386d05..cdbc7bfb37 100644 --- a/paddle/fluid/operators/math/math_function.h +++ b/paddle/fluid/operators/math/math_function.h @@ -19,13 +19,6 @@ limitations under the License. */ #include #endif -#ifdef PADDLE_USE_ATLAS -extern "C" { -#include -#include -} -#endif - #ifdef PADDLE_USE_OPENBLAS #include #include diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index b2ff4bc323..de404cad89 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr; } __name; // struct DynLoad__##__name #endif -#ifdef PADDLE_USE_ATLAS - #define PADDLE_SGETRF clapack_sgetrf - #define PADDLE_DGETRF clapack_dgetrf - #define PADDLE_SGETRI clapack_sgetri - #define PADDLE_DGETRI clapack_dgetri -#else - #define PADDLE_SGETRF LAPACKE_sgetrf - #define PADDLE_DGETRF LAPACKE_dgetrf - #define PADDLE_SGETRI LAPACKE_sgetri - #define PADDLE_DGETRI LAPACKE_dgetri -#endif +#define PADDLE_SGETRF LAPACKE_sgetrf +#define PADDLE_DGETRF LAPACKE_dgetrf +#define PADDLE_SGETRI LAPACKE_sgetri +#define PADDLE_DGETRI LAPACKE_dgetri #define LAPACK_ROUTINE_EACH(__macro) \ __macro(PADDLE_SGETRF) \ diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index f4cf6bd6c2..f3d8b1a39e 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -21,7 +21,7 @@ limitations under the License. */ #include #endif -#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB) +#if defined(PADDLE_USE_VECLIB) extern "C" { #include #include From d3d16f76f583ca3f46a13e62f6f670acdcccbb5c Mon Sep 17 00:00:00 2001 From: ying Date: Wed, 7 Mar 2018 09:39:53 +0800 Subject: [PATCH 003/314] enhance reshape operator. --- paddle/fluid/operators/reshape_op.cc | 97 ++++++++++++------- paddle/fluid/operators/reshape_op.h | 48 ++++++++- .../paddle/fluid/tests/unittests/op_test.py | 8 +- .../unittests/test_mine_hard_examples_op.py | 0 .../fluid/tests/unittests/test_reshape_op.py | 56 +++++++---- .../tests/unittests/test_target_assign_op.py | 0 6 files changed, 150 insertions(+), 59 deletions(-) mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py mode change 100755 => 100644 python/paddle/fluid/tests/unittests/test_target_assign_op.py diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 3580932356..c47df73405 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -31,48 +31,69 @@ class ReshapeOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ReshapeOp should not be null."); - auto shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE(shape.size() > 0, "Attr(shape) shouldn't be empty."); + const std::vector &shape = ctx->Attrs().Get>("shape"); + + PADDLE_ENFORCE_EQ(shape.empty(), ctx->HasInput("Shape"), + "The shape information can only be set by Attr(shape) or " + "by Input(Shape). Attr(shape) and Input(Shape) cannot be " + "set at the same time."); + auto x_dims = ctx->GetInputDim("X"); - std::vector neg_dims_idx; - // set some dimension to -1 if it is unknown - const int unknown_size = -1; - for (size_t i = 0; i < shape.size(); ++i) { - PADDLE_ENFORCE(shape[i] > 0 || shape[i] == unknown_size, - "Each dimension of Attr(shape) must be positive or %d.", - unknown_size); - if (shape[i] == unknown_size) { - neg_dims_idx.push_back(i); - PADDLE_ENFORCE(neg_dims_idx.size() <= 1, - "Only one dimension of Attr(shape) can be unknown."); - } - } + if (ctx->HasInput("Shape")) { + auto shape_dims = ctx->GetInputDim("Shape"); - int64_t capacity = - std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); - int64_t in_size = framework::product(x_dims); - if (neg_dims_idx.size() == 1) { - // dim infer - shape[neg_dims_idx[0]] = in_size / (-capacity); - // recalculate capacity - capacity = shape[neg_dims_idx[0]] * (-capacity); + PADDLE_ENFORCE(shape_dims.size() == 2UL && shape_dims[0] == 1UL, + "The Input(Label) should be a 2-D tensor with the 1st " + "dimensions fixed to 1 (a row vector)."); + + // The actual output shape will be set at runtime, here temporially the + // the shape of output the same as the shape of input. + ctx->SetOutputDim("Out", x_dims); + } else { + std::vector output_shape; + ValidateShape(shape, framework::product(x_dims), output_shape); + + auto out_dims = framework::make_ddim(output_shape); + ctx->SetOutputDim("Out", out_dims); } - // capacity check - PADDLE_ENFORCE(capacity == in_size, - "The size of Input(X) mismatches with Attr(shape)."); - // resize output - std::vector shape_int64(shape.size(), 0); - std::transform(shape.begin(), shape.end(), shape_int64.begin(), - [](int a) { return static_cast(a); }); - auto out_dims = framework::make_ddim(shape_int64); - ctx->SetOutputDim("Out", out_dims); + if (shape[0] == x_dims[0]) { - // Only pass LoD when the first dimension is equal between - // output and input. + // Only pass LoD when the first dimension of output and input are the + // same. ctx->ShareLoD("X", /*->*/ "Out"); } } + + private: + void ValidateShape(const std::vector &shape, const int64_t in_size, + std::vector &output_shape) const { + std::vector neg_dims_idx; + const int unknown_index = -1; // only one dimension canbe set to -1, whose + // size will be automatically infered. + + for (size_t i = 0; i < shape.size(); ++i) { + PADDLE_ENFORCE(shape[i] > 1 || shape[i] == unknown_index, + "Each input dimension of Attr(shape) must be positive, or " + "only one input dimension can be -1."); + if (shape[i] == unknown_index) neg_dims_idx.push_back(i); + } + PADDLE_ENFORCE_LE( + neg_dims_idx.size(), 1, + "Only one input dimension of Attr(shape) may be unknown."); + + int64_t inferred_dim = 0; + if (neg_dims_idx.size()) { + int64_t capacity = std::accumulate(shape.begin(), shape.end(), 1, + std::multiplies()); + inferred_dim = in_size / (-capacity); + } + + output_shape.resize(shape.size(), 0); + std::transform(shape.begin(), shape.end(), output_shape.begin(), + [](int a) { return static_cast(a); }); + if (neg_dims_idx.size()) output_shape[neg_dims_idx[0]] = inferred_dim; + } }; class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { @@ -80,10 +101,12 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input tensor of reshape operator."); + AddInput("Shape", "a 1-D tensor that provides the shape information.") + .AsDispensable(); AddOutput("Out", "The output tensor of reshape operator."); AddAttr>("shape", - "(vector) " - "Target shape of reshape operator."); + "(vector) Target shape of reshape operator.") + .SetDefault(std::vector()); AddComment(R"DOC( Reshape Operator. @@ -96,7 +119,7 @@ and target shape = [1, 4], the reshape operator will transform the tensor X into a 2-D tensor: [[1, 2, 3, 4]] One dimension in the target shape can be set -1, representing that its -size is unknown. In this case, the real dimension will be infered from +size is unknown. In this case, the real dimension will be infered from the original shape of Input(X) and other dimensions in the target shape. )DOC"); } diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 1357bce4b7..fc0885c149 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -26,11 +26,57 @@ class ReshapeKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const { auto* out = ctx.Output("Out"); auto* in = ctx.Input("X"); - auto out_dims = out->dims(); + + auto* shape = ctx.Input("Shape"); + framework::DDim out_dims; + if (shape) { + std::vector output_shape; + ValidateShape(*shape, framework::product(in->dims()), output_shape); + + for (auto d : output_shape) std::cout << d << " "; + std::cout << std::endl; + + out_dims = framework::make_ddim(output_shape); + } else { + out_dims = out->dims(); + } + out->mutable_data(ctx.GetPlace()); framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); out->Resize(out_dims); } + + private: + void ValidateShape(const framework::Tensor& shape, const int64_t in_size, + std::vector& output_shape) const { + std::vector neg_dims_idx; + const int unknown_index = -1; // only one dimension canbe set to -1, whose + // size will be automatically infered. + + const int64_t dimension = shape.dims()[1]; + std::cout << "dimension =" << dimension << std::endl; + const T* shape_data = shape.data(); + + for (int64_t i = 0; i < dimension; ++i) { + PADDLE_ENFORCE(shape_data[i] > 1 || shape_data[i] == unknown_index, + "Each input dimension of Attr(shape) must be positive, or " + "only one input dimension can be -1."); + if (shape_data[i] == unknown_index) neg_dims_idx.push_back(i); + } + PADDLE_ENFORCE_LE( + neg_dims_idx.size(), 1, + "Only one input dimension of Attr(shape) can be unknown."); + + int64_t capacity = 1; + output_shape.resize(dimension, 0); + for (int64_t i = 0; i < dimension; ++i) { + capacity *= shape_data[i]; + output_shape[i] = static_cast(shape_data[i]); + } + + if (neg_dims_idx.size()) + output_shape[neg_dims_idx[0]] = in_size / (-capacity); + } }; template diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index f7e02595ec..26835336ad 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -334,7 +334,7 @@ class OpTest(unittest.TestCase): np.allclose( actual_t, expect_t, atol=atol), "Output (" + out_name + ") has diff at " + str(place) + - str(actual_t) + str(expect_t)) + str(actual_t) + "\n" + str(expect_t)) if isinstance(expect, tuple): self.assertListEqual(actual.lod(), expect[1], "Output (" + out_name + @@ -546,6 +546,6 @@ class OpTest(unittest.TestCase): fetch_list = [g for p, g in param_grad_list] executor = Executor(place) - return map( - np.array, - executor.run(prog, feed_dict, fetch_list, return_numpy=False)) + return map(np.array, + executor.run(prog, feed_dict, fetch_list, + return_numpy=False)) diff --git a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py old mode 100755 new mode 100644 diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index 6d1aa549d5..ae1cca0c3e 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -14,29 +14,51 @@ import unittest import numpy as np -from op_test import OpTest - +import pdb -class TestReshapeOp(OpTest): - def setUp(self): - self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [10 * 20]} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} +from op_test import OpTest - def test_check_output(self): - self.check_output() +# class TestReshapeOp1(OpTest): +# def setUp(self): +# ori_shape = (2, 25) +# new_shape = [5, 10] +# +# self.op_type = "reshape" +# self.inputs = {"X": np.random.random(ori_shape).astype("float32")} +# self.attrs = {"shape": new_shape} +# self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} +# +# def test_check_output(self): +# self.check_output() +# +# def test_check_grad(self): +# self.check_grad(["X"], "Out") - def test_check_grad(self): - self.check_grad(["X"], "Out") +# class TestReshapeOpDimInfer1(OpTest): +# def setUp(self): +# self.op_type = "reshape" +# self.inputs = {"X": np.random.random((5, 10)).astype("float32")} +# self.attrs = {"shape": [5, -1, 5]} +# self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} +# +# def test_check_output(self): +# self.check_output() +# +# def test_check_grad(self): +# self.check_grad(["X"], "Out") -class TestReshapeOpDimInfer(OpTest): +class TestReshapeOp2(OpTest): def setUp(self): + ori_shape = (2, 25) + new_shape = ([5, 10], ) + self.op_type = "reshape" - self.inputs = {'X': np.random.random((10, 20)).astype("float32")} - self.attrs = {'shape': [4, -1, 5]} - self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} + self.inputs = { + "X": np.random.random(ori_shape).astype("float32"), + "Shape": np.array(new_shape) + } + self.outputs = {"Out": self.inputs["X"].reshape(new_shape[0])} def test_check_output(self): self.check_output() @@ -45,5 +67,5 @@ class TestReshapeOpDimInfer(OpTest): self.check_grad(["X"], "Out") -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_target_assign_op.py b/python/paddle/fluid/tests/unittests/test_target_assign_op.py old mode 100755 new mode 100644 From 1d4dfc096666fd2c482969a44b188faa4362f064 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 12 Mar 2018 10:28:22 +0800 Subject: [PATCH 004/314] fix bugs. --- paddle/fluid/operators/reshape_op.cc | 39 ++++++++++++++----- paddle/fluid/operators/reshape_op.h | 14 ++++--- .../fluid/tests/unittests/test_reshape_op.py | 33 +++++++++++++++- 3 files changed, 69 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index c47df73405..2ad49437a9 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -32,7 +32,6 @@ class ReshapeOp : public framework::OperatorWithKernel { "Output(Out) of ReshapeOp should not be null."); const std::vector &shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE_EQ(shape.empty(), ctx->HasInput("Shape"), "The shape information can only be set by Attr(shape) or " "by Input(Shape). Attr(shape) and Input(Shape) cannot be " @@ -41,27 +40,29 @@ class ReshapeOp : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); if (ctx->HasInput("Shape")) { + // The shape information in given by Input(Shape). auto shape_dims = ctx->GetInputDim("Shape"); PADDLE_ENFORCE(shape_dims.size() == 2UL && shape_dims[0] == 1UL, "The Input(Label) should be a 2-D tensor with the 1st " "dimensions fixed to 1 (a row vector)."); - // The actual output shape will be set at runtime, here temporially the + // The actual output shape will be set at runtime, here temporially set // the shape of output the same as the shape of input. ctx->SetOutputDim("Out", x_dims); } else { + // The shape information in given by Attr(shape). std::vector output_shape; ValidateShape(shape, framework::product(x_dims), output_shape); auto out_dims = framework::make_ddim(output_shape); ctx->SetOutputDim("Out", out_dims); - } - if (shape[0] == x_dims[0]) { - // Only pass LoD when the first dimension of output and input are the - // same. - ctx->ShareLoD("X", /*->*/ "Out"); + if (shape[0] == x_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", /*->*/ "Out"); + } } } @@ -94,6 +95,14 @@ class ReshapeOp : public framework::OperatorWithKernel { [](int a) { return static_cast(a); }); if (neg_dims_idx.size()) output_shape[neg_dims_idx[0]] = inferred_dim; } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } }; class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { @@ -101,11 +110,13 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input tensor of reshape operator."); - AddInput("Shape", "a 1-D tensor that provides the shape information.") + AddInput( + "Shape", + "Tensor, a 1-D tensor that provides the shape information.") .AsDispensable(); AddOutput("Out", "The output tensor of reshape operator."); - AddAttr>("shape", - "(vector) Target shape of reshape operator.") + AddAttr>( + "shape", "(std::vector) Target shape of reshape operator.") .SetDefault(std::vector()); AddComment(R"DOC( Reshape Operator. @@ -139,6 +150,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) shouldn't be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index fc0885c149..0c97dc639f 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -33,9 +33,6 @@ class ReshapeKernel : public framework::OpKernel { std::vector output_shape; ValidateShape(*shape, framework::product(in->dims()), output_shape); - for (auto d : output_shape) std::cout << d << " "; - std::cout << std::endl; - out_dims = framework::make_ddim(output_shape); } else { out_dims = out->dims(); @@ -85,11 +82,18 @@ class ReshapeGradKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const { auto* d_out = ctx.Input(framework::GradVarName("Out")); auto* d_x = ctx.Output(framework::GradVarName("X")); + d_x->mutable_data(ctx.GetPlace()); + bool inplace = ctx.Attr("inplace"); auto in_dims = d_x->dims(); - framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); - d_x->Resize(in_dims); + if (!inplace) { + framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); + d_x->Resize(in_dims); + } else { + d_x->ShareDataWith(*d_out); + d_x->Resize(in_dims); + } } }; } // namespace operators diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index ae1cca0c3e..dc96aed8db 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -33,7 +33,8 @@ from op_test import OpTest # # def test_check_grad(self): # self.check_grad(["X"], "Out") - +# +# # class TestReshapeOpDimInfer1(OpTest): # def setUp(self): # self.op_type = "reshape" @@ -56,7 +57,8 @@ class TestReshapeOp2(OpTest): self.op_type = "reshape" self.inputs = { "X": np.random.random(ori_shape).astype("float32"), - "Shape": np.array(new_shape) + "Shape": np.array( + new_shape, dtype="int64") } self.outputs = {"Out": self.inputs["X"].reshape(new_shape[0])} @@ -67,5 +69,32 @@ class TestReshapeOp2(OpTest): self.check_grad(["X"], "Out") +# class TestReshapeOpInplace(OpTest): +# def setUp(self): +# self.op_type = "reshape" +# self.inputs = {'X': np.random.random((10, 20)).astype("float32")} +# self.attrs = {'shape': [10 * 20], 'inplace': True} +# self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} +# +# def test_check_output(self): +# self.check_output() +# +# def test_check_grad(self): +# self.check_grad(["X"], "Out") +# +# +# class TestReshapeOpDimInferInplace(OpTest): +# def setUp(self): +# self.op_type = "reshape" +# self.inputs = {'X': np.random.random((10, 20)).astype("float32")} +# self.attrs = {'shape': [4, -1, 5], 'inplace': True} +# self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} +# +# def test_check_output(self): +# self.check_output() +# +# def test_check_grad(self): +# self.check_grad(["X"], "Out") + if __name__ == "__main__": unittest.main() From cf081851453a42bb6c7ea707b4f998e208d0e2a1 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 12 Mar 2018 13:05:47 +0800 Subject: [PATCH 005/314] fix bugs and complete codes. --- paddle/fluid/operators/reshape_op.cc | 94 +++++------ paddle/fluid/operators/reshape_op.h | 61 +++---- python/paddle/fluid/layers/detection.py | 17 +- python/paddle/fluid/layers/nn.py | 56 +++++++ python/paddle/fluid/layers/ops.py | 1 - .../fluid/tests/unittests/test_reshape_op.py | 158 ++++++++++-------- 6 files changed, 220 insertions(+), 167 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index b094e649c3..c0d08cc690 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -25,39 +25,28 @@ class ReshapeOp : public framework::OperatorWithKernel { : OperatorWithKernel(type, inputs, outputs, attrs) {} void InferShape(framework::InferShapeContext *ctx) const override { - // input check PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of ReshapeOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of ReshapeOp should not be null."); const std::vector &shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE_EQ(shape.empty(), ctx->HasInput("Shape"), - "The shape information can only be set by Attr(shape) or " - "by Input(Shape). Attr(shape) and Input(Shape) cannot be " - "set at the same time."); + PADDLE_ENFORCE(!shape.empty(), + "The shape information must be set by Attr(shape)."); + std::vector output_shape; auto x_dims = ctx->GetInputDim("X"); + bool need_copy_dim = ValidateShape(shape, x_dims, output_shape); - if (ctx->HasInput("Shape")) { - // The shape information in given by Input(Shape). - auto shape_dims = ctx->GetInputDim("Shape"); - - PADDLE_ENFORCE(shape_dims.size() == 2UL && shape_dims[0] == 1UL, - "The Input(Label) should be a 2-D tensor with the 1st " - "dimensions fixed to 1 (a row vector)."); - - // The actual output shape will be set at runtime, here temporially set - // the shape of output the same as the shape of input. + if (need_copy_dim) { + // Some dimensions can only be determined during runtime. Here temporarily + // set output tensor's shape the same as that of the input tensor. ctx->SetOutputDim("Out", x_dims); } else { - // The shape information in given by Attr(shape). - std::vector output_shape; - ValidateShape(shape, framework::product(x_dims), output_shape); - - auto out_dims = framework::make_ddim(output_shape); - ctx->SetOutputDim("Out", out_dims); + ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + // FIXME(caoying): When shape of the output tensor is determined during + // runtime, LoD information of X will not passed to the output. if (shape[0] == x_dims[0]) { // Only pass LoD when the first dimension of output and Input(X) // are the same. @@ -67,41 +56,51 @@ class ReshapeOp : public framework::OperatorWithKernel { } private: - void ValidateShape(const std::vector &shape, const int64_t in_size, + bool ValidateShape(const std::vector &shape, + const framework::DDim &input_dim, std::vector &output_shape) const { - std::vector neg_dims_idx; - const int unknown_index = -1; // only one dimension canbe set to -1, whose - // size will be automatically infered. + // only one dimension canbe set to -1, whose size will be automatically + // infered. + const int64_t unknown_index = -1; + const auto in_size = framework::product(input_dim); + const auto x_rank = input_dim.size(); + bool need_dim_copy = false; + std::vector neg_dims_idx; for (size_t i = 0; i < shape.size(); ++i) { - PADDLE_ENFORCE(shape[i] > 1 || shape[i] == unknown_index, + PADDLE_ENFORCE(shape[i] >= 0 || shape[i] == unknown_index, "Each input dimension of Attr(shape) must be positive, or " "only one input dimension can be -1."); - if (shape[i] == unknown_index) neg_dims_idx.push_back(i); + if (shape[i] == unknown_index) { + neg_dims_idx.push_back(i); + } else if (shape[i] == 0) { + PADDLE_ENFORCE_LT( + i, x_rank, + "Only dimension less than rank of Input(X) can be set to 0."); + need_dim_copy = true; + } } PADDLE_ENFORCE_LE( neg_dims_idx.size(), 1, "Only one input dimension of Attr(shape) may be unknown."); + output_shape.resize(shape.size(), 0); + std::transform(shape.begin(), shape.end(), output_shape.begin(), + [](int a) { return static_cast(a); }); + + // some dimension can only be determinted during runtime. + if (need_dim_copy) return need_dim_copy; + int64_t inferred_dim = 0; if (neg_dims_idx.size()) { int64_t capacity = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); inferred_dim = in_size / (-capacity); + PADDLE_ENFORCE_EQ(inferred_dim * (-capacity), in_size, + "Invalid shape is given."); + output_shape[neg_dims_idx[0]] = inferred_dim; } - - output_shape.resize(shape.size(), 0); - std::transform(shape.begin(), shape.end(), output_shape.begin(), - [](int a) { return static_cast(a); }); - if (neg_dims_idx.size()) output_shape[neg_dims_idx[0]] = inferred_dim; - } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return false; } }; @@ -110,14 +109,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "The input tensor of reshape operator."); - AddInput( - "Shape", - "Tensor, a 1-D tensor that provides the shape information.") - .AsDispensable(); AddOutput("Out", "The output tensor of reshape operator."); AddAttr>( - "shape", "(std::vector) Target shape of reshape operator.") - .SetDefault(std::vector()); + "shape", "(std::vector) Target shape of reshape operator."); AddAttr("inplace", "Change the source tensor's shape without copy memory.") .SetDefault(true); @@ -153,14 +147,6 @@ class ReshapeGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) shouldn't be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } - - protected: - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); - } }; } // namespace operators diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 23fbf1655c..9dbc5cec6b 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -27,17 +27,8 @@ class ReshapeKernel : public framework::OpKernel { auto* out = ctx.Output("Out"); auto* in = ctx.Input("X"); - auto* shape = ctx.Input("Shape"); - framework::DDim out_dims; - if (shape) { - std::vector output_shape; - ValidateShape(*shape, framework::product(in->dims()), output_shape); - - out_dims = framework::make_ddim(output_shape); - } else { - out_dims = out->dims(); - } - + auto out_dims = + ValidateShape(ctx.Attr>("shape"), in->dims()); bool inplace = ctx.Attr("inplace"); if (!inplace) { out->mutable_data(ctx.GetPlace()); @@ -50,35 +41,31 @@ class ReshapeKernel : public framework::OpKernel { } private: - void ValidateShape(const framework::Tensor& shape, const int64_t in_size, - std::vector& output_shape) const { - std::vector neg_dims_idx; - const int unknown_index = -1; // only one dimension canbe set to -1, whose - // size will be automatically infered. - - const int64_t dimension = shape.dims()[1]; - std::cout << "dimension =" << dimension << std::endl; - const T* shape_data = shape.data(); - - for (int64_t i = 0; i < dimension; ++i) { - PADDLE_ENFORCE(shape_data[i] > 1 || shape_data[i] == unknown_index, - "Each input dimension of Attr(shape) must be positive, or " - "only one input dimension can be -1."); - if (shape_data[i] == unknown_index) neg_dims_idx.push_back(i); - } - PADDLE_ENFORCE_LE( - neg_dims_idx.size(), 1, - "Only one input dimension of Attr(shape) can be unknown."); - + framework::DDim ValidateShape(const std::vector shape_attr, + const framework::DDim& in_dims) const { + const int64_t in_size = framework::product(in_dims); + // only one dimension canbe set to -1, whose size will be automatically + // infered. + const int64_t unknown_index = -1; + + std::vector output_shape(shape_attr.size(), 0); int64_t capacity = 1; - output_shape.resize(dimension, 0); - for (int64_t i = 0; i < dimension; ++i) { - capacity *= shape_data[i]; - output_shape[i] = static_cast(shape_data[i]); + int neg_dim_idx = -1; + for (size_t i = 0; i < shape_attr.size(); ++i) { + if (shape_attr[i] == unknown_index) neg_dim_idx = i; + capacity *= (shape_attr[i] ? shape_attr[i] : in_dims[i]); + output_shape[i] = + (shape_attr[i] ? static_cast(shape_attr[i]) : in_dims[i]); } - if (neg_dims_idx.size()) - output_shape[neg_dims_idx[0]] = in_size / (-capacity); + if (neg_dim_idx != -1) { + output_shape[neg_dim_idx] = -in_size / capacity; + PADDLE_ENFORCE_EQ(output_shape[neg_dim_idx] * capacity, -in_size, + "Invalid shape is given."); + } else { + PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); + } + return framework::make_ddim(output_shape); } }; diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 2bf7cf21ca..d326c5651f 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -19,7 +19,6 @@ from layer_function_generator import generate_layer_fn from layer_function_generator import autodoc from ..layer_helper import LayerHelper import tensor -import ops import nn import math @@ -58,7 +57,7 @@ def detection_output(loc, This operation is to get the detection results by performing following two steps: - + 1. Decode input bounding box predictions according to the prior boxes. 2. Get the final detection results by applying multi-class non maximum suppression (NMS). @@ -458,7 +457,7 @@ def ssd_loss(location, num, num_prior, num_class = confidence.shape def __reshape_to_2d(var): - return ops.reshape(x=var, shape=[-1, var.shape[-1]]) + return nn.reshape(x=var, shape=[-1, var.shape[-1]]) # 1. Find matched boundding box by prior box. # 1.1 Compute IOU similarity between ground-truth boxes and prior boxes. @@ -469,7 +468,7 @@ def ssd_loss(location, # 2. Compute confidence for mining hard examples # 2.1. Get the target label based on matched indices - gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, )) + gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, )) target_label, _ = target_assign( gt_label, matched_indices, mismatch_value=background_label) # 2.2. Compute confidence loss. @@ -480,7 +479,7 @@ def ssd_loss(location, conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) # 3. Mining hard examples - conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior)) + conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior)) neg_indices = helper.create_tmp_variable(dtype='int32') dtype = matched_indices.dtype updated_matched_indices = helper.create_tmp_variable(dtype=dtype) @@ -548,7 +547,7 @@ def ssd_loss(location, # 5.3 Compute overall weighted loss. loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss # reshape to [N, Np], N is the batch size and Np is the prior box number. - loss = ops.reshape(x=loss, shape=[-1, num_prior]) + loss = nn.reshape(x=loss, shape=[-1, num_prior]) loss = nn.reduce_sum(loss, dim=1, keep_dim=True) if normalize: normalizer = nn.reduce_sum(target_loc_weight) @@ -696,7 +695,7 @@ def multi_box_head(inputs, new_shape = [ -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)]) ] - out = ops.reshape(x=input, shape=new_shape) + out = nn.reshape(x=input, shape=new_shape) return out def _is_list_or_tuple_(data): @@ -793,7 +792,7 @@ def multi_box_head(inputs, mbox_loc.shape[0], mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4 ] - mbox_loc_flatten = ops.reshape(mbox_loc, shape=new_shape) + mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape) mbox_locs.append(mbox_loc_flatten) # get conf_loc @@ -809,7 +808,7 @@ def multi_box_head(inputs, conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] * conf_loc.shape[3] / num_classes, num_classes ] - conf_loc_flatten = ops.reshape(conf_loc, shape=new_shape) + conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape) mbox_confs.append(conf_loc_flatten) if len(box_results) == 1: diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 10b0405f47..67a6fd8084 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -70,6 +70,7 @@ __all__ = [ 'smooth_l1', 'one_hot', 'autoincreased_step_counter', + 'reshape', ] @@ -3184,6 +3185,8 @@ def one_hot(input, depth): The one-hot tensor or LodTensor, same as input. Examples: + .. code-block:: python + X is a LoDTensor: X.lod = [[0, 1, 4]] X.shape = [4, 1] @@ -3236,3 +3239,56 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): counter.stop_gradient = True return counter + + +def reshape(x, shape, act=None, inplace=True, name=None): + """ + Gives a new shape to Tensor without changing its data. + This layer takes a tensor as input and the attribute shape specifying the + new shape. The shape attribute must be specified. At most one dimension of + the new shape can be -1. In this case, the value is inferred from the size + of the tensor and the remaining dimensions. A dimension could also be 0, + in which case the actual dimension value is going to be copied from the + input tensor. + + Args: + input(variable): The input tensor. + shape(list): The new shape. At most one dimension of the new shape can + be -1. + act (str): The non-linear activation to be applied to output variable. + inplace(bool): If this flag is set true, a new output tensor is created + whose data is copied from input x, otherwise the output + shares data with input without copying. + + Returns(variable): The output tensor. + + Examples: + .. code-block:: python + + Given a 2-D tensor X with shape [2 x 2], and the new shape: [1, 4]. + The reshape layer will change tensor X into a 2-D tensor with + shape [1 x 4] with its data unchanged. + + Given a 3-D tensor x with shape [2, 3, 4] and the new shape: [3, -1]. + The reshape layer will change tensor X into a 2-D tensor with shape: + [3 x 8] with its data unchanged. + + Given a 3-D tensor x with shape [2, 3, 8] and the new shape: + [-1, 0, 2, 2]. The reshape layer will change tensor X into a 4-D tensor + with shape [4, 3, 2, 2] with its data unchanged. + + """ + + if not (isinstance(shape, list) or isinstance(shape, tuple)): + raise ValueError("Input shape must be a python lsit or tuple.") + + helper = LayerHelper("reshape", **locals()) + reshaped = helper.create_tmp_variable(dtype=x.dtype) + helper.append_op( + type="reshape", + inputs={"X": x}, + attrs={"shape": shape, + "inplace": inplace}, + outputs={"Out": reshaped}) + + return helper.append_activation(reshaped) diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 0b88b63962..20dd1b4752 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -47,7 +47,6 @@ __activations__ = [ __all__ = [ 'mean', 'mul', - 'reshape', 'scale', 'sigmoid_cross_entropy_with_logits', 'elementwise_add', diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index dc96aed8db..1a54427ab5 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -14,53 +14,88 @@ import unittest import numpy as np -import pdb from op_test import OpTest -# class TestReshapeOp1(OpTest): -# def setUp(self): -# ori_shape = (2, 25) -# new_shape = [5, 10] -# -# self.op_type = "reshape" -# self.inputs = {"X": np.random.random(ori_shape).astype("float32")} -# self.attrs = {"shape": new_shape} -# self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} -# -# def test_check_output(self): -# self.check_output() -# -# def test_check_grad(self): -# self.check_grad(["X"], "Out") -# -# -# class TestReshapeOpDimInfer1(OpTest): -# def setUp(self): -# self.op_type = "reshape" -# self.inputs = {"X": np.random.random((5, 10)).astype("float32")} -# self.attrs = {"shape": [5, -1, 5]} -# self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} -# -# def test_check_output(self): -# self.check_output() -# -# def test_check_grad(self): -# self.check_grad(["X"], "Out") - - -class TestReshapeOp2(OpTest): + +class TestReshapeOp(OpTest): + def setUp(self): + ori_shape = (2, 25) + new_shape = (5, 10) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpDimInfer1(OpTest): + def setUp(self): + ori_shape = (5, 10) + new_shape = (5, -1, 5) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(self.attrs["shape"])} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpDimInfer2(OpTest): + def setUp(self): + ori_shape = (2, 2, 6) + new_shape = (2, 0, 3, -1) + infered_shape = (2, 2, 3, -1) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape, "inplace": False} + self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpInplace(OpTest): def setUp(self): ori_shape = (2, 25) - new_shape = ([5, 10], ) + new_shape = (5, 10) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + + +class TestReshapeOpDimInferInplace1(OpTest): + def setUp(self): + ori_shape = (5, 10) + new_shape = (5, -1, 5) self.op_type = "reshape" - self.inputs = { - "X": np.random.random(ori_shape).astype("float32"), - "Shape": np.array( - new_shape, dtype="int64") - } - self.outputs = {"Out": self.inputs["X"].reshape(new_shape[0])} + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(new_shape)} def test_check_output(self): self.check_output() @@ -69,32 +104,23 @@ class TestReshapeOp2(OpTest): self.check_grad(["X"], "Out") -# class TestReshapeOpInplace(OpTest): -# def setUp(self): -# self.op_type = "reshape" -# self.inputs = {'X': np.random.random((10, 20)).astype("float32")} -# self.attrs = {'shape': [10 * 20], 'inplace': True} -# self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} -# -# def test_check_output(self): -# self.check_output() -# -# def test_check_grad(self): -# self.check_grad(["X"], "Out") -# -# -# class TestReshapeOpDimInferInplace(OpTest): -# def setUp(self): -# self.op_type = "reshape" -# self.inputs = {'X': np.random.random((10, 20)).astype("float32")} -# self.attrs = {'shape': [4, -1, 5], 'inplace': True} -# self.outputs = {'Out': self.inputs['X'].reshape(self.attrs['shape'])} -# -# def test_check_output(self): -# self.check_output() -# -# def test_check_grad(self): -# self.check_grad(["X"], "Out") +class TestReshapeOpDimInferInplace2(OpTest): + def setUp(self): + ori_shape = (2, 2, 6) + new_shape = (2, 0, 3, -1) + infered_shape = (2, 2, 3, -1) + + self.op_type = "reshape" + self.inputs = {"X": np.random.random(ori_shape).astype("float32")} + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(infered_shape)} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Out") + if __name__ == "__main__": unittest.main() From e42b8f8a11c344173c6d276fbdfdef1f13c17d19 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 13 Mar 2018 16:03:26 +0800 Subject: [PATCH 006/314] fix mklml install path --- cmake/external/mklml.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake index f24cb2d11b..df3f0c7f0c 100644 --- a/cmake/external/mklml.cmake +++ b/cmake/external/mklml.cmake @@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt "PROJECT(MKLML)\n" "cmake_minimum_required(VERSION 3.0)\n" - "install(DIRECTORY ${MKLML_VER}\n" + "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n" " DESTINATION ${MKLML_DST_DIR})\n") ExternalProject_Add( From 0621c327f1d0dd272ab7248c50e9afa8ae0fc0c0 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 13 Mar 2018 23:52:35 +0000 Subject: [PATCH 007/314] init commit --- doc/design/parallel_executor.md | 52 ++++++++++++++++++ paddle/fluid/framework/CMakeLists.txt | 2 + paddle/fluid/framework/executor.cc | 13 +++++ paddle/fluid/framework/executor.h | 1 + paddle/fluid/framework/parallel_executor.cc | 19 +++++++ paddle/fluid/framework/parallel_executor.h | 61 +++++++++++++++++++++ 6 files changed, 148 insertions(+) create mode 100644 doc/design/parallel_executor.md create mode 100644 paddle/fluid/framework/parallel_executor.cc create mode 100644 paddle/fluid/framework/parallel_executor.h diff --git a/doc/design/parallel_executor.md b/doc/design/parallel_executor.md new file mode 100644 index 0000000000..567eede1bd --- /dev/null +++ b/doc/design/parallel_executor.md @@ -0,0 +1,52 @@ +# ParallelExecutor Design Doc + +## Introduction + +We introduce `ParallelExecutor` to run multi-GPU training in PaddlePaddle Fluid. It supports +1. keeping a copy of the parameters on each GPU +1. allreduce on a separate stream allowing computation and communication overlap + +An example of switching single GPU training to multiple GPUs: +```python +cost = your_neural_network() +opt = fluid.optimizer.SGDOptimizer() +opt.minimize(avg_cost) + +# change Executor -> ParallelExecutor +exe = fluid.ParallelExecutor(gpu_list=[0, 1]) + +for iter in xranges(iter_num): + exe.run() +``` + +## Design + +In the constructor, a list of parameter, whose gradients need to be allreduced, is given. + +During the runtime, `ParallelExecutor` starts `#gpu` threads to run each `Executor`. For every +operator run on each GPU, it will automatically sync with different streams when necessary. + +```c++ +// if op's input is params' grad: + // sync with allreduce stream + // e.g. sgd should wait for allreduce to be finished +SyncMultipleStreams(op); + +op->Run(*local_scope, place_); + +// if op's output is params' grad: +// sync with computation stream +// e.g. allreduce shoudl wait for fc_grad to be finished. +SyncMultipleStreams(op); +``` + + +## API + +The `ParallelExecutor.run` has similar interface as `Executor.run`. Besides +1. Scope: we don't expose `scope` in `ParallelExecutor.run` since `ParallelExecutor` has its +own scope to maintain NCCL. +1. Feed: we don't expose `feed` in the API either, because the whole point of implementing +parallel_executor is the speed. The input for NN should be implemented in an reader OP. +1. Fetch: we return the fetched value on all GPUs as a list. (e.g. `exe.run(..., fetch=loss)` +with return `[loss_on_gpu0, loss_on_gpu1]`) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 15e5574ecf..934bb43ffe 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -86,6 +86,8 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope + framework_proto backward glog lod_rank_table feed_fetch_method executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 5cae38b2a8..6ee3f18dd4 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -305,10 +305,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } // if (create_vars) for (auto& op : ctx->ops_) { + // TODO(ty): + // e.g. sgd should wait for allreduce to be finished + // if op's input is params' grad: + // sync with allreduce stream + // SyncMultipleStreams(op); + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + // TODO(ty): + // e.g. allreduce shoudl wait for fc_grad to be finished. + // if op's output is params' grad: + // sync with computation stream + // apply allreduce on allreduce stream + // SyncMultipleStreams(op); + if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " << memory::memory_usage(place_); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 28ce331515..8d8a7cf4db 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -47,6 +47,7 @@ class Executor { const std::string& feed_holder_name = "feed", const std::string& fetch_holder_name = "fetch"); + private: static ExecutorPrepareContext* Prepare(const ProgramDesc& program, int block_id); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc new file mode 100644 index 0000000000..e9f213ae2c --- /dev/null +++ b/paddle/fluid/framework/parallel_executor.cc @@ -0,0 +1,19 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/parallel_executor.h" + +namespace paddle { +namespace framework {} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h new file mode 100644 index 0000000000..47e0005e58 --- /dev/null +++ b/paddle/fluid/framework/parallel_executor.h @@ -0,0 +1,61 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" + +#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { + +struct AllReduceCallBack { + void operator()(framework::OperatorBase* op); + + std::unordered_set param_grad_names_; + platform::DeviceContext dev_ctx; +}; + +class ParallelExecutor { + explicit ParallelExecutor(const std::vector& places, + const std::unordered_set& params); + + /* @Brief + * Runtime evaluation of the given ProgramDesc under certain Scope + * + * @param + * ProgramDesc + * Scope + */ + void Run(const ProgramDesc& prog, Scope* scope, int block_id, + bool create_local_scope = true, bool create_vars = true); + + private: + std::vector exes_; + std::vector scopes_; + AllReduceCallBack all_reduce_callbacks_; + std::unordered_set params_; // where to initilize it? + platform::Communicator nccl_com_; +}; + +} // namespace framework +} // namespace paddle From e67325cdaf8ce85342dab45b06dbc286c77a5555 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 14 Mar 2018 00:11:32 +0000 Subject: [PATCH 008/314] update readme --- doc/design/parallel_executor.md | 42 +++++++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/doc/design/parallel_executor.md b/doc/design/parallel_executor.md index 567eede1bd..78ef74f159 100644 --- a/doc/design/parallel_executor.md +++ b/doc/design/parallel_executor.md @@ -30,23 +30,45 @@ operator run on each GPU, it will automatically sync with different streams when // if op's input is params' grad: // sync with allreduce stream // e.g. sgd should wait for allreduce to be finished -SyncMultipleStreams(op); +CallBack->BeforeOp(op); op->Run(*local_scope, place_); // if op's output is params' grad: // sync with computation stream // e.g. allreduce shoudl wait for fc_grad to be finished. -SyncMultipleStreams(op); +CallBack->AfterOp(op); ``` +And the `Callback` object can be implemented as the following -## API +```c++ +struct AllReduceCallBack { + void BeforeOp(framework::OperatorBase* op); + void AfterOp(framework::OperatorBase* op); + + std::unordered_set reduced_param_grad_names; + std::unordered_set param_grad_names_; + + platform::DeviceContext* computation_dev_ctx; // computation device context + platform::DeviceContext* communication_dev_ctx; // communication device context -The `ParallelExecutor.run` has similar interface as `Executor.run`. Besides -1. Scope: we don't expose `scope` in `ParallelExecutor.run` since `ParallelExecutor` has its -own scope to maintain NCCL. -1. Feed: we don't expose `feed` in the API either, because the whole point of implementing -parallel_executor is the speed. The input for NN should be implemented in an reader OP. -1. Fetch: we return the fetched value on all GPUs as a list. (e.g. `exe.run(..., fetch=loss)` -with return `[loss_on_gpu0, loss_on_gpu1]`) + framework::Scope* scope; + platform::NCCL::Communicator* nccl_com; +}; + +AllReduceCallBack::BeforeOp(framework::OperatorBase* op) { + if (op->Input() in reduced_param_grad_names) { + communication_dev_ctx->Wait(); + reduced_param_grad_names.erase(op->Input()) + } +} + +AllReduceCallBack::AfterOp(framework::OperatorBase* op) { + if (op->Output() in param_grad_names) { + computation_dev_ctx->Wait(); + reduced_param_grad_names.insert(op->Output()); + ncclAllreduce(scope, op->Output(), communication_dev_ctx); + } +} +``` From 8f061e43b71b398d37aebc3576e2c2f21d5fae73 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Wed, 14 Mar 2018 00:16:11 +0000 Subject: [PATCH 009/314] delete param name --- paddle/fluid/framework/parallel_executor.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 47e0005e58..f67b926694 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -52,8 +52,7 @@ class ParallelExecutor { private: std::vector exes_; std::vector scopes_; - AllReduceCallBack all_reduce_callbacks_; - std::unordered_set params_; // where to initilize it? + std::vector all_reduce_callbacks_; platform::Communicator nccl_com_; }; From baef1124fb4cc8876a0119af34ca1500df682f9d Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 14 Mar 2018 21:13:29 +0800 Subject: [PATCH 010/314] ParallelExecutor And dependency engine --- paddle/fluid/framework/parallel_executor.cc | 338 +++++++++++++++++- paddle/fluid/framework/parallel_executor.h | 45 +-- paddle/fluid/platform/place.h | 11 + paddle/fluid/pybind/CMakeLists.txt | 1 + paddle/fluid/pybind/pybind.cc | 14 + .../tests/unittests/test_parallel_executor.py | 47 +++ 6 files changed, 433 insertions(+), 23 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_executor.py diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e9f213ae2c..7488458743 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,7 +13,343 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include "lod_tensor.h" +#include "op_registry.h" namespace paddle { -namespace framework {} // namespace framework +namespace framework { + +struct OpHandle; + +struct VarHandle { + size_t version_; + std::string name_; + platform::Place place_; + + OpHandle *generated_op_; + std::vector deps_ops_; +}; + +struct OpHandle { + std::vector inputs_; + std::vector outputs_; + platform::DeviceContext *dev_ctx_; + + std::string DebugString() { + std::stringstream ss; + ss << "("; + for (auto *var : inputs_) { + ss << var->name_ << ":" << var->place_ << ", "; + } + ss << ") --> ("; + for (auto *var : outputs_) { + ss << var->name_ << ":" << var->place_ << ", "; + } + ss << ")\n"; + return ss.str(); + } + + virtual ~OpHandle() {} +}; + +struct ComputationOpHandle : public OpHandle { + std::unique_ptr op_; + + explicit ComputationOpHandle(const OpDesc &op_desc) + : op_(framework::OpRegistry::CreateOp(op_desc)) {} +}; + +struct ScaleLossGradOpHandle : public OpHandle {}; + +struct NCCLAllReduceOpHandle : public OpHandle {}; + +class ParallelExecutorPrivate { + public: + std::unordered_map + local_scopes_; + std::unordered_map + dev_ctxs_; + platform::Place main_place_; + + std::unordered_map>, + platform::PlaceHash> + vars_; + std::vector> ops_; +}; + +// TODO(yy): Move this function somewhere +ncclDataType_t ToNCCLDataType(std::type_index type) { + // FIXME!! + return ncclFloat; +} + +ParallelExecutor::ParallelExecutor( + const std::vector &places, + const std::unordered_set ¶ms, + const ProgramDesc &startup_program, const ProgramDesc &main_program, + const std::string &loss_var_name, Scope *scope) + : member_(new ParallelExecutorPrivate()) { + // Step 1. RunStartupProgram and Bcast the params to devs. + Executor exe(places[0]); + exe.Run(startup_program, scope, 0); + // Create local scopes + for (auto &place : places) { + member_->local_scopes_[place] = &scope->NewScope(); + } + member_->main_place_ = places[0]; + + // Bcast Parameters to all GPUs + if (platform::is_gpu_place(member_->main_place_)) { // Is CUDA + // BCastParamsToGPUs(startup_program); + } + // Startup Program has been run. All local scopes has correct parameters. + + // Step 2. Convert main_program to SSA form and dependency graph. Also, insert + // ncclOp + ConstructDependencyGraph(params, main_program, loss_var_name); +} + +void ParallelExecutor::ConstructDependencyGraph( + const std::unordered_set ¶ms, + const ProgramDesc &main_program, const std::string &loss_var_name) const { + std::unordered_set grads; + for (auto &each_param : params) { + grads.insert(each_param + "@GRAD"); + } + + bool is_forwarding = true; + for (auto *op : main_program.Block(0).AllOps()) { + bool change_forward = false; + + if (!is_forwarding) { + // FIXME(yy): Do not hard code like this + if (op->OutputArgumentNames().size() == 1 && + op->OutputArgumentNames()[0] == loss_var_name + "@GRAD") { + continue; // Drop fill 1. for backward coeff; + } + } + + for (auto &pair : member_->local_scopes_) { + member_->ops_.emplace_back(new ComputationOpHandle(*op)); + auto *op_handle = member_->ops_.back().get(); + + auto var_names = op->InputArgumentNames(); + + for (auto &each_var_name : var_names) { + auto &place = pair.first; + VarHandle *var = GetVarHandle(each_var_name, place); + op_handle->inputs_.emplace_back(var); + var->deps_ops_.emplace_back(op_handle); + } + var_names = op->OutputArgumentNames(); + + for (auto &each_var_name : var_names) { + auto &place = pair.first; + GenerateVar(op_handle, each_var_name, place); + } + + if (is_forwarding) { + if (var_names.size() == 1 && var_names[0] == loss_var_name) { + // Insert ScaleCost OpHandle + member_->ops_.emplace_back(new ScaleLossGradOpHandle()); + + op_handle = member_->ops_.back().get(); + auto &place = pair.first; + VarHandle *loss = GetVarHandle(loss_var_name, place); + loss->deps_ops_.emplace_back(op_handle); + op_handle->inputs_.emplace_back(loss); + GenerateVar(op_handle, loss_var_name + "@GRAD", place); + change_forward = true; + LOG(INFO) << "Scale Loss " << op_handle->DebugString(); + } + } + } + + if (change_forward) { + is_forwarding = false; + } + + if (!is_forwarding) { + auto var_names = op->OutputArgumentNames(); + for (auto &og : var_names) { + if (grads.count(og) != 0) { // is param grad + // Insert NCCL AllReduce Op + member_->ops_.emplace_back(new NCCLAllReduceOpHandle()); + auto *op_handle = member_->ops_.back().get(); + + for (auto &pair : member_->local_scopes_) { + auto &place = pair.first; + auto &vars = member_->vars_[place][og]; + + if (vars.empty()) { // This device has no data. continue. + continue; + } + auto *prev_grad = &vars[vars.size() - 1]; + op_handle->inputs_.emplace_back(prev_grad); + prev_grad->deps_ops_.emplace_back(op_handle); + auto &var = vars[vars.size()]; + var.place_ = place; + var.generated_op_ = op_handle; + var.name_ = og; + var.version_ = vars.size() - 1; + op_handle->outputs_.emplace_back(&var); + } + } + } + } + } +} + +void ParallelExecutor::GenerateVar(OpHandle *op_handle, + const std::string &each_var_name, + const platform::Place &place) const { + auto &vars = member_->vars_[place][each_var_name]; + size_t version = vars.size(); + auto &var = vars[version]; + var.version_ = version; + var.generated_op_ = op_handle; + var.name_ = each_var_name; + var.place_ = place; + op_handle->outputs_.emplace_back(&var); +} + +VarHandle *ParallelExecutor::GetVarHandle(const std::string &each_var_name, + const platform::Place &place) const { + auto &var_holders = member_->vars_[place]; + auto &var_holder = var_holders[each_var_name]; + VarHandle *var = nullptr; + if (var_holder.empty()) { + auto &init_var = var_holder[0]; + init_var.place_ = place; + init_var.name_ = each_var_name; + init_var.generated_op_ = nullptr; + init_var.version_ = 0; + var = &init_var; + } else { + var = &var_holder.rbegin()->second; + } + return var; +} + +void ParallelExecutor::BCastParamsToGPUs( + const ProgramDesc &startup_program) const { + auto *main_scope = member_->local_scopes_[member_->main_place_]; + for (auto *var_desc : startup_program.Block(0).AllVars()) { + if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { + auto &main_tensor = + main_scope->FindVar(var_desc->Name())->Get(); + + ncclDataType_t data_type = ToNCCLDataType(main_tensor.type()); + auto &dims = main_tensor.dims(); + size_t numel = main_tensor.numel(); + std::vector> mems; + mems.emplace_back( + const_cast(main_tensor.data()), + new platform::CUDADeviceContext( + boost::get(member_->main_place_))); + + for (auto &pair : member_->local_scopes_) { + if (pair.first == member_->main_place_) { + continue; + } + + auto local_scope = pair.second; + auto *t = local_scope->Var(var_desc->Name())->GetMutable(); + t->Resize(dims); + mems.emplace_back(t->mutable_data(pair.first, main_tensor.type()), + new platform::CUDADeviceContext( + boost::get(pair.first))); + } + + // TODO(yy): Invoke ncclBCast here. mems, numel, data_type. The mems[0] + // is the src, rests are dests. + + (void)(data_type); + (void)(numel); + + // Free Communication Ctx + for (auto &pair : mems) { + // Release Communication Ctx + + // FIXME: Store CUDA DevCtx to member. Since NCCL All Reduce will use + // this + delete pair.second; + } + } + } +} + +std::vector ParallelExecutor::Run( + const std::vector &fetch_tensors) { + // Version --> VarHandle + std::unordered_set pending_vars; + std::unordered_map pending_ops; + + for (auto &place_pair : member_->vars_) { + for (auto &name_pair : place_pair.second) { + for (auto &version_pair : name_pair.second) { + pending_vars.insert(&version_pair.second); + } + } + } + + for (auto &op : member_->ops_) { + pending_ops.insert({op.get(), op->inputs_.size()}); + } + + std::unordered_set complete_op; + + size_t num_op = pending_ops.size(); + + while (complete_op.size() != num_op) { + std::vector to_remove; + for (auto &var : pending_vars) { + if (var->generated_op_ == nullptr || + complete_op.count(var->generated_op_) != 0) { + to_remove.push_back(var); + } + } + for (auto *var : to_remove) { + pending_vars.erase(var); + } + + std::vector to_run; + for (auto *var : to_remove) { + for (auto *op : var->deps_ops_) { + if (var->name_ == "mean_0.tmp_0@GRAD") { + LOG(INFO) << op->DebugString(); + } + auto &num = pending_ops[op]; + --num; + if (num == 0) { + to_run.emplace_back(op); + } + } + } + + for (auto *op : to_run) { + pending_ops.erase(op); + complete_op.insert(op); + } + + if (to_run.empty()) break; + + // TODO(yy): Use thead pool to run OpHandle. Operators in ToRun can be + // paralleled. We can also use another schedule method. Just a demo here. + + std::stringstream ss; + ss << "\n"; + for (auto *op : to_run) { + ss << op->DebugString() << "\n"; + } + ss << std::endl; + LOG(INFO) << ss.str(); + } + + PADDLE_ENFORCE_EQ(complete_op.size(), num_op); + return std::vector(); +} +} // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index f67b926694..ec80f89f0e 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -28,32 +28,33 @@ limitations under the License. */ namespace paddle { namespace framework { -struct AllReduceCallBack { - void operator()(framework::OperatorBase* op); - - std::unordered_set param_grad_names_; - platform::DeviceContext dev_ctx; -}; - +class ParallelExecutorPrivate; +class VarHandle; +class OpHandle; class ParallelExecutor { + public: explicit ParallelExecutor(const std::vector& places, - const std::unordered_set& params); - - /* @Brief - * Runtime evaluation of the given ProgramDesc under certain Scope - * - * @param - * ProgramDesc - * Scope - */ - void Run(const ProgramDesc& prog, Scope* scope, int block_id, - bool create_local_scope = true, bool create_vars = true); + const std::unordered_set& params, + const ProgramDesc& startup_program, + const ProgramDesc& main_program, + const std::string& loss_var_name, Scope* scope); + + std::vector Run(const std::vector& fetch_tensors); private: - std::vector exes_; - std::vector scopes_; - std::vector all_reduce_callbacks_; - platform::Communicator nccl_com_; + ParallelExecutorPrivate* member_; + + void BCastParamsToGPUs(const ProgramDesc& startup_program) const; + + VarHandle* GetVarHandle(const std::string& each_var_name, + const platform::Place& place) const; + + void GenerateVar(OpHandle* op_handle, const std::string& each_var_name, + const platform::Place& place) const; + + void ConstructDependencyGraph(const std::unordered_set& params, + const ProgramDesc& main_program, + const std::string& loss_var_name) const; }; } // namespace framework diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 501bddfc6e..633251eb47 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -65,6 +65,17 @@ bool is_cpu_place(const Place &); bool places_are_same_class(const Place &, const Place &); bool is_same_place(const Place &, const Place &); +struct PlaceHash { + std::size_t operator()(const Place &p) const { + std::hash ihash; + size_t dev_id = 0; + if (is_gpu_place(p)) { + dev_id = boost::get(p).device; + } + return ihash(dev_id << 2 | p.which()); + } +}; + std::ostream &operator<<(std::ostream &, const Place &); template diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 8942b5c943..ecf9e47884 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -2,6 +2,7 @@ if(WITH_PYTHON) cc_library(paddle_pybind SHARED SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method + parallel_executor ${GLOB_OP_LIB}) if(NOT APPLE AND NOT ANDROID) target_link_libraries(paddle_pybind rt) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index d2e883cacc..8b752c4efb 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/parallel_executor.h" #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" @@ -488,6 +489,19 @@ All parameter, weight, gradient are variables in Paddle. m.def("disable_profiler", platform::DisableProfiler); m.def("reset_profiler", platform::ResetProfiler); + py::class_(m, "ParallelExecutor") + .def( + "__init__", + [](ParallelExecutor &self, const std::vector &places, + const std::unordered_set ¶ms, + const ProgramDesc &startup_program, + const ProgramDesc &main_program, const std::string &loss_var_name, + Scope *scope) { + new (&self) ParallelExecutor(places, params, startup_program, + main_program, loss_var_name, scope); + }) + .def("run", [](ParallelExecutor &self) { self.Run({}); }); + BindRecordIOWriter(m); return m.ptr(); } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py new file mode 100644 index 0000000000..2b41b2c9b4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -0,0 +1,47 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle.fluid as fluid + + +class ParallelExecutor(unittest.TestCase): + def test_main(self): + main = fluid.Program() + startup = fluid.Program() + + with fluid.program_guard(main, startup): + reader = fluid.layers.open_recordio_file( + filename='tmp', + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + img, label = fluid.layers.read_file(reader) + hidden = fluid.layers.fc(img, size=200, act='tanh') + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + adam = fluid.optimizer.Adam() + adam.minimize(loss) + act_places = [] + for each in [fluid.CUDAPlace(0), fluid.CUDAPlace(1)]: + p = fluid.core.Place() + p.set_place(each) + act_places.append(p) + + exe = fluid.core.ParallelExecutor( + act_places, + set([p.name for p in main.global_block().iter_parameters()]), + startup.desc, main.desc, loss.name, fluid.global_scope()) + exe.run() From 692a0f7425064f5e44179be6daf49062d50ffc2a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 14 Mar 2018 21:17:42 +0800 Subject: [PATCH 011/314] Better name --- paddle/fluid/framework/parallel_executor.cc | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7488458743..46fb15f580 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -27,7 +27,8 @@ struct VarHandle { platform::Place place_; OpHandle *generated_op_; - std::vector deps_ops_; + + std::vector pending_ops_; }; struct OpHandle { @@ -141,7 +142,7 @@ void ParallelExecutor::ConstructDependencyGraph( auto &place = pair.first; VarHandle *var = GetVarHandle(each_var_name, place); op_handle->inputs_.emplace_back(var); - var->deps_ops_.emplace_back(op_handle); + var->pending_ops_.emplace_back(op_handle); } var_names = op->OutputArgumentNames(); @@ -158,7 +159,7 @@ void ParallelExecutor::ConstructDependencyGraph( op_handle = member_->ops_.back().get(); auto &place = pair.first; VarHandle *loss = GetVarHandle(loss_var_name, place); - loss->deps_ops_.emplace_back(op_handle); + loss->pending_ops_.emplace_back(op_handle); op_handle->inputs_.emplace_back(loss); GenerateVar(op_handle, loss_var_name + "@GRAD", place); change_forward = true; @@ -188,7 +189,7 @@ void ParallelExecutor::ConstructDependencyGraph( } auto *prev_grad = &vars[vars.size() - 1]; op_handle->inputs_.emplace_back(prev_grad); - prev_grad->deps_ops_.emplace_back(op_handle); + prev_grad->pending_ops_.emplace_back(op_handle); auto &var = vars[vars.size()]; var.place_ = place; var.generated_op_ = op_handle; @@ -317,7 +318,7 @@ std::vector ParallelExecutor::Run( std::vector to_run; for (auto *var : to_remove) { - for (auto *op : var->deps_ops_) { + for (auto *op : var->pending_ops_) { if (var->name_ == "mean_0.tmp_0@GRAD") { LOG(INFO) << op->DebugString(); } From ae88fdefb7deff02a83ca5fe4eb8d4b17b2173e0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 15 Mar 2018 14:51:01 +0800 Subject: [PATCH 012/314] Use thread pool --- paddle/fluid/framework/parallel_executor.cc | 77 +++++++++++---------- paddle/fluid/framework/threadpool.h | 4 +- 2 files changed, 41 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 46fb15f580..dd726f1fab 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include "lod_tensor.h" #include "op_registry.h" +#include "threadpool.h" namespace paddle { namespace framework { @@ -34,7 +35,6 @@ struct VarHandle { struct OpHandle { std::vector inputs_; std::vector outputs_; - platform::DeviceContext *dev_ctx_; std::string DebugString() { std::stringstream ss; @@ -66,6 +66,9 @@ struct NCCLAllReduceOpHandle : public OpHandle {}; class ParallelExecutorPrivate { public: + explicit ParallelExecutorPrivate(size_t num_threads = 12) + : pool_(num_threads) {} + std::unordered_map local_scopes_; std::unordered_map vars_; std::vector> ops_; + + ThreadPool pool_; }; // TODO(yy): Move this function somewhere @@ -285,13 +290,15 @@ void ParallelExecutor::BCastParamsToGPUs( std::vector ParallelExecutor::Run( const std::vector &fetch_tensors) { // Version --> VarHandle - std::unordered_set pending_vars; + + std::unordered_map pending_vars; std::unordered_map pending_ops; for (auto &place_pair : member_->vars_) { for (auto &name_pair : place_pair.second) { for (auto &version_pair : name_pair.second) { - pending_vars.insert(&version_pair.second); + pending_vars[&version_pair.second] = + version_pair.second.generated_op_ == nullptr; } } } @@ -300,56 +307,50 @@ std::vector ParallelExecutor::Run( pending_ops.insert({op.get(), op->inputs_.size()}); } - std::unordered_set complete_op; - - size_t num_op = pending_ops.size(); - - while (complete_op.size() != num_op) { - std::vector to_remove; - for (auto &var : pending_vars) { - if (var->generated_op_ == nullptr || - complete_op.count(var->generated_op_) != 0) { - to_remove.push_back(var); + while (!pending_ops.empty()) { + VarHandle *ready_var = nullptr; + for (auto &pair : pending_vars) { + if (pair.second) { + ready_var = pair.first; } } - for (auto *var : to_remove) { - pending_vars.erase(var); + + if (ready_var == nullptr) { + member_->pool_.Wait(); // Wait thread pool; + continue; } + pending_vars.erase(ready_var); + std::vector to_run; - for (auto *var : to_remove) { - for (auto *op : var->pending_ops_) { - if (var->name_ == "mean_0.tmp_0@GRAD") { - LOG(INFO) << op->DebugString(); - } - auto &num = pending_ops[op]; - --num; - if (num == 0) { - to_run.emplace_back(op); - } + + for (auto *op : ready_var->pending_ops_) { + auto &deps = pending_ops[op]; + --deps; + if (deps == 0) { + to_run.emplace_back(op); } } for (auto *op : to_run) { pending_ops.erase(op); - complete_op.insert(op); - } - if (to_run.empty()) break; + std::vector ready_buffer; + for (auto *var : op->outputs_) { + ready_buffer.emplace_back(&pending_vars[var]); + } - // TODO(yy): Use thead pool to run OpHandle. Operators in ToRun can be - // paralleled. We can also use another schedule method. Just a demo here. + auto op_run = [ready_buffer, op] { + // TODO(yy) Check Previous Op has same dev ctx. + LOG(INFO) << "Run " << op->DebugString(); + for (auto *ready : ready_buffer) { + *ready = true; + } + }; - std::stringstream ss; - ss << "\n"; - for (auto *op : to_run) { - ss << op->DebugString() << "\n"; + member_->pool_.Run(op_run); } - ss << std::endl; - LOG(INFO) << ss.str(); } - - PADDLE_ENFORCE_EQ(complete_op.size(), num_op); return std::vector(); } } // namespace framework diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index df51fb24a5..f9dce7105e 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -32,6 +32,8 @@ namespace framework { // number of threads. class ThreadPool { public: + explicit ThreadPool(int num_threads); + using Task = std::packaged_task()>; // Returns the singleton of ThreadPool. @@ -103,8 +105,6 @@ class ThreadPool { DISABLE_COPY_AND_ASSIGN(ThreadPool); - explicit ThreadPool(int num_threads); - // If the task queue is empty and avaialbe is equal to the number of // threads, means that all tasks are completed. Note: this function // is not thread-safe. Returns true if all tasks are completed. From 22bb262a75d2b6ed71b9828ae0cfa4a621967c8a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 15 Mar 2018 14:51:38 +0800 Subject: [PATCH 013/314] Remove out of date design --- doc/design/parallel_executor.md | 74 --------------------------------- 1 file changed, 74 deletions(-) delete mode 100644 doc/design/parallel_executor.md diff --git a/doc/design/parallel_executor.md b/doc/design/parallel_executor.md deleted file mode 100644 index 78ef74f159..0000000000 --- a/doc/design/parallel_executor.md +++ /dev/null @@ -1,74 +0,0 @@ -# ParallelExecutor Design Doc - -## Introduction - -We introduce `ParallelExecutor` to run multi-GPU training in PaddlePaddle Fluid. It supports -1. keeping a copy of the parameters on each GPU -1. allreduce on a separate stream allowing computation and communication overlap - -An example of switching single GPU training to multiple GPUs: -```python -cost = your_neural_network() -opt = fluid.optimizer.SGDOptimizer() -opt.minimize(avg_cost) - -# change Executor -> ParallelExecutor -exe = fluid.ParallelExecutor(gpu_list=[0, 1]) - -for iter in xranges(iter_num): - exe.run() -``` - -## Design - -In the constructor, a list of parameter, whose gradients need to be allreduced, is given. - -During the runtime, `ParallelExecutor` starts `#gpu` threads to run each `Executor`. For every -operator run on each GPU, it will automatically sync with different streams when necessary. - -```c++ -// if op's input is params' grad: - // sync with allreduce stream - // e.g. sgd should wait for allreduce to be finished -CallBack->BeforeOp(op); - -op->Run(*local_scope, place_); - -// if op's output is params' grad: -// sync with computation stream -// e.g. allreduce shoudl wait for fc_grad to be finished. -CallBack->AfterOp(op); -``` - -And the `Callback` object can be implemented as the following - -```c++ -struct AllReduceCallBack { - void BeforeOp(framework::OperatorBase* op); - void AfterOp(framework::OperatorBase* op); - - std::unordered_set reduced_param_grad_names; - std::unordered_set param_grad_names_; - - platform::DeviceContext* computation_dev_ctx; // computation device context - platform::DeviceContext* communication_dev_ctx; // communication device context - - framework::Scope* scope; - platform::NCCL::Communicator* nccl_com; -}; - -AllReduceCallBack::BeforeOp(framework::OperatorBase* op) { - if (op->Input() in reduced_param_grad_names) { - communication_dev_ctx->Wait(); - reduced_param_grad_names.erase(op->Input()) - } -} - -AllReduceCallBack::AfterOp(framework::OperatorBase* op) { - if (op->Output() in param_grad_names) { - computation_dev_ctx->Wait(); - reduced_param_grad_names.insert(op->Output()); - ncclAllreduce(scope, op->Output(), communication_dev_ctx); - } -} -``` From 35744e7b36f3c7202080feeabc0d8f207839b2e1 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 15 Mar 2018 16:30:16 +0800 Subject: [PATCH 014/314] Polish code --- paddle/fluid/framework/parallel_executor.cc | 100 ++++++++++++++---- paddle/fluid/framework/parallel_executor.h | 2 + .../tests/unittests/test_parallel_executor.py | 2 +- 3 files changed, 82 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dd726f1fab..7af5cc075c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -20,6 +20,12 @@ limitations under the License. */ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_CUDA + +// FIXME: CHECK the return value of x; +#define NCCL_INVOKE(x) x +#endif + struct OpHandle; struct VarHandle { @@ -71,9 +77,51 @@ class ParallelExecutorPrivate { std::unordered_map local_scopes_; - std::unordered_map - dev_ctxs_; + +#ifdef PADDLE_WITH_CUDA + struct NCCLContext { + std::unique_ptr ctx_; + ncclComm_t comm; + + explicit NCCLContext(int dev_id) { + ctx_.reset(new platform::CUDADeviceContext(platform::CUDAPlace(dev_id))); + } + + cudaStream_t stream() const { return ctx_->stream(); } + + int device_id() const { + return boost::get(ctx_->GetPlace()).device; + } + + static void InitNCCLContext(std::map &contexts) { + std::vector comms; + std::vector devs; + comms.resize(contexts.size()); + devs.reserve(contexts.size()); + + for (auto &ctx : contexts) { + devs.push_back(ctx.first); + } + + NCCL_INVOKE(platform::dynload::ncclCommInitAll( + &comms[0], static_cast(contexts.size()), &devs[0])); + + int i = 0; + for (auto &ctx : contexts) { + ctx.second.comm = comms[i++]; + } + } + }; + + std::map communication_streams_; + + NCCLContext &GetNCCLCtx(platform::Place p) { + int dev_id = boost::get(p).device; + return communication_streams_.at(dev_id); + } + +#endif + platform::Place main_place_; std::unordered_mapmain_place_ = places[0]; // Bcast Parameters to all GPUs - if (platform::is_gpu_place(member_->main_place_)) { // Is CUDA - // BCastParamsToGPUs(startup_program); + if (platform::is_gpu_place(member_->main_place_) && + member_->local_scopes_.size() != 1) { // Is CUDA + BuildNCCLCommunicator(); + BCastParamsToGPUs(startup_program); } // Startup Program has been run. All local scopes has correct parameters. @@ -241,20 +291,20 @@ VarHandle *ParallelExecutor::GetVarHandle(const std::string &each_var_name, void ParallelExecutor::BCastParamsToGPUs( const ProgramDesc &startup_program) const { +#ifdef PADDLE_WITH_CUDA auto *main_scope = member_->local_scopes_[member_->main_place_]; + for (auto *var_desc : startup_program.Block(0).AllVars()) { if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { auto &main_tensor = main_scope->FindVar(var_desc->Name())->Get(); - ncclDataType_t data_type = ToNCCLDataType(main_tensor.type()); auto &dims = main_tensor.dims(); size_t numel = main_tensor.numel(); - std::vector> mems; - mems.emplace_back( - const_cast(main_tensor.data()), - new platform::CUDADeviceContext( - boost::get(member_->main_place_))); + std::vector> + mems; + mems.emplace_back(const_cast(main_tensor.data()), + &member_->GetNCCLCtx(member_->main_place_)); for (auto &pair : member_->local_scopes_) { if (pair.first == member_->main_place_) { @@ -265,8 +315,7 @@ void ParallelExecutor::BCastParamsToGPUs( auto *t = local_scope->Var(var_desc->Name())->GetMutable(); t->Resize(dims); mems.emplace_back(t->mutable_data(pair.first, main_tensor.type()), - new platform::CUDADeviceContext( - boost::get(pair.first))); + &member_->GetNCCLCtx(member_->main_place_)); } // TODO(yy): Invoke ncclBCast here. mems, numel, data_type. The mems[0] @@ -274,17 +323,26 @@ void ParallelExecutor::BCastParamsToGPUs( (void)(data_type); (void)(numel); + } + } +#else + PADDLE_THROW("Not compiled with CUDA"); +#endif +} - // Free Communication Ctx - for (auto &pair : mems) { - // Release Communication Ctx +void ParallelExecutor::BuildNCCLCommunicator() const { +#ifdef PADDLE_WITH_CUDA + for (auto &place_pair : member_->local_scopes_) { + auto place = place_pair.first; + int dev_id = boost::get(place).device; - // FIXME: Store CUDA DevCtx to member. Since NCCL All Reduce will use - // this - delete pair.second; - } - } + member_->communication_streams_.emplace( + dev_id, ParallelExecutorPrivate::NCCLContext(dev_id)); } + + ParallelExecutorPrivate::NCCLContext::InitNCCLContext( + member_->communication_streams_); +#endif } std::vector ParallelExecutor::Run( diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ec80f89f0e..805b7e5aa9 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -55,6 +55,8 @@ class ParallelExecutor { void ConstructDependencyGraph(const std::unordered_set& params, const ProgramDesc& main_program, const std::string& loss_var_name) const; + + void BuildNCCLCommunicator() const; }; } // namespace framework diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 2b41b2c9b4..65b43448a4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -35,7 +35,7 @@ class ParallelExecutor(unittest.TestCase): adam = fluid.optimizer.Adam() adam.minimize(loss) act_places = [] - for each in [fluid.CUDAPlace(0), fluid.CUDAPlace(1)]: + for each in [fluid.CUDAPlace(0)]: p = fluid.core.Place() p.set_place(each) act_places.append(p) From 193c0a7e4333ca7e403089ef1f9e66c79d56c68a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 15 Mar 2018 17:27:42 +0800 Subject: [PATCH 015/314] Handle var hazard --- paddle/fluid/framework/parallel_executor.cc | 137 +++++++++++++++++--- 1 file changed, 121 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7af5cc075c..e98fedb68d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -28,42 +28,79 @@ namespace framework { struct OpHandle; -struct VarHandle { +struct VarHandleBase { + virtual ~VarHandleBase() {} + virtual std::string DebugString() const = 0; + + OpHandle *generated_op_; + std::vector pending_ops_; +}; + +struct VarHandle : public VarHandleBase { + std::string DebugString() const override { + std::stringstream ss; + ss << name_ << ":" << place_; + return ss.str(); + } + size_t version_; std::string name_; platform::Place place_; +}; - OpHandle *generated_op_; - - std::vector pending_ops_; +struct DependencyVarHandle : public VarHandleBase { + std::string DebugString() const override { return "Deps var"; } }; struct OpHandle { - std::vector inputs_; - std::vector outputs_; + std::vector inputs_; + std::vector outputs_; + std::unordered_map + dev_ctx_; std::string DebugString() { std::stringstream ss; ss << "("; for (auto *var : inputs_) { - ss << var->name_ << ":" << var->place_ << ", "; + ss << var->DebugString() << ", "; } ss << ") --> ("; for (auto *var : outputs_) { - ss << var->name_ << ":" << var->place_ << ", "; + ss << var->DebugString() << ", "; } ss << ")\n"; return ss.str(); } virtual ~OpHandle() {} + + virtual void Run() {} + virtual void Wait() {} }; struct ComputationOpHandle : public OpHandle { std::unique_ptr op_; + Scope *scope_; + platform::Place place_; - explicit ComputationOpHandle(const OpDesc &op_desc) - : op_(framework::OpRegistry::CreateOp(op_desc)) {} + explicit ComputationOpHandle(const OpDesc &op_desc, Scope *scope, + platform::Place place) + : op_(framework::OpRegistry::CreateOp(op_desc)), + scope_(scope), + place_(place) {} + + void Run() override { + // Wait other op if necessary + auto *cur_ctx = dev_ctx_[place_]; + for (auto *in : inputs_) { + if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { + in->generated_op_->Wait(); + } + } + + op_->Run(*scope_, place_); + } }; struct ScaleLossGradOpHandle : public OpHandle {}; @@ -122,12 +159,27 @@ class ParallelExecutorPrivate { #endif + platform::DeviceContext *CommunicationDevCtx(const platform::Place &place) { + if (platform::is_cpu_place(place) || local_scopes_.size() == 1) { + return const_cast( + platform::DeviceContextPool::Instance().Get(place)); + } else { +#ifdef PADDLE_WITH_CUDA + return GetNCCLCtx(place).ctx_.get(); +#else + PADDLE_THROW("Not compiled with CUDA") +#endif + } + } + platform::Place main_place_; std::unordered_map>, platform::PlaceHash> vars_; + std::unordered_set> dep_vars_; + std::vector> ops_; ThreadPool pool_; @@ -170,7 +222,7 @@ ParallelExecutor::ParallelExecutor( void ParallelExecutor::ConstructDependencyGraph( const std::unordered_set ¶ms, const ProgramDesc &main_program, const std::string &loss_var_name) const { - std::unordered_set grads; + std::unordered_set grads; for (auto &each_param : params) { grads.insert(each_param + "@GRAD"); } @@ -188,8 +240,11 @@ void ParallelExecutor::ConstructDependencyGraph( } for (auto &pair : member_->local_scopes_) { - member_->ops_.emplace_back(new ComputationOpHandle(*op)); + member_->ops_.emplace_back( + new ComputationOpHandle(*op, pair.second, pair.first)); auto *op_handle = member_->ops_.back().get(); + op_handle->dev_ctx_[pair.first] = const_cast( + platform::DeviceContextPool::Instance().Get(pair.first)); auto var_names = op->InputArgumentNames(); @@ -210,8 +265,11 @@ void ParallelExecutor::ConstructDependencyGraph( if (var_names.size() == 1 && var_names[0] == loss_var_name) { // Insert ScaleCost OpHandle member_->ops_.emplace_back(new ScaleLossGradOpHandle()); - op_handle = member_->ops_.back().get(); + + op_handle->dev_ctx_[pair.first] = + member_->CommunicationDevCtx(pair.first); + auto &place = pair.first; VarHandle *loss = GetVarHandle(loss_var_name, place); loss->pending_ops_.emplace_back(op_handle); @@ -251,11 +309,54 @@ void ParallelExecutor::ConstructDependencyGraph( var.name_ = og; var.version_ = vars.size() - 1; op_handle->outputs_.emplace_back(&var); + + for (auto &pair : member_->local_scopes_) { + op_handle->dev_ctx_[pair.first] = + member_->CommunicationDevCtx(pair.first); + } } } } } } + + /** + * Dependency graph has been constructed. However, there are still data + * harzaeds need to be handled. + * + * We only handle write after read(WAR), since it should not have a write + * after write in program. If there are write after write operators, we need + * prune them. + * + * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + */ + + for (auto &place_pair : member_->vars_) { + for (auto &name_pair : place_pair.second) { + if (name_pair.second.size() <= 1) { + return; + } + auto it_new = name_pair.second.rbegin(); + auto it_old = name_pair.second.rbegin(); + ++it_old; + for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { + auto *write_op = it_new->second.generated_op_; + auto &read_ops = it_old->second.pending_ops_; + + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + + auto *dep_var = new DependencyVarHandle(); + dep_var->generated_op_ = read_op; + read_op->outputs_.emplace_back(dep_var); + + dep_var->pending_ops_.emplace_back(write_op); + write_op->inputs_.emplace_back(dep_var); + member_->dep_vars_.emplace(dep_var); + } + } + } + } } void ParallelExecutor::GenerateVar(OpHandle *op_handle, @@ -349,7 +450,7 @@ std::vector ParallelExecutor::Run( const std::vector &fetch_tensors) { // Version --> VarHandle - std::unordered_map pending_vars; + std::unordered_map pending_vars; std::unordered_map pending_ops; for (auto &place_pair : member_->vars_) { @@ -361,12 +462,16 @@ std::vector ParallelExecutor::Run( } } + for (auto &var : member_->dep_vars_) { + pending_vars[var.get()] = var->generated_op_ == nullptr; + } + for (auto &op : member_->ops_) { pending_ops.insert({op.get(), op->inputs_.size()}); } while (!pending_ops.empty()) { - VarHandle *ready_var = nullptr; + VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { if (pair.second) { ready_var = pair.first; @@ -400,7 +505,7 @@ std::vector ParallelExecutor::Run( auto op_run = [ready_buffer, op] { // TODO(yy) Check Previous Op has same dev ctx. - LOG(INFO) << "Run " << op->DebugString(); + op->Run(); for (auto *ready : ready_buffer) { *ready = true; } From d84ddcf1239d6a7a6a7c24ebe9668d39e8bb55e6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 15 Mar 2018 17:43:23 +0800 Subject: [PATCH 016/314] Stash --- paddle/fluid/framework/executor.cc | 8 ++++---- paddle/fluid/framework/executor.h | 2 ++ paddle/fluid/framework/parallel_executor.cc | 9 ++++----- .../reader/create_recordio_file_reader_op.cc | 4 +++- .../tests/unittests/test_parallel_executor.py | 19 ++++++++++++++++++- 5 files changed, 31 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 6ee3f18dd4..b250378b9f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -45,7 +45,7 @@ struct ExecutorPrepareContext { Executor::Executor(const platform::Place& place) : place_(place) {} -static void CreateTensor(Variable* var, proto::VarType::Type var_type) { +void InitializeVariable(Variable* var, proto::VarType::Type var_type) { if (var_type == proto::VarType::LOD_TENSOR) { var->GetMutable(); } else if (var_type == proto::VarType::SELECTED_ROWS) { @@ -284,12 +284,12 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, if (var->Persistable()) { auto* ptr = scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); + InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() << " global, which pointer is " << ptr; } else { auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); + InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() << " locally, which pointer is " << ptr; } @@ -297,7 +297,7 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } else { for (auto& var : block.AllVars()) { auto* ptr = local_scope->Var(var->Name()); - CreateTensor(ptr, var->GetType()); + InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create variable " << var->Name() << ", which pointer is " << ptr; } diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 8d8a7cf4db..e020a6e738 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -59,5 +59,7 @@ class Executor { const platform::Place place_; }; +extern void InitializeVariable(Variable* var, proto::VarType::Type var_type); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e98fedb68d..97ffe01bec 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -84,14 +84,14 @@ struct ComputationOpHandle : public OpHandle { Scope *scope_; platform::Place place_; - explicit ComputationOpHandle(const OpDesc &op_desc, Scope *scope, - platform::Place place) + explicit ComputationOpHandle(const OpDesc &op_desc, platform::Place place) : op_(framework::OpRegistry::CreateOp(op_desc)), - scope_(scope), + scope_(nullptr), place_(place) {} void Run() override { // Wait other op if necessary + LOG(INFO) << DebugString(); auto *cur_ctx = dev_ctx_[place_]; for (auto *in : inputs_) { if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { @@ -240,8 +240,7 @@ void ParallelExecutor::ConstructDependencyGraph( } for (auto &pair : member_->local_scopes_) { - member_->ops_.emplace_back( - new ComputationOpHandle(*op, pair.second, pair.first)); + member_->ops_.emplace_back(new ComputationOpHandle(*op, pair.first)); auto *op_handle = member_->ops_.back().get(); op_handle->dev_ctx_[pair.first] = const_cast( platform::DeviceContextPool::Instance().Get(pair.first)); diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc index c3eb247bbe..0126ff7271 100644 --- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc +++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc @@ -25,7 +25,9 @@ class RecordIOFileReader : public framework::FileReader { : FileReader(shapes), scanner_(filename), dev_ctx_(*platform::DeviceContextPool::Instance().Get( - platform::CPUPlace())) {} + platform::CPUPlace())) { + LOG(INFO) << "Creating file reader" << filename; + } void ReadNext(std::vector* out) override { *out = framework::ReadFromRecordIO(scanner_, dev_ctx_); diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 65b43448a4..3604fdb285 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -14,16 +14,33 @@ import unittest import paddle.fluid as fluid +import paddle.v2 as paddle +import paddle.v2.dataset.mnist as mnist class ParallelExecutor(unittest.TestCase): + def setUp(self): + # Convert mnist to recordio file + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(mnist.train(), batch_size=32) + feeder = fluid.DataFeeder( + feed_list=[ # order is image and label + fluid.layers.data( + name='image', shape=[784]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + './mnist.recordio', reader, feeder) + def test_main(self): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): reader = fluid.layers.open_recordio_file( - filename='tmp', + filename='./mnist.recordio', shapes=[[-1, 784], [-1, 1]], lod_levels=[0, 0], dtypes=['float32', 'int64']) From 6f0dfd89a4265e3aec08beb693ad7e342c10696b Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 14:33:36 +0800 Subject: [PATCH 017/314] Single GPU ParallelExecutor complete --- CMakeLists.txt | 1 + cmake/external/threadpool.cmake | 30 ++++ paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/parallel_executor.cc | 165 ++++++++++++++++---- paddle/fluid/framework/parallel_executor.h | 4 + paddle/fluid/operators/read_op.cc | 5 +- 6 files changed, 173 insertions(+), 34 deletions(-) create mode 100644 cmake/external/threadpool.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index c86889c05c..502213bf29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,6 +146,7 @@ include(external/cares) include(external/grpc) include(external/snappy) # download snappy include(external/snappystream) +include(external/threadpool) include(cudnn) # set cudnn libraries, must before configure include(cupti) diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake new file mode 100644 index 0000000000..0159815fed --- /dev/null +++ b/cmake/external/threadpool.cmake @@ -0,0 +1,30 @@ +INCLUDE(ExternalProject) + +SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool) +SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool) +INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR}) + +ExternalProject_Add( + extern_threadpool + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/progschj/ThreadPool.git" + GIT_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040 + PREFIX ${THREADPOOL_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";") + add_library(simple_threadpool STATIC ${dummyfile}) +else() + add_library(simple_threadpool INTERFACE) +endif() + +add_dependencies(simple_threadpool extern_threadpool) + +LIST(APPEND external_project_dependencies simple_threadpool) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 934bb43ffe..4fd66c77ac 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -87,7 +87,7 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - framework_proto backward glog lod_rank_table feed_fetch_method executor) + framework_proto backward glog lod_rank_table feed_fetch_method executor simple_threadpool) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 97ffe01bec..930be7fab3 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,9 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include "ThreadPool.h" +#include "executor.h" #include "lod_tensor.h" #include "op_registry.h" -#include "threadpool.h" namespace paddle { namespace framework { @@ -49,7 +50,7 @@ struct VarHandle : public VarHandleBase { }; struct DependencyVarHandle : public VarHandleBase { - std::string DebugString() const override { return "Deps var"; } + std::string DebugString() const override { return "Dependency Variable"; } }; struct OpHandle { @@ -75,7 +76,7 @@ struct OpHandle { virtual ~OpHandle() {} - virtual void Run() {} + virtual void Run() { PADDLE_THROW("Not implemented"); } virtual void Wait() {} }; @@ -84,14 +85,15 @@ struct ComputationOpHandle : public OpHandle { Scope *scope_; platform::Place place_; - explicit ComputationOpHandle(const OpDesc &op_desc, platform::Place place) + explicit ComputationOpHandle(const OpDesc &op_desc, Scope *scope, + platform::Place place) : op_(framework::OpRegistry::CreateOp(op_desc)), - scope_(nullptr), + scope_(scope), place_(place) {} void Run() override { // Wait other op if necessary - LOG(INFO) << DebugString(); + LOG(INFO) << "Run " << this << " " << DebugString(); auto *cur_ctx = dev_ctx_[place_]; for (auto *in : inputs_) { if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { @@ -100,12 +102,49 @@ struct ComputationOpHandle : public OpHandle { } op_->Run(*scope_, place_); + LOG(INFO) << "Done " << this; } }; -struct ScaleLossGradOpHandle : public OpHandle {}; +struct ScaleLossGradOpHandle : public OpHandle { + float coeff_; + Scope *scope_; + platform::Place place_; + + explicit ScaleLossGradOpHandle(size_t num_dev, Scope *scope, + platform::Place place) + : coeff_(static_cast(1.0 / num_dev)), + scope_(scope), + place_(place) {} + + void Run() override { + LOG(INFO) << "Run Scale Loss Grad"; + + std::string var_name = static_cast(this->outputs_[0])->name_; -struct NCCLAllReduceOpHandle : public OpHandle {}; + float *tmp = scope_->FindVar(var_name) + ->GetMutable() + ->mutable_data(make_ddim({1}), place_); + + if (platform::is_cpu_place(place_)) { + *tmp = coeff_; + } else { + memory::Copy( + boost::get(place_), tmp, platform::CPUPlace(), + &coeff_, sizeof(float), + static_cast(this->dev_ctx_[place_]) + ->stream()); + } + } +}; + +struct NCCLAllReduceOpHandle : public OpHandle { + void Run() override { + if (this->inputs_.size() == 1) { + return; // No need to all reduce when GPU count = 1; + } + } +}; class ParallelExecutorPrivate { public: @@ -182,7 +221,10 @@ class ParallelExecutorPrivate { std::vector> ops_; + // Use a simpler thread pool, might be faster. ThreadPool pool_; + + std::unique_ptr exception_; }; // TODO(yy): Move this function somewhere @@ -217,6 +259,19 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp ConstructDependencyGraph(params, main_program, loss_var_name); + + // Step 3. Create vars in each scope; + for (auto &pair : member_->local_scopes_) { + auto *scope = pair.second; + + for (auto *var : main_program.Block(0).AllVars()) { + if (scope->FindVar(var->Name()) != nullptr) { + continue; + } + + InitializeVariable(scope->Var(var->Name()), var->GetType()); + } + } } void ParallelExecutor::ConstructDependencyGraph( @@ -240,7 +295,8 @@ void ParallelExecutor::ConstructDependencyGraph( } for (auto &pair : member_->local_scopes_) { - member_->ops_.emplace_back(new ComputationOpHandle(*op, pair.first)); + member_->ops_.emplace_back( + new ComputationOpHandle(*op, pair.second, pair.first)); auto *op_handle = member_->ops_.back().get(); op_handle->dev_ctx_[pair.first] = const_cast( platform::DeviceContextPool::Instance().Get(pair.first)); @@ -263,16 +319,20 @@ void ParallelExecutor::ConstructDependencyGraph( if (is_forwarding) { if (var_names.size() == 1 && var_names[0] == loss_var_name) { // Insert ScaleCost OpHandle - member_->ops_.emplace_back(new ScaleLossGradOpHandle()); + member_->ops_.emplace_back(new ScaleLossGradOpHandle( + this->member_->local_scopes_.size(), pair.second, pair.first)); op_handle = member_->ops_.back().get(); op_handle->dev_ctx_[pair.first] = member_->CommunicationDevCtx(pair.first); auto &place = pair.first; - VarHandle *loss = GetVarHandle(loss_var_name, place); - loss->pending_ops_.emplace_back(op_handle); - op_handle->inputs_.emplace_back(loss); + // FIXME: Currently ScaleLossGradOp only use device_count as scale + // factor. So it does not depend on any other operators. + // VarHandle *loss = GetVarHandle(loss_var_name, place); + // loss->pending_ops_.emplace_back(op_handle); + // op_handle->inputs_.emplace_back(loss); + GenerateVar(op_handle, loss_var_name + "@GRAD", place); change_forward = true; LOG(INFO) << "Scale Loss " << op_handle->DebugString(); @@ -341,11 +401,25 @@ void ParallelExecutor::ConstructDependencyGraph( for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { auto *write_op = it_new->second.generated_op_; auto &read_ops = it_old->second.pending_ops_; + auto *ex_write_op = it_old->second.generated_op_; + + if (ex_write_op == nullptr) { // Nobody write this var. + continue; + } + + LOG(INFO) << "Link " << it_new->second.DebugString() << " From " + << it_old->second.version_ << " To " + << it_new->second.version_; for (auto *read_op : read_ops) { // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } auto *dep_var = new DependencyVarHandle(); + dep_var->generated_op_ = read_op; read_op->outputs_.emplace_back(dep_var); @@ -448,7 +522,7 @@ void ParallelExecutor::BuildNCCLCommunicator() const { std::vector ParallelExecutor::Run( const std::vector &fetch_tensors) { // Version --> VarHandle - + member_->exception_.reset(); std::unordered_map pending_vars; std::unordered_map pending_ops; @@ -465,8 +539,18 @@ std::vector ParallelExecutor::Run( pending_vars[var.get()] = var->generated_op_ == nullptr; } + std::vector to_run; + for (auto &op : member_->ops_) { - pending_ops.insert({op.get(), op->inputs_.size()}); + if (op->inputs_.empty()) { // Special case, Op has no input. + to_run.emplace_back(op.get()); + } else { + pending_ops.insert({op.get(), op->inputs_.size()}); + } + } + + for (auto *op : to_run) { + RunOp(pending_vars, op); } while (!pending_ops.empty()) { @@ -478,13 +562,19 @@ std::vector ParallelExecutor::Run( } if (ready_var == nullptr) { - member_->pool_.Wait(); // Wait thread pool; + // FIXME use conditional var instead of busy wait. + + if (member_->exception_) { + throw * member_->exception_; + } + + std::this_thread::yield(); continue; } pending_vars.erase(ready_var); - std::vector to_run; + to_run.clear(); for (auto *op : ready_var->pending_ops_) { auto &deps = pending_ops[op]; @@ -496,24 +586,35 @@ std::vector ParallelExecutor::Run( for (auto *op : to_run) { pending_ops.erase(op); - - std::vector ready_buffer; - for (auto *var : op->outputs_) { - ready_buffer.emplace_back(&pending_vars[var]); - } - - auto op_run = [ready_buffer, op] { - // TODO(yy) Check Previous Op has same dev ctx. - op->Run(); - for (auto *ready : ready_buffer) { - *ready = true; - } - }; - - member_->pool_.Run(op_run); + RunOp(pending_vars, op); } } return std::vector(); } + +void ParallelExecutor::RunOp( + std::unordered_map &pending_vars, + OpHandle *op) const { + std::vector ready_buffer; + for (auto *var : op->outputs_) { + ready_buffer.emplace_back(&pending_vars[var]); + } + + auto op_run = [ready_buffer, op, this] { + try { + // TODO(yy) Check Previous Op has same dev ctx. + op->Run(); + for (auto *ready : ready_buffer) { + *ready = true; + } + } catch (platform::EnforceNotMet ex) { + member_->exception_.reset(new platform::EnforceNotMet(ex)); + } catch (...) { + LOG(FATAL) << "Unknown exception catched"; + } + }; + + member_->pool_.enqueue(op_run); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 805b7e5aa9..1e4c5c48f2 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -31,6 +31,7 @@ namespace framework { class ParallelExecutorPrivate; class VarHandle; class OpHandle; +class VarHandleBase; class ParallelExecutor { public: explicit ParallelExecutor(const std::vector& places, @@ -57,6 +58,9 @@ class ParallelExecutor { const std::string& loss_var_name) const; void BuildNCCLCommunicator() const; + + void RunOp(std::unordered_map& pending_vars, + OpHandle* op) const; }; } // namespace framework diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc index 2a5605e0d3..2925b8a85d 100644 --- a/paddle/fluid/operators/read_op.cc +++ b/paddle/fluid/operators/read_op.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/operators/detail/safe_ref.h" namespace paddle { namespace operators { @@ -59,7 +60,9 @@ class ReadOp : public framework::OperatorBase { void RunImpl(const framework::Scope& scope, const platform::Place& dev_place) const override { framework::ReaderHolder* reader = - scope.FindVar(Input("Reader"))->GetMutable(); + detail::Ref(scope.FindVar(Input("Reader")), + "Cannot find reader variable %s", Input("Reader")) + .GetMutable(); std::vector out_arg_names = Outputs("Out"); std::vector ins; reader->ReadNext(&ins); From 8c9cd369dc2280ec9c212586b804de9c10adb600 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 14:47:56 +0800 Subject: [PATCH 018/314] Polish code style --- paddle/fluid/framework/parallel_executor.cc | 22 ++++++++++++--------- paddle/fluid/framework/parallel_executor.h | 2 ++ 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 930be7fab3..40de26bdd0 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -379,17 +379,21 @@ void ParallelExecutor::ConstructDependencyGraph( } } - /** - * Dependency graph has been constructed. However, there are still data - * harzaeds need to be handled. - * - * We only handle write after read(WAR), since it should not have a write - * after write in program. If there are write after write operators, we need - * prune them. - * - * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + /* + Dependency graph has been constructed. However, there are still data + harzaeds need to be handled. */ + PolishGraphToSupportDataHarzaeds(); +} +/** + * We only handle write after read(WAR), since it should not have a write + * after write in program. If there are write after write operators, we need + * prune them. + * + * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + */ +void ParallelExecutor::PolishGraphToSupportDataHarzaeds() const { for (auto &place_pair : member_->vars_) { for (auto &name_pair : place_pair.second) { if (name_pair.second.size() <= 1) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 1e4c5c48f2..30416563f8 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -61,6 +61,8 @@ class ParallelExecutor { void RunOp(std::unordered_map& pending_vars, OpHandle* op) const; + + void PolishGraphToSupportDataHarzaeds() const; }; } // namespace framework From 8b397d16024f1d5a985e0cbc6c88c6560d7e7661 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 14:48:17 +0800 Subject: [PATCH 019/314] Make recordio file reader thread-safe by default --- .../reader/create_recordio_file_reader_op.cc | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc index 0126ff7271..986e1b7a21 100644 --- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc +++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc @@ -18,6 +18,7 @@ namespace paddle { namespace operators { namespace reader { +template class RecordIOFileReader : public framework::FileReader { public: RecordIOFileReader(const std::string& filename, @@ -26,11 +27,19 @@ class RecordIOFileReader : public framework::FileReader { scanner_(filename), dev_ctx_(*platform::DeviceContextPool::Instance().Get( platform::CPUPlace())) { + if (ThreadSafe) { + mutex_.reset(new std::mutex()); + } LOG(INFO) << "Creating file reader" << filename; } void ReadNext(std::vector* out) override { - *out = framework::ReadFromRecordIO(scanner_, dev_ctx_); + if (ThreadSafe) { + std::lock_guard guard(*mutex_); + *out = framework::ReadFromRecordIO(scanner_, dev_ctx_); + } else { + *out = framework::ReadFromRecordIO(scanner_, dev_ctx_); + } } bool HasNext() const override { return scanner_.HasNext(); } @@ -38,6 +47,7 @@ class RecordIOFileReader : public framework::FileReader { void ReInit() override { scanner_.Reset(); } private: + std::unique_ptr mutex_; recordio::Scanner scanner_; const platform::DeviceContext& dev_ctx_; }; @@ -61,7 +71,7 @@ class CreateRecordIOReaderOp : public framework::OperatorBase { auto* out = scope.FindVar(Output("Out")) ->template GetMutable(); - out->Reset(new RecordIOFileReader(filename, shapes)); + out->Reset(new RecordIOFileReader(filename, shapes)); } }; From 0ef9edf566a2206c8fa8b209d4b5610f1a4f067e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 15:21:13 +0800 Subject: [PATCH 020/314] Stash --- paddle/fluid/framework/parallel_executor.cc | 43 +++++++++++-------- .../tests/unittests/test_parallel_executor.py | 2 +- 2 files changed, 27 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 40de26bdd0..25b31f8636 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -229,8 +229,15 @@ class ParallelExecutorPrivate { // TODO(yy): Move this function somewhere ncclDataType_t ToNCCLDataType(std::type_index type) { - // FIXME!! - return ncclFloat; + if (type == typeid(float)) { // NOLINT + return ncclFloat; + } else if (type == typeid(double)) { // NOLINT + return ncclDouble; + } else if (type == typeid(int)) { // NOLINT + return ncclInt; + } else { + PADDLE_THROW("Not supported"); + } } ParallelExecutor::ParallelExecutor( @@ -479,30 +486,32 @@ void ParallelExecutor::BCastParamsToGPUs( ncclDataType_t data_type = ToNCCLDataType(main_tensor.type()); auto &dims = main_tensor.dims(); size_t numel = main_tensor.numel(); - std::vector> - mems; - mems.emplace_back(const_cast(main_tensor.data()), - &member_->GetNCCLCtx(member_->main_place_)); - for (auto &pair : member_->local_scopes_) { - if (pair.first == member_->main_place_) { - continue; - } + platform::dynload::ncclGroupStart(); + for (auto &pair : member_->local_scopes_) { auto local_scope = pair.second; auto *t = local_scope->Var(var_desc->Name())->GetMutable(); t->Resize(dims); - mems.emplace_back(t->mutable_data(pair.first, main_tensor.type()), - &member_->GetNCCLCtx(member_->main_place_)); + auto &nccl_ctx = member_->GetNCCLCtx(pair.first); + platform::dynload::ncclBcast( + t->mutable_data(pair.first, main_tensor.type()), numel, data_type, + 0, nccl_ctx.comm, nccl_ctx.stream()); } + platform::dynload::ncclGroupEnd(); + } + } - // TODO(yy): Invoke ncclBCast here. mems, numel, data_type. The mems[0] - // is the src, rests are dests. + for (auto &pair : member_->local_scopes_) { + member_->GetNCCLCtx(pair.first).ctx_->Wait(); - (void)(data_type); - (void)(numel); - } + auto &b = pair.second->FindVar("fc_1.b_0")->Get(); + framework::LoDTensor cpu; + framework::TensorCopy(b, platform::CPUPlace(), &cpu); + platform::DeviceContextPool::Instance().Get(b.place())->Wait(); + LOG(INFO) << *cpu.data(); } + #else PADDLE_THROW("Not compiled with CUDA"); #endif diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 3604fdb285..85a9f7697f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -52,7 +52,7 @@ class ParallelExecutor(unittest.TestCase): adam = fluid.optimizer.Adam() adam.minimize(loss) act_places = [] - for each in [fluid.CUDAPlace(0)]: + for each in [fluid.CUDAPlace(0), fluid.CUDAPlace(1)]: p = fluid.core.Place() p.set_place(each) act_places.append(p) From 9fc0b596a92cf63e6c0df18b7f59842758411c5d Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 15:39:52 +0800 Subject: [PATCH 021/314] Test more --- paddle/fluid/framework/parallel_executor.cc | 1 + .../paddle/fluid/tests/unittests/test_parallel_executor.py | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 25b31f8636..ea5ce3f2e9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -502,6 +502,7 @@ void ParallelExecutor::BCastParamsToGPUs( } } + // Debug code, bias should be 1.0f. for (auto &pair : member_->local_scopes_) { member_->GetNCCLCtx(pair.first).ctx_->Wait(); diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 85a9f7697f..2a614700b0 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -45,7 +45,12 @@ class ParallelExecutor(unittest.TestCase): lod_levels=[0, 0], dtypes=['float32', 'int64']) img, label = fluid.layers.read_file(reader) - hidden = fluid.layers.fc(img, size=200, act='tanh') + hidden = fluid.layers.fc( + img, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) prediction = fluid.layers.fc(hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) loss = fluid.layers.mean(loss) From d470763f6c0e7641367641bdb6cb1f28b8cf39c3 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 15:53:36 +0800 Subject: [PATCH 022/314] Stash --- paddle/fluid/framework/parallel_executor.cc | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ea5ce3f2e9..215ee38ac5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -154,6 +154,8 @@ class ParallelExecutorPrivate { std::unordered_map local_scopes_; + std::vector places_; + #ifdef PADDLE_WITH_CUDA struct NCCLContext { std::unique_ptr ctx_; @@ -246,6 +248,8 @@ ParallelExecutor::ParallelExecutor( const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope) : member_(new ParallelExecutorPrivate()) { + member_->places_ = places; + // Step 1. RunStartupProgram and Bcast the params to devs. Executor exe(places[0]); exe.Run(startup_program, scope, 0); @@ -489,14 +493,14 @@ void ParallelExecutor::BCastParamsToGPUs( platform::dynload::ncclGroupStart(); - for (auto &pair : member_->local_scopes_) { - auto local_scope = pair.second; + for (auto &place : member_->places_) { + auto local_scope = member_->local_scopes_[place]; auto *t = local_scope->Var(var_desc->Name())->GetMutable(); t->Resize(dims); - auto &nccl_ctx = member_->GetNCCLCtx(pair.first); - platform::dynload::ncclBcast( - t->mutable_data(pair.first, main_tensor.type()), numel, data_type, - 0, nccl_ctx.comm, nccl_ctx.stream()); + auto &nccl_ctx = member_->GetNCCLCtx(place); + platform::dynload::ncclBcast(t->mutable_data(place, main_tensor.type()), + numel, data_type, 0, nccl_ctx.comm, + nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); } @@ -506,7 +510,7 @@ void ParallelExecutor::BCastParamsToGPUs( for (auto &pair : member_->local_scopes_) { member_->GetNCCLCtx(pair.first).ctx_->Wait(); - auto &b = pair.second->FindVar("fc_1.b_0")->Get(); + auto &b = pair.second->FindVar("fc_0.b_0")->Get(); framework::LoDTensor cpu; framework::TensorCopy(b, platform::CPUPlace(), &cpu); platform::DeviceContextPool::Instance().Get(b.place())->Wait(); From c15d2c9edc1dbea3e3d5b5948bb2c5b0cc81eb88 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 16:13:44 +0800 Subject: [PATCH 023/314] Update --- paddle/fluid/framework/parallel_executor.cc | 34 +++++++++++++-------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 215ee38ac5..996273c720 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -171,27 +171,28 @@ class ParallelExecutorPrivate { return boost::get(ctx_->GetPlace()).device; } - static void InitNCCLContext(std::map &contexts) { + static void InitNCCLContext(std::unordered_map &contexts, + const std::vector &places) { std::vector comms; std::vector devs; comms.resize(contexts.size()); devs.reserve(contexts.size()); - for (auto &ctx : contexts) { - devs.push_back(ctx.first); + for (auto &p : places) { + devs.push_back(boost::get(p).device); } NCCL_INVOKE(platform::dynload::ncclCommInitAll( &comms[0], static_cast(contexts.size()), &devs[0])); int i = 0; - for (auto &ctx : contexts) { - ctx.second.comm = comms[i++]; + for (auto &dev_id : devs) { + contexts.at(dev_id).comm = comms[i++]; } } }; - std::map communication_streams_; + std::unordered_map communication_streams_; NCCLContext &GetNCCLCtx(platform::Place p) { int dev_id = boost::get(p).device; @@ -493,13 +494,20 @@ void ParallelExecutor::BCastParamsToGPUs( platform::dynload::ncclGroupStart(); - for (auto &place : member_->places_) { - auto local_scope = member_->local_scopes_[place]; - auto *t = local_scope->Var(var_desc->Name())->GetMutable(); - t->Resize(dims); + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; + if (i == 0) { + buffer = const_cast(main_tensor.data()); + } else { + auto local_scope = member_->local_scopes_[place]; + auto *t = local_scope->Var(var_desc->Name())->GetMutable(); + t->Resize(dims); + buffer = t->mutable_data(place, main_tensor.type()); + } + auto &nccl_ctx = member_->GetNCCLCtx(place); - platform::dynload::ncclBcast(t->mutable_data(place, main_tensor.type()), - numel, data_type, 0, nccl_ctx.comm, + platform::dynload::ncclBcast(buffer, numel, data_type, 0, nccl_ctx.comm, nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); @@ -533,7 +541,7 @@ void ParallelExecutor::BuildNCCLCommunicator() const { } ParallelExecutorPrivate::NCCLContext::InitNCCLContext( - member_->communication_streams_); + member_->communication_streams_, member_->places_); #endif } From 8f0590e7c5924e9281a957cf0d355176c4bed301 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 16:31:58 +0800 Subject: [PATCH 024/314] Add ncclAllReduce --- paddle/fluid/framework/parallel_executor.cc | 50 +++++++++++++++++---- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 996273c720..ec5eb57910 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -138,14 +138,6 @@ struct ScaleLossGradOpHandle : public OpHandle { } }; -struct NCCLAllReduceOpHandle : public OpHandle { - void Run() override { - if (this->inputs_.size() == 1) { - return; // No need to all reduce when GPU count = 1; - } - } -}; - class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(size_t num_threads = 12) @@ -243,6 +235,46 @@ ncclDataType_t ToNCCLDataType(std::type_index type) { } } +struct NCCLAllReduceOpHandle : public OpHandle { + ParallelExecutorPrivate *member_; + + explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) + : member_(member) {} + + void Run() override { + if (this->inputs_.size() == 1) { + return; // No need to all reduce when GPU count = 1; + } else { + auto &var_name = static_cast(this->inputs_[0])->name_; + + int dtype = -1; + size_t numel = 0; + + for (auto &p : member_->places_) { + int dev_id = boost::get(p).device; + + Scope *s = member_->local_scopes_[p]; + auto &lod_tensor = s->FindVar(var_name)->Get(); + void *buffer = const_cast(lod_tensor.data()); + if (dtype == -1) { + dtype = ToNCCLDataType(lod_tensor.type()); + } + + if (numel == 0) { + numel = static_cast(lod_tensor.numel()); + } + + auto &nccl_ctx = member_->communication_streams_.at(dev_id); + + ncclAllReduce(buffer, buffer, numel, static_cast(dtype), + ncclSum, nccl_ctx.comm, nccl_ctx.stream()); + } + + ncclGroupEnd(); + } + } +}; + ParallelExecutor::ParallelExecutor( const std::vector &places, const std::unordered_set ¶ms, @@ -361,7 +393,7 @@ void ParallelExecutor::ConstructDependencyGraph( for (auto &og : var_names) { if (grads.count(og) != 0) { // is param grad // Insert NCCL AllReduce Op - member_->ops_.emplace_back(new NCCLAllReduceOpHandle()); + member_->ops_.emplace_back(new NCCLAllReduceOpHandle(member_)); auto *op_handle = member_->ops_.back().get(); for (auto &pair : member_->local_scopes_) { From e8a7e5d1e6e854ab542644f1df7ae90c8565cc5b Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 16:35:56 +0800 Subject: [PATCH 025/314] Update --- paddle/fluid/framework/parallel_executor.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ec5eb57910..5870eac811 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -250,6 +250,8 @@ struct NCCLAllReduceOpHandle : public OpHandle { int dtype = -1; size_t numel = 0; + platform::dynload::ncclGroupStart(); + for (auto &p : member_->places_) { int dev_id = boost::get(p).device; @@ -266,11 +268,12 @@ struct NCCLAllReduceOpHandle : public OpHandle { auto &nccl_ctx = member_->communication_streams_.at(dev_id); - ncclAllReduce(buffer, buffer, numel, static_cast(dtype), - ncclSum, nccl_ctx.comm, nccl_ctx.stream()); + platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(dtype), ncclSum, + nccl_ctx.comm, nccl_ctx.stream()); } - ncclGroupEnd(); + platform::dynload::ncclGroupEnd(); } } }; From b2c7a9b82850c2e4ffaf7027e82f49fa463defc5 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 16:43:49 +0800 Subject: [PATCH 026/314] Wait by stream --- paddle/fluid/framework/parallel_executor.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5870eac811..d46adf291b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -77,7 +77,7 @@ struct OpHandle { virtual ~OpHandle() {} virtual void Run() { PADDLE_THROW("Not implemented"); } - virtual void Wait() {} + virtual void Wait(platform::DeviceContext *waited_dev) {} }; struct ComputationOpHandle : public OpHandle { @@ -97,13 +97,17 @@ struct ComputationOpHandle : public OpHandle { auto *cur_ctx = dev_ctx_[place_]; for (auto *in : inputs_) { if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { - in->generated_op_->Wait(); + in->generated_op_->Wait(cur_ctx); } } op_->Run(*scope_, place_); LOG(INFO) << "Done " << this; } + + void Wait(platform::DeviceContext *waited_dev) override { + this->dev_ctx_.at(place_)->Wait(); + } }; struct ScaleLossGradOpHandle : public OpHandle { @@ -136,6 +140,10 @@ struct ScaleLossGradOpHandle : public OpHandle { ->stream()); } } + + void Wait(platform::DeviceContext *waited_dev) override { + this->dev_ctx_.at(place_)->Wait(); + } }; class ParallelExecutorPrivate { @@ -276,6 +284,10 @@ struct NCCLAllReduceOpHandle : public OpHandle { platform::dynload::ncclGroupEnd(); } } + + void Wait(platform::DeviceContext *waited_dev) override { + this->dev_ctx_.at(waited_dev->GetPlace())->Wait(); + } }; ParallelExecutor::ParallelExecutor( From 254d7ff4f5e5793d44aecde15ee375ec76d4ea4b Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 16 Mar 2018 17:23:43 +0800 Subject: [PATCH 027/314] Refactor local_scopes --- paddle/fluid/framework/parallel_executor.cc | 76 ++++++++------------- 1 file changed, 28 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d46adf291b..edc24cc131 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -151,11 +151,10 @@ class ParallelExecutorPrivate { explicit ParallelExecutorPrivate(size_t num_threads = 12) : pool_(num_threads) {} - std::unordered_map - local_scopes_; - std::vector places_; + std::vector local_scopes_; + #ifdef PADDLE_WITH_CUDA struct NCCLContext { std::unique_ptr ctx_; @@ -260,10 +259,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { platform::dynload::ncclGroupStart(); - for (auto &p : member_->places_) { + for (size_t i = 0; i < member_->local_scopes_.size(); ++i) { + auto &p = member_->places_[i]; + auto *s = member_->local_scopes_[i]; int dev_id = boost::get(p).device; - Scope *s = member_->local_scopes_[p]; auto &lod_tensor = s->FindVar(var_name)->Get(); void *buffer = const_cast(lod_tensor.data()); if (dtype == -1) { @@ -302,8 +302,8 @@ ParallelExecutor::ParallelExecutor( Executor exe(places[0]); exe.Run(startup_program, scope, 0); // Create local scopes - for (auto &place : places) { - member_->local_scopes_[place] = &scope->NewScope(); + for (size_t i = 0; i < member_->places_.size(); ++i) { + member_->local_scopes_.push_back(&scope->NewScope()); } member_->main_place_ = places[0]; @@ -320,9 +320,7 @@ ParallelExecutor::ParallelExecutor( ConstructDependencyGraph(params, main_program, loss_var_name); // Step 3. Create vars in each scope; - for (auto &pair : member_->local_scopes_) { - auto *scope = pair.second; - + for (auto *scope : member_->local_scopes_) { for (auto *var : main_program.Block(0).AllVars()) { if (scope->FindVar(var->Name()) != nullptr) { continue; @@ -353,46 +351,44 @@ void ParallelExecutor::ConstructDependencyGraph( } } - for (auto &pair : member_->local_scopes_) { - member_->ops_.emplace_back( - new ComputationOpHandle(*op, pair.second, pair.first)); + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto &p = member_->places_[i]; + auto *s = member_->local_scopes_[i]; + + member_->ops_.emplace_back(new ComputationOpHandle(*op, s, p)); auto *op_handle = member_->ops_.back().get(); - op_handle->dev_ctx_[pair.first] = const_cast( - platform::DeviceContextPool::Instance().Get(pair.first)); + op_handle->dev_ctx_[p] = const_cast( + platform::DeviceContextPool::Instance().Get(p)); auto var_names = op->InputArgumentNames(); for (auto &each_var_name : var_names) { - auto &place = pair.first; - VarHandle *var = GetVarHandle(each_var_name, place); + VarHandle *var = GetVarHandle(each_var_name, p); op_handle->inputs_.emplace_back(var); var->pending_ops_.emplace_back(op_handle); } var_names = op->OutputArgumentNames(); for (auto &each_var_name : var_names) { - auto &place = pair.first; - GenerateVar(op_handle, each_var_name, place); + GenerateVar(op_handle, each_var_name, p); } if (is_forwarding) { if (var_names.size() == 1 && var_names[0] == loss_var_name) { // Insert ScaleCost OpHandle member_->ops_.emplace_back(new ScaleLossGradOpHandle( - this->member_->local_scopes_.size(), pair.second, pair.first)); + this->member_->local_scopes_.size(), s, p)); op_handle = member_->ops_.back().get(); - op_handle->dev_ctx_[pair.first] = - member_->CommunicationDevCtx(pair.first); + op_handle->dev_ctx_[p] = member_->CommunicationDevCtx(p); - auto &place = pair.first; // FIXME: Currently ScaleLossGradOp only use device_count as scale // factor. So it does not depend on any other operators. // VarHandle *loss = GetVarHandle(loss_var_name, place); // loss->pending_ops_.emplace_back(op_handle); // op_handle->inputs_.emplace_back(loss); - GenerateVar(op_handle, loss_var_name + "@GRAD", place); + GenerateVar(op_handle, loss_var_name + "@GRAD", p); change_forward = true; LOG(INFO) << "Scale Loss " << op_handle->DebugString(); } @@ -411,9 +407,9 @@ void ParallelExecutor::ConstructDependencyGraph( member_->ops_.emplace_back(new NCCLAllReduceOpHandle(member_)); auto *op_handle = member_->ops_.back().get(); - for (auto &pair : member_->local_scopes_) { - auto &place = pair.first; - auto &vars = member_->vars_[place][og]; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto &p = member_->places_[i]; + auto &vars = member_->vars_[p][og]; if (vars.empty()) { // This device has no data. continue. continue; @@ -422,16 +418,13 @@ void ParallelExecutor::ConstructDependencyGraph( op_handle->inputs_.emplace_back(prev_grad); prev_grad->pending_ops_.emplace_back(op_handle); auto &var = vars[vars.size()]; - var.place_ = place; + var.place_ = p; var.generated_op_ = op_handle; var.name_ = og; var.version_ = vars.size() - 1; op_handle->outputs_.emplace_back(&var); - for (auto &pair : member_->local_scopes_) { - op_handle->dev_ctx_[pair.first] = - member_->CommunicationDevCtx(pair.first); - } + op_handle->dev_ctx_[p] = member_->CommunicationDevCtx(p); } } } @@ -529,7 +522,7 @@ VarHandle *ParallelExecutor::GetVarHandle(const std::string &each_var_name, void ParallelExecutor::BCastParamsToGPUs( const ProgramDesc &startup_program) const { #ifdef PADDLE_WITH_CUDA - auto *main_scope = member_->local_scopes_[member_->main_place_]; + auto *main_scope = member_->local_scopes_[0]; for (auto *var_desc : startup_program.Block(0).AllVars()) { if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { @@ -547,7 +540,7 @@ void ParallelExecutor::BCastParamsToGPUs( if (i == 0) { buffer = const_cast(main_tensor.data()); } else { - auto local_scope = member_->local_scopes_[place]; + auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var_desc->Name())->GetMutable(); t->Resize(dims); buffer = t->mutable_data(place, main_tensor.type()); @@ -560,18 +553,6 @@ void ParallelExecutor::BCastParamsToGPUs( platform::dynload::ncclGroupEnd(); } } - - // Debug code, bias should be 1.0f. - for (auto &pair : member_->local_scopes_) { - member_->GetNCCLCtx(pair.first).ctx_->Wait(); - - auto &b = pair.second->FindVar("fc_0.b_0")->Get(); - framework::LoDTensor cpu; - framework::TensorCopy(b, platform::CPUPlace(), &cpu); - platform::DeviceContextPool::Instance().Get(b.place())->Wait(); - LOG(INFO) << *cpu.data(); - } - #else PADDLE_THROW("Not compiled with CUDA"); #endif @@ -579,8 +560,7 @@ void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BuildNCCLCommunicator() const { #ifdef PADDLE_WITH_CUDA - for (auto &place_pair : member_->local_scopes_) { - auto place = place_pair.first; + for (auto &place : member_->places_) { int dev_id = boost::get(place).device; member_->communication_streams_.emplace( From 45c988d86a43bf34667ce7110972fff8dcaf20de Mon Sep 17 00:00:00 2001 From: sabreshao Date: Fri, 16 Mar 2018 17:27:19 +0800 Subject: [PATCH 028/314] Demostration of cmake refine for HIP support. 1. Add option WITH_AMD_GPU. 2. Add cmake/hip.cmake for HIP toolchain. 3. Some external module such as eigen may need HIP port. 4. Add macro hip_library/hip_binary/hip_test to cmake/generic.cmake. 5. Add one HIP source concat.hip.cu as an example. Each .cu may have its corresponding .hip.cu. --- CMakeLists.txt | 9 + cmake/configure.cmake | 15 +- cmake/external/eigen.cmake | 43 +++- cmake/generic.cmake | 76 ++++++ cmake/hip.cmake | 46 ++++ paddle/fluid/operators/CMakeLists.txt | 3 + paddle/fluid/operators/math/CMakeLists.txt | 6 + paddle/fluid/operators/math/concat.hip.cu | 281 +++++++++++++++++++++ paddle/fluid/pybind/CMakeLists.txt | 21 +- paddle/scripts/docker/build.sh | 4 + 10 files changed, 477 insertions(+), 27 deletions(-) create mode 100644 cmake/hip.cmake create mode 100644 paddle/fluid/operators/math/concat.hip.cu mode change 100644 => 100755 paddle/scripts/docker/build.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 0ec65bac84..399bf50748 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ include(simd) ################################ Configurations ####################################### option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) +option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) @@ -69,6 +70,9 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() +if(WITH_AMD_GPU) +endif() + if(ANDROID OR IOS) if(ANDROID) if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") @@ -180,6 +184,11 @@ if(WITH_GPU) include(cuda) endif(WITH_GPU) +if(WITH_AMD_GPU) + find_package(HIP) + include(hip) +endif(WITH_AMD_GPU) + if(WITH_MKLML) list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) endif() diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 0f76f55270..f726405c47 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -57,11 +57,7 @@ if(NOT WITH_GOLANG) add_definitions(-DPADDLE_WITHOUT_GOLANG) endif(NOT WITH_GOLANG) -if(NOT WITH_GPU) - add_definitions(-DHPPL_STUB_FUNC) - - list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) -else() +if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) FIND_PACKAGE(CUDA REQUIRED) @@ -84,7 +80,14 @@ else() # Include cuda and cudnn include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDA_TOOLKIT_INCLUDE}) -endif(NOT WITH_GPU) +elseif(WITH_AMD_GPU) + add_definitions(-DPADDLE_WITH_HIP) + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__") +else() + add_definitions(-DHPPL_STUB_FUNC) + list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu) +endif() if (WITH_MKLML AND MKLML_IOMP_LIB) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 6a701e076c..5d88c5a0b0 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -1,21 +1,36 @@ INCLUDE(ExternalProject) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) -SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) -INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) -ExternalProject_Add( - extern_eigen3 - ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" - GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 - PREFIX ${EIGEN_SOURCE_DIR} - UPDATE_COMMAND "" - CONFIGURE_COMMAND "" - BUILD_COMMAND "" - INSTALL_COMMAND "" - TEST_COMMAND "" -) +INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3) + +if(WITH_AMD_GPU) + ExternalProject_Add( + extern_eigen3 + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" + GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + PREFIX ${EIGEN_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) +else() + ExternalProject_Add( + extern_eigen3 + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" + GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 + PREFIX ${EIGEN_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" + ) +endif() if (${CMAKE_VERSION} VERSION_LESS "3.3.0") set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 471e392906..c749c97f13 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -317,6 +317,82 @@ function(nv_test TARGET_NAME) endif() endfunction(nv_test) +function(hip_library TARGET_NAME) + if (WITH_AMD_GPU) + set(options STATIC static SHARED shared) + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(_sources ${hip_library_SRCS}) + HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() + if(hip_library_SRCS) + if (hip_library_SHARED OR hip_library_shared) # build *.so + add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources}) + set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) + else() + add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources}) + set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) + target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a) + find_fluid_modules(${TARGET_NAME}) + endif() + if (hip_library_DEPS) + add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) + target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + endif() + # cpplint code style + foreach(source_file ${hip_library_SRCS}) + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() + endforeach() + add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS}) + else(hip_library_SRCS) + if (hip_library_DEPS) + merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) + else() + message(FATAL "Please specify source file or library in nv_library.") + endif() + endif(hip_library_SRCS) + endif() +endfunction(hip_library) + +function(hip_binary TARGET_NAME) + if (WITH_AMD_GPU) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS}) + if(hip_binary_DEPS) + target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS}) + add_dependencies(${TARGET_NAME} ${hip_binary_DEPS}) + endif() + endif() +endfunction(hip_binary) + +function(hip_test TARGET_NAME) + if (WITH_AMD_GPU AND WITH_TESTING) + set(options "") + set(oneValueArgs "") + set(multiValueArgs SRCS DEPS) + cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(_sources ${hip_test_SRCS}) + HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options}) + if(_source_files) + list(REMOVE_ITEM _sources ${_source_files}) + endif() + add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources}) + set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP) + target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_test(${TARGET_NAME} ${TARGET_NAME}) + endif() +endfunction(hip_test) + function(go_library TARGET_NAME) set(options STATIC static SHARED shared) set(oneValueArgs "") diff --git a/cmake/hip.cmake b/cmake/hip.cmake new file mode 100644 index 0000000000..cd880603a7 --- /dev/null +++ b/cmake/hip.cmake @@ -0,0 +1,46 @@ +if(NOT WITH_AMD_GPU) + return() +endif() + +include_directories("/opt/rocm/include") +include_directories("/opt/rocm/hipblas/include") +include_directories("/opt/rocm/hiprand/include") +include_directories("/opt/rocm/rocrand/include") +include_directories("/opt/rocm/rccl/include") +include_directories("/opt/rocm/thrust") + +list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc") + +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" ) + +if(WITH_DSO) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO") +endif(WITH_DSO) + +if(WITH_DOUBLE) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE") +endif(WITH_DOUBLE) + +if(WITH_TESTING) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING") +endif(WITH_TESTING) + +if(CMAKE_BUILD_TYPE STREQUAL "Debug") + list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) +elseif(CMAKE_BUILD_TYPE STREQUAL "Release") +# Disable optimization since one eigen symbol will be removed in math_function.cu + #list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) +elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") + list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) +elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") + list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) +endif() + +if("x${HCC_HOME}" STREQUAL "x") + set(HCC_HOME "/opt/rocm/hcc") +endif() + +set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o ") +set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared") +set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} -o -shared") + diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index d30124d4a3..26d1dab1e9 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -76,6 +76,9 @@ function(op_library TARGET) if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) + elseif (WITH_AMD_GPU) + hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS + ${op_library_DEPS} ${op_common_deps}) else() cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index fba1612d10..1cac62472c 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -6,6 +6,7 @@ function(math_library TARGET) # But it handle split GPU/CPU code and link some common library. set(cc_srcs) set(cu_srcs) + set(hip_srcs) set(math_common_deps device_context framework_proto) set(multiValueArgs DEPS) cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" @@ -17,10 +18,15 @@ function(math_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) + list(APPEND hip_srcs ${TARGET}.hip.cu) + endif() list(LENGTH cc_srcs cc_srcs_len) if (WITH_GPU) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) + elseif (WITH_AMD_GPU) + hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) elseif(${cc_srcs_len} GREATER 0) cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) endif() diff --git a/paddle/fluid/operators/math/concat.hip.cu b/paddle/fluid/operators/math/concat.hip.cu new file mode 100644 index 0000000000..91efd8ea57 --- /dev/null +++ b/paddle/fluid/operators/math/concat.hip.cu @@ -0,0 +1,281 @@ +/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "hip/hip_runtime.h" +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__device__ T upper_bound(const T* first, T count, T val) { + const T* orig = first; + const T* it = nullptr; + T step = 0; + while (count > 0) { + it = first; + step = count / 2; + it += step; + if (!(val < *it)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first - orig; +} + +template +__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, + const int output_rows, const int output_cols, + T* output) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int segment = upper_bound(input_cols, col_size, tid_x) - 1; + + int curr_offset = input_cols[segment]; + int curr_segment = segment; + for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { + T curr_col_offset; + while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* input_ptr = inputs[curr_segment]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) + output[tid_y * output_cols + tid_x] = + input_ptr[tid_y * segment_width + local_col]; + } +} + +template +__global__ void KernelConcat(T** inputs, const int input_col, + const int output_rows, const int output_cols, + T* output) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + double inv_input_col = 1.0 / input_col; + for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { + int split = tid_x * inv_input_col; + int in_offset = tid_x - split * input_col; + T* input_ptr = inputs[split]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) { + output[tid_y * output_cols + tid_x] = + input_ptr[tid_y * input_col + in_offset]; + } + } +} + +template +__global__ void KernelConcatGrad(const T* input, const int input_row, + const int input_col, const int* output_cols, + int col_size, T** outputs) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int segment = upper_bound(output_cols, col_size, tid_x) - 1; + int curr_offset = output_cols[segment]; + int curr_segment = segment; + for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { + T curr_col_offset; + while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs[curr_segment]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input[tid_y * input_col + tid_x]; + } +} + +template +__global__ void KernelConcatGrad(const T* input, const int input_row, + const int input_col, const int output_cols, + T** outputs) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + double inv_input_col = 1.0 / input_col; + for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x * inv_input_col; + int in_offset = tid_x - split * input_col; + T* output_ptr = outputs[split]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * output_cols + in_offset] = + input[tid_y * input_col + tid_x]; + } +} + +/* + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. + */ +template +class ConcatFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const std::vector& input, const int axis, + framework::Tensor* output) { + // TODO(zcd): Add input data validity checking + int num = input.size(); + int rows = 1; + auto dim_0 = input[0].dims(); + for (int i = 0; i < axis; ++i) { + rows *= dim_0[i]; + } + int cols = input[0].numel() / rows; + int out_rows = rows, out_cols = 0; + + framework::Vector inputs_data(num * sizeof(T*) / 2); + framework::Vector inputs_cols(num + 1); + inputs_cols[0] = 0; + T** inputs_ptr = reinterpret_cast(inputs_data.data()); + + bool sameShape = true; + for (int i = 0; i < num; ++i) { + int t_cols = input[i].numel() / rows; + if (sameShape) { + if (t_cols != cols) sameShape = false; + } + out_cols += t_cols; + inputs_cols[i + 1] = out_cols; + inputs_ptr[i] = const_cast(input[i].data()); + } + + T** ins_gpu = + reinterpret_cast(inputs_data.CUDAMutableData(context.GetPlace())); + const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace()); + + // computation + // set the thread block and grid according to CurrentDeviceId + const int kThreadsPerBlock = 1024; + int block_cols = kThreadsPerBlock; + if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((out_cols + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((out_cols + block_cols - 1) / block_cols, max_blocks); + int grid_rows = + std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (sameShape) { + hipLaunchKernelGGL((KernelConcat), dim3(grid_size), dim3(block_size), 0, context.stream(), + ins_gpu, cols, out_rows, out_cols, output->data()); + } else { + hipLaunchKernelGGL((KernelConcat), dim3(grid_size), dim3(block_size), 0, context.stream(), + ins_gpu, ins_col_gpu, static_cast(inputs_cols.size()), out_rows, + out_cols, output->data()); + } + } +}; + +/* + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. + */ +template +class ConcatGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, const int axis, + std::vector& outputs) { + // TODO(zcd): Add input data validity checking + int num = outputs.size(); + int input_row = 1; + auto dim_0 = outputs[0].dims(); + for (int i = 0; i < axis; ++i) { + input_row *= dim_0[i]; + } + + int output_col_0 = outputs[0].numel() / input_row; + int input_col = 0; + bool sameShape = true; + + framework::Vector outputs_data(num * sizeof(T*) / 2); + framework::Vector outputs_cols(num + 1); + outputs_cols[0] = 0; + T** outputs_ptr = reinterpret_cast(outputs_data.data()); + + for (int i = 0; i < num; ++i) { + int t_col = outputs[i].numel() / input_row; + if (sameShape) { + if (t_col != output_col_0) sameShape = false; + } + input_col += t_col; + outputs_cols[i + 1] = input_col; + outputs_ptr[i] = outputs[i].data(); + } + + T** outs_gpu = + reinterpret_cast(outputs_data.CUDAMutableData(context.GetPlace())); + const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace()); + + // computation + const int kThreadsPerBlock = 1024; + int block_cols = kThreadsPerBlock; + if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((input_col + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((input_col + block_cols - 1) / block_cols, max_blocks); + int grid_rows = + std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (sameShape) { + hipLaunchKernelGGL((KernelConcatGrad), dim3(grid_size), dim3(block_size), 0, context.stream(), + input.data(), input_row, input_col, output_col_0, outs_gpu); + } else { + hipLaunchKernelGGL((KernelConcatGrad), dim3(grid_size), dim3(block_size), 0, context.stream(), + input.data(), input_row, input_col, outs_col_gpu, + static_cast(outputs_cols.size()), outs_gpu); + } + } +}; + +template class ConcatFunctor; +template class ConcatFunctor; +template class ConcatFunctor; +template class ConcatFunctor; + +template class ConcatGradFunctor; +template class ConcatGradFunctor; +template class ConcatGradFunctor; +template class ConcatGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 8942b5c943..d523ad7f73 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,9 +1,16 @@ if(WITH_PYTHON) - cc_library(paddle_pybind SHARED - SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc - DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method - ${GLOB_OP_LIB}) - if(NOT APPLE AND NOT ANDROID) - target_link_libraries(paddle_pybind rt) - endif(NOT APPLE AND NOT ANDROID) + if(WITH_AMD_GPU) + hip_library(paddle_pybind SHARED + SRCS pybind.cc exception.cc protobuf.cc const_value.cc + DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method + ${GLOB_OP_LIB}) + else() + cc_library(paddle_pybind SHARED + SRCS pybind.cc exception.cc protobuf.cc const_value.cc + DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method + ${GLOB_OP_LIB}) + if(NOT APPLE AND NOT ANDROID) + target_link_libraries(paddle_pybind rt) + endif(NOT APPLE AND NOT ANDROID) + endif(WITH_AMD_GPU) endif(WITH_PYTHON) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh old mode 100644 new mode 100755 index 6be2bd8fad..02f2d7ba12 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -37,6 +37,7 @@ function cmake_gen() { -DWITH_DSO=ON -DWITH_DOC=OFF -DWITH_GPU=${WITH_GPU:-OFF} + -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} @@ -50,6 +51,7 @@ function cmake_gen() { -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_FAST_BUNDLE_TEST=ON + -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ======================================== EOF @@ -62,6 +64,7 @@ EOF -DWITH_DSO=ON \ -DWITH_DOC=OFF \ -DWITH_GPU=${WITH_GPU:-OFF} \ + -DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_AVX=${WITH_AVX:-OFF} \ @@ -74,6 +77,7 @@ EOF -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_FAST_BUNDLE_TEST=ON \ + -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON } From 9cb8f503026c6d3d25fa80e34b8fa2ca0bea6d2f Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 14:58:50 +0800 Subject: [PATCH 029/314] Complete fetch op --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/parallel_executor.cc | 123 +++++++++++++++--- paddle/fluid/framework/parallel_executor.h | 3 +- paddle/fluid/operators/math/concat.h | 1 + paddle/fluid/pybind/pybind.cc | 2 +- .../tests/unittests/test_parallel_executor.py | 15 ++- 6 files changed, 124 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index fadc24ae5d..6522a7a69f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -87,7 +87,7 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - framework_proto backward glog lod_rank_table feed_fetch_method executor simple_threadpool) + framework_proto backward glog lod_rank_table feed_fetch_method executor simple_threadpool concat) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index edc24cc131..cfaa2dbd1f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include "ThreadPool.h" #include "executor.h" #include "lod_tensor.h" +#include "lod_tensor_array.h" #include "op_registry.h" +#include "paddle/fluid/operators/math/concat.h" namespace paddle { namespace framework { @@ -34,7 +36,7 @@ struct VarHandleBase { virtual std::string DebugString() const = 0; OpHandle *generated_op_; - std::vector pending_ops_; + std::unordered_set pending_ops_; }; struct VarHandle : public VarHandleBase { @@ -93,7 +95,6 @@ struct ComputationOpHandle : public OpHandle { void Run() override { // Wait other op if necessary - LOG(INFO) << "Run " << this << " " << DebugString(); auto *cur_ctx = dev_ctx_[place_]; for (auto *in : inputs_) { if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { @@ -102,7 +103,6 @@ struct ComputationOpHandle : public OpHandle { } op_->Run(*scope_, place_); - LOG(INFO) << "Done " << this; } void Wait(platform::DeviceContext *waited_dev) override { @@ -122,8 +122,6 @@ struct ScaleLossGradOpHandle : public OpHandle { place_(place) {} void Run() override { - LOG(INFO) << "Run Scale Loss Grad"; - std::string var_name = static_cast(this->outputs_[0])->name_; float *tmp = scope_->FindVar(var_name) @@ -146,6 +144,64 @@ struct ScaleLossGradOpHandle : public OpHandle { } }; +struct FetchedData { + public: + std::vector tensors_; + + explicit FetchedData(size_t num_fetched) { tensors_.resize(num_fetched); } +}; + +struct FetchOpHandle : public OpHandle { + std::shared_ptr data_; + size_t offset_; + std::vector *local_scopes_; + std::vector tensors_; + + ~FetchOpHandle() { + for (auto *input_var : inputs_) { + input_var->pending_ops_.erase(this); + } + for (auto &pair : dev_ctx_) { + pair.second->Wait(); + } + + // Lazily merge tensors. Will faster code. + MergeTensors(); + } + + void Run() override { + tensors_.resize(inputs_.size()); + auto *var = static_cast(inputs_[0]); + auto &var_name = var->name_; + platform::CPUPlace cpu; + auto &scopes = *local_scopes_; + + for (size_t i = 0; i < scopes.size(); ++i) { + auto &scope = scopes[i]; + auto &t = scope->FindVar(var_name)->Get(); + if (platform::is_gpu_place(var->place_)) { + TensorCopy(t, cpu, *dev_ctx_[t.place()], &tensors_[i]); + } else { + tensors_[i].ShareDataWith(t); + tensors_[i].set_lod(t.lod()); + } + } + } + + void Wait(platform::DeviceContext *waited_dev) override { + PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); + } + + private: + void MergeTensors() const { + std::vector tensors_ptr; + for (auto &t : tensors_) { + tensors_ptr.emplace_back(&t); + } + data_->tensors_[offset_].MergeLoDTensor(tensors_ptr, platform::CPUPlace()); + } +}; + class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(size_t num_threads = 12) @@ -154,6 +210,7 @@ class ParallelExecutorPrivate { std::vector places_; std::vector local_scopes_; + Scope *global_scope_; #ifdef PADDLE_WITH_CUDA struct NCCLContext { @@ -297,7 +354,7 @@ ParallelExecutor::ParallelExecutor( const std::string &loss_var_name, Scope *scope) : member_(new ParallelExecutorPrivate()) { member_->places_ = places; - + member_->global_scope_ = scope; // Step 1. RunStartupProgram and Bcast the params to devs. Executor exe(places[0]); exe.Run(startup_program, scope, 0); @@ -308,9 +365,9 @@ ParallelExecutor::ParallelExecutor( member_->main_place_ = places[0]; // Bcast Parameters to all GPUs + BuildNCCLCommunicator(); if (platform::is_gpu_place(member_->main_place_) && member_->local_scopes_.size() != 1) { // Is CUDA - BuildNCCLCommunicator(); BCastParamsToGPUs(startup_program); } // Startup Program has been run. All local scopes has correct parameters. @@ -365,7 +422,7 @@ void ParallelExecutor::ConstructDependencyGraph( for (auto &each_var_name : var_names) { VarHandle *var = GetVarHandle(each_var_name, p); op_handle->inputs_.emplace_back(var); - var->pending_ops_.emplace_back(op_handle); + var->pending_ops_.emplace(op_handle); } var_names = op->OutputArgumentNames(); @@ -390,7 +447,6 @@ void ParallelExecutor::ConstructDependencyGraph( GenerateVar(op_handle, loss_var_name + "@GRAD", p); change_forward = true; - LOG(INFO) << "Scale Loss " << op_handle->DebugString(); } } } @@ -416,7 +472,7 @@ void ParallelExecutor::ConstructDependencyGraph( } auto *prev_grad = &vars[vars.size() - 1]; op_handle->inputs_.emplace_back(prev_grad); - prev_grad->pending_ops_.emplace_back(op_handle); + prev_grad->pending_ops_.emplace(op_handle); auto &var = vars[vars.size()]; var.place_ = p; var.generated_op_ = op_handle; @@ -463,10 +519,6 @@ void ParallelExecutor::PolishGraphToSupportDataHarzaeds() const { continue; } - LOG(INFO) << "Link " << it_new->second.DebugString() << " From " - << it_old->second.version_ << " To " - << it_new->second.version_; - for (auto *read_op : read_ops) { // Manually add a dependency var from read_op to write_op; if (read_op == write_op) { @@ -479,7 +531,7 @@ void ParallelExecutor::PolishGraphToSupportDataHarzaeds() const { dep_var->generated_op_ = read_op; read_op->outputs_.emplace_back(dep_var); - dep_var->pending_ops_.emplace_back(write_op); + dep_var->pending_ops_.emplace(write_op); write_op->inputs_.emplace_back(dep_var); member_->dep_vars_.emplace(dep_var); } @@ -572,8 +624,9 @@ void ParallelExecutor::BuildNCCLCommunicator() const { #endif } -std::vector ParallelExecutor::Run( - const std::vector &fetch_tensors) { +void ParallelExecutor::Run(const std::vector &fetch_tensors, + const std::string &fetched_var_name) { + auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); std::unordered_map pending_vars; @@ -602,6 +655,38 @@ std::vector ParallelExecutor::Run( } } + std::unordered_map> fetched_vars; + + for (auto &fetch_var_name : fetch_tensors) { + for (auto &pair : member_->vars_) { + auto it = pair.second.find(fetch_var_name); + if (it != pair.second.end()) { + fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); + } + } + } + + std::vector fetch_ops; + + for (size_t i = 0; i < fetch_tensors.size(); ++i) { + auto &var_name = fetch_tensors[i]; + auto &vars = fetched_vars[var_name]; + fetch_ops.emplace_back(); + FetchOpHandle *op = &fetch_ops.back(); + op->data_ = fetched_data; + op->offset_ = i; + op->local_scopes_ = &member_->local_scopes_; + for (auto &p : member_->places_) { + op->dev_ctx_[p] = this->member_->GetNCCLCtx(p).ctx_.get(); + } + + for (auto *var : vars) { + var->pending_ops_.emplace(op); + op->inputs_.emplace_back(var); + } + pending_ops.insert({op, op->inputs_.size()}); + } + for (auto *op : to_run) { RunOp(pending_vars, op); } @@ -642,7 +727,9 @@ std::vector ParallelExecutor::Run( RunOp(pending_vars, op); } } - return std::vector(); + fetch_ops.clear(); + *member_->global_scope_->Var(fetched_var_name)->GetMutable() = + fetched_data->tensors_; } void ParallelExecutor::RunOp( diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 30416563f8..e4857f0eef 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -40,7 +40,8 @@ class ParallelExecutor { const ProgramDesc& main_program, const std::string& loss_var_name, Scope* scope); - std::vector Run(const std::vector& fetch_tensors); + void Run(const std::vector& fetch_tensors, + const std::string& fetched_var_name = "fetched_var"); private: ParallelExecutorPrivate* member_; diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h index 22147d79e4..c0e983e4aa 100644 --- a/paddle/fluid/operators/math/concat.h +++ b/paddle/fluid/operators/math/concat.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/tensor.h" namespace paddle { diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index c2348d9686..929c343f7a 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -508,7 +508,7 @@ All parameter, weight, gradient are variables in Paddle. new (&self) ParallelExecutor(places, params, startup_program, main_program, loss_var_name, scope); }) - .def("run", [](ParallelExecutor &self) { self.Run({}); }); + .def("run", &ParallelExecutor::Run); BindRecordIOWriter(m); return m.ptr(); diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 2a614700b0..1cea14fb96 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -16,6 +16,7 @@ import unittest import paddle.fluid as fluid import paddle.v2 as paddle import paddle.v2.dataset.mnist as mnist +import numpy class ParallelExecutor(unittest.TestCase): @@ -66,4 +67,16 @@ class ParallelExecutor(unittest.TestCase): act_places, set([p.name for p in main.global_block().iter_parameters()]), startup.desc, main.desc, loss.name, fluid.global_scope()) - exe.run() + exe.run([loss.name], 'fetched_var') + + first_loss = numpy.array(fluid.global_scope().find_var('fetched_var') + .get_lod_tensor_array()[0]) + + for i in xrange(10): + exe.run([], 'fetched_var') + exe.run([loss.name], 'fetched_var') + last_loss = numpy.array(fluid.global_scope().find_var('fetched_var') + .get_lod_tensor_array()[0]) + + print first_loss, last_loss + self.assertGreater(first_loss[0], last_loss[0]) From e18a2697054f02d87d1289f7feed1081cf3599c3 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 15:08:09 +0800 Subject: [PATCH 030/314] Add debug code --- paddle/fluid/framework/parallel_executor.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index cfaa2dbd1f..b3bf2b8fb6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -46,6 +46,8 @@ struct VarHandle : public VarHandleBase { return ss.str(); } + // version field currently is not used, however, just store the version to + // debug easily. size_t version_; std::string name_; platform::Place place_; @@ -742,7 +744,7 @@ void ParallelExecutor::RunOp( auto op_run = [ready_buffer, op, this] { try { - // TODO(yy) Check Previous Op has same dev ctx. + VLOG(10) << op->DebugString(); op->Run(); for (auto *ready : ready_buffer) { *ready = true; From 389ea18a4e95f19cfc78cae6fc46d5096a648a91 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 15:13:04 +0800 Subject: [PATCH 031/314] Debug code --- .../tests/unittests/test_parallel_executor.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 1cea14fb96..e8976ff052 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -71,12 +71,13 @@ class ParallelExecutor(unittest.TestCase): first_loss = numpy.array(fluid.global_scope().find_var('fetched_var') .get_lod_tensor_array()[0]) - - for i in xrange(10): - exe.run([], 'fetched_var') - exe.run([loss.name], 'fetched_var') - last_loss = numpy.array(fluid.global_scope().find_var('fetched_var') - .get_lod_tensor_array()[0]) - - print first_loss, last_loss - self.assertGreater(first_loss[0], last_loss[0]) + print first_loss + # + # for i in xrange(10): + # exe.run([], 'fetched_var') + # exe.run([loss.name], 'fetched_var') + # last_loss = numpy.array(fluid.global_scope().find_var('fetched_var') + # .get_lod_tensor_array()[0]) + # + # print first_loss, last_loss + # self.assertGreater(first_loss[0], last_loss[0]) From f8141d90c845c71cda03df10649b0dfc747f2c1a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 15:16:40 +0800 Subject: [PATCH 032/314] Debug --- paddle/fluid/framework/parallel_executor.cc | 1 + .../tests/unittests/test_parallel_executor.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b3bf2b8fb6..c42101e21a 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -345,6 +345,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { } void Wait(platform::DeviceContext *waited_dev) override { + VLOG(3) << "Wait NCCL AllReduce"; this->dev_ctx_.at(waited_dev->GetPlace())->Wait(); } }; diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index e8976ff052..e156d5b60e 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -72,12 +72,12 @@ class ParallelExecutor(unittest.TestCase): first_loss = numpy.array(fluid.global_scope().find_var('fetched_var') .get_lod_tensor_array()[0]) print first_loss - # - # for i in xrange(10): - # exe.run([], 'fetched_var') - # exe.run([loss.name], 'fetched_var') - # last_loss = numpy.array(fluid.global_scope().find_var('fetched_var') - # .get_lod_tensor_array()[0]) - # - # print first_loss, last_loss - # self.assertGreater(first_loss[0], last_loss[0]) + + for i in xrange(10): + exe.run([], 'fetched_var') + exe.run([loss.name], 'fetched_var') + last_loss = numpy.array(fluid.global_scope().find_var('fetched_var') + .get_lod_tensor_array()[0]) + + print first_loss, last_loss + self.assertGreater(first_loss[0], last_loss[0]) From 09935ab936364257f3172f7cc0986a813057ecd0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 15:24:21 +0800 Subject: [PATCH 033/314] Debug --- paddle/fluid/framework/parallel_executor.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index c42101e21a..1782430927 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -345,8 +345,9 @@ struct NCCLAllReduceOpHandle : public OpHandle { } void Wait(platform::DeviceContext *waited_dev) override { - VLOG(3) << "Wait NCCL AllReduce"; - this->dev_ctx_.at(waited_dev->GetPlace())->Wait(); + for (auto &pair : member_->communication_streams_) { + pair.second.ctx_->Wait(); + } } }; From a6e64242d8f73f1a597f2a6634a98453cd07edf1 Mon Sep 17 00:00:00 2001 From: caoying03 Date: Mon, 19 Mar 2018 11:08:33 +0800 Subject: [PATCH 034/314] follow comments. --- paddle/fluid/operators/reshape_op.cc | 64 +++++++++++++++++-------- paddle/fluid/operators/reshape_op.h | 14 +++++- python/paddle/fluid/layers/detection.py | 4 +- python/paddle/fluid/layers/nn.py | 52 ++++++++++++-------- 4 files changed, 91 insertions(+), 43 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index c0d08cc690..489742b492 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -44,22 +44,22 @@ class ReshapeOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", x_dims); } else { ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); - - // FIXME(caoying): When shape of the output tensor is determined during - // runtime, LoD information of X will not passed to the output. - if (shape[0] == x_dims[0]) { - // Only pass LoD when the first dimension of output and Input(X) - // are the same. - ctx->ShareLoD("X", /*->*/ "Out"); - } } + + // NOTE: Reshape op cannot reshape an input sequence batch into an output + // sequence batch that has a different number of time steps. + // Here output always shares the LoD information with input. But if + // Attr(shape) contains 0 or -1, the actual output shape can only be + // determined during runtime. The check for wheather it is a valid output + // sequence batch is performed in runtime. + ctx->ShareLoD("X", /*->*/ "Out"); } private: bool ValidateShape(const std::vector &shape, const framework::DDim &input_dim, std::vector &output_shape) const { - // only one dimension canbe set to -1, whose size will be automatically + // only one dimension can be set to -1, whose size will be automatically // infered. const int64_t unknown_index = -1; const auto in_size = framework::product(input_dim); @@ -82,7 +82,7 @@ class ReshapeOp : public framework::OperatorWithKernel { } PADDLE_ENFORCE_LE( neg_dims_idx.size(), 1, - "Only one input dimension of Attr(shape) may be unknown."); + "Only one input dimension of Attr(shape) can be unknown."); output_shape.resize(shape.size(), 0); std::transform(shape.begin(), shape.end(), output_shape.begin(), @@ -113,22 +113,46 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>( "shape", "(std::vector) Target shape of reshape operator."); AddAttr("inplace", - "Change the source tensor's shape without copy memory.") - .SetDefault(true); + "(default: false) Change the source tensor's shape without " + "memory copy. When Attr(inplace) is set true, the output " + "tensor shares memory with Input(X), otherwise, a new output " + "tensor is created, and its data are copied from Input(x).") + .SetDefault(false); AddComment(R"DOC( Reshape Operator. -Reshape Input(X) into the shape specified by Attr(shape). +Reshape Input(X) into the shape specified by Attr(shape). The data in Input(X) +are unchanged. + +Examples: + +1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X) +into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged. + +1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform +Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data +unchanged. In this case, one and only dimension of Attr(shape) can be set to -1, +the value of this dimension is inferred from the total element number of +Input(X) and remaining dimensions. + +1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform +Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data +unchanged. In this case, besides -1, 0 means the actual dimension value is going +to be copied from the corresponding dimension of Input(X). -An example: -Given a 2-D tensor X with 2 rows and 2 columns : [[1, 2], [3, 4]] +Note: -and target shape = [1, 4], the reshape operator will transform -the tensor X into a 2-D tensor: [[1, 2, 3, 4]] +1. One and only one dimension in Attr(shape) can be set -1. In this case, +the actual dimension value will be infered from the total element number of +Input(X) and remaining dimensions. +1. More than one dimensions in Attr(shape) can be set to 0, which means the real +dimension value will be copied from Input(X) at runtime. Note that the index of +0 can not access Rank(X). For example, Input(X) is a 3-D tensor with shape +[2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. -One dimension in the target shape can be set -1, representing that its -size is unknown. In this case, the real dimension will be infered from -the original shape of Input(X) and other dimensions in the target shape. )DOC"); } }; diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 9dbc5cec6b..dd8eaf3e4f 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -24,11 +24,21 @@ template class ReshapeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { - auto* out = ctx.Output("Out"); - auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto* in = ctx.Input("X"); auto out_dims = ValidateShape(ctx.Attr>("shape"), in->dims()); + + if (!in->lod().empty()) { + PADDLE_ENFORCE_EQ( + out_dims[0], in->dims()[0], + "Reshape operator cannot reshape an input sequence batch " + "into an output sequence batch that has a different " + "number of time steps. Please consider using " + "sequence_reshape op."); + } + bool inplace = ctx.Attr("inplace"); if (!inplace) { out->mutable_data(ctx.GetPlace()); diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3ced35d6ce..ec4afa8067 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -130,9 +130,9 @@ def detection_output(loc, code_type='decode_center_size') old_shape = scores.shape - scores = ops.reshape(x=scores, shape=(-1, old_shape[-1])) + scores = nn.reshape(x=scores, shape=(-1, old_shape[-1])) scores = nn.softmax(input=scores) - scores = ops.reshape(x=scores, shape=old_shape) + scores = nn.reshape(x=scores, shape=old_shape) scores = nn.transpose(scores, perm=[0, 2, 1]) nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 48d244f3f6..85693578e1 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3299,13 +3299,35 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): def reshape(x, shape, act=None, inplace=True, name=None): """ - Gives a new shape to Tensor without changing its data. - This layer takes a tensor as input and the attribute shape specifying the - new shape. The shape attribute must be specified. At most one dimension of - the new shape can be -1. In this case, the value is inferred from the size - of the tensor and the remaining dimensions. A dimension could also be 0, - in which case the actual dimension value is going to be copied from the - input tensor. + Gives a new shape to the input Tensor without changing its data. + + This layer takes a tensor and the attribute shape which specifies the + new shape as its inputs. The shape attribute must be given. It cannot be + empty. One and only one dimension of shape can be -1. More than one + dimension of shape can be 0. + + -1 means the value of this dimension is inferred from the total element + number of x and remaining dimensions. + + 0 means the actual dimension value is going to be copied from the + corresponding dimension of x. + + 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + specified by Attr(shape) is [6, 8], the reshape operator will transform x + into a 2-D tensor with shape [6, 8] and leaving x's data unchanged. + + 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will + transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data + unchanged. In this case, one and only dimension of Attr(shape) can be set + to -1, the value of this dimension is inferred from the total element number + of x and remaining dimensions. + + 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will + transform x into a 4-D tensor with shape [2, 4, 3, 2] and leaving x's data + unchanged. In this case, besides -1, 0 means the actual dimension value is + going to be copied from the corresponding dimension of x during runtime. Args: input(variable): The input tensor. @@ -3320,18 +3342,10 @@ def reshape(x, shape, act=None, inplace=True, name=None): Examples: .. code-block:: python - - Given a 2-D tensor X with shape [2 x 2], and the new shape: [1, 4]. - The reshape layer will change tensor X into a 2-D tensor with - shape [1 x 4] with its data unchanged. - - Given a 3-D tensor x with shape [2, 3, 4] and the new shape: [3, -1]. - The reshape layer will change tensor X into a 2-D tensor with shape: - [3 x 8] with its data unchanged. - - Given a 3-D tensor x with shape [2, 3, 8] and the new shape: - [-1, 0, 2, 2]. The reshape layer will change tensor X into a 4-D tensor - with shape [4, 3, 2, 2] with its data unchanged. + data = fluid.layers.data(name='data', shape=[2, 4, 6], dtype='float32') + reshaped = fluid.layers.reshape( + x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True + ) """ From 0023c3bcf52c7bde221a32fb898f52a9aac635c2 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 16:29:41 +0800 Subject: [PATCH 035/314] Use atomic bool --- paddle/fluid/framework/parallel_executor.cc | 6 +++--- paddle/fluid/framework/parallel_executor.h | 5 +++-- paddle/fluid/platform/profiler_test.cc | 9 +++++++++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1782430927..c8dd3f9151 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -633,7 +633,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); - std::unordered_map pending_vars; + std::unordered_map> pending_vars; std::unordered_map pending_ops; for (auto &place_pair : member_->vars_) { @@ -737,9 +737,9 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } void ParallelExecutor::RunOp( - std::unordered_map &pending_vars, + std::unordered_map> &pending_vars, OpHandle *op) const { - std::vector ready_buffer; + std::vector *> ready_buffer; for (auto *var : op->outputs_) { ready_buffer.emplace_back(&pending_vars[var]); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index e4857f0eef..c3cebcfc57 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -60,8 +60,9 @@ class ParallelExecutor { void BuildNCCLCommunicator() const; - void RunOp(std::unordered_map& pending_vars, - OpHandle* op) const; + void RunOp( + std::unordered_map>& pending_vars, + OpHandle* op) const; void PolishGraphToSupportDataHarzaeds() const; }; diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index fc77e0f321..366c82bf96 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#include "cuda_runtime.h" #include "gtest/gtest.h" TEST(Event, CpuElapsedTime) { @@ -157,3 +158,11 @@ TEST(RecordEvent, RecordEvent) { // Will remove parsing-related code from test later DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler"); } + +TEST(TMP, stream_wait) { + cudaStream_t stream; + cudaStreamCreate(&stream); + cudaStreamSynchronize(stream); + cudaStreamSynchronize(stream); + cudaStreamSynchronize(stream); +} From f52714d391d49230e0cfc630a5fcbb35c06c941a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 16:33:35 +0800 Subject: [PATCH 036/314] Debug --- paddle/fluid/framework/parallel_executor.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index c8dd3f9151..1e1a5477a0 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -172,6 +172,10 @@ struct FetchOpHandle : public OpHandle { } void Run() override { + for (auto *input : inputs_) { + input->generated_op_->Wait(nullptr); + } + tensors_.resize(inputs_.size()); auto *var = static_cast(inputs_[0]); auto &var_name = var->name_; From 5957f28b862c154add5bdf1c35b9826d3b77ed39 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 16:39:29 +0800 Subject: [PATCH 037/314] Debug --- paddle/fluid/framework/parallel_executor.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1e1a5477a0..5b483849b1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -714,6 +714,12 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, throw * member_->exception_; } + { + for (auto &pair : pending_vars) { + VLOG(3) << pair.first->DebugString(); + } + } + std::this_thread::yield(); continue; } From 36e0415220312ba9920777f1850d8f18cfa97d36 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 16:59:08 +0800 Subject: [PATCH 038/314] Single Thread --- paddle/fluid/framework/parallel_executor.cc | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5b483849b1..2898c5ffd9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -714,12 +714,6 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, throw * member_->exception_; } - { - for (auto &pair : pending_vars) { - VLOG(3) << pair.first->DebugString(); - } - } - std::this_thread::yield(); continue; } @@ -768,7 +762,8 @@ void ParallelExecutor::RunOp( } }; - member_->pool_.enqueue(op_run); + op_run(); + // member_->pool_.enqueue(op_run); } } // namespace framework } // namespace paddle From f3e983e49987b32af57e2e7924be8b245041ec4d Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 17:08:19 +0800 Subject: [PATCH 039/314] Memory order --- paddle/fluid/framework/parallel_executor.cc | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2898c5ffd9..875b5d8ba7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -702,7 +702,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, while (!pending_ops.empty()) { VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { - if (pair.second) { + if (pair.second.load(std::memory_order_acquire)) { ready_var = pair.first; } } @@ -714,7 +714,6 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, throw * member_->exception_; } - std::this_thread::yield(); continue; } @@ -753,7 +752,7 @@ void ParallelExecutor::RunOp( VLOG(10) << op->DebugString(); op->Run(); for (auto *ready : ready_buffer) { - *ready = true; + ready->store(true, std::memory_order_release); } } catch (platform::EnforceNotMet ex) { member_->exception_.reset(new platform::EnforceNotMet(ex)); @@ -762,8 +761,7 @@ void ParallelExecutor::RunOp( } }; - op_run(); - // member_->pool_.enqueue(op_run); + member_->pool_.enqueue(op_run); } } // namespace framework } // namespace paddle From b57b880b055a0eab250e5092eb6a5b3e9b1b9ee3 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 17:15:45 +0800 Subject: [PATCH 040/314] Debug --- paddle/fluid/framework/parallel_executor.cc | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 875b5d8ba7..b5b1e43abf 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -742,26 +742,29 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, void ParallelExecutor::RunOp( std::unordered_map> &pending_vars, OpHandle *op) const { - std::vector *> ready_buffer; + std::vector *> *ready_buffer = + new std::vector *>(); for (auto *var : op->outputs_) { - ready_buffer.emplace_back(&pending_vars[var]); + ready_buffer->emplace_back(&pending_vars[var]); } auto op_run = [ready_buffer, op, this] { try { VLOG(10) << op->DebugString(); op->Run(); - for (auto *ready : ready_buffer) { + for (auto *ready : *ready_buffer) { ready->store(true, std::memory_order_release); } + delete ready_buffer; } catch (platform::EnforceNotMet ex) { member_->exception_.reset(new platform::EnforceNotMet(ex)); } catch (...) { LOG(FATAL) << "Unknown exception catched"; } }; - + VLOG(3) << "Enqueue"; member_->pool_.enqueue(op_run); + VLOG(3) << "Done"; } } // namespace framework } // namespace paddle From b1cb8bbd405ecb602446da0a6e5822d5b696afbd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 17:20:14 +0800 Subject: [PATCH 041/314] Debug --- paddle/fluid/framework/parallel_executor.cc | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b5b1e43abf..a0bd01e0c8 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -700,13 +700,14 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } while (!pending_ops.empty()) { + VLOG(1) << "1"; VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { if (pair.second.load(std::memory_order_acquire)) { ready_var = pair.first; } } - + VLOG(1) << "1"; if (ready_var == nullptr) { // FIXME use conditional var instead of busy wait. @@ -716,11 +717,11 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, continue; } - + VLOG(1) << "1"; pending_vars.erase(ready_var); - + VLOG(1) << "1"; to_run.clear(); - + VLOG(1) << "1"; for (auto *op : ready_var->pending_ops_) { auto &deps = pending_ops[op]; --deps; @@ -728,13 +729,16 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, to_run.emplace_back(op); } } - + VLOG(1) << "1"; for (auto *op : to_run) { pending_ops.erase(op); RunOp(pending_vars, op); } + VLOG(1) << "1"; } + VLOG(1) << "1"; fetch_ops.clear(); + VLOG(1) << "1"; *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; } From 1f063d0900d79c0d09809419d6393bc2ecebbb2b Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 17:30:16 +0800 Subject: [PATCH 042/314] Memorder --- paddle/fluid/framework/parallel_executor.cc | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a0bd01e0c8..7d2ba74086 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -643,14 +643,16 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, for (auto &place_pair : member_->vars_) { for (auto &name_pair : place_pair.second) { for (auto &version_pair : name_pair.second) { - pending_vars[&version_pair.second] = - version_pair.second.generated_op_ == nullptr; + pending_vars[&version_pair.second].store( + version_pair.second.generated_op_ == nullptr, + std::memory_order_relaxed); } } } for (auto &var : member_->dep_vars_) { - pending_vars[var.get()] = var->generated_op_ == nullptr; + pending_vars[var.get()].store(var->generated_op_ == nullptr, + std::memory_order_relaxed); } std::vector to_run; @@ -700,14 +702,12 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } while (!pending_ops.empty()) { - VLOG(1) << "1"; VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { if (pair.second.load(std::memory_order_acquire)) { ready_var = pair.first; } } - VLOG(1) << "1"; if (ready_var == nullptr) { // FIXME use conditional var instead of busy wait. @@ -717,11 +717,8 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, continue; } - VLOG(1) << "1"; pending_vars.erase(ready_var); - VLOG(1) << "1"; to_run.clear(); - VLOG(1) << "1"; for (auto *op : ready_var->pending_ops_) { auto &deps = pending_ops[op]; --deps; @@ -729,16 +726,12 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, to_run.emplace_back(op); } } - VLOG(1) << "1"; for (auto *op : to_run) { pending_ops.erase(op); RunOp(pending_vars, op); } - VLOG(1) << "1"; } - VLOG(1) << "1"; fetch_ops.clear(); - VLOG(1) << "1"; *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; } From 515e516e770e648a6adf41d6aa0bd839b4683007 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 17:36:00 +0800 Subject: [PATCH 043/314] Add more log --- paddle/fluid/framework/parallel_executor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7d2ba74086..57dc663c41 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -747,8 +747,9 @@ void ParallelExecutor::RunOp( auto op_run = [ready_buffer, op, this] { try { - VLOG(10) << op->DebugString(); + VLOG(10) << op->DebugString() << " " << this; op->Run(); + VLOG(10) << "Done " << this; for (auto *ready : *ready_buffer) { ready->store(true, std::memory_order_release); } From 192cc5dd3260bede2ff9cadd90f9249d853f0cf0 Mon Sep 17 00:00:00 2001 From: Tomasz Patejko Date: Tue, 13 Mar 2018 11:07:08 -0400 Subject: [PATCH 044/314] Implementation of MKLDNN LRN --- paddle/fluid/operators/lrn_mkldnn_op.cc | 189 ++++++++++++++++++ paddle/fluid/operators/lrn_op.cc | 55 ++++- .../fluid/tests/unittests/test_lrn_op.py | 10 + 3 files changed, 253 insertions(+), 1 deletion(-) create mode 100644 paddle/fluid/operators/lrn_mkldnn_op.cc diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc new file mode 100644 index 0000000000..334597ab05 --- /dev/null +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/lrn_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; +using paddle::platform::MKLDNNDeviceContext; + +namespace { +mkldnn::algorithm LRNAlgorithm(const paddle::framework::ExecutionContext& ctx) { + mkldnn::algorithm algorithm = mkldnn::lrn_across_channels; + + std::string algorithm_str = ctx.Attr("algorithm"); + if (algorithm_str == "WITHIN_CHANNEL") { + algorithm = mkldnn::lrn_within_channel; + } + return algorithm; +} +} // namespace + +template +class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(std::is_same::value, + "MKLDNN LRN must use float data."); + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "MKLDNN LRN must use CPUPlace."); + + auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + auto mid = ctx.Output("MidOut"); + + auto input_data = x->data(); + auto output_data = out->mutable_data(ctx.GetPlace()); + mid->mutable_data(ctx.GetPlace()); + + const std::string key = ctx.op().Output("Out"); + const std::string key_src_memory = key + "@lrn_src_memory"; + const std::string key_pd = key + "@lrn_pd"; + const std::string key_workspace_memory = key + "@lrn_workspace_memory"; + + const int n = ctx.Attr("n"); + const float alpha = ctx.Attr("alpha"); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + + auto algorithm = LRNAlgorithm(ctx); + + auto e_mid = framework::EigenTensor::From(*mid); + e_mid = e_mid.constant(k); + + auto dims = paddle::framework::vectorize2int(x->dims()); + + auto src_md = paddle::platform::MKLDNNMemDesc( + dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + auto dst_md = paddle::platform::MKLDNNMemDesc( + dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + auto forward_desc = mkldnn::lrn_forward::desc{ + mkldnn::prop_kind::forward, algorithm, src_md, n, alpha, beta, k}; + + auto forward_pd = std::make_shared( + forward_desc, mkldnn_engine); + + dev_ctx.SetBlob(key_pd, forward_pd); + + auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; + auto src_memory = std::make_shared( + src_memory_pd, static_cast(const_cast(input_data))); + + dev_ctx.SetBlob(key_src_memory, src_memory); + auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine}, + static_cast(output_data)}; + + auto workspace_md = forward_pd->workspace_primitive_desc(); + auto workspace_memory = std::make_shared(workspace_md); + + dev_ctx.SetBlob(key_workspace_memory, workspace_memory); + + auto forward_op = mkldnn::lrn_forward{*forward_pd, *src_memory, + *workspace_memory, dst_memory}; + + std::vector pipeline = {forward_op}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } +}; + +template +class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(std::is_same::value, + "MKLDNN LRN must use float data."); + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "MKLDNN LRN must use CPUPlace."); + + auto x = ctx.Input("X"); + + auto out_grad = ctx.Input(framework::GradVarName("Out")); + auto x_grad = ctx.Output(framework::GradVarName("X")); + + const std::string key = ctx.op().Input("Out"); + const std::string key_src_memory = key + "@lrn_src_memory"; + const std::string key_pd = key + "@lrn_pd"; + const std::string key_workspace_memory = key + "@lrn_workspace_memory"; + + const int n = ctx.Attr("n"); + const float alpha = ctx.Attr("alpha"); + const float beta = ctx.Attr("beta"); + const float k = ctx.Attr("k"); + + auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto x_grad_data = x_grad->mutable_data(ctx.GetPlace()); + auto out_grad_data = out_grad->data(); + + auto dims = paddle::framework::vectorize2int(x->dims()); + + auto src_md = paddle::platform::MKLDNNMemDesc( + dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + auto diff_src_md = paddle::platform::MKLDNNMemDesc( + dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + auto diff_dst_md = paddle::platform::MKLDNNMemDesc( + dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + auto diff_dst_memory = + mkldnn::memory{{diff_dst_md, mkldnn_engine}, + static_cast(const_cast(out_grad_data))}; + + auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine}, + static_cast(x_grad_data)}; + + auto algorithm = LRNAlgorithm(ctx); + + auto backward_desc = mkldnn::lrn_backward::desc{ + algorithm, src_md, diff_src_md, n, alpha, beta, k}; + + auto forward_pd = dev_ctx.GetBlob(key_pd); + + auto backward_pd = mkldnn::lrn_backward::primitive_desc{ + backward_desc, mkldnn_engine, + *static_cast(forward_pd.get())}; + + std::shared_ptr workspace_memory = + dev_ctx.GetBlob(key_workspace_memory); + + auto src_memory = dev_ctx.GetBlob(key_src_memory); + auto backward_op = mkldnn::lrn_backward{ + backward_pd, *static_cast(src_memory.get()), + diff_dst_memory, *static_cast(workspace_memory.get()), + diff_src_memory}; + + std::vector pipeline = {backward_op}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace, + ops::LRNMKLDNNOpKernel); +REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace, + ops::LRNMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 692e85dcff..6bd451a118 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/lrn_op.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -135,6 +138,24 @@ class LRNOp : public framework::OperatorWithKernel { ctx->SetOutputDim("MidOut", x_dim); ctx->ShareLoD("X", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + } +#endif + + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); + } }; template @@ -176,6 +197,21 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { "beta is the power number.") .SetDefault(0.75) .GreaterThan(0.0); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); + AddAttr("algorithm", + "(string default ACROSS_CHANNELS" + "An optional string: \"ACROSS_CHANNELS\", " + "\"WITHIN_CHANNEL\". Used by MKLDNN library") + .SetDefault("ACROSS_CHANNELS"); AddComment(R"DOC( Local Response Normalization Operator. @@ -223,8 +259,25 @@ class LRNOpGrad : public framework::OperatorWithKernel { auto x_dims = ctx->GetInputDim("X"); ctx->SetOutputDim(framework::GradVarName("X"), x_dims); } -}; + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + } +#endif + + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); + } +}; } // namespace operators } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py index eaff45cbb2..2268eafdbd 100644 --- a/python/paddle/fluid/tests/unittests/test_lrn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py @@ -87,5 +87,15 @@ class TestLRNOp(OpTest): self.check_grad(['X'], 'Out', max_relative_error=0.01) +class TestLRNMKLDNNOp(TestLRNOp): + def get_attrs(self): + attrs = TestLRNOp.get_attrs(self) + attrs['use_mkldnn'] = True + return attrs + + def test_check_output(self): + self.check_output(atol=0.002) + + if __name__ == "__main__": unittest.main() From c51c446221ce63890a0c099da7f26b9bfa41cb48 Mon Sep 17 00:00:00 2001 From: Tomasz Patejko Date: Fri, 16 Mar 2018 10:05:54 -0400 Subject: [PATCH 045/314] Content of GetExpectedKernelType moved to standalone function --- paddle/fluid/operators/lrn_op.cc | 54 ++++++++++++++------------------ 1 file changed, 24 insertions(+), 30 deletions(-) diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 6bd451a118..00db09ece3 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -119,6 +119,26 @@ struct LRNGradFunctor { template struct LRNGradFunctor; template struct LRNGradFunctor; +namespace { + framework::OpKernelType GetExpectedLRNKernel( + const framework::ExecutionContext& ctx) { + framework::LibraryType library_{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + } +#endif + + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); + } +} + class LRNOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -140,21 +160,8 @@ class LRNOp : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { - framework::LibraryType library_{framework::LibraryType::kPlain}; -#ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; - } -#endif - - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + const framework::ExecutionContext& ctx) const override { + return GetExpectedLRNKernel(ctx); } }; @@ -261,21 +268,8 @@ class LRNOpGrad : public framework::OperatorWithKernel { } framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { - framework::LibraryType library_{framework::LibraryType::kPlain}; -#ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; - } -#endif - - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); + const framework::ExecutionContext& ctx) const override { + return GetExpectedLRNKernel(ctx); } }; } // namespace operators From 2d95527527fe3b27e06f254965c8eb4fbacb4abf Mon Sep 17 00:00:00 2001 From: Tomasz Patejko Date: Mon, 19 Mar 2018 06:10:27 -0400 Subject: [PATCH 046/314] Removing WITHIN_CHANNEL algorithm for lrn. CPU lrn operator works only with ACROSS_CHANNELS --- paddle/fluid/operators/lrn_mkldnn_op.cc | 27 ++++++-------------- paddle/fluid/operators/lrn_op.cc | 33 +++++++++++-------------- 2 files changed, 22 insertions(+), 38 deletions(-) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 334597ab05..a2971fcd14 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -22,18 +22,6 @@ namespace operators { using paddle::framework::Tensor; using paddle::platform::MKLDNNDeviceContext; -namespace { -mkldnn::algorithm LRNAlgorithm(const paddle::framework::ExecutionContext& ctx) { - mkldnn::algorithm algorithm = mkldnn::lrn_across_channels; - - std::string algorithm_str = ctx.Attr("algorithm"); - if (algorithm_str == "WITHIN_CHANNEL") { - algorithm = mkldnn::lrn_within_channel; - } - return algorithm; -} -} // namespace - template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -64,8 +52,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); - auto algorithm = LRNAlgorithm(ctx); - auto e_mid = framework::EigenTensor::From(*mid); e_mid = e_mid.constant(k); @@ -77,8 +63,13 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto dst_md = paddle::platform::MKLDNNMemDesc( dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); - auto forward_desc = mkldnn::lrn_forward::desc{ - mkldnn::prop_kind::forward, algorithm, src_md, n, alpha, beta, k}; + auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward, + mkldnn::lrn_across_channels, + src_md, + n, + alpha, + beta, + k}; auto forward_pd = std::make_shared( forward_desc, mkldnn_engine); @@ -154,10 +145,8 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine}, static_cast(x_grad_data)}; - auto algorithm = LRNAlgorithm(ctx); - auto backward_desc = mkldnn::lrn_backward::desc{ - algorithm, src_md, diff_src_md, n, alpha, beta, k}; + mkldnn::lrn_across_channels, src_md, diff_src_md, n, alpha, beta, k}; auto forward_pd = dev_ctx.GetBlob(key_pd); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 00db09ece3..bd72f0435e 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -120,24 +120,24 @@ template struct LRNGradFunctor; template struct LRNGradFunctor; namespace { - framework::OpKernelType GetExpectedLRNKernel( - const framework::ExecutionContext& ctx) { - framework::LibraryType library_{framework::LibraryType::kPlain}; +framework::OpKernelType GetExpectedLRNKernel( + const framework::ExecutionContext& ctx) { + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_MKLDNN - if (library_ == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library_ = framework::LibraryType::kMKLDNN; - } + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; + } #endif - std::string data_format = ctx.Attr("data_format"); - // TODO(pzelazko-intel): enable MKLDNN layout when it's ready - framework::DataLayout layout_ = framework::StringToDataLayout(data_format); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout_, library_); - } + std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready + framework::DataLayout layout_ = framework::StringToDataLayout(data_format); + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), + layout_, library_); } +} // namespace class LRNOp : public framework::OperatorWithKernel { public: @@ -214,11 +214,6 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); - AddAttr("algorithm", - "(string default ACROSS_CHANNELS" - "An optional string: \"ACROSS_CHANNELS\", " - "\"WITHIN_CHANNEL\". Used by MKLDNN library") - .SetDefault("ACROSS_CHANNELS"); AddComment(R"DOC( Local Response Normalization Operator. From ea11a0a8533affaa9681d7859713d07eed8fddd8 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 18:19:39 +0800 Subject: [PATCH 047/314] Use volitie --- paddle/fluid/framework/parallel_executor.cc | 24 +++++++++++---------- paddle/fluid/framework/parallel_executor.h | 5 ++--- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 57dc663c41..450df244b7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -97,6 +97,10 @@ struct ComputationOpHandle : public OpHandle { void Run() override { // Wait other op if necessary + if (platform::is_gpu_place(place_)) { + int dev_id = boost::get(place_).device; + cudaSetDevice(dev_id); + } auto *cur_ctx = dev_ctx_[place_]; for (auto *in : inputs_) { if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { @@ -637,22 +641,20 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); - std::unordered_map> pending_vars; + std::unordered_map pending_vars; std::unordered_map pending_ops; for (auto &place_pair : member_->vars_) { for (auto &name_pair : place_pair.second) { for (auto &version_pair : name_pair.second) { - pending_vars[&version_pair.second].store( - version_pair.second.generated_op_ == nullptr, - std::memory_order_relaxed); + pending_vars[&version_pair.second] = + version_pair.second.generated_op_ == nullptr; } } } for (auto &var : member_->dep_vars_) { - pending_vars[var.get()].store(var->generated_op_ == nullptr, - std::memory_order_relaxed); + pending_vars[var.get()] = var->generated_op_ == nullptr; } std::vector to_run; @@ -704,7 +706,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, while (!pending_ops.empty()) { VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { - if (pair.second.load(std::memory_order_acquire)) { + if (pair.second) { ready_var = pair.first; } } @@ -737,10 +739,10 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } void ParallelExecutor::RunOp( - std::unordered_map> &pending_vars, + std::unordered_map &pending_vars, OpHandle *op) const { - std::vector *> *ready_buffer = - new std::vector *>(); + std::vector *ready_buffer = + new std::vector(); for (auto *var : op->outputs_) { ready_buffer->emplace_back(&pending_vars[var]); } @@ -751,7 +753,7 @@ void ParallelExecutor::RunOp( op->Run(); VLOG(10) << "Done " << this; for (auto *ready : *ready_buffer) { - ready->store(true, std::memory_order_release); + *ready = true; } delete ready_buffer; } catch (platform::EnforceNotMet ex) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index c3cebcfc57..150b429f94 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -60,9 +60,8 @@ class ParallelExecutor { void BuildNCCLCommunicator() const; - void RunOp( - std::unordered_map>& pending_vars, - OpHandle* op) const; + void RunOp(std::unordered_map& pending_vars, + OpHandle* op) const; void PolishGraphToSupportDataHarzaeds() const; }; From a87ce91c4b93561a913a47350043ef6048f29912 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 18:30:15 +0800 Subject: [PATCH 048/314] Use mtx --- paddle/fluid/framework/parallel_executor.cc | 7 +++---- paddle/fluid/framework/parallel_executor.h | 23 ++++++++++++++++++++- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 450df244b7..773e5c0074 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -641,7 +641,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); - std::unordered_map pending_vars; + std::unordered_map pending_vars; std::unordered_map pending_ops; for (auto &place_pair : member_->vars_) { @@ -739,10 +739,9 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } void ParallelExecutor::RunOp( - std::unordered_map &pending_vars, + std::unordered_map &pending_vars, OpHandle *op) const { - std::vector *ready_buffer = - new std::vector(); + std::vector *ready_buffer = new std::vector(); for (auto *var : op->outputs_) { ready_buffer->emplace_back(&pending_vars[var]); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 150b429f94..b6fa6fb2d8 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -32,6 +32,27 @@ class ParallelExecutorPrivate; class VarHandle; class OpHandle; class VarHandleBase; + +struct GuardedBool { + public: + GuardedBool() {} + + operator bool() const { + std::lock_guard g(mtx_); + return value_; + } + + GuardedBool& operator=(bool o) { + std::lock_guard g(mtx_); + value_ = o; + return *this; + } + + private: + mutable std::mutex mtx_; + bool value_; +}; + class ParallelExecutor { public: explicit ParallelExecutor(const std::vector& places, @@ -60,7 +81,7 @@ class ParallelExecutor { void BuildNCCLCommunicator() const; - void RunOp(std::unordered_map& pending_vars, + void RunOp(std::unordered_map& pending_vars, OpHandle* op) const; void PolishGraphToSupportDataHarzaeds() const; From a5ba704de060f3e23eac74fcdc3e635c1cf6c2a7 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 18:38:36 +0800 Subject: [PATCH 049/314] Counter --- paddle/fluid/framework/parallel_executor.cc | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 773e5c0074..ab0d9f72f7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -748,9 +748,9 @@ void ParallelExecutor::RunOp( auto op_run = [ready_buffer, op, this] { try { - VLOG(10) << op->DebugString() << " " << this; + VLOG(10) << op->DebugString() << " " << op; op->Run(); - VLOG(10) << "Done " << this; + VLOG(10) << "Done " << op; for (auto *ready : *ready_buffer) { *ready = true; } @@ -761,9 +761,7 @@ void ParallelExecutor::RunOp( LOG(FATAL) << "Unknown exception catched"; } }; - VLOG(3) << "Enqueue"; member_->pool_.enqueue(op_run); - VLOG(3) << "Done"; } } // namespace framework } // namespace paddle From d3e55fde032c08e45c8cab83204d73a27c99cfc8 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 18:40:03 +0800 Subject: [PATCH 050/314] Guard devctx --- paddle/fluid/platform/device_context.cc | 1 + paddle/fluid/platform/device_context.h | 1 + 2 files changed, 2 insertions(+) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 98b4178177..37a77c7ea7 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -159,6 +159,7 @@ CUDADeviceContext::~CUDADeviceContext() { Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { + std::lock_guard guard(this->mutex_); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 603b890af1..c43207b641 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -110,6 +110,7 @@ class CUDADeviceContext : public DeviceContext { int compute_capability; int multi_process; int max_threads_per_mp; + mutable std::mutex mutex_; }; template <> From 866f6f1be09bc38a8ed3b51bcfc475b52c07a28a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 18:56:15 +0800 Subject: [PATCH 051/314] Debug --- paddle/fluid/framework/parallel_executor.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ab0d9f72f7..08d508d542 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -703,7 +703,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, RunOp(pending_vars, op); } - while (!pending_ops.empty()) { + while (!pending_vars.empty()) { VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { if (pair.second) { @@ -716,6 +716,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, if (member_->exception_) { throw * member_->exception_; } + VLOG(3) << pending_vars.size(); continue; } @@ -748,9 +749,7 @@ void ParallelExecutor::RunOp( auto op_run = [ready_buffer, op, this] { try { - VLOG(10) << op->DebugString() << " " << op; op->Run(); - VLOG(10) << "Done " << op; for (auto *ready : *ready_buffer) { *ready = true; } From 7bff02b2ca6ab5206406bcda10a46448c5f3a71e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:00:34 +0800 Subject: [PATCH 052/314] Change to pending op --- paddle/fluid/framework/parallel_executor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 08d508d542..ac2c878453 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -703,7 +703,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, RunOp(pending_vars, op); } - while (!pending_vars.empty()) { + while (!pending_ops.empty()) { VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { if (pair.second) { @@ -716,8 +716,8 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, if (member_->exception_) { throw * member_->exception_; } - VLOG(3) << pending_vars.size(); + VLOG(3) << pending_vars.size(); continue; } pending_vars.erase(ready_var); From 5fa535b71785cc2abc58f3e0f76a2e7c73dfd497 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:09:45 +0800 Subject: [PATCH 053/314] Wait all thread done --- paddle/fluid/framework/parallel_executor.cc | 16 ++++++++++++---- paddle/fluid/framework/parallel_executor.h | 7 ++++--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ac2c878453..938f4317b1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -699,8 +699,11 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, pending_ops.insert({op, op->inputs_.size()}); } + std::vector> op_threads; + op_threads.reserve(pending_ops.size() + to_run.size()); + for (auto *op : to_run) { - RunOp(pending_vars, op); + op_threads.emplace_back(RunOp(pending_vars, op)); } while (!pending_ops.empty()) { @@ -731,15 +734,20 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } for (auto *op : to_run) { pending_ops.erase(op); - RunOp(pending_vars, op); + op_threads.emplace_back(RunOp(pending_vars, op)); } } + + for (auto &t : op_threads) { + t.get(); // Join all workers + } + fetch_ops.clear(); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; } -void ParallelExecutor::RunOp( +std::future ParallelExecutor::RunOp( std::unordered_map &pending_vars, OpHandle *op) const { std::vector *ready_buffer = new std::vector(); @@ -760,7 +768,7 @@ void ParallelExecutor::RunOp( LOG(FATAL) << "Unknown exception catched"; } }; - member_->pool_.enqueue(op_run); + return member_->pool_.enqueue(op_run); } } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index b6fa6fb2d8..badf7c5ea7 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,8 +14,8 @@ limitations under the License. */ #pragma once +#include #include - #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/program_desc.h" @@ -81,8 +81,9 @@ class ParallelExecutor { void BuildNCCLCommunicator() const; - void RunOp(std::unordered_map& pending_vars, - OpHandle* op) const; + std::future RunOp( + std::unordered_map& pending_vars, + OpHandle* op) const; void PolishGraphToSupportDataHarzaeds() const; }; From c7beac142609c89343ab862d9a3695e0c077d4cf Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:18:01 +0800 Subject: [PATCH 054/314] Add dummy var --- paddle/fluid/framework/parallel_executor.cc | 32 +++++++++++---------- paddle/fluid/framework/parallel_executor.h | 5 ++-- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 938f4317b1..2fb274d3a5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -53,6 +53,10 @@ struct VarHandle : public VarHandleBase { platform::Place place_; }; +struct DummyVarHandle : public VarHandleBase { + std::string DebugString() const override { return "dummy"; } +}; + struct DependencyVarHandle : public VarHandleBase { std::string DebugString() const override { return "Dependency Variable"; } }; @@ -643,6 +647,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, member_->exception_.reset(); std::unordered_map pending_vars; std::unordered_map pending_ops; + std::vector dummy_vars; for (auto &place_pair : member_->vars_) { for (auto &name_pair : place_pair.second) { @@ -696,17 +701,21 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, var->pending_ops_.emplace(op); op->inputs_.emplace_back(var); } + + dummy_vars.emplace_back(); + auto *var = &dummy_vars.back(); + op->outputs_.emplace_back(var); + var->generated_op_ = op; + pending_vars[var] = false; + pending_ops.insert({op, op->inputs_.size()}); } - std::vector> op_threads; - op_threads.reserve(pending_ops.size() + to_run.size()); - for (auto *op : to_run) { - op_threads.emplace_back(RunOp(pending_vars, op)); + RunOp(pending_vars, op); } - while (!pending_ops.empty()) { + while (!pending_vars.empty()) { VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { if (pair.second) { @@ -715,12 +724,9 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } if (ready_var == nullptr) { // FIXME use conditional var instead of busy wait. - if (member_->exception_) { throw * member_->exception_; } - - VLOG(3) << pending_vars.size(); continue; } pending_vars.erase(ready_var); @@ -734,20 +740,16 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } for (auto *op : to_run) { pending_ops.erase(op); - op_threads.emplace_back(RunOp(pending_vars, op)); + RunOp(pending_vars, op); } } - for (auto &t : op_threads) { - t.get(); // Join all workers - } - fetch_ops.clear(); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; } -std::future ParallelExecutor::RunOp( +void ParallelExecutor::RunOp( std::unordered_map &pending_vars, OpHandle *op) const { std::vector *ready_buffer = new std::vector(); @@ -768,7 +770,7 @@ std::future ParallelExecutor::RunOp( LOG(FATAL) << "Unknown exception catched"; } }; - return member_->pool_.enqueue(op_run); + member_->pool_.enqueue(op_run); } } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index badf7c5ea7..8fe93fb62e 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -81,9 +81,8 @@ class ParallelExecutor { void BuildNCCLCommunicator() const; - std::future RunOp( - std::unordered_map& pending_vars, - OpHandle* op) const; + void RunOp(std::unordered_map& pending_vars, + OpHandle* op) const; void PolishGraphToSupportDataHarzaeds() const; }; From 1f53193a630bc3b6289154dd5f5334a45ddb9285 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:22:03 +0800 Subject: [PATCH 055/314] Use atomic code --- paddle/fluid/framework/parallel_executor.cc | 13 ++++++----- paddle/fluid/framework/parallel_executor.h | 25 +++------------------ 2 files changed, 10 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2fb274d3a5..fa6763b5b5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -645,7 +645,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); - std::unordered_map pending_vars; + std::unordered_map> pending_vars; std::unordered_map pending_ops; std::vector dummy_vars; @@ -694,7 +694,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, op->offset_ = i; op->local_scopes_ = &member_->local_scopes_; for (auto &p : member_->places_) { - op->dev_ctx_[p] = this->member_->GetNCCLCtx(p).ctx_.get(); + op->dev_ctx_[p] = member_->GetNCCLCtx(p).ctx_.get(); } for (auto *var : vars) { @@ -718,7 +718,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, while (!pending_vars.empty()) { VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { - if (pair.second) { + if (pair.second.load(std::memory_order_consume)) { ready_var = pair.first; } } @@ -750,9 +750,10 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } void ParallelExecutor::RunOp( - std::unordered_map &pending_vars, + std::unordered_map> &pending_vars, OpHandle *op) const { - std::vector *ready_buffer = new std::vector(); + std::vector *> *ready_buffer = + new std::vector *>(); for (auto *var : op->outputs_) { ready_buffer->emplace_back(&pending_vars[var]); } @@ -761,7 +762,7 @@ void ParallelExecutor::RunOp( try { op->Run(); for (auto *ready : *ready_buffer) { - *ready = true; + ready->store(true, std::memory_order_release); } delete ready_buffer; } catch (platform::EnforceNotMet ex) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 8fe93fb62e..03bf60b8bc 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -33,26 +33,6 @@ class VarHandle; class OpHandle; class VarHandleBase; -struct GuardedBool { - public: - GuardedBool() {} - - operator bool() const { - std::lock_guard g(mtx_); - return value_; - } - - GuardedBool& operator=(bool o) { - std::lock_guard g(mtx_); - value_ = o; - return *this; - } - - private: - mutable std::mutex mtx_; - bool value_; -}; - class ParallelExecutor { public: explicit ParallelExecutor(const std::vector& places, @@ -81,8 +61,9 @@ class ParallelExecutor { void BuildNCCLCommunicator() const; - void RunOp(std::unordered_map& pending_vars, - OpHandle* op) const; + void RunOp( + std::unordered_map>& pending_vars, + OpHandle* op) const; void PolishGraphToSupportDataHarzaeds() const; }; From 3aa7051b980c10eb73c591302f379671540042bd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:23:40 +0800 Subject: [PATCH 056/314] Remove DevCtx lock --- paddle/fluid/platform/device_context.cc | 1 - paddle/fluid/platform/device_context.h | 1 - 2 files changed, 2 deletions(-) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 37a77c7ea7..98b4178177 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -159,7 +159,6 @@ CUDADeviceContext::~CUDADeviceContext() { Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { - std::lock_guard guard(this->mutex_); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index c43207b641..603b890af1 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -110,7 +110,6 @@ class CUDADeviceContext : public DeviceContext { int compute_capability; int multi_process; int max_threads_per_mp; - mutable std::mutex mutex_; }; template <> From d7badb3ed2d4fdcc42a81dffedf68e131daf5fdb Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:33:35 +0800 Subject: [PATCH 057/314] Use event to sync stream --- paddle/fluid/framework/parallel_executor.cc | 30 ++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index fa6763b5b5..6777aec488 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -315,9 +315,21 @@ ncclDataType_t ToNCCLDataType(std::type_index type) { struct NCCLAllReduceOpHandle : public OpHandle { ParallelExecutorPrivate *member_; + std::vector events_; explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) - : member_(member) {} + : member_(member) { + events_.resize(member_->places_.size()); + for (auto &ev : events_) { + cudaEventCreateWithFlags(&ev, cudaEventDisableTiming); + } + } + + ~NCCLAllReduceOpHandle() { + for (auto &ev : events_) { + cudaEventDestroy(ev); + } + } void Run() override { if (this->inputs_.size() == 1) { @@ -350,6 +362,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm, nccl_ctx.stream()); + cudaEventRecord(events_[i], nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); @@ -357,8 +370,19 @@ struct NCCLAllReduceOpHandle : public OpHandle { } void Wait(platform::DeviceContext *waited_dev) override { - for (auto &pair : member_->communication_streams_) { - pair.second.ctx_->Wait(); + if (platform::is_cpu_place( + waited_dev->GetPlace())) { // Wait by CPU, just sync stream + for (auto &pair : member_->communication_streams_) { + pair.second.ctx_->Wait(); + } + } else { + if (events_.size() > 1) { + auto stream = + static_cast(waited_dev)->stream(); + for (auto &ev : events_) { + cudaStreamWaitEvent(stream, ev, 0); + } + } } } }; From 29cc9f308d151c23ddbaeef69530f3c7c56a6ce4 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:39:13 +0800 Subject: [PATCH 058/314] SetDev for nccl --- paddle/fluid/framework/parallel_executor.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 6777aec488..f7dc833937 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -358,7 +358,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { } auto &nccl_ctx = member_->communication_streams_.at(dev_id); - + cudaSetDevice(dev_id); platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm, nccl_ctx.stream()); @@ -519,7 +519,6 @@ void ParallelExecutor::ConstructDependencyGraph( var.name_ = og; var.version_ = vars.size() - 1; op_handle->outputs_.emplace_back(&var); - op_handle->dev_ctx_[p] = member_->CommunicationDevCtx(p); } } From 8af57706e216131937b26ddbd83338883de0d5d1 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:44:31 +0800 Subject: [PATCH 059/314] Only wait same device --- paddle/fluid/framework/parallel_executor.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f7dc833937..1d9584939f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -315,19 +315,19 @@ ncclDataType_t ToNCCLDataType(std::type_index type) { struct NCCLAllReduceOpHandle : public OpHandle { ParallelExecutorPrivate *member_; - std::vector events_; + std::unordered_map events_; explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) : member_(member) { - events_.resize(member_->places_.size()); - for (auto &ev : events_) { - cudaEventCreateWithFlags(&ev, cudaEventDisableTiming); + for (auto &nccl : member_->communication_streams_) { + cudaEventCreate(&events_[nccl.second.device_id()], + cudaEventDisableTiming); } } ~NCCLAllReduceOpHandle() { for (auto &ev : events_) { - cudaEventDestroy(ev); + cudaEventDestroy(ev.second); } } @@ -362,7 +362,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm, nccl_ctx.stream()); - cudaEventRecord(events_[i], nccl_ctx.stream()); + cudaEventRecord(events_[dev_id], nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); @@ -377,11 +377,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { } } else { if (events_.size() > 1) { + int dev_id = + boost::get(waited_dev->GetPlace()).device; auto stream = static_cast(waited_dev)->stream(); - for (auto &ev : events_) { - cudaStreamWaitEvent(stream, ev, 0); - } + cudaStreamWaitEvent(stream, events_[dev_id], 0); } } } From 071043c388990465531c14a3ec7644fb80204f08 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:47:55 +0800 Subject: [PATCH 060/314] Add paddle enforce --- paddle/fluid/framework/parallel_executor.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1d9584939f..2e13b3c8c1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -320,14 +320,14 @@ struct NCCLAllReduceOpHandle : public OpHandle { explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) : member_(member) { for (auto &nccl : member_->communication_streams_) { - cudaEventCreate(&events_[nccl.second.device_id()], - cudaEventDisableTiming); + PADDLE_ENFORCE(cudaEventCreate(&events_[nccl.second.device_id()], + cudaEventDisableTiming)); } } ~NCCLAllReduceOpHandle() { for (auto &ev : events_) { - cudaEventDestroy(ev.second); + PADDLE_ENFORCE(cudaEventDestroy(ev.second)); } } @@ -362,7 +362,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm, nccl_ctx.stream()); - cudaEventRecord(events_[dev_id], nccl_ctx.stream()); + PADDLE_ENFORCE(cudaEventRecord(events_[dev_id], nccl_ctx.stream())); } platform::dynload::ncclGroupEnd(); @@ -381,7 +381,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { boost::get(waited_dev->GetPlace()).device; auto stream = static_cast(waited_dev)->stream(); - cudaStreamWaitEvent(stream, events_[dev_id], 0); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events_[dev_id], 0)); } } } From 9824e8f31160e5a7c6723d58060a9e3d515a684a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 19:55:39 +0800 Subject: [PATCH 061/314] Scale loss op use event --- paddle/fluid/framework/parallel_executor.cc | 24 +++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2e13b3c8c1..dc614fc6ba 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -124,12 +124,17 @@ struct ScaleLossGradOpHandle : public OpHandle { float coeff_; Scope *scope_; platform::Place place_; + cudaEvent_t ev_; explicit ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place) : coeff_(static_cast(1.0 / num_dev)), scope_(scope), - place_(place) {} + place_(place) { + PADDLE_ENFORCE(cudaEventCreateWithFlags(&ev_, cudaEventDisableTiming)); + } + + ~ScaleLossGradOpHandle() { PADDLE_ENFORCE(cudaEventDestroy(ev_)); } void Run() override { std::string var_name = static_cast(this->outputs_[0])->name_; @@ -141,16 +146,23 @@ struct ScaleLossGradOpHandle : public OpHandle { if (platform::is_cpu_place(place_)) { *tmp = coeff_; } else { - memory::Copy( - boost::get(place_), tmp, platform::CPUPlace(), - &coeff_, sizeof(float), + auto stream = static_cast(this->dev_ctx_[place_]) - ->stream()); + ->stream(); + memory::Copy(boost::get(place_), tmp, + platform::CPUPlace(), &coeff_, sizeof(float), stream); + PADDLE_ENFORCE(cudaEventRecord(ev_, stream)); } } void Wait(platform::DeviceContext *waited_dev) override { - this->dev_ctx_.at(place_)->Wait(); + if (platform::is_cpu_place(waited_dev->GetPlace())) { + this->dev_ctx_.at(place_)->Wait(); + } else { + auto stream = + static_cast(waited_dev)->stream(); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev_, 0)); + } } }; From 4a330094f9f3e090847a287bb4fe707852c45fc3 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:04:35 +0800 Subject: [PATCH 062/314] Add log --- paddle/fluid/framework/parallel_executor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dc614fc6ba..94c61461c0 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -795,6 +795,7 @@ void ParallelExecutor::RunOp( auto op_run = [ready_buffer, op, this] { try { + VLOG(10) << op->DebugString(); op->Run(); for (auto *ready : *ready_buffer) { ready->store(true, std::memory_order_release); From bade579826d0e6e82b62b6f0b630dbfee35f65d2 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:08:52 +0800 Subject: [PATCH 063/314] Wait code --- paddle/fluid/framework/parallel_executor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 94c61461c0..bc9035b302 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -193,7 +193,8 @@ struct FetchOpHandle : public OpHandle { void Run() override { for (auto *input : inputs_) { - input->generated_op_->Wait(nullptr); + auto *var = static_cast(input); + var->generated_op_->Wait(this->dev_ctx_[var->place_]); } tensors_.resize(inputs_.size()); From 7fd0d24e0cf185251d861a81eabcda3a37b907fa Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:13:35 +0800 Subject: [PATCH 064/314] Add lgo --- paddle/fluid/framework/parallel_executor.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index bc9035b302..df04cfc461 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -149,9 +149,15 @@ struct ScaleLossGradOpHandle : public OpHandle { auto stream = static_cast(this->dev_ctx_[place_]) ->stream(); + VLOG(3) << "1"; + PADDLE_ENFORCE(cudaGetLastError()); + VLOG(3) << "2"; memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); + PADDLE_ENFORCE(cudaGetLastError()); + VLOG(3) << "3"; PADDLE_ENFORCE(cudaEventRecord(ev_, stream)); + VLOG(3) << "4"; } } From dad7bdabd42ac2eeef7b3cb004ca64b6ad388cde Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:17:32 +0800 Subject: [PATCH 065/314] Add setDev --- paddle/fluid/framework/parallel_executor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index df04cfc461..c3a90149a1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -149,6 +149,7 @@ struct ScaleLossGradOpHandle : public OpHandle { auto stream = static_cast(this->dev_ctx_[place_]) ->stream(); + cudaSetDevice(boost::get(place_).device); VLOG(3) << "1"; PADDLE_ENFORCE(cudaGetLastError()); VLOG(3) << "2"; @@ -163,7 +164,7 @@ struct ScaleLossGradOpHandle : public OpHandle { void Wait(platform::DeviceContext *waited_dev) override { if (platform::is_cpu_place(waited_dev->GetPlace())) { - this->dev_ctx_.at(place_)->Wait(); + dev_ctx_.at(place_)->Wait(); } else { auto stream = static_cast(waited_dev)->stream(); From 932364a27597e141b167694d9ec94e615965cbfc Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:21:50 +0800 Subject: [PATCH 066/314] Sync dev --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index c3a90149a1..67e7078fbc 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -155,7 +155,7 @@ struct ScaleLossGradOpHandle : public OpHandle { VLOG(3) << "2"; memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - PADDLE_ENFORCE(cudaGetLastError()); + PADDLE_ENFORCE(cudaDeviceSynchronize()); VLOG(3) << "3"; PADDLE_ENFORCE(cudaEventRecord(ev_, stream)); VLOG(3) << "4"; From d55a03d916f2a587d5fd9d2eefc750f20813d3b0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:25:00 +0800 Subject: [PATCH 067/314] Scale loss on place --- paddle/fluid/framework/parallel_executor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 67e7078fbc..21d9fd259c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -146,6 +146,7 @@ struct ScaleLossGradOpHandle : public OpHandle { if (platform::is_cpu_place(place_)) { *tmp = coeff_; } else { + VLOG(3) << "Scale loss on place" << place_; auto stream = static_cast(this->dev_ctx_[place_]) ->stream(); From d26f093f9d1f5c3a64f42821cb52fda95b4a54c1 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:32:02 +0800 Subject: [PATCH 068/314] Log --- paddle/fluid/framework/parallel_executor.cc | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 21d9fd259c..1a2e6a5f86 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -132,9 +132,13 @@ struct ScaleLossGradOpHandle : public OpHandle { scope_(scope), place_(place) { PADDLE_ENFORCE(cudaEventCreateWithFlags(&ev_, cudaEventDisableTiming)); + VLOG(3) << "Create " << ev_; } - ~ScaleLossGradOpHandle() { PADDLE_ENFORCE(cudaEventDestroy(ev_)); } + ~ScaleLossGradOpHandle() { + VLOG(3) << "Destroy " << ev_; + PADDLE_ENFORCE(cudaEventDestroy(ev_)); + } void Run() override { std::string var_name = static_cast(this->outputs_[0])->name_; @@ -146,20 +150,13 @@ struct ScaleLossGradOpHandle : public OpHandle { if (platform::is_cpu_place(place_)) { *tmp = coeff_; } else { - VLOG(3) << "Scale loss on place" << place_; auto stream = static_cast(this->dev_ctx_[place_]) ->stream(); cudaSetDevice(boost::get(place_).device); - VLOG(3) << "1"; - PADDLE_ENFORCE(cudaGetLastError()); - VLOG(3) << "2"; memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - PADDLE_ENFORCE(cudaDeviceSynchronize()); - VLOG(3) << "3"; PADDLE_ENFORCE(cudaEventRecord(ev_, stream)); - VLOG(3) << "4"; } } From 99f85a9fbc704424ab99a0327d09f49d46f82be0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:35:07 +0800 Subject: [PATCH 069/314] Set dev --- paddle/fluid/framework/parallel_executor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1a2e6a5f86..b78dc3b8ae 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -131,6 +131,7 @@ struct ScaleLossGradOpHandle : public OpHandle { : coeff_(static_cast(1.0 / num_dev)), scope_(scope), place_(place) { + cudaSetDevice(boost::get(place_).device); PADDLE_ENFORCE(cudaEventCreateWithFlags(&ev_, cudaEventDisableTiming)); VLOG(3) << "Create " << ev_; } From b94ffacbd722b752871715a78cee52a151fd5445 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:38:43 +0800 Subject: [PATCH 070/314] SetDev --- paddle/fluid/framework/parallel_executor.cc | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b78dc3b8ae..3a92494e7e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -132,12 +132,12 @@ struct ScaleLossGradOpHandle : public OpHandle { scope_(scope), place_(place) { cudaSetDevice(boost::get(place_).device); + // Must set device before create event PADDLE_ENFORCE(cudaEventCreateWithFlags(&ev_, cudaEventDisableTiming)); - VLOG(3) << "Create " << ev_; } ~ScaleLossGradOpHandle() { - VLOG(3) << "Destroy " << ev_; + cudaSetDevice(boost::get(place_).device); PADDLE_ENFORCE(cudaEventDestroy(ev_)); } @@ -339,13 +339,15 @@ struct NCCLAllReduceOpHandle : public OpHandle { explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) : member_(member) { for (auto &nccl : member_->communication_streams_) { - PADDLE_ENFORCE(cudaEventCreate(&events_[nccl.second.device_id()], - cudaEventDisableTiming)); + int dev_id = nccl.second.device_id(); + cudaSetDevice(dev_id); + PADDLE_ENFORCE(cudaEventCreate(&events_[dev_id], cudaEventDisableTiming)); } } ~NCCLAllReduceOpHandle() { for (auto &ev : events_) { + cudaSetDevice(ev.first); PADDLE_ENFORCE(cudaEventDestroy(ev.second)); } } From ee697b8b5a8522d2cec7e44520c28dfc43054c67 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:44:12 +0800 Subject: [PATCH 071/314] Larger model --- .../tests/unittests/test_parallel_executor.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index e156d5b60e..148f0ce5bb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -46,12 +46,14 @@ class ParallelExecutor(unittest.TestCase): lod_levels=[0, 0], dtypes=['float32', 'int64']) img, label = fluid.layers.read_file(reader) - hidden = fluid.layers.fc( - img, - size=200, - act='tanh', - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1.0))) + hidden = img + for _ in xrange(10): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) prediction = fluid.layers.fc(hidden, size=10, act='softmax') loss = fluid.layers.cross_entropy(input=prediction, label=label) loss = fluid.layers.mean(loss) From 48619bc9817c0df92f63e5cbaa5206f7f6ab983b Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:45:50 +0800 Subject: [PATCH 072/314] Shrink model --- python/paddle/fluid/tests/unittests/test_parallel_executor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 148f0ce5bb..c0ec6442de 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -47,7 +47,7 @@ class ParallelExecutor(unittest.TestCase): dtypes=['float32', 'int64']) img, label = fluid.layers.read_file(reader) hidden = img - for _ in xrange(10): + for _ in xrange(2): hidden = fluid.layers.fc( hidden, size=200, From c372ce2885684f9d4af26e2e894d70c33e5d4cc8 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 19 Mar 2018 20:54:55 +0800 Subject: [PATCH 073/314] Add event for computational op --- paddle/fluid/framework/parallel_executor.cc | 26 +++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3a92494e7e..f841b3b7fa 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -92,12 +92,22 @@ struct ComputationOpHandle : public OpHandle { std::unique_ptr op_; Scope *scope_; platform::Place place_; + cudaEvent_t event_; explicit ComputationOpHandle(const OpDesc &op_desc, Scope *scope, platform::Place place) : op_(framework::OpRegistry::CreateOp(op_desc)), scope_(scope), - place_(place) {} + place_(place) { + if (platform::is_gpu_place(place)) { + cudaSetDevice(boost::get(place_).device); + cudaEventCreateWithFlags(&event_, cudaEventDisableTiming); + } + } + + ~ComputationOpHandle() { + // FIXME: Destroy Event + } void Run() override { // Wait other op if necessary @@ -113,10 +123,22 @@ struct ComputationOpHandle : public OpHandle { } op_->Run(*scope_, place_); + if (platform::is_gpu_place(place_)) { + auto stream = static_cast(dev_ctx_[place_]) + ->stream(); + PADDLE_ENFORCE(cudaEventRecord(event_, stream)); + } } void Wait(platform::DeviceContext *waited_dev) override { - this->dev_ctx_.at(place_)->Wait(); + if (platform::is_cpu_place(waited_dev->GetPlace()) || + platform::is_cpu_place(place_)) { + this->dev_ctx_.at(place_)->Wait(); + } else { + auto stream = + static_cast(waited_dev)->stream(); + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, event_, 0)); + } } }; From c18c2f6ab01082e14e76fdbcf384f577239bcc0f Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 12:15:06 +0800 Subject: [PATCH 074/314] Sync all computation streams at the end of run --- paddle/fluid/framework/parallel_executor.cc | 12 +++++++++--- paddle/fluid/framework/parallel_executor.h | 2 +- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f841b3b7fa..0f9bc86972 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -482,7 +482,6 @@ void ParallelExecutor::ConstructDependencyGraph( bool is_forwarding = true; for (auto *op : main_program.Block(0).AllOps()) { bool change_forward = false; - if (!is_forwarding) { // FIXME(yy): Do not hard code like this if (op->OutputArgumentNames().size() == 1 && @@ -573,7 +572,7 @@ void ParallelExecutor::ConstructDependencyGraph( Dependency graph has been constructed. However, there are still data harzaeds need to be handled. */ - PolishGraphToSupportDataHarzaeds(); + PolishGraphToSupportDataHazards(); } /** @@ -583,7 +582,7 @@ void ParallelExecutor::ConstructDependencyGraph( * * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) */ -void ParallelExecutor::PolishGraphToSupportDataHarzaeds() const { +void ParallelExecutor::PolishGraphToSupportDataHazards() const { for (auto &place_pair : member_->vars_) { for (auto &name_pair : place_pair.second) { if (name_pair.second.size() <= 1) { @@ -813,6 +812,13 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, fetch_ops.clear(); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; + + // FIXME: + // It could be optimized by using multiple events in an operator. + // Manually sync computation during iter. + for (auto &p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } } void ParallelExecutor::RunOp( diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 03bf60b8bc..cb93c0cd41 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -65,7 +65,7 @@ class ParallelExecutor { std::unordered_map>& pending_vars, OpHandle* op) const; - void PolishGraphToSupportDataHarzaeds() const; + void PolishGraphToSupportDataHazards() const; }; } // namespace framework From d3c82c356e806d17d399f152948dee3c8ac169e8 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 12:18:37 +0800 Subject: [PATCH 075/314] Wait multiple stream --- paddle/fluid/framework/parallel_executor.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 0f9bc86972..f4f5ab6a6f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -816,6 +816,10 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, // FIXME: // It could be optimized by using multiple events in an operator. // Manually sync computation during iter. + for (auto &s : member_->communication_streams_) { + s.second.ctx_->Wait(); + } + for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } From 3da4159f88e8715abb60f6a8c475b4d59b8f3ef6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 12:20:56 +0800 Subject: [PATCH 076/314] Add run iter --- paddle/fluid/framework/parallel_executor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f4f5ab6a6f..1847a4dfa5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -707,6 +707,7 @@ void ParallelExecutor::BuildNCCLCommunicator() const { void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { + VLOG(3) << "Run iter"; auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); From 4137bb4eda7692b06b986ed7ede8f09ec2f28fb0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 12:28:40 +0800 Subject: [PATCH 077/314] Add wait --- paddle/fluid/framework/parallel_executor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1847a4dfa5..d3122353af 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -813,7 +813,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, fetch_ops.clear(); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; - + VLOG(3) << "Before Wait"; // FIXME: // It could be optimized by using multiple events in an operator. // Manually sync computation during iter. @@ -824,6 +824,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } + VLOG(3) << "Done wait"; } void ParallelExecutor::RunOp( From d2cb3790e9aecc74cd9915b12346a4c7076f5510 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 12:38:15 +0800 Subject: [PATCH 078/314] Wait all evernts --- paddle/fluid/framework/parallel_executor.cc | 6 +++--- .../paddle/fluid/tests/unittests/test_parallel_executor.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d3122353af..cb1b080eea 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -420,11 +420,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { } } else { if (events_.size() > 1) { - int dev_id = - boost::get(waited_dev->GetPlace()).device; auto stream = static_cast(waited_dev)->stream(); - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, events_[dev_id], 0)); + for (auto &ev : events_) { + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); + } } } } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index c0ec6442de..cabb8e769d 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -47,7 +47,7 @@ class ParallelExecutor(unittest.TestCase): dtypes=['float32', 'int64']) img, label = fluid.layers.read_file(reader) hidden = img - for _ in xrange(2): + for _ in xrange(4): hidden = fluid.layers.fc( hidden, size=200, @@ -60,7 +60,7 @@ class ParallelExecutor(unittest.TestCase): adam = fluid.optimizer.Adam() adam.minimize(loss) act_places = [] - for each in [fluid.CUDAPlace(0), fluid.CUDAPlace(1)]: + for each in [fluid.CUDAPlace(0)]: p = fluid.core.Place() p.set_place(each) act_places.append(p) From 8a9de67e179bea067302da949e76d36822ccd9dd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 12:42:27 +0800 Subject: [PATCH 079/314] Remove wait --- paddle/fluid/framework/parallel_executor.cc | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index cb1b080eea..409cb3fbb9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -813,18 +813,6 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, fetch_ops.clear(); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; - VLOG(3) << "Before Wait"; - // FIXME: - // It could be optimized by using multiple events in an operator. - // Manually sync computation during iter. - for (auto &s : member_->communication_streams_) { - s.second.ctx_->Wait(); - } - - for (auto &p : member_->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - VLOG(3) << "Done wait"; } void ParallelExecutor::RunOp( From 3238ce06727d1daadfd5c93c12b7e9073f75e695 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 12:47:01 +0800 Subject: [PATCH 080/314] Add wait --- paddle/fluid/framework/parallel_executor.cc | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 409cb3fbb9..6408ecdd37 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -813,6 +813,16 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, fetch_ops.clear(); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetched_data->tensors_; + // FIXME: + // It could be optimized by using multiple events in an operator. + // Manually sync computation during iter. + for (auto &s : member_->communication_streams_) { + s.second.ctx_->Wait(); + } + + for (auto &p : member_->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } } void ParallelExecutor::RunOp( From e025e284c662ccab9089359eadb07637ae32f19a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 12:56:03 +0800 Subject: [PATCH 081/314] Exchange wait op --- paddle/fluid/framework/parallel_executor.cc | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 6408ecdd37..07dfddfa30 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -810,19 +810,13 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } } - fetch_ops.clear(); - *member_->global_scope_->Var(fetched_var_name)->GetMutable() = - fetched_data->tensors_; - // FIXME: - // It could be optimized by using multiple events in an operator. - // Manually sync computation during iter. - for (auto &s : member_->communication_streams_) { - s.second.ctx_->Wait(); - } - for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } + + fetch_ops.clear(); + *member_->global_scope_->Var(fetched_var_name)->GetMutable() = + fetched_data->tensors_; } void ParallelExecutor::RunOp( From 260cfe3b865d48a09ff903bb1f7816d1d055da73 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 13:08:46 +0800 Subject: [PATCH 082/314] Stop Wait NCCL Stream --- paddle/fluid/framework/parallel_executor.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 07dfddfa30..d0c4d8dd8b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -211,9 +211,6 @@ struct FetchOpHandle : public OpHandle { for (auto *input_var : inputs_) { input_var->pending_ops_.erase(this); } - for (auto &pair : dev_ctx_) { - pair.second->Wait(); - } // Lazily merge tensors. Will faster code. MergeTensors(); From feb569f8ea9808dadce26e9ebdad43d9a7e67587 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 14:59:12 +0800 Subject: [PATCH 083/314] Add log --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d0c4d8dd8b..f9fc35d8ce 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -376,7 +376,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { return; // No need to all reduce when GPU count = 1; } else { auto &var_name = static_cast(this->inputs_[0])->name_; - + VLOG(3) << "Invoke NCCL AllReduce"; int dtype = -1; size_t numel = 0; From e50205e744753f5a6c93f49bd74e00aa7cc642d2 Mon Sep 17 00:00:00 2001 From: sabreshao Date: Tue, 20 Mar 2018 13:46:48 +0800 Subject: [PATCH 084/314] CMake refine for HIP support. 1. Add option WITH_AMD_GPU. 2. Add cmake/hip.cmake for HIP toolchain. 3. Some external module such as eigen may need HIP port. 4. Add macro hip_library/hip_binary/hip_test to cmake/generic.cmake. 5. Add one HIP source concat.hip.cu as an example. Each .cu may have its corresponding .hip.cu. --- CMakeLists.txt | 3 - cmake/external/eigen.cmake | 4 +- cmake/hip.cmake | 3 - paddle/fluid/operators/CMakeLists.txt | 33 ++- paddle/fluid/operators/math/concat.hip.cu | 268 +--------------------- paddle/scripts/docker/build.sh | 4 +- 6 files changed, 33 insertions(+), 282 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 399bf50748..1e11f86d0e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -70,9 +70,6 @@ if(NOT CMAKE_BUILD_TYPE) FORCE) endif() -if(WITH_AMD_GPU) -endif() - if(ANDROID OR IOS) if(ANDROID) if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16") diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 5d88c5a0b0..73d70c34dc 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -1,8 +1,8 @@ INCLUDE(ExternalProject) SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) - -INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3) +SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) +INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) if(WITH_AMD_GPU) ExternalProject_Add( diff --git a/cmake/hip.cmake b/cmake/hip.cmake index cd880603a7..bfe491bd6b 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -27,9 +27,6 @@ endif(WITH_TESTING) if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) -elseif(CMAKE_BUILD_TYPE STREQUAL "Release") -# Disable optimization since one eigen symbol will be removed in math_function.cu - #list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE}) elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 26d1dab1e9..c0245379ac 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -12,6 +12,8 @@ function(op_library TARGET) set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) set(cc_srcs) set(cu_srcs) + set(hip_cu_srcs) + set(miopen_hip_cc_srcs) set(cu_cc_srcs) set(cudnn_cu_cc_srcs) set(CUDNN_FILE) @@ -36,10 +38,19 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu) endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu) + list(APPEND hip_cu_srcs ${TARGET}.hip.cu) + endif() string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}") if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) endif() + if(WITH_AMD_GPU) + string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc) + list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc) + endif() + endif() if(WITH_MKLDNN) string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc) @@ -48,10 +59,14 @@ function(op_library TARGET) endif() else() foreach(src ${op_library_SRCS}) - if (${src} MATCHES ".*\\.cu$") + if (${src} MATCHES ".*\\.hip.cu$") + list(APPEND hip_cu_srcs ${src}) + elseif (${src} MATCHES ".*\\.cu$") list(APPEND cu_srcs ${src}) elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") list(APPEND cudnn_cu_cc_srcs ${src}) + elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$") + list(APPEND miopen_hip_cc_srcs ${src}) elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") list(APPEND mkldnn_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cu.cc$") @@ -77,8 +92,8 @@ function(op_library TARGET) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) elseif (WITH_AMD_GPU) - hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS - ${op_library_DEPS} ${op_common_deps}) + hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) else() cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) @@ -91,7 +106,7 @@ function(op_library TARGET) endif() endforeach() - # The registration of USE_OP, please refer to paddle/framework/op_registry.h. + # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h. # Note that it's enough to just adding one operator to pybind in a *_op.cc file. # And for detail pybind information, please see generated paddle/pybind/pybind.h. file(READ ${TARGET}.cc TARGET_CONTENT) @@ -117,7 +132,10 @@ function(op_library TARGET) list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) - if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) + list(LENGTH hip_cu_srcs hip_cu_srcs_len) + list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len) + if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND + ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) endif() @@ -128,6 +146,11 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() + # pybind USE_OP_DEVICE_KERNEL for MIOPEN + if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") + endif() + # pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") diff --git a/paddle/fluid/operators/math/concat.hip.cu b/paddle/fluid/operators/math/concat.hip.cu index 91efd8ea57..eacef04388 100644 --- a/paddle/fluid/operators/math/concat.hip.cu +++ b/paddle/fluid/operators/math/concat.hip.cu @@ -12,270 +12,4 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "hip/hip_runtime.h" -#include "paddle/fluid/framework/mixed_vector.h" -#include "paddle/fluid/operators/math/concat.h" -#include "paddle/fluid/platform/cuda_helper.h" - -namespace paddle { -namespace operators { -namespace math { - -template -__device__ T upper_bound(const T* first, T count, T val) { - const T* orig = first; - const T* it = nullptr; - T step = 0; - while (count > 0) { - it = first; - step = count / 2; - it += step; - if (!(val < *it)) { - first = ++it; - count -= step + 1; - } else { - count = step; - } - } - return first - orig; -} - -template -__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, - const int output_rows, const int output_cols, - T* output) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int segment = upper_bound(input_cols, col_size, tid_x) - 1; - - int curr_offset = input_cols[segment]; - int curr_segment = segment; - for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { - T curr_col_offset; - while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) { - curr_offset = curr_col_offset; - ++curr_segment; - } - - int local_col = tid_x - curr_offset; - int segment_width = curr_col_offset - curr_offset; - T* input_ptr = inputs[curr_segment]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) - output[tid_y * output_cols + tid_x] = - input_ptr[tid_y * segment_width + local_col]; - } -} - -template -__global__ void KernelConcat(T** inputs, const int input_col, - const int output_rows, const int output_cols, - T* output) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - double inv_input_col = 1.0 / input_col; - for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { - int split = tid_x * inv_input_col; - int in_offset = tid_x - split * input_col; - T* input_ptr = inputs[split]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) { - output[tid_y * output_cols + tid_x] = - input_ptr[tid_y * input_col + in_offset]; - } - } -} - -template -__global__ void KernelConcatGrad(const T* input, const int input_row, - const int input_col, const int* output_cols, - int col_size, T** outputs) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int segment = upper_bound(output_cols, col_size, tid_x) - 1; - int curr_offset = output_cols[segment]; - int curr_segment = segment; - for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { - T curr_col_offset; - while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) { - curr_offset = curr_col_offset; - ++curr_segment; - } - - int local_col = tid_x - curr_offset; - int segment_width = curr_col_offset - curr_offset; - T* output_ptr = outputs[curr_segment]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * segment_width + local_col] = - input[tid_y * input_col + tid_x]; - } -} - -template -__global__ void KernelConcatGrad(const T* input, const int input_row, - const int input_col, const int output_cols, - T** outputs) { - int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - double inv_input_col = 1.0 / input_col; - for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { - int split = tid_x * inv_input_col; - int in_offset = tid_x - split * input_col; - T* output_ptr = outputs[split]; - int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * output_cols + in_offset] = - input[tid_y * input_col + tid_x]; - } -} - -/* - * All tensors' dimension should be the same and the values of - * each dimension are the same, except the axis dimension. - */ -template -class ConcatFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const std::vector& input, const int axis, - framework::Tensor* output) { - // TODO(zcd): Add input data validity checking - int num = input.size(); - int rows = 1; - auto dim_0 = input[0].dims(); - for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; - } - int cols = input[0].numel() / rows; - int out_rows = rows, out_cols = 0; - - framework::Vector inputs_data(num * sizeof(T*) / 2); - framework::Vector inputs_cols(num + 1); - inputs_cols[0] = 0; - T** inputs_ptr = reinterpret_cast(inputs_data.data()); - - bool sameShape = true; - for (int i = 0; i < num; ++i) { - int t_cols = input[i].numel() / rows; - if (sameShape) { - if (t_cols != cols) sameShape = false; - } - out_cols += t_cols; - inputs_cols[i + 1] = out_cols; - inputs_ptr[i] = const_cast(input[i].data()); - } - - T** ins_gpu = - reinterpret_cast(inputs_data.CUDAMutableData(context.GetPlace())); - const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace()); - - // computation - // set the thread block and grid according to CurrentDeviceId - const int kThreadsPerBlock = 1024; - int block_cols = kThreadsPerBlock; - if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. - block_cols = ((out_cols + 31) >> 5) << 5; - } - int block_rows = kThreadsPerBlock / block_cols; - dim3 block_size = dim3(block_cols, block_rows, 1); - - int max_threads = context.GetMaxPhysicalThreadCount(); - int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); - - int grid_cols = - std::min((out_cols + block_cols - 1) / block_cols, max_blocks); - int grid_rows = - std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); - dim3 grid_size = dim3(grid_cols, grid_rows, 1); - - if (sameShape) { - hipLaunchKernelGGL((KernelConcat), dim3(grid_size), dim3(block_size), 0, context.stream(), - ins_gpu, cols, out_rows, out_cols, output->data()); - } else { - hipLaunchKernelGGL((KernelConcat), dim3(grid_size), dim3(block_size), 0, context.stream(), - ins_gpu, ins_col_gpu, static_cast(inputs_cols.size()), out_rows, - out_cols, output->data()); - } - } -}; - -/* - * All tensors' dimension should be the same and the values of - * each dimension are the same, except the axis dimension. - */ -template -class ConcatGradFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& input, const int axis, - std::vector& outputs) { - // TODO(zcd): Add input data validity checking - int num = outputs.size(); - int input_row = 1; - auto dim_0 = outputs[0].dims(); - for (int i = 0; i < axis; ++i) { - input_row *= dim_0[i]; - } - - int output_col_0 = outputs[0].numel() / input_row; - int input_col = 0; - bool sameShape = true; - - framework::Vector outputs_data(num * sizeof(T*) / 2); - framework::Vector outputs_cols(num + 1); - outputs_cols[0] = 0; - T** outputs_ptr = reinterpret_cast(outputs_data.data()); - - for (int i = 0; i < num; ++i) { - int t_col = outputs[i].numel() / input_row; - if (sameShape) { - if (t_col != output_col_0) sameShape = false; - } - input_col += t_col; - outputs_cols[i + 1] = input_col; - outputs_ptr[i] = outputs[i].data(); - } - - T** outs_gpu = - reinterpret_cast(outputs_data.CUDAMutableData(context.GetPlace())); - const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace()); - - // computation - const int kThreadsPerBlock = 1024; - int block_cols = kThreadsPerBlock; - if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32. - block_cols = ((input_col + 31) >> 5) << 5; - } - int block_rows = kThreadsPerBlock / block_cols; - dim3 block_size = dim3(block_cols, block_rows, 1); - - int max_threads = context.GetMaxPhysicalThreadCount(); - int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); - - int grid_cols = - std::min((input_col + block_cols - 1) / block_cols, max_blocks); - int grid_rows = - std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1)); - dim3 grid_size = dim3(grid_cols, grid_rows, 1); - - if (sameShape) { - hipLaunchKernelGGL((KernelConcatGrad), dim3(grid_size), dim3(block_size), 0, context.stream(), - input.data(), input_row, input_col, output_col_0, outs_gpu); - } else { - hipLaunchKernelGGL((KernelConcatGrad), dim3(grid_size), dim3(block_size), 0, context.stream(), - input.data(), input_row, input_col, outs_col_gpu, - static_cast(outputs_cols.size()), outs_gpu); - } - } -}; - -template class ConcatFunctor; -template class ConcatFunctor; -template class ConcatFunctor; -template class ConcatFunctor; - -template class ConcatGradFunctor; -template class ConcatGradFunctor; -template class ConcatGradFunctor; -template class ConcatGradFunctor; - -} // namespace math -} // namespace operators -} // namespace paddle +#include diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 02f2d7ba12..a0fc391c7c 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -51,7 +51,7 @@ function cmake_gen() { -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} -DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_FAST_BUNDLE_TEST=ON - -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake + -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ======================================== EOF @@ -77,7 +77,7 @@ EOF -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_FAST_BUNDLE_TEST=ON \ - -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ + -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON } From 9b1f4d5d621d2d0d24f884c4afde8e974fd9ed9c Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 15:31:57 +0800 Subject: [PATCH 085/314] After nccl add event --- paddle/fluid/framework/parallel_executor.cc | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f9fc35d8ce..21a19cb5b2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -402,10 +402,13 @@ struct NCCLAllReduceOpHandle : public OpHandle { platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm, nccl_ctx.stream()); - PADDLE_ENFORCE(cudaEventRecord(events_[dev_id], nccl_ctx.stream())); } - platform::dynload::ncclGroupEnd(); + + for (auto &ev : events_) { + PADDLE_ENFORCE(cudaEventRecord( + ev.second, member_->communication_streams_.at(ev.first).stream())); + } } } From 631aa3d10a33a1fbb52f9c6ec0ebd5022b80ede7 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 15:38:26 +0800 Subject: [PATCH 086/314] Wait all inputs ready --- paddle/fluid/framework/parallel_executor.cc | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 21a19cb5b2..248a1b4a25 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -375,6 +375,12 @@ struct NCCLAllReduceOpHandle : public OpHandle { if (this->inputs_.size() == 1) { return; // No need to all reduce when GPU count = 1; } else { + // Wait input done + for (auto *in : inputs_) { + auto &p = static_cast(in)->place_; + in->generated_op_->Wait(dev_ctx_[p]); + } + auto &var_name = static_cast(this->inputs_[0])->name_; VLOG(3) << "Invoke NCCL AllReduce"; int dtype = -1; From 4185dd48e4bc506d7a579e8b1ed95d1b65336698 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 15:59:05 +0800 Subject: [PATCH 087/314] Disable multi-thread --- paddle/fluid/framework/parallel_executor.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 248a1b4a25..25f8d7afde 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -84,8 +84,8 @@ struct OpHandle { virtual ~OpHandle() {} - virtual void Run() { PADDLE_THROW("Not implemented"); } - virtual void Wait(platform::DeviceContext *waited_dev) {} + virtual void Run() = 0; + virtual void Wait(platform::DeviceContext *waited_dev) = 0; }; struct ComputationOpHandle : public OpHandle { @@ -382,7 +382,6 @@ struct NCCLAllReduceOpHandle : public OpHandle { } auto &var_name = static_cast(this->inputs_[0])->name_; - VLOG(3) << "Invoke NCCL AllReduce"; int dtype = -1; size_t numel = 0; @@ -848,7 +847,8 @@ void ParallelExecutor::RunOp( LOG(FATAL) << "Unknown exception catched"; } }; - member_->pool_.enqueue(op_run); + op_run(); + // member_->pool_.enqueue(op_run); } } // namespace framework } // namespace paddle From 1dd216dc3b7a293bcecda34da00ad1ef8ca6f192 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 16:04:20 +0800 Subject: [PATCH 088/314] Wait bcast param --- paddle/fluid/framework/parallel_executor.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 25f8d7afde..66ad3f33d9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -690,6 +690,10 @@ void ParallelExecutor::BCastParamsToGPUs( } platform::dynload::ncclGroupEnd(); } + + for (auto &stream : member_->communication_streams_) { + stream.second.ctx_->Wait(); + } } #else PADDLE_THROW("Not compiled with CUDA"); From 236b7dd2bde254f83479ca632756b4dfaa1b8bdc Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 20 Mar 2018 14:28:07 +0800 Subject: [PATCH 089/314] add pinned memory --- .../fluid/memory/detail/system_allocator.cc | 41 ++++++++++++++ paddle/fluid/memory/detail/system_allocator.h | 12 +++++ paddle/fluid/memory/memory.cc | 53 ++++++++++++++++--- paddle/fluid/memory/memory.h | 12 +++-- 4 files changed, 107 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 8ac8978120..df9d28ede8 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -119,6 +119,47 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { bool GPUAllocator::UseGpu() const { return true; } +void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { + if (size <= 0) return nullptr; + void* p; + // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size + // of host fallback allocation. Allocates too much would reduce + // the amount of memory available to the underlying system for paging. + + size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; + + if (size > usable) return nullptr; + + cudaError_t result = cudaMallocHost(&p, size); + if (result == cudaSuccess) { + index = 1; + fallback_alloc_size_ += size; + return p; + } + + return nullptr; +} + +void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { + cudaError_t err; + PADDLE_ASSERT(index == 1); + + PADDLE_ASSERT(fallback_alloc_size_ >= size); + fallback_alloc_size_ -= size; + err = cudaFreeHost(p); + + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + if (err != cudaErrorCudartUnloading) { + PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free."); + } +} + +bool CUDAPinnedAllocator::UseGpu() const { return true; } + #endif } // namespace detail diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e93c2c1e32..3e024125fa 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator { size_t gpu_alloc_size_ = 0; size_t fallback_alloc_size_ = 0; }; + +class CUDAPinnedAllocator : public SystemAllocator { + public: + virtual void* Alloc(size_t& index, size_t size); + virtual void Free(void* p, size_t size, size_t index); + virtual bool UseGpu() const; + + private: + size_t gpu_alloc_size_ = + 0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory? + size_t fallback_alloc_size_ = 0; +}; #endif } // namespace detail diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index d07f89439a..c5577587aa 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() { } template <> -void* Alloc(platform::CPUPlace place, size_t size) { +void* Alloc(platform::CPUPlace place, size_t size, + bool use_pinned) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); VLOG(10) << " pointer=" << p; @@ -46,7 +47,8 @@ void* Alloc(platform::CPUPlace place, size_t size) { } template <> -void Free(platform::CPUPlace place, void* p) { +void Free(platform::CPUPlace place, void* p, + bool use_pinned) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { return as[gpu_id]; } +BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) { + static BuddyAllocator** as = NULL; + if (as == NULL) { + int gpu_num = platform::GetCUDADeviceCount(); + as = new BuddyAllocator*[gpu_num]; + for (int gpu = 0; gpu < gpu_num; gpu++) { + as[gpu] = nullptr; + } + } + platform::SetDeviceId(gpu_id); + if (!as[gpu_id]) { + as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator, + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); + VLOG(10) << "\n\nNOTE: each GPU device use " + << FLAGS_fraction_of_gpu_memory_to_use * 100 + << "% of GPU memory.\n" + << "You can set GFlags environment variable '" + << "FLAGS_fraction_of_gpu_memory_to_use" + << "' to change the fraction of GPU usage.\n\n"; + } + return as[gpu_id]; +} + template <> size_t Used(platform::CUDAPlace place) { return GetGPUBuddyAllocator(place.device)->Used(); } template <> -void* Alloc(platform::CUDAPlace place, size_t size) { - auto* buddy_allocator = GetGPUBuddyAllocator(place.device); - auto* ptr = buddy_allocator->Alloc(size); +void* Alloc(platform::CUDAPlace place, size_t size, + bool use_pinned) { + void* ptr; + if (use_pinned) { + auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device); + ptr = buddy_allocator->Alloc(size); + } else { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + ptr = buddy_allocator->Alloc(size); + } + if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); platform::SetDeviceId(place.device); @@ -108,8 +142,13 @@ void* Alloc(platform::CUDAPlace place, size_t size) { } template <> -void Free(platform::CUDAPlace place, void* p) { - GetGPUBuddyAllocator(place.device)->Free(p); +void Free(platform::CUDAPlace place, void* p, + bool use_pinned) { + if (use_pinned) { + GetCUDAPinnedBuddyAllocator(place.device)->Free(p); + } else { + GetGPUBuddyAllocator(place.device)->Free(p); + } } #endif diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h index 7c5db815d6..9bc48ac68f 100644 --- a/paddle/fluid/memory/memory.h +++ b/paddle/fluid/memory/memory.h @@ -33,7 +33,7 @@ namespace memory { * address is valid or not. */ template -void* Alloc(Place place, size_t size); +void* Alloc(Place place, size_t size, bool use_pinned = false); /** * \brief Free memory block in one place. @@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size); * */ template -void Free(Place place, void* ptr); +void Free(Place place, void* ptr, bool use_pinned = false); /** * \brief Total size of used memory in one place. @@ -74,11 +74,15 @@ class PODDeleter { static_assert(std::is_pod::value, "T must be POD"); public: - explicit PODDeleter(Place place) : place_(place) {} - void operator()(T* ptr) { Free(place_, static_cast(ptr)); } + explicit PODDeleter(Place place, bool use_pinned = false) + : place_(place), use_pinned_(use_pinned) {} + void operator()(T* ptr) { + Free(place_, static_cast(ptr), use_pinned_); + } private: Place place_; + bool use_pinned_; }; /** From f251a58e852503054eaba612665733b6d34bb7e9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 16:28:09 +0800 Subject: [PATCH 090/314] Use base class manage events --- paddle/fluid/framework/parallel_executor.cc | 156 ++++++++------------ 1 file changed, 60 insertions(+), 96 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 66ad3f33d9..335a063c4b 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -68,6 +68,8 @@ struct OpHandle { platform::PlaceHash> dev_ctx_; + std::unordered_map events_; + std::string DebugString() { std::stringstream ss; ss << "("; @@ -84,32 +86,57 @@ struct OpHandle { virtual ~OpHandle() {} - virtual void Run() = 0; - virtual void Wait(platform::DeviceContext *waited_dev) = 0; + void Run() { + if (events_.empty()) { + for (auto &p : dev_ctx_) { + int dev_id = boost::get(p.first).device; + cudaSetDevice(dev_id); + cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming); + } + } + + RunImpl(); + + for (auto &p : dev_ctx_) { + int dev_id = boost::get(p.first).device; + auto stream = + static_cast(p.second)->stream(); + cudaEventRecord(events_.at(dev_id), stream); + } + } + + virtual void Wait(platform::DeviceContext *waited_dev) { + if (platform::is_cpu_place(waited_dev->GetPlace())) { + for (auto &dev_ctx : dev_ctx_) { + dev_ctx.second->Wait(); + } + } else { + auto stream = + static_cast(waited_dev)->stream(); + + for (auto &ev : events_) { + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); + } + } + } + + protected: + virtual void RunImpl() = 0; }; struct ComputationOpHandle : public OpHandle { std::unique_ptr op_; Scope *scope_; platform::Place place_; - cudaEvent_t event_; explicit ComputationOpHandle(const OpDesc &op_desc, Scope *scope, platform::Place place) : op_(framework::OpRegistry::CreateOp(op_desc)), scope_(scope), - place_(place) { - if (platform::is_gpu_place(place)) { - cudaSetDevice(boost::get(place_).device); - cudaEventCreateWithFlags(&event_, cudaEventDisableTiming); - } - } - - ~ComputationOpHandle() { - // FIXME: Destroy Event - } + place_(place) {} - void Run() override { + protected: + void RunImpl() override { // Wait other op if necessary if (platform::is_gpu_place(place_)) { int dev_id = boost::get(place_).device; @@ -123,22 +150,6 @@ struct ComputationOpHandle : public OpHandle { } op_->Run(*scope_, place_); - if (platform::is_gpu_place(place_)) { - auto stream = static_cast(dev_ctx_[place_]) - ->stream(); - PADDLE_ENFORCE(cudaEventRecord(event_, stream)); - } - } - - void Wait(platform::DeviceContext *waited_dev) override { - if (platform::is_cpu_place(waited_dev->GetPlace()) || - platform::is_cpu_place(place_)) { - this->dev_ctx_.at(place_)->Wait(); - } else { - auto stream = - static_cast(waited_dev)->stream(); - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, event_, 0)); - } } }; @@ -146,7 +157,6 @@ struct ScaleLossGradOpHandle : public OpHandle { float coeff_; Scope *scope_; platform::Place place_; - cudaEvent_t ev_; explicit ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place) @@ -154,16 +164,14 @@ struct ScaleLossGradOpHandle : public OpHandle { scope_(scope), place_(place) { cudaSetDevice(boost::get(place_).device); - // Must set device before create event - PADDLE_ENFORCE(cudaEventCreateWithFlags(&ev_, cudaEventDisableTiming)); } ~ScaleLossGradOpHandle() { cudaSetDevice(boost::get(place_).device); - PADDLE_ENFORCE(cudaEventDestroy(ev_)); } - void Run() override { + protected: + void RunImpl() override { std::string var_name = static_cast(this->outputs_[0])->name_; float *tmp = scope_->FindVar(var_name) @@ -176,20 +184,8 @@ struct ScaleLossGradOpHandle : public OpHandle { auto stream = static_cast(this->dev_ctx_[place_]) ->stream(); - cudaSetDevice(boost::get(place_).device); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); - PADDLE_ENFORCE(cudaEventRecord(ev_, stream)); - } - } - - void Wait(platform::DeviceContext *waited_dev) override { - if (platform::is_cpu_place(waited_dev->GetPlace())) { - dev_ctx_.at(place_)->Wait(); - } else { - auto stream = - static_cast(waited_dev)->stream(); - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev_, 0)); } } }; @@ -216,7 +212,12 @@ struct FetchOpHandle : public OpHandle { MergeTensors(); } - void Run() override { + void Wait(platform::DeviceContext *waited_dev) override { + PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); + } + + protected: + void RunImpl() override { for (auto *input : inputs_) { auto *var = static_cast(input); var->generated_op_->Wait(this->dev_ctx_[var->place_]); @@ -240,10 +241,6 @@ struct FetchOpHandle : public OpHandle { } } - void Wait(platform::DeviceContext *waited_dev) override { - PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); - } - private: void MergeTensors() const { std::vector tensors_ptr; @@ -256,8 +253,8 @@ struct FetchOpHandle : public OpHandle { class ParallelExecutorPrivate { public: - explicit ParallelExecutorPrivate(size_t num_threads = 12) - : pool_(num_threads) {} + explicit ParallelExecutorPrivate(size_t num_threads = 0) + : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; @@ -333,7 +330,7 @@ class ParallelExecutorPrivate { std::vector> ops_; // Use a simpler thread pool, might be faster. - ThreadPool pool_; + std::unique_ptr pool_; std::unique_ptr exception_; }; @@ -353,25 +350,12 @@ ncclDataType_t ToNCCLDataType(std::type_index type) { struct NCCLAllReduceOpHandle : public OpHandle { ParallelExecutorPrivate *member_; - std::unordered_map events_; explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) - : member_(member) { - for (auto &nccl : member_->communication_streams_) { - int dev_id = nccl.second.device_id(); - cudaSetDevice(dev_id); - PADDLE_ENFORCE(cudaEventCreate(&events_[dev_id], cudaEventDisableTiming)); - } - } + : member_(member) {} - ~NCCLAllReduceOpHandle() { - for (auto &ev : events_) { - cudaSetDevice(ev.first); - PADDLE_ENFORCE(cudaEventDestroy(ev.second)); - } - } - - void Run() override { + protected: + void RunImpl() override { if (this->inputs_.size() == 1) { return; // No need to all reduce when GPU count = 1; } else { @@ -403,34 +387,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { } auto &nccl_ctx = member_->communication_streams_.at(dev_id); - cudaSetDevice(dev_id); platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm, nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); - - for (auto &ev : events_) { - PADDLE_ENFORCE(cudaEventRecord( - ev.second, member_->communication_streams_.at(ev.first).stream())); - } - } - } - - void Wait(platform::DeviceContext *waited_dev) override { - if (platform::is_cpu_place( - waited_dev->GetPlace())) { // Wait by CPU, just sync stream - for (auto &pair : member_->communication_streams_) { - pair.second.ctx_->Wait(); - } - } else { - if (events_.size() > 1) { - auto stream = - static_cast(waited_dev)->stream(); - for (auto &ev : events_) { - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); - } - } } } }; @@ -851,8 +812,11 @@ void ParallelExecutor::RunOp( LOG(FATAL) << "Unknown exception catched"; } }; - op_run(); - // member_->pool_.enqueue(op_run); + if (member_->pool_) { + member_->pool_->enqueue(op_run); + } else { + op_run(); + } } } // namespace framework } // namespace paddle From ca4b3d25326d0c1f910a1b68e883eac17b1dc143 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 16:37:50 +0800 Subject: [PATCH 091/314] Use 12 threads --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 335a063c4b..344587897f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -253,7 +253,7 @@ struct FetchOpHandle : public OpHandle { class ParallelExecutorPrivate { public: - explicit ParallelExecutorPrivate(size_t num_threads = 0) + explicit ParallelExecutorPrivate(size_t num_threads = 12) : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; From 7643c2cbab8d9efb7b0dbb96d1d418abedd7d043 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 16:43:53 +0800 Subject: [PATCH 092/314] Add flag for use event --- paddle/fluid/framework/parallel_executor.cc | 29 ++++++++++++--------- paddle/fluid/framework/parallel_executor.h | 1 + 2 files changed, 17 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 344587897f..121302880c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -86,8 +86,8 @@ struct OpHandle { virtual ~OpHandle() {} - void Run() { - if (events_.empty()) { + void Run(bool use_event) { + if (events_.empty() && use_event) { for (auto &p : dev_ctx_) { int dev_id = boost::get(p.first).device; cudaSetDevice(dev_id); @@ -97,16 +97,18 @@ struct OpHandle { RunImpl(); - for (auto &p : dev_ctx_) { - int dev_id = boost::get(p.first).device; - auto stream = - static_cast(p.second)->stream(); - cudaEventRecord(events_.at(dev_id), stream); + if (use_event) { + for (auto &p : dev_ctx_) { + int dev_id = boost::get(p.first).device; + auto stream = + static_cast(p.second)->stream(); + cudaEventRecord(events_.at(dev_id), stream); + } } } virtual void Wait(platform::DeviceContext *waited_dev) { - if (platform::is_cpu_place(waited_dev->GetPlace())) { + if (platform::is_cpu_place(waited_dev->GetPlace()) && events_.empty()) { for (auto &dev_ctx : dev_ctx_) { dev_ctx.second->Wait(); } @@ -677,7 +679,7 @@ void ParallelExecutor::BuildNCCLCommunicator() const { void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { - VLOG(3) << "Run iter"; + bool use_event = false; auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); @@ -748,7 +750,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } for (auto *op : to_run) { - RunOp(pending_vars, op); + RunOp(use_event, pending_vars, op); } while (!pending_vars.empty()) { @@ -776,7 +778,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } for (auto *op : to_run) { pending_ops.erase(op); - RunOp(pending_vars, op); + RunOp(use_event, pending_vars, op); } } @@ -790,6 +792,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } void ParallelExecutor::RunOp( + bool use_event, std::unordered_map> &pending_vars, OpHandle *op) const { std::vector *> *ready_buffer = @@ -798,10 +801,10 @@ void ParallelExecutor::RunOp( ready_buffer->emplace_back(&pending_vars[var]); } - auto op_run = [ready_buffer, op, this] { + auto op_run = [ready_buffer, op, this, use_event] { try { VLOG(10) << op->DebugString(); - op->Run(); + op->Run(use_event); for (auto *ready : *ready_buffer) { ready->store(true, std::memory_order_release); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index cb93c0cd41..2345bffcc7 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -62,6 +62,7 @@ class ParallelExecutor { void BuildNCCLCommunicator() const; void RunOp( + bool use_event, std::unordered_map>& pending_vars, OpHandle* op) const; From fbbcedda01656e8e2183b2e88d5db2dbd2b08c7a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 16:46:55 +0800 Subject: [PATCH 093/314] Fix bug --- paddle/fluid/framework/parallel_executor.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 121302880c..2a1652f749 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -108,14 +108,13 @@ struct OpHandle { } virtual void Wait(platform::DeviceContext *waited_dev) { - if (platform::is_cpu_place(waited_dev->GetPlace()) && events_.empty()) { + if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) { for (auto &dev_ctx : dev_ctx_) { dev_ctx.second->Wait(); } } else { auto stream = static_cast(waited_dev)->stream(); - for (auto &ev : events_) { PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); } From f8f1a963d9508cbdbd37c61554e8ffac9bf4a6ab Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 16:52:20 +0800 Subject: [PATCH 094/314] Add debug code --- paddle/fluid/framework/parallel_executor.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2a1652f749..d1652a3030 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -365,6 +365,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { auto &p = static_cast(in)->place_; in->generated_op_->Wait(dev_ctx_[p]); } + PADDLE_ENFORCE(cudaDeviceSynchronize()); auto &var_name = static_cast(this->inputs_[0])->name_; int dtype = -1; @@ -393,6 +394,8 @@ struct NCCLAllReduceOpHandle : public OpHandle { nccl_ctx.comm, nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); + + PADDLE_ENFORCE(cudaDeviceSynchronize()); } } }; From 3c9cea597e1e3075f8b56d0c7d11febe1a384033 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 16:58:37 +0800 Subject: [PATCH 095/314] Add more log --- paddle/fluid/framework/parallel_executor.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d1652a3030..24a9dcacf2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -365,6 +365,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { auto &p = static_cast(in)->place_; in->generated_op_->Wait(dev_ctx_[p]); } + VLOG(3) << "Before NCCL"; PADDLE_ENFORCE(cudaDeviceSynchronize()); auto &var_name = static_cast(this->inputs_[0])->name_; @@ -394,8 +395,9 @@ struct NCCLAllReduceOpHandle : public OpHandle { nccl_ctx.comm, nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); - PADDLE_ENFORCE(cudaDeviceSynchronize()); + + VLOG(3) << "After NCCL"; } } }; From a8bd7b9809a1953396b7f985e6154e42b13b82e6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 17:03:13 +0800 Subject: [PATCH 096/314] Add log --- paddle/fluid/framework/parallel_executor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 24a9dcacf2..e0b75b2342 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -109,6 +109,7 @@ struct OpHandle { virtual void Wait(platform::DeviceContext *waited_dev) { if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) { + VLOG(4) << "I am here"; for (auto &dev_ctx : dev_ctx_) { dev_ctx.second->Wait(); } From e53b6aba63a1635b137a57b15410f2eeda180e8e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 17:06:41 +0800 Subject: [PATCH 097/314] Use no thread --- paddle/fluid/framework/parallel_executor.cc | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index e0b75b2342..31a49575f1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -109,7 +109,6 @@ struct OpHandle { virtual void Wait(platform::DeviceContext *waited_dev) { if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) { - VLOG(4) << "I am here"; for (auto &dev_ctx : dev_ctx_) { dev_ctx.second->Wait(); } @@ -255,7 +254,7 @@ struct FetchOpHandle : public OpHandle { class ParallelExecutorPrivate { public: - explicit ParallelExecutorPrivate(size_t num_threads = 12) + explicit ParallelExecutorPrivate(size_t num_threads = 0) : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; @@ -397,8 +396,6 @@ struct NCCLAllReduceOpHandle : public OpHandle { } platform::dynload::ncclGroupEnd(); PADDLE_ENFORCE(cudaDeviceSynchronize()); - - VLOG(3) << "After NCCL"; } } }; From dbed1233823b081071752275bbc770125d08fff0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 17:08:53 +0800 Subject: [PATCH 098/314] Debug --- paddle/fluid/framework/parallel_executor.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 31a49575f1..d3e846d10d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -365,8 +365,6 @@ struct NCCLAllReduceOpHandle : public OpHandle { auto &p = static_cast(in)->place_; in->generated_op_->Wait(dev_ctx_[p]); } - VLOG(3) << "Before NCCL"; - PADDLE_ENFORCE(cudaDeviceSynchronize()); auto &var_name = static_cast(this->inputs_[0])->name_; int dtype = -1; @@ -395,7 +393,6 @@ struct NCCLAllReduceOpHandle : public OpHandle { nccl_ctx.comm, nccl_ctx.stream()); } platform::dynload::ncclGroupEnd(); - PADDLE_ENFORCE(cudaDeviceSynchronize()); } } }; From 4e43b713779971d681b8d224b336bfb29abb67e2 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 17:13:00 +0800 Subject: [PATCH 099/314] Add wait log --- paddle/fluid/framework/parallel_executor.cc | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d3e846d10d..8630e51d0d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -146,6 +146,7 @@ struct ComputationOpHandle : public OpHandle { auto *cur_ctx = dev_ctx_[place_]; for (auto *in : inputs_) { if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { + VLOG(3) << "Wait " << in->generated_op_->DebugString(); in->generated_op_->Wait(cur_ctx); } } @@ -163,13 +164,9 @@ struct ScaleLossGradOpHandle : public OpHandle { platform::Place place) : coeff_(static_cast(1.0 / num_dev)), scope_(scope), - place_(place) { - cudaSetDevice(boost::get(place_).device); - } + place_(place) {} - ~ScaleLossGradOpHandle() { - cudaSetDevice(boost::get(place_).device); - } + ~ScaleLossGradOpHandle() {} protected: void RunImpl() override { From a0494f8e5548aa0b6493e7205fd890cf3c24df83 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 17:16:06 +0800 Subject: [PATCH 100/314] Mutex lock wait --- paddle/fluid/platform/device_context.cc | 1 + paddle/fluid/platform/device_context.h | 1 + 2 files changed, 2 insertions(+) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 98b4178177..ab02a95f26 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -159,6 +159,7 @@ CUDADeviceContext::~CUDADeviceContext() { Place CUDADeviceContext::GetPlace() const { return place_; } void CUDADeviceContext::Wait() const { + std::lock_guard guard(mutex_); PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); PADDLE_ENFORCE(cudaGetLastError()); } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 603b890af1..df0a427b48 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -103,6 +103,7 @@ class CUDADeviceContext : public DeviceContext { std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; + mutable std::mutex mutex_; cudaStream_t stream_; cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; From 1c2b6100b05f99bf8351c3a1124a42e1a3cd83c1 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 17:16:36 +0800 Subject: [PATCH 101/314] Add --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8630e51d0d..aa52cbb7bf 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -251,7 +251,7 @@ struct FetchOpHandle : public OpHandle { class ParallelExecutorPrivate { public: - explicit ParallelExecutorPrivate(size_t num_threads = 0) + explicit ParallelExecutorPrivate(size_t num_threads = 12) : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; From 798e6907b42a8f60b730d99033a0d5715a6698df Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 18:00:06 +0800 Subject: [PATCH 102/314] Change mem order --- paddle/fluid/framework/parallel_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index aa52cbb7bf..b869097662 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -752,7 +752,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, while (!pending_vars.empty()) { VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { - if (pair.second.load(std::memory_order_consume)) { + if (pair.second.load(std::memory_order_acquire)) { ready_var = pair.first; } } From 95a0d7c7c14f5df4e4a455de76d30b905ee0df22 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 18:05:56 +0800 Subject: [PATCH 103/314] Illegal memory access --- paddle/fluid/framework/parallel_executor.cc | 6 ------ 1 file changed, 6 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b869097662..daa19eb17c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -138,15 +138,9 @@ struct ComputationOpHandle : public OpHandle { protected: void RunImpl() override { - // Wait other op if necessary - if (platform::is_gpu_place(place_)) { - int dev_id = boost::get(place_).device; - cudaSetDevice(dev_id); - } auto *cur_ctx = dev_ctx_[place_]; for (auto *in : inputs_) { if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { - VLOG(3) << "Wait " << in->generated_op_->DebugString(); in->generated_op_->Wait(cur_ctx); } } From ed7727e8f04c215f4ff77f486e46347efe0ad3cd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 18:17:13 +0800 Subject: [PATCH 104/314] Fix bug in system allocator --- paddle/fluid/memory/detail/system_allocator.cc | 11 +++++++++++ paddle/fluid/memory/detail/system_allocator.h | 3 +++ paddle/fluid/memory/memory.cc | 2 +- 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 8ac8978120..9949d80434 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -79,7 +79,18 @@ void* GPUAllocator::Alloc(size_t& index, size_t size) { // if size is 0. We just make sure it does. if (size <= 0) return nullptr; void* p; + int prev_id; + cudaGetDevice(&prev_id); + if (prev_id != gpu_id_) { + cudaSetDevice(gpu_id_); + } + cudaError_t result = cudaMalloc(&p, size); + + if (prev_id != gpu_id_) { + cudaSetDevice(prev_id); + } + if (result == cudaSuccess) { index = 0; gpu_alloc_size_ += size; diff --git a/paddle/fluid/memory/detail/system_allocator.h b/paddle/fluid/memory/detail/system_allocator.h index e93c2c1e32..c103d08640 100644 --- a/paddle/fluid/memory/detail/system_allocator.h +++ b/paddle/fluid/memory/detail/system_allocator.h @@ -43,6 +43,8 @@ class CPUAllocator : public SystemAllocator { #ifdef PADDLE_WITH_CUDA class GPUAllocator : public SystemAllocator { public: + explicit GPUAllocator(int gpu_id) : gpu_id_(gpu_id) {} + virtual void* Alloc(size_t& index, size_t size); virtual void Free(void* p, size_t size, size_t index); virtual bool UseGpu() const; @@ -50,6 +52,7 @@ class GPUAllocator : public SystemAllocator { private: size_t gpu_alloc_size_ = 0; size_t fallback_alloc_size_ = 0; + int gpu_id_; }; #endif diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index d07f89439a..1985f1f4e6 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -69,7 +69,7 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { } platform::SetDeviceId(gpu_id); if (!as[gpu_id]) { - as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator, + as[gpu_id] = new BuddyAllocator(new detail::GPUAllocator(gpu_id), platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); VLOG(10) << "\n\nNOTE: each GPU device use " From 176277b824ec0c8fad774b731dff176c30ce17cd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 18:26:28 +0800 Subject: [PATCH 105/314] Add log --- paddle/fluid/memory/memory.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index 1985f1f4e6..a12cdd45aa 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -90,6 +90,7 @@ size_t Used(platform::CUDAPlace place) { template <> void* Alloc(platform::CUDAPlace place, size_t size) { auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + VLOG(30) << "Allocating " << size << " bytes on " << place; auto* ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); From 1533bf12dfa057bc7e34be540a391cb83d4dc9bb Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 18:38:02 +0800 Subject: [PATCH 106/314] Use event and single thread --- paddle/fluid/framework/parallel_executor.cc | 4 ++-- paddle/fluid/memory/memory.cc | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index daa19eb17c..f1b8a20e41 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -245,7 +245,7 @@ struct FetchOpHandle : public OpHandle { class ParallelExecutorPrivate { public: - explicit ParallelExecutorPrivate(size_t num_threads = 12) + explicit ParallelExecutorPrivate(size_t num_threads = 0) : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; @@ -669,7 +669,7 @@ void ParallelExecutor::BuildNCCLCommunicator() const { void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { - bool use_event = false; + bool use_event = true; auto fetched_data = std::make_shared(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index a12cdd45aa..1985f1f4e6 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -90,7 +90,6 @@ size_t Used(platform::CUDAPlace place) { template <> void* Alloc(platform::CUDAPlace place, size_t size) { auto* buddy_allocator = GetGPUBuddyAllocator(place.device); - VLOG(30) << "Allocating " << size << " bytes on " << place; auto* ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { int cur_dev = platform::GetCurrentDeviceId(); From ba227df9419bbb2f8b3ac5636674c176cced3f19 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 18:41:57 +0800 Subject: [PATCH 107/314] Expose num_threads --- paddle/fluid/framework/parallel_executor.cc | 6 +++--- paddle/fluid/framework/parallel_executor.h | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index f1b8a20e41..bbfaac7339 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -245,7 +245,7 @@ struct FetchOpHandle : public OpHandle { class ParallelExecutorPrivate { public: - explicit ParallelExecutorPrivate(size_t num_threads = 0) + explicit ParallelExecutorPrivate(size_t num_threads) : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; @@ -389,11 +389,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { }; ParallelExecutor::ParallelExecutor( - const std::vector &places, + size_t num_threads, const std::vector &places, const std::unordered_set ¶ms, const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope) - : member_(new ParallelExecutorPrivate()) { + : member_(new ParallelExecutorPrivate(num_threads)) { member_->places_ = places; member_->global_scope_ = scope; // Step 1. RunStartupProgram and Bcast the params to devs. diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 2345bffcc7..c206e726a7 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -35,7 +35,8 @@ class VarHandleBase; class ParallelExecutor { public: - explicit ParallelExecutor(const std::vector& places, + explicit ParallelExecutor(size_t num_threads, + const std::vector& places, const std::unordered_set& params, const ProgramDesc& startup_program, const ProgramDesc& main_program, From d42117e7422facdbffbd77d3f5b2841fe6ad5ed9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 18:42:40 +0800 Subject: [PATCH 108/314] Set NumThreads --- paddle/fluid/pybind/pybind.cc | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 929c343f7a..60662244cc 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -498,16 +498,17 @@ All parameter, weight, gradient are variables in Paddle. m.def("reset_profiler", platform::ResetProfiler); py::class_(m, "ParallelExecutor") - .def( - "__init__", - [](ParallelExecutor &self, const std::vector &places, - const std::unordered_set ¶ms, - const ProgramDesc &startup_program, - const ProgramDesc &main_program, const std::string &loss_var_name, - Scope *scope) { - new (&self) ParallelExecutor(places, params, startup_program, - main_program, loss_var_name, scope); - }) + .def("__init__", + [](ParallelExecutor &self, size_t num_threads, + const std::vector &places, + const std::unordered_set ¶ms, + const ProgramDesc &startup_program, + const ProgramDesc &main_program, const std::string &loss_var_name, + Scope *scope) { + new (&self) + ParallelExecutor(num_threads, places, params, startup_program, + main_program, loss_var_name, scope); + }) .def("run", &ParallelExecutor::Run); BindRecordIOWriter(m); From 65bc7d17d52741cd124a00444bf063195e4f9c5d Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 18:46:20 +0800 Subject: [PATCH 109/314] Add mtx to ncclAllReduce --- paddle/fluid/framework/parallel_executor.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index bbfaac7339..d61f1438a6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -340,6 +340,8 @@ ncclDataType_t ToNCCLDataType(std::type_index type) { } } +static std::mutex g_nccl_mtx_; + struct NCCLAllReduceOpHandle : public OpHandle { ParallelExecutorPrivate *member_; @@ -361,6 +363,8 @@ struct NCCLAllReduceOpHandle : public OpHandle { int dtype = -1; size_t numel = 0; + std::lock_guard g(g_nccl_mtx_); + platform::dynload::ncclGroupStart(); for (size_t i = 0; i < member_->local_scopes_.size(); ++i) { From eb0a580e78da1418e66358278fc2270b6406ef80 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 19:08:44 +0800 Subject: [PATCH 110/314] Add enforce --- paddle/fluid/framework/parallel_executor.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d61f1438a6..b8751662c3 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -246,7 +246,7 @@ struct FetchOpHandle : public OpHandle { class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(size_t num_threads) - : pool_(num_threads == 0 ? nullptr : new ThreadPool(num_threads)) {} + : pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; @@ -365,7 +365,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { std::lock_guard g(g_nccl_mtx_); - platform::dynload::ncclGroupStart(); + PADDLE_ENFORCE(platform::dynload::ncclGroupStart()); for (size_t i = 0; i < member_->local_scopes_.size(); ++i) { auto &p = member_->places_[i]; @@ -383,11 +383,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { } auto &nccl_ctx = member_->communication_streams_.at(dev_id); - platform::dynload::ncclAllReduce( + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, - nccl_ctx.comm, nccl_ctx.stream()); + nccl_ctx.comm, nccl_ctx.stream())); } - platform::dynload::ncclGroupEnd(); + PADDLE_ENFORCE(platform::dynload::ncclGroupEnd()); } } }; From 82693e72273599da5a0ffc8e21790665279d4a4b Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 19:14:27 +0800 Subject: [PATCH 111/314] Wait nccl all reduce --- paddle/fluid/framework/parallel_executor.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b8751662c3..8ee2e57324 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -348,6 +348,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) : member_(member) {} + void Wait(platform::DeviceContext *waited_dev) override { + VLOG(3) << "Wait nccl all reduce op"; + OpHandle::Wait(waited_dev); + } + protected: void RunImpl() override { if (this->inputs_.size() == 1) { @@ -381,7 +386,6 @@ struct NCCLAllReduceOpHandle : public OpHandle { if (numel == 0) { numel = static_cast(lod_tensor.numel()); } - auto &nccl_ctx = member_->communication_streams_.at(dev_id); PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, From e335f01826143452c8733495f02a60f7d668d3c7 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 19:20:37 +0800 Subject: [PATCH 112/314] Add more logs --- paddle/fluid/framework/parallel_executor.cc | 54 ++++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8ee2e57324..82df86bebd 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -125,30 +125,6 @@ struct OpHandle { virtual void RunImpl() = 0; }; -struct ComputationOpHandle : public OpHandle { - std::unique_ptr op_; - Scope *scope_; - platform::Place place_; - - explicit ComputationOpHandle(const OpDesc &op_desc, Scope *scope, - platform::Place place) - : op_(framework::OpRegistry::CreateOp(op_desc)), - scope_(scope), - place_(place) {} - - protected: - void RunImpl() override { - auto *cur_ctx = dev_ctx_[place_]; - for (auto *in : inputs_) { - if (in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx) { - in->generated_op_->Wait(cur_ctx); - } - } - - op_->Run(*scope_, place_); - } -}; - struct ScaleLossGradOpHandle : public OpHandle { float coeff_; Scope *scope_; @@ -396,6 +372,36 @@ struct NCCLAllReduceOpHandle : public OpHandle { } }; +struct ComputationOpHandle : public OpHandle { + std::unique_ptr op_; + Scope *scope_; + platform::Place place_; + + explicit ComputationOpHandle(const OpDesc &op_desc, Scope *scope, + platform::Place place) + : op_(framework::OpRegistry::CreateOp(op_desc)), + scope_(scope), + place_(place) {} + + protected: + void RunImpl() override { + auto *cur_ctx = dev_ctx_[place_]; + for (auto *in : inputs_) { + bool need_wait = + in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx; + if (dynamic_cast(in->generated_op_)) { + VLOG(3) << "Input is nccl all reduce, need to wait" << need_wait; + } + + if (need_wait) { + in->generated_op_->Wait(cur_ctx); + } + } + + op_->Run(*scope_, place_); + } +}; + ParallelExecutor::ParallelExecutor( size_t num_threads, const std::vector &places, const std::unordered_set ¶ms, From 43e54079a89a31a3970989b34178391a2120f0e8 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 19:32:35 +0800 Subject: [PATCH 113/314] Debug code --- paddle/fluid/framework/parallel_executor.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 82df86bebd..382e13451f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -545,6 +545,13 @@ void ParallelExecutor::ConstructDependencyGraph( harzaeds need to be handled. */ PolishGraphToSupportDataHazards(); + + for (auto &g : grads) { + LOG(INFO) << member_->vars_.begin() + ->second[g] + .rbegin() + ->second.pending_ops_.size(); + } } /** From 599f7a87ba6f87b42141f16b06ca28721a6982e9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 20 Mar 2018 19:34:38 +0800 Subject: [PATCH 114/314] Refine code --- paddle/fluid/framework/parallel_executor.cc | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 382e13451f..c008da9493 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -389,10 +389,6 @@ struct ComputationOpHandle : public OpHandle { for (auto *in : inputs_) { bool need_wait = in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx; - if (dynamic_cast(in->generated_op_)) { - VLOG(3) << "Input is nccl all reduce, need to wait" << need_wait; - } - if (need_wait) { in->generated_op_->Wait(cur_ctx); } @@ -545,13 +541,6 @@ void ParallelExecutor::ConstructDependencyGraph( harzaeds need to be handled. */ PolishGraphToSupportDataHazards(); - - for (auto &g : grads) { - LOG(INFO) << member_->vars_.begin() - ->second[g] - .rbegin() - ->second.pending_ops_.size(); - } } /** From dc2bc077a2f2479fcfb55c5b029d6eed6bb628c9 Mon Sep 17 00:00:00 2001 From: weixing02 <564445201@qq.com> Date: Tue, 20 Mar 2018 19:40:03 +0800 Subject: [PATCH 115/314] Build basic sphinx doctree for doc/fluid --- doc/CMakeLists.txt | 1 + doc/fluid/CMakeLists.txt | 49 ++++++++++++++++++++++++ doc/fluid/build_and_install/index_cn.rst | 2 + doc/fluid/build_and_install/index_en.rst | 2 + doc/fluid/design/index_cn.rst | 2 + doc/fluid/design/index_en.rst | 2 + doc/fluid/dev/index_cn.rst | 2 + doc/fluid/dev/index_en.rst | 4 ++ doc/fluid/faq/index_cn.rst | 2 + doc/fluid/faq/index_en.rst | 2 + doc/fluid/getstarted/index_cn.rst | 4 ++ doc/fluid/getstarted/index_en.rst | 4 ++ doc/fluid/howto/index_cn.rst | 2 + doc/fluid/howto/index_en.rst | 4 ++ doc/fluid/index_cn.rst | 12 ++++++ doc/fluid/index_en.rst | 12 ++++++ 16 files changed, 106 insertions(+) create mode 100644 doc/fluid/CMakeLists.txt create mode 100644 doc/fluid/build_and_install/index_cn.rst create mode 100644 doc/fluid/build_and_install/index_en.rst create mode 100644 doc/fluid/design/index_cn.rst create mode 100644 doc/fluid/design/index_en.rst create mode 100644 doc/fluid/dev/index_cn.rst create mode 100644 doc/fluid/dev/index_en.rst create mode 100644 doc/fluid/faq/index_cn.rst create mode 100644 doc/fluid/faq/index_en.rst create mode 100644 doc/fluid/getstarted/index_cn.rst create mode 100644 doc/fluid/getstarted/index_en.rst create mode 100644 doc/fluid/howto/index_cn.rst create mode 100644 doc/fluid/howto/index_en.rst create mode 100644 doc/fluid/index_cn.rst create mode 100644 doc/fluid/index_en.rst diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index da67701ec1..a9b27933a5 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(v2) +add_subdirectory(fluid) diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt new file mode 100644 index 0000000000..cc999f5a8d --- /dev/null +++ b/doc/fluid/CMakeLists.txt @@ -0,0 +1,49 @@ +if(NOT DEFINED SPHINX_THEME) + set(SPHINX_THEME default) +endif() + +if(NOT DEFINED SPHINX_THEME_DIR) + set(SPHINX_THEME_DIR) +endif() + +# configured documentation tools and intermediate build results +set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build") + +# Sphinx cache with pickled ReST documents +set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") + +# HTML output director +set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") + +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in" + "${BINARY_BUILD_DIR_EN}/conf.py" + @ONLY) + +sphinx_add_target(paddle_fluid_docs + html + ${BINARY_BUILD_DIR_EN} + ${SPHINX_CACHE_DIR_EN} + ${CMAKE_CURRENT_SOURCE_DIR} + ${SPHINX_HTML_DIR_EN}) + +# configured documentation tools and intermediate build results +set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") + +# Sphinx cache with pickled ReST documents +set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees") + +# HTML output directory +set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html") + +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.cn.in" + "${BINARY_BUILD_DIR_CN}/conf.py" + @ONLY) + +sphinx_add_target(paddle_fluid_docs_cn + html + ${BINARY_BUILD_DIR_CN} + ${SPHINX_CACHE_DIR_CN} + ${CMAKE_CURRENT_SOURCE_DIR} + ${SPHINX_HTML_DIR_CN}) diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst new file mode 100644 index 0000000000..9276236f9f --- /dev/null +++ b/doc/fluid/build_and_install/index_cn.rst @@ -0,0 +1,2 @@ +安装与使用 +------------ diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst new file mode 100644 index 0000000000..cc1e61a58a --- /dev/null +++ b/doc/fluid/build_and_install/index_en.rst @@ -0,0 +1,2 @@ +Build and Install +------------ diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst new file mode 100644 index 0000000000..f1887be690 --- /dev/null +++ b/doc/fluid/design/index_cn.rst @@ -0,0 +1,2 @@ +设计思想 +------------ diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst new file mode 100644 index 0000000000..18a4b4122f --- /dev/null +++ b/doc/fluid/design/index_en.rst @@ -0,0 +1,2 @@ +Design +------------ diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst new file mode 100644 index 0000000000..e1edf079fa --- /dev/null +++ b/doc/fluid/dev/index_cn.rst @@ -0,0 +1,2 @@ +开发标准 +------------ diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst new file mode 100644 index 0000000000..faf9dfcd31 --- /dev/null +++ b/doc/fluid/dev/index_en.rst @@ -0,0 +1,4 @@ +Development +------------ + +This is Development page diff --git a/doc/fluid/faq/index_cn.rst b/doc/fluid/faq/index_cn.rst new file mode 100644 index 0000000000..395c110989 --- /dev/null +++ b/doc/fluid/faq/index_cn.rst @@ -0,0 +1,2 @@ +FAQ +------------ diff --git a/doc/fluid/faq/index_en.rst b/doc/fluid/faq/index_en.rst new file mode 100644 index 0000000000..395c110989 --- /dev/null +++ b/doc/fluid/faq/index_en.rst @@ -0,0 +1,2 @@ +FAQ +------------ diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst new file mode 100644 index 0000000000..c4d8525f23 --- /dev/null +++ b/doc/fluid/getstarted/index_cn.rst @@ -0,0 +1,4 @@ +新手入门 +------------ + +新手入门 diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst new file mode 100644 index 0000000000..a4efd05e2f --- /dev/null +++ b/doc/fluid/getstarted/index_en.rst @@ -0,0 +1,4 @@ +GET STARTED +------------ + +This is get started page diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst new file mode 100644 index 0000000000..a92abad0c5 --- /dev/null +++ b/doc/fluid/howto/index_cn.rst @@ -0,0 +1,2 @@ +进阶使用 +------------ diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst new file mode 100644 index 0000000000..06036bdce5 --- /dev/null +++ b/doc/fluid/howto/index_en.rst @@ -0,0 +1,4 @@ +HOW TO +------------ + +This is how to page diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst new file mode 100644 index 0000000000..be3bed4393 --- /dev/null +++ b/doc/fluid/index_cn.rst @@ -0,0 +1,12 @@ + PaddlePaddle Fluid +========================== + +.. toctree:: + :maxdepth: 1 + + getstarted/index_cn.rst + design/index_cn.rst + build_and_install/index_cn.rst + howto/index_cn.rst + dev/index_cn.rst + faq/index_cn.rst diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst new file mode 100644 index 0000000000..87c831420a --- /dev/null +++ b/doc/fluid/index_en.rst @@ -0,0 +1,12 @@ + PaddlePaddle Fluid +========================== + +.. toctree:: + :maxdepth: 1 + + getstarted/index_en.rst + design/index_en.rst + build_and_install/index_en.rst + howto/index_en.rst + dev/index_en.rst + faq/index_en.rst From eaa90d38ad121ae019688f024380526cf7d504c8 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 20 Mar 2018 15:12:15 +0800 Subject: [PATCH 116/314] add use_pinned --- paddle/fluid/framework/tensor.h | 32 +++++++++++++++++++--------- paddle/fluid/framework/tensor_impl.h | 23 ++++++++++++-------- 2 files changed, 36 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 6f878541e6..aa8f44ea30 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -45,10 +45,11 @@ class Tensor { friend struct EigenVector; public: - Tensor() : offset_(0) {} + Tensor() : offset_(0), use_pinned_(false) {} /*! Constructor with place should only be used in pybind. */ - explicit Tensor(const platform::Place& place) : offset_(0) { + explicit Tensor(const platform::Place& place) + : offset_(0), use_pinned_(false) { holder_->set_place(place); } @@ -69,11 +70,12 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(platform::Place place); + inline T* mutable_data(platform::Place place, bool use_pinned = false); - inline void* mutable_data(platform::Place place, std::type_index type); + inline void* mutable_data(platform::Place place, std::type_index type, + bool use_pinned = false); - inline void* mutable_data(platform::Place place); + inline void* mutable_data(platform::Place place, bool use_pinned = false); /** * @brief Return a pointer to mutable memory block. @@ -84,7 +86,8 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(DDim dims, platform::Place place); + inline T* mutable_data(DDim dims, platform::Place place, + bool use_pinned = false); /*! Return the dimensions of the memory block. */ inline const DDim& dims() const; @@ -92,6 +95,9 @@ class Tensor { /*! Return the numel of the memory block. */ inline int64_t numel() const; + /*! Return the numel of the memory block. */ + inline bool isPinned() const; + /*! Resize the dimensions of the memory block. */ inline Tensor& Resize(const DDim& dims); @@ -146,12 +152,14 @@ class Tensor { template struct PlaceholderImpl : public Placeholder { - PlaceholderImpl(Place place, size_t size, std::type_index type) - : ptr_(static_cast(memory::Alloc(place, size)), - memory::PODDeleter(place)), + PlaceholderImpl(Place place, size_t size, std::type_index type, + bool use_pinned = false) + : ptr_(static_cast(memory::Alloc(place, size, use_pinned)), + memory::PODDeleter(place, use_pinned)), place_(place), size_(size), - type_(type) { + type_(type), + use_pinned_(use_pinned) { PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", (is_cpu_place(place_) ? "CPU" : "GPU")); } @@ -174,6 +182,9 @@ class Tensor { /* the current type of memory */ std::type_index type_; + + /*! use pinned memory or not. */ + bool use_pinned_; }; /*! holds the memory block if allocated. */ @@ -208,6 +219,7 @@ class Tensor { * PlaceHolder::ptr_ and where the tensor data really begins. */ size_t offset_; + bool use_pinned_; }; inline void Tensor::switch_place(platform::Place new_place) { diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 638bd0db9d..e882cce69e 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -101,19 +101,21 @@ inline T* Tensor::data() { } template -inline T* Tensor::mutable_data(DDim dims, platform::Place place) { +inline T* Tensor::mutable_data(DDim dims, platform::Place place, + bool use_pinned) { static_assert(std::is_pod::value, "T must be POD"); Resize(dims); - return mutable_data(place); + return mutable_data(place, use_pinned); } template -inline T* Tensor::mutable_data(platform::Place place) { +inline T* Tensor::mutable_data(platform::Place place, bool use_pinned) { static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data(place, typeid(T))); + return reinterpret_cast(mutable_data(place, typeid(T), use_pinned)); } -inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { +inline void* Tensor::mutable_data(platform::Place place, std::type_index type, + bool use_pinned) { if (holder_ != nullptr) { holder_->set_type(type); } @@ -127,26 +129,27 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { holder_->size() < size + offset_) { if (platform::is_cpu_place(place)) { holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); + boost::get(place), size, type, use_pinned)); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); } #else holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); + boost::get(place), size, type, use_pinned)); } #endif offset_ = 0; + use_pinned_ = use_pinned; } return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } -inline void* Tensor::mutable_data(platform::Place place) { +inline void* Tensor::mutable_data(platform::Place place, bool use_pinned) { PADDLE_ENFORCE(this->holder_ != nullptr, "Cannot invoke mutable data if current hold nothing"); - return mutable_data(place, holder_->type()); + return mutable_data(place, holder_->type(), use_pinned); } inline Tensor& Tensor::ShareDataWith(const Tensor& src) { @@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; } inline int64_t Tensor::numel() const { return product(dims_); } +inline bool Tensor::isPinned() const { return use_pinned_; } + inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { Tensor res; res.ShareDataWith(src); From 18461d093505f2b889cfae3ae99ea55c12afe540 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 21 Mar 2018 10:48:46 +0800 Subject: [PATCH 117/314] wip --- paddle/fluid/operators/listen_and_serv_op.cc | 42 ++++++++++++++------ 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index a594de67e0..bd6e25449f 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -95,6 +95,13 @@ class ListenAndServOp : public framework::OperatorBase { "server program should have at least 2 blocks"); framework::Executor executor(dev_place); + std::vector blk_ctx_list; + blk_ctx_list.push_back(nullptr); // block0 is not used. + for (int blkid = 1; blkid < num_blocks; ++blkid) { + auto *exe_ctx = executor.Prepare(*program, blkid); + VLOG(2) << "prepare ctx: " << exe_ctx; + blk_ctx_list.push_back(exe_ctx); + } // TODO(typhoonzero): change this to a while_op for every cluster-batch. bool exit_flag = false; @@ -145,23 +152,30 @@ class ListenAndServOp : public framework::OperatorBase { std::vector> fs; // block0 contains only listen_and_serv op, start run from block1. for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { - fs.push_back(framework::Async([&executor, &program, &recv_scope, - blkid]() { - int run_block = blkid; // thread local - try { - executor.Run(*program, &recv_scope, run_block, - false /*create_local_scope*/, false /*create_vars*/); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - })); + fs.push_back(framework::Async( + [&executor, &program, &recv_scope, &blk_ctx_list, blkid]() { + int run_block = blkid; // thread local + try { + VLOG(2) << "run ctx: " << blk_ctx_list[run_block] + << " block: " << run_block; + executor.RunPreparedContext(blk_ctx_list[run_block], + &recv_scope, false, false); + // executor.Run(*program, &recv_scope, run_block, + // false /*create_local_scope*/, + // false /*create_vars*/); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + })); } for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait(); // Run global block at final step, or block1 if there are only 2 blocks if (num_blocks >= 2) { try { - executor.Run(*program, &recv_scope, num_blocks - 1, - false /*create_local_scope*/, false /*create_vars*/); + executor.RunPreparedContext(blk_ctx_list[num_blocks - 1], &recv_scope, + false, false); + // executor.Run(*program, &recv_scope, num_blocks - 1, + // false /*create_local_scope*/, false /*create_vars*/); } catch (std::exception &e) { LOG(ERROR) << "run sub program error " << e.what(); } @@ -180,6 +194,10 @@ class ListenAndServOp : public framework::OperatorBase { rpc_service_->WaitClientGet(fan_in); sparse_vars.clear(); } // while(true) + + for (int i = 0; i < num_blocks; ++i) { + delete blk_ctx_list[i]; + } } protected: From 7ac969b88c53ab7e6bc345f20033f6e0fbd934dd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 11:33:09 +0800 Subject: [PATCH 118/314] Debug * add Check align * Make FetchData not shared_ptr * Remove FetchData * Wait & Fetch Data --- paddle/fluid/framework/parallel_executor.cc | 55 +++++++++++---------- 1 file changed, 29 insertions(+), 26 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index c008da9493..8d8004fc6d 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "lod_tensor.h" #include "lod_tensor_array.h" #include "op_registry.h" +#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/operators/math/concat.h" namespace paddle { @@ -158,15 +159,8 @@ struct ScaleLossGradOpHandle : public OpHandle { } }; -struct FetchedData { - public: - std::vector tensors_; - - explicit FetchedData(size_t num_fetched) { tensors_.resize(num_fetched); } -}; - struct FetchOpHandle : public OpHandle { - std::shared_ptr data_; + FeedFetchList *data_; size_t offset_; std::vector *local_scopes_; std::vector tensors_; @@ -175,15 +169,26 @@ struct FetchOpHandle : public OpHandle { for (auto *input_var : inputs_) { input_var->pending_ops_.erase(this); } - - // Lazily merge tensors. Will faster code. - MergeTensors(); } void Wait(platform::DeviceContext *waited_dev) override { PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); } + void WaitAndMergeCPUTensors() const { + // Wait fetch stream done. + for (auto &ctx : dev_ctx_) { + ctx.second->Wait(); + } + + std::vector tensors_ptr; + tensors_ptr.reserve(tensors_.size()); + for (auto &t : tensors_) { + tensors_ptr.emplace_back(&t); + } + data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace()); + } + protected: void RunImpl() override { for (auto *input : inputs_) { @@ -208,15 +213,6 @@ struct FetchOpHandle : public OpHandle { } } } - - private: - void MergeTensors() const { - std::vector tensors_ptr; - for (auto &t : tensors_) { - tensors_ptr.emplace_back(&t); - } - data_->tensors_[offset_].MergeLoDTensor(tensors_ptr, platform::CPUPlace()); - } }; class ParallelExecutorPrivate { @@ -325,7 +321,6 @@ struct NCCLAllReduceOpHandle : public OpHandle { : member_(member) {} void Wait(platform::DeviceContext *waited_dev) override { - VLOG(3) << "Wait nccl all reduce op"; OpHandle::Wait(waited_dev); } @@ -355,6 +350,11 @@ struct NCCLAllReduceOpHandle : public OpHandle { auto &lod_tensor = s->FindVar(var_name)->Get(); void *buffer = const_cast(lod_tensor.data()); + uintptr_t buf = reinterpret_cast(buffer); + if (buf % sizeof(float) != 0) { + VLOG(3) << "Buffer is not aligned " << buf; + } + if (dtype == -1) { dtype = ToNCCLDataType(lod_tensor.type()); } @@ -680,7 +680,7 @@ void ParallelExecutor::BuildNCCLCommunicator() const { void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { bool use_event = true; - auto fetched_data = std::make_shared(fetch_tensors.size()); + FeedFetchList fetched_data(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); std::unordered_map> pending_vars; @@ -728,7 +728,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, auto &vars = fetched_vars[var_name]; fetch_ops.emplace_back(); FetchOpHandle *op = &fetch_ops.back(); - op->data_ = fetched_data; + op->data_ = &fetched_data; op->offset_ = i; op->local_scopes_ = &member_->local_scopes_; for (auto &p : member_->places_) { @@ -786,9 +786,12 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, platform::DeviceContextPool::Instance().Get(p)->Wait(); } - fetch_ops.clear(); - *member_->global_scope_->Var(fetched_var_name)->GetMutable() = - fetched_data->tensors_; + for (auto &fetch_op : fetch_ops) { + fetch_op.WaitAndMergeCPUTensors(); + } + + *member_->global_scope_->Var(fetched_var_name)->GetMutable() = + fetched_data; } void ParallelExecutor::RunOp( From 90f980167d8b2f706e1c1cba98eb1bbc5356eec3 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 11:35:03 +0800 Subject: [PATCH 119/314] Do not wait computation stream --- paddle/fluid/framework/parallel_executor.cc | 4 ---- 1 file changed, 4 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8d8004fc6d..fce1bf4724 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -782,10 +782,6 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } } - for (auto &p : member_->places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); - } - for (auto &fetch_op : fetch_ops) { fetch_op.WaitAndMergeCPUTensors(); } From 99fe83a0200af9054457ebb677a46b02627011bc Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 12:23:55 +0800 Subject: [PATCH 120/314] Move nccl helper --- paddle/fluid/framework/parallel_executor.cc | 18 ++-------- paddle/fluid/platform/nccl_helper.h | 37 +++++++++++++++++++++ 2 files changed, 40 insertions(+), 15 deletions(-) create mode 100644 paddle/fluid/platform/nccl_helper.h diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index fce1bf4724..991a0c8238 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -20,6 +20,7 @@ limitations under the License. */ #include "op_registry.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/platform/nccl_helper.h" namespace paddle { namespace framework { @@ -299,19 +300,6 @@ class ParallelExecutorPrivate { std::unique_ptr exception_; }; -// TODO(yy): Move this function somewhere -ncclDataType_t ToNCCLDataType(std::type_index type) { - if (type == typeid(float)) { // NOLINT - return ncclFloat; - } else if (type == typeid(double)) { // NOLINT - return ncclDouble; - } else if (type == typeid(int)) { // NOLINT - return ncclInt; - } else { - PADDLE_THROW("Not supported"); - } -} - static std::mutex g_nccl_mtx_; struct NCCLAllReduceOpHandle : public OpHandle { @@ -356,7 +344,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { } if (dtype == -1) { - dtype = ToNCCLDataType(lod_tensor.type()); + dtype = platform::ToNCCLDataType(lod_tensor.type()); } if (numel == 0) { @@ -629,7 +617,7 @@ void ParallelExecutor::BCastParamsToGPUs( if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { auto &main_tensor = main_scope->FindVar(var_desc->Name())->Get(); - ncclDataType_t data_type = ToNCCLDataType(main_tensor.type()); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); auto &dims = main_tensor.dims(); size_t numel = main_tensor.numel(); diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h new file mode 100644 index 0000000000..e20f99bc6b --- /dev/null +++ b/paddle/fluid/platform/nccl_helper.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/platform/dynload/nccl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +inline ncclDataType_t ToNCCLDataType(std::type_index type) { + if (type == typeid(float)) { // NOLINT + return ncclFloat; + } else if (type == typeid(double)) { // NOLINT + return ncclDouble; + } else if (type == typeid(int)) { // NOLINT + return ncclInt; + } else { + PADDLE_THROW("Not supported"); + } +} + +} // namespace platform +} // namespace paddle From 41ad63234181e2c6dcec464db51c08270c18ac3c Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 12:35:39 +0800 Subject: [PATCH 121/314] Add NCCL Group Guard --- paddle/fluid/framework/parallel_executor.cc | 7 +------ paddle/fluid/platform/nccl_helper.h | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 991a0c8238..1823cefe42 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -300,8 +300,6 @@ class ParallelExecutorPrivate { std::unique_ptr exception_; }; -static std::mutex g_nccl_mtx_; - struct NCCLAllReduceOpHandle : public OpHandle { ParallelExecutorPrivate *member_; @@ -327,9 +325,7 @@ struct NCCLAllReduceOpHandle : public OpHandle { int dtype = -1; size_t numel = 0; - std::lock_guard g(g_nccl_mtx_); - - PADDLE_ENFORCE(platform::dynload::ncclGroupStart()); + platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->local_scopes_.size(); ++i) { auto &p = member_->places_[i]; @@ -355,7 +351,6 @@ struct NCCLAllReduceOpHandle : public OpHandle { buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm, nccl_ctx.stream())); } - PADDLE_ENFORCE(platform::dynload::ncclGroupEnd()); } } }; diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index e20f99bc6b..cceceda8ad 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/platform/dynload/nccl.h" #include "paddle/fluid/platform/enforce.h" @@ -33,5 +34,24 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { } } +class NCCLGroupGuard { + public: + inline NCCLGroupGuard() { + mutex().lock(); + PADDLE_ENFORCE(dynload::ncclGroupStart()); + } + + inline ~NCCLGroupGuard() { + PADDLE_ENFORCE(dynload::ncclGroupEnd()); + mutex().unlock(); + } + + private: + static std::mutex& mutex() { + static std::mutex mtx; + return mtx; + } +}; + } // namespace platform } // namespace paddle From f2685bed81d492e13e471b16fefd31ce834962e9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 12:38:42 +0800 Subject: [PATCH 122/314] Clean code --- paddle/fluid/framework/parallel_executor.cc | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 1823cefe42..d06613b573 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -25,12 +25,6 @@ limitations under the License. */ namespace paddle { namespace framework { -#ifdef PADDLE_WITH_CUDA - -// FIXME: CHECK the return value of x; -#define NCCL_INVOKE(x) x -#endif - struct OpHandle; struct VarHandleBase { @@ -59,10 +53,6 @@ struct DummyVarHandle : public VarHandleBase { std::string DebugString() const override { return "dummy"; } }; -struct DependencyVarHandle : public VarHandleBase { - std::string DebugString() const override { return "Dependency Variable"; } -}; - struct OpHandle { std::vector inputs_; std::vector outputs_; @@ -252,7 +242,7 @@ class ParallelExecutorPrivate { devs.push_back(boost::get(p).device); } - NCCL_INVOKE(platform::dynload::ncclCommInitAll( + PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( &comms[0], static_cast(contexts.size()), &devs[0])); int i = 0; @@ -558,7 +548,7 @@ void ParallelExecutor::PolishGraphToSupportDataHazards() const { continue; } - auto *dep_var = new DependencyVarHandle(); + auto *dep_var = new DummyVarHandle(); dep_var->generated_op_ = read_op; read_op->outputs_.emplace_back(dep_var); From a478a11e0b381c19bc392efd85d016dfaa62df22 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 12:43:23 +0800 Subject: [PATCH 123/314] NCCL Guard for bcast --- paddle/fluid/framework/parallel_executor.cc | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d06613b573..a5221d03d6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -606,7 +606,7 @@ void ParallelExecutor::BCastParamsToGPUs( auto &dims = main_tensor.dims(); size_t numel = main_tensor.numel(); - platform::dynload::ncclGroupStart(); + platform::NCCLGroupGuard guard; for (size_t i = 0; i < member_->places_.size(); ++i) { auto place = member_->places_[i]; @@ -624,7 +624,6 @@ void ParallelExecutor::BCastParamsToGPUs( platform::dynload::ncclBcast(buffer, numel, data_type, 0, nccl_ctx.comm, nccl_ctx.stream()); } - platform::dynload::ncclGroupEnd(); } for (auto &stream : member_->communication_streams_) { From 6ebc6bf5337bb7b30c379bb242d00ae15f53ee82 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 13:41:58 +0800 Subject: [PATCH 124/314] ReorganizeCode --- paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/details/CMakeLists.txt | 1 + paddle/fluid/framework/details/var_handle.cc | 32 +++ paddle/fluid/framework/details/var_handle.h | 66 +++++ paddle/fluid/framework/parallel_executor.cc | 268 +++++++----------- paddle/fluid/framework/parallel_executor.h | 14 - paddle/fluid/platform/nccl_helper.h | 36 ++- 7 files changed, 244 insertions(+), 176 deletions(-) create mode 100644 paddle/fluid/framework/details/CMakeLists.txt create mode 100644 paddle/fluid/framework/details/var_handle.cc create mode 100644 paddle/fluid/framework/details/var_handle.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 6522a7a69f..9d2dc29028 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,3 +1,4 @@ +add_subdirectory(details) # ddim lib proto_library(framework_proto SRCS framework.proto) @@ -87,7 +88,7 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - framework_proto backward glog lod_rank_table feed_fetch_method executor simple_threadpool concat) + framework_proto backward glog lod_rank_table feed_fetch_method executor simple_threadpool var_handle) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt new file mode 100644 index 0000000000..5074715e2e --- /dev/null +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -0,0 +1 @@ +cc_library(var_handle SRCS var_handle.cc DEPS place) diff --git a/paddle/fluid/framework/details/var_handle.cc b/paddle/fluid/framework/details/var_handle.cc new file mode 100644 index 0000000000..6f00abd947 --- /dev/null +++ b/paddle/fluid/framework/details/var_handle.cc @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/var_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +VarHandleBase::~VarHandleBase() {} + +std::string VarHandle::DebugString() const { + std::stringstream ss; + ss << name_ << ":" << place_; + return ss.str(); +} + +std::string DummyVarHandle::DebugString() const { return "dummy"; } +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h new file mode 100644 index 0000000000..613ff901b1 --- /dev/null +++ b/paddle/fluid/framework/details/var_handle.h @@ -0,0 +1,66 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include + +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { + +struct OpHandleBase; + +namespace details { + +// VarHandleBase is the var node in the dependency graph. +// A variable can only be generated by a single operator. i.e. +// This is a single assignment graph. +struct VarHandleBase { + virtual ~VarHandleBase(); + virtual std::string DebugString() const = 0; + + // The operator who generate this variable. nullptr if the variable + // is a root node. + OpHandleBase *generated_op_; + + // Operators which depend on this variable ready. + std::unordered_set pending_ops_; +}; + +// VarHandle is actually a single version of Runtime Variable. +// Variable in Runtime mapped to many VarHandles in Graph. +// Each assignment will generate a new var handle with newer version. +// +// NOTE: runtime variables have place. +struct VarHandle : public VarHandleBase { + std::string DebugString() const override; + + // version field currently is not used, however, just store the version to + // debug easily. + size_t version_; + std::string name_; + platform::Place place_; +}; + +// Dummy Variable. It is used to represent dependencies between operators +struct DummyVarHandle : public VarHandleBase { + std::string DebugString() const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index a5221d03d6..2b094eba1e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "lod_tensor.h" #include "lod_tensor_array.h" #include "op_registry.h" +#include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/platform/nccl_helper.h" @@ -25,35 +26,11 @@ limitations under the License. */ namespace paddle { namespace framework { -struct OpHandle; +using details::DummyVarHandle; +using details::VarHandle; +using details::VarHandleBase; -struct VarHandleBase { - virtual ~VarHandleBase() {} - virtual std::string DebugString() const = 0; - - OpHandle *generated_op_; - std::unordered_set pending_ops_; -}; - -struct VarHandle : public VarHandleBase { - std::string DebugString() const override { - std::stringstream ss; - ss << name_ << ":" << place_; - return ss.str(); - } - - // version field currently is not used, however, just store the version to - // debug easily. - size_t version_; - std::string name_; - platform::Place place_; -}; - -struct DummyVarHandle : public VarHandleBase { - std::string DebugString() const override { return "dummy"; } -}; - -struct OpHandle { +struct OpHandleBase { std::vector inputs_; std::vector outputs_; std::unordered_map *local_scopes_; @@ -216,51 +193,13 @@ class ParallelExecutorPrivate { std::vector local_scopes_; Scope *global_scope_; -#ifdef PADDLE_WITH_CUDA - struct NCCLContext { - std::unique_ptr ctx_; - ncclComm_t comm; - - explicit NCCLContext(int dev_id) { - ctx_.reset(new platform::CUDADeviceContext(platform::CUDAPlace(dev_id))); - } - - cudaStream_t stream() const { return ctx_->stream(); } - - int device_id() const { - return boost::get(ctx_->GetPlace()).device; - } - - static void InitNCCLContext(std::unordered_map &contexts, - const std::vector &places) { - std::vector comms; - std::vector devs; - comms.resize(contexts.size()); - devs.reserve(contexts.size()); - - for (auto &p : places) { - devs.push_back(boost::get(p).device); - } - - PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( - &comms[0], static_cast(contexts.size()), &devs[0])); - - int i = 0; - for (auto &dev_id : devs) { - contexts.at(dev_id).comm = comms[i++]; - } - } - }; - - std::unordered_map communication_streams_; + std::unordered_map communication_streams_; - NCCLContext &GetNCCLCtx(platform::Place p) { + platform::NCCLContext &GetNCCLCtx(platform::Place p) { int dev_id = boost::get(p).device; return communication_streams_.at(dev_id); } -#endif - platform::DeviceContext *CommunicationDevCtx(const platform::Place &place) { if (platform::is_cpu_place(place) || local_scopes_.size() == 1) { return const_cast( @@ -282,27 +221,95 @@ class ParallelExecutorPrivate { vars_; std::unordered_set> dep_vars_; - std::vector> ops_; + std::vector> ops_; // Use a simpler thread pool, might be faster. std::unique_ptr pool_; std::unique_ptr exception_; -}; -struct NCCLAllReduceOpHandle : public OpHandle { - ParallelExecutorPrivate *member_; + VarHandle *GetVarHandle(const std::string &each_var_name, + const platform::Place &place) { + auto &var_holders = vars_[place]; + auto &var_holder = var_holders[each_var_name]; + VarHandle *var = nullptr; + if (var_holder.empty()) { + auto &init_var = var_holder[0]; + init_var.place_ = place; + init_var.name_ = each_var_name; + init_var.generated_op_ = nullptr; + init_var.version_ = 0; + var = &init_var; + } else { + var = &var_holder.rbegin()->second; + } + return var; + } - explicit NCCLAllReduceOpHandle(ParallelExecutorPrivate *member) - : member_(member) {} + void RunOp( + bool use_event, + std::unordered_map> &pending_vars, + OpHandleBase *op) { + std::vector *> *ready_buffer = + new std::vector *>(); + for (auto *var : op->outputs_) { + ready_buffer->emplace_back(&pending_vars[var]); + } + + auto op_run = [ready_buffer, op, this, use_event] { + try { + VLOG(10) << op->DebugString(); + op->Run(use_event); + for (auto *ready : *ready_buffer) { + ready->store(true, std::memory_order_release); + } + delete ready_buffer; + } catch (platform::EnforceNotMet ex) { + exception_.reset(new platform::EnforceNotMet(ex)); + } catch (...) { + LOG(FATAL) << "Unknown exception catched"; + } + }; + if (pool_) { + pool_->enqueue(op_run); + } else { + op_run(); + } + } + + void GenerateVar(OpHandleBase *op_handle, const std::string &each_var_name, + const platform::Place &place) { + auto &vars = vars_[place][each_var_name]; + size_t version = vars.size(); + auto &var = vars[version]; + var.version_ = version; + var.generated_op_ = op_handle; + var.name_ = each_var_name; + var.place_ = place; + op_handle->outputs_.emplace_back(&var); + } +}; // namespace framework + +struct NCCLAllReduceOpHandle : public OpHandleBase { + const std::vector &local_scopes_; + const std::vector &places_; + const std::unordered_map &communication_ctxs_; + + explicit NCCLAllReduceOpHandle( + const std::vector &local_scopes, + const std::vector &places, + const std::unordered_map &ctxs) + : local_scopes_(local_scopes), + places_(places), + communication_ctxs_(ctxs) {} void Wait(platform::DeviceContext *waited_dev) override { - OpHandle::Wait(waited_dev); + OpHandleBase::Wait(waited_dev); } protected: void RunImpl() override { - if (this->inputs_.size() == 1) { + if (inputs_.size() == 1) { return; // No need to all reduce when GPU count = 1; } else { // Wait input done @@ -317,9 +324,9 @@ struct NCCLAllReduceOpHandle : public OpHandle { platform::NCCLGroupGuard guard; - for (size_t i = 0; i < member_->local_scopes_.size(); ++i) { - auto &p = member_->places_[i]; - auto *s = member_->local_scopes_[i]; + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + auto *s = local_scopes_[i]; int dev_id = boost::get(p).device; auto &lod_tensor = s->FindVar(var_name)->Get(); @@ -336,16 +343,16 @@ struct NCCLAllReduceOpHandle : public OpHandle { if (numel == 0) { numel = static_cast(lod_tensor.numel()); } - auto &nccl_ctx = member_->communication_streams_.at(dev_id); + auto &nccl_ctx = communication_ctxs_.at(dev_id); PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, - nccl_ctx.comm, nccl_ctx.stream())); + nccl_ctx.comm_, nccl_ctx.stream())); } } } }; -struct ComputationOpHandle : public OpHandle { +struct ComputationOpHandle : public OpHandleBase { std::unique_ptr op_; Scope *scope_; platform::Place place_; @@ -443,14 +450,14 @@ void ParallelExecutor::ConstructDependencyGraph( auto var_names = op->InputArgumentNames(); for (auto &each_var_name : var_names) { - VarHandle *var = GetVarHandle(each_var_name, p); + VarHandle *var = member_->GetVarHandle(each_var_name, p); op_handle->inputs_.emplace_back(var); var->pending_ops_.emplace(op_handle); } var_names = op->OutputArgumentNames(); for (auto &each_var_name : var_names) { - GenerateVar(op_handle, each_var_name, p); + member_->GenerateVar(op_handle, each_var_name, p); } if (is_forwarding) { @@ -468,7 +475,7 @@ void ParallelExecutor::ConstructDependencyGraph( // loss->pending_ops_.emplace_back(op_handle); // op_handle->inputs_.emplace_back(loss); - GenerateVar(op_handle, loss_var_name + "@GRAD", p); + member_->GenerateVar(op_handle, loss_var_name + "@GRAD", p); change_forward = true; } } @@ -483,7 +490,9 @@ void ParallelExecutor::ConstructDependencyGraph( for (auto &og : var_names) { if (grads.count(og) != 0) { // is param grad // Insert NCCL AllReduce Op - member_->ops_.emplace_back(new NCCLAllReduceOpHandle(member_)); + member_->ops_.emplace_back(new NCCLAllReduceOpHandle( + member_->local_scopes_, member_->places_, + member_->communication_streams_)); auto *op_handle = member_->ops_.back().get(); for (size_t i = 0; i < member_->places_.size(); ++i) { @@ -562,37 +571,6 @@ void ParallelExecutor::PolishGraphToSupportDataHazards() const { } } -void ParallelExecutor::GenerateVar(OpHandle *op_handle, - const std::string &each_var_name, - const platform::Place &place) const { - auto &vars = member_->vars_[place][each_var_name]; - size_t version = vars.size(); - auto &var = vars[version]; - var.version_ = version; - var.generated_op_ = op_handle; - var.name_ = each_var_name; - var.place_ = place; - op_handle->outputs_.emplace_back(&var); -} - -VarHandle *ParallelExecutor::GetVarHandle(const std::string &each_var_name, - const platform::Place &place) const { - auto &var_holders = member_->vars_[place]; - auto &var_holder = var_holders[each_var_name]; - VarHandle *var = nullptr; - if (var_holder.empty()) { - auto &init_var = var_holder[0]; - init_var.place_ = place; - init_var.name_ = each_var_name; - init_var.generated_op_ = nullptr; - init_var.version_ = 0; - var = &init_var; - } else { - var = &var_holder.rbegin()->second; - } - return var; -} - void ParallelExecutor::BCastParamsToGPUs( const ProgramDesc &startup_program) const { #ifdef PADDLE_WITH_CUDA @@ -621,8 +599,8 @@ void ParallelExecutor::BCastParamsToGPUs( } auto &nccl_ctx = member_->GetNCCLCtx(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, nccl_ctx.comm, - nccl_ctx.stream()); + platform::dynload::ncclBcast(buffer, numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); } } @@ -640,12 +618,12 @@ void ParallelExecutor::BuildNCCLCommunicator() const { for (auto &place : member_->places_) { int dev_id = boost::get(place).device; - member_->communication_streams_.emplace( - dev_id, ParallelExecutorPrivate::NCCLContext(dev_id)); + member_->communication_streams_.emplace(dev_id, + platform::NCCLContext(dev_id)); } - ParallelExecutorPrivate::NCCLContext::InitNCCLContext( - member_->communication_streams_, member_->places_); + platform::NCCLContext::InitNCCLContext(member_->communication_streams_, + member_->places_); #endif } @@ -656,7 +634,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, // Version --> VarHandle member_->exception_.reset(); std::unordered_map> pending_vars; - std::unordered_map pending_ops; + std::unordered_map pending_ops; std::vector dummy_vars; for (auto &place_pair : member_->vars_) { @@ -672,7 +650,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, pending_vars[var.get()] = var->generated_op_ == nullptr; } - std::vector to_run; + std::vector to_run; for (auto &op : member_->ops_) { if (op->inputs_.empty()) { // Special case, Op has no input. @@ -722,7 +700,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } for (auto *op : to_run) { - RunOp(use_event, pending_vars, op); + member_->RunOp(use_event, pending_vars, op); } while (!pending_vars.empty()) { @@ -750,7 +728,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } for (auto *op : to_run) { pending_ops.erase(op); - RunOp(use_event, pending_vars, op); + member_->RunOp(use_event, pending_vars, op); } } @@ -762,35 +740,5 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, fetched_data; } -void ParallelExecutor::RunOp( - bool use_event, - std::unordered_map> &pending_vars, - OpHandle *op) const { - std::vector *> *ready_buffer = - new std::vector *>(); - for (auto *var : op->outputs_) { - ready_buffer->emplace_back(&pending_vars[var]); - } - - auto op_run = [ready_buffer, op, this, use_event] { - try { - VLOG(10) << op->DebugString(); - op->Run(use_event); - for (auto *ready : *ready_buffer) { - ready->store(true, std::memory_order_release); - } - delete ready_buffer; - } catch (platform::EnforceNotMet ex) { - member_->exception_.reset(new platform::EnforceNotMet(ex)); - } catch (...) { - LOG(FATAL) << "Unknown exception catched"; - } - }; - if (member_->pool_) { - member_->pool_->enqueue(op_run); - } else { - op_run(); - } -} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index c206e726a7..466b5f5f62 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -29,9 +29,6 @@ namespace paddle { namespace framework { class ParallelExecutorPrivate; -class VarHandle; -class OpHandle; -class VarHandleBase; class ParallelExecutor { public: @@ -50,23 +47,12 @@ class ParallelExecutor { void BCastParamsToGPUs(const ProgramDesc& startup_program) const; - VarHandle* GetVarHandle(const std::string& each_var_name, - const platform::Place& place) const; - - void GenerateVar(OpHandle* op_handle, const std::string& each_var_name, - const platform::Place& place) const; - void ConstructDependencyGraph(const std::unordered_set& params, const ProgramDesc& main_program, const std::string& loss_var_name) const; void BuildNCCLCommunicator() const; - void RunOp( - bool use_event, - std::unordered_map>& pending_vars, - OpHandle* op) const; - void PolishGraphToSupportDataHazards() const; }; diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index cceceda8ad..3db846b024 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -47,11 +47,45 @@ class NCCLGroupGuard { } private: - static std::mutex& mutex() { + static std::mutex &mutex() { static std::mutex mtx; return mtx; } }; +struct NCCLContext { + std::unique_ptr ctx_; + ncclComm_t comm_; + + explicit NCCLContext(int dev_id) + : ctx_(new CUDADeviceContext(CUDAPlace(dev_id))) {} + + cudaStream_t stream() const { return ctx_->stream(); } + + int device_id() const { + return boost::get(ctx_->GetPlace()).device; + } + + static void InitNCCLContext(std::unordered_map &contexts, + const std::vector &places) { + std::vector comms; + std::vector devs; + comms.resize(contexts.size()); + devs.reserve(contexts.size()); + + for (auto &p : places) { + devs.push_back(boost::get(p).device); + } + + PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( + &comms[0], static_cast(contexts.size()), &devs[0])); + + int i = 0; + for (auto &dev_id : devs) { + contexts.at(dev_id).comm_ = comms[i++]; + } + } +}; + } // namespace platform } // namespace paddle From e9d815e32b7cdb6e030bfd3aa649d3327bf4f195 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 21 Mar 2018 14:46:10 +0800 Subject: [PATCH 125/314] prepare and create op before run --- paddle/fluid/operators/listen_and_serv_op.cc | 9 +-------- paddle/fluid/operators/send_op.cc | 1 + 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index bd6e25449f..da44128cdd 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -99,7 +99,6 @@ class ListenAndServOp : public framework::OperatorBase { blk_ctx_list.push_back(nullptr); // block0 is not used. for (int blkid = 1; blkid < num_blocks; ++blkid) { auto *exe_ctx = executor.Prepare(*program, blkid); - VLOG(2) << "prepare ctx: " << exe_ctx; blk_ctx_list.push_back(exe_ctx); } @@ -149,6 +148,7 @@ class ListenAndServOp : public framework::OperatorBase { // should be global ops. // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads // and this will still work. + std::vector> fs; // block0 contains only listen_and_serv op, start run from block1. for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { @@ -156,13 +156,8 @@ class ListenAndServOp : public framework::OperatorBase { [&executor, &program, &recv_scope, &blk_ctx_list, blkid]() { int run_block = blkid; // thread local try { - VLOG(2) << "run ctx: " << blk_ctx_list[run_block] - << " block: " << run_block; executor.RunPreparedContext(blk_ctx_list[run_block], &recv_scope, false, false); - // executor.Run(*program, &recv_scope, run_block, - // false /*create_local_scope*/, - // false /*create_vars*/); } catch (std::exception &e) { LOG(ERROR) << "run sub program error " << e.what(); } @@ -174,8 +169,6 @@ class ListenAndServOp : public framework::OperatorBase { try { executor.RunPreparedContext(blk_ctx_list[num_blocks - 1], &recv_scope, false, false); - // executor.Run(*program, &recv_scope, num_blocks - 1, - // false /*create_local_scope*/, false /*create_vars*/); } catch (std::exception &e) { LOG(ERROR) << "run sub program error " << e.what(); } diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 443f40e803..2df25ae5a6 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -66,6 +66,7 @@ class SendOp : public framework::OperatorBase { auto* client_var = scope.FindVar(client_var_name); detail::RPCClient* rpc_client = client_var->GetMutable(); + ctx.Wait(); // wait before sending for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; From fe7ed285d131ba99e82538e76cb7ac5381e97809 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 14:49:02 +0800 Subject: [PATCH 126/314] Extract NCCLCtxMap --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 1 + .../fluid/framework/details/op_handle_base.cc | 84 +++++++++++++ .../fluid/framework/details/op_handle_base.h | 48 ++++++++ paddle/fluid/framework/details/var_handle.h | 4 +- paddle/fluid/framework/parallel_executor.cc | 114 +++--------------- paddle/fluid/platform/nccl_helper.h | 46 +++++++ 7 files changed, 196 insertions(+), 103 deletions(-) create mode 100644 paddle/fluid/framework/details/op_handle_base.cc create mode 100644 paddle/fluid/framework/details/op_handle_base.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 9d2dc29028..afc7ec9d66 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -88,7 +88,7 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - framework_proto backward glog lod_rank_table feed_fetch_method executor simple_threadpool var_handle) + framework_proto backward glog lod_rank_table simple_threadpool var_handle op_handle_base) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 5074715e2e..d9bdf0b94d 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -1 +1,2 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) +cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc new file mode 100644 index 0000000000..094b62cc94 --- /dev/null +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/op_handle_base.h" + +namespace paddle { +namespace framework { +namespace details { +std::string OpHandleBase::DebugString() const { + std::stringstream ss; + ss << "("; + for (auto *var : inputs_) { + ss << var->DebugString() << ", "; + } + ss << ") --> ("; + for (auto *var : outputs_) { + ss << var->DebugString() << ", "; + } + ss << ")\n"; + return ss.str(); +} + +OpHandleBase::~OpHandleBase() {} + +void OpHandleBase::Run(bool use_event) { +#ifdef PADDLE_WITH_CUDA + if (events_.empty() && use_event) { + for (auto &p : dev_ctx_) { + int dev_id = boost::get(p.first).device; + cudaSetDevice(dev_id); + cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming); + } + } +#else + PADDLE_ENFORCE(!use_event); +#endif + + RunImpl(); + +#ifdef PADDLE_WITH_CUDA + if (use_event) { + for (auto &p : dev_ctx_) { + int dev_id = boost::get(p.first).device; + auto stream = + static_cast(p.second)->stream(); + cudaEventRecord(events_.at(dev_id), stream); + } + } +#endif +} + +void OpHandleBase::Wait(platform::DeviceContext *waited_dev) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) { + for (auto &dev_ctx : dev_ctx_) { + dev_ctx.second->Wait(); + } + } else { + auto stream = + static_cast(waited_dev)->stream(); + for (auto &ev : events_) { + PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); + } + } +#else + for (auto &dev_ctx : dev_ctx_) { + dev_ctx.second->Wait(); + } +#endif +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h new file mode 100644 index 0000000000..bdfd1f78ad --- /dev/null +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/platform/device_context.h" +namespace paddle { +namespace framework { +namespace details { + +struct OpHandleBase { + std::vector inputs_; + std::vector outputs_; + std::unordered_map + dev_ctx_; + +#ifdef PADDLE_WITH_CUDA + std::unordered_map events_; +#endif + + std::string DebugString() const; + + virtual ~OpHandleBase(); + + void Run(bool use_event); + + virtual void Wait(platform::DeviceContext *waited_dev); + + protected: + virtual void RunImpl() = 0; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 613ff901b1..893cc15f6c 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -21,10 +21,8 @@ namespace paddle { namespace framework { - -struct OpHandleBase; - namespace details { +struct OpHandleBase; // VarHandleBase is the var node in the dependency graph. // A variable can only be generated by a single operator. i.e. diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 2b094eba1e..3c24fa4bdf 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -14,86 +14,22 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include "ThreadPool.h" -#include "executor.h" #include "lod_tensor.h" #include "lod_tensor_array.h" #include "op_registry.h" +#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/feed_fetch_type.h" -#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/platform/nccl_helper.h" namespace paddle { namespace framework { using details::DummyVarHandle; +using details::OpHandleBase; using details::VarHandle; using details::VarHandleBase; -struct OpHandleBase { - std::vector inputs_; - std::vector outputs_; - std::unordered_map - dev_ctx_; - - std::unordered_map events_; - - std::string DebugString() { - std::stringstream ss; - ss << "("; - for (auto *var : inputs_) { - ss << var->DebugString() << ", "; - } - ss << ") --> ("; - for (auto *var : outputs_) { - ss << var->DebugString() << ", "; - } - ss << ")\n"; - return ss.str(); - } - - virtual ~OpHandleBase() {} - - void Run(bool use_event) { - if (events_.empty() && use_event) { - for (auto &p : dev_ctx_) { - int dev_id = boost::get(p.first).device; - cudaSetDevice(dev_id); - cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming); - } - } - - RunImpl(); - - if (use_event) { - for (auto &p : dev_ctx_) { - int dev_id = boost::get(p.first).device; - auto stream = - static_cast(p.second)->stream(); - cudaEventRecord(events_.at(dev_id), stream); - } - } - } - - virtual void Wait(platform::DeviceContext *waited_dev) { - if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) { - for (auto &dev_ctx : dev_ctx_) { - dev_ctx.second->Wait(); - } - } else { - auto stream = - static_cast(waited_dev)->stream(); - for (auto &ev : events_) { - PADDLE_ENFORCE(cudaStreamWaitEvent(stream, ev.second, 0)); - } - } - } - - protected: - virtual void RunImpl() = 0; -}; - struct ScaleLossGradOpHandle : public OpHandleBase { float coeff_; Scope *scope_; @@ -193,12 +129,7 @@ class ParallelExecutorPrivate { std::vector local_scopes_; Scope *global_scope_; - std::unordered_map communication_streams_; - - platform::NCCLContext &GetNCCLCtx(platform::Place p) { - int dev_id = boost::get(p).device; - return communication_streams_.at(dev_id); - } + std::unique_ptr nccl_ctxs_; platform::DeviceContext *CommunicationDevCtx(const platform::Place &place) { if (platform::is_cpu_place(place) || local_scopes_.size() == 1) { @@ -206,7 +137,7 @@ class ParallelExecutorPrivate { platform::DeviceContextPool::Instance().Get(place)); } else { #ifdef PADDLE_WITH_CUDA - return GetNCCLCtx(place).ctx_.get(); + return nccl_ctxs_->DevCtx(place); #else PADDLE_THROW("Not compiled with CUDA") #endif @@ -293,15 +224,12 @@ class ParallelExecutorPrivate { struct NCCLAllReduceOpHandle : public OpHandleBase { const std::vector &local_scopes_; const std::vector &places_; - const std::unordered_map &communication_ctxs_; + const platform::NCCLContextMap &nccl_ctxs_; - explicit NCCLAllReduceOpHandle( - const std::vector &local_scopes, - const std::vector &places, - const std::unordered_map &ctxs) - : local_scopes_(local_scopes), - places_(places), - communication_ctxs_(ctxs) {} + explicit NCCLAllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap &ctxs) + : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {} void Wait(platform::DeviceContext *waited_dev) override { OpHandleBase::Wait(waited_dev); @@ -343,7 +271,7 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { if (numel == 0) { numel = static_cast(lod_tensor.numel()); } - auto &nccl_ctx = communication_ctxs_.at(dev_id); + auto &nccl_ctx = nccl_ctxs_.at(dev_id); PADDLE_ENFORCE(platform::dynload::ncclAllReduce( buffer, buffer, numel, static_cast(dtype), ncclSum, nccl_ctx.comm_, nccl_ctx.stream())); @@ -491,8 +419,7 @@ void ParallelExecutor::ConstructDependencyGraph( if (grads.count(og) != 0) { // is param grad // Insert NCCL AllReduce Op member_->ops_.emplace_back(new NCCLAllReduceOpHandle( - member_->local_scopes_, member_->places_, - member_->communication_streams_)); + member_->local_scopes_, member_->places_, *member_->nccl_ctxs_)); auto *op_handle = member_->ops_.back().get(); for (size_t i = 0; i < member_->places_.size(); ++i) { @@ -598,15 +525,12 @@ void ParallelExecutor::BCastParamsToGPUs( buffer = t->mutable_data(place, main_tensor.type()); } - auto &nccl_ctx = member_->GetNCCLCtx(place); + auto &nccl_ctx = member_->nccl_ctxs_->at(place); platform::dynload::ncclBcast(buffer, numel, data_type, 0, nccl_ctx.comm_, nccl_ctx.stream()); } } - - for (auto &stream : member_->communication_streams_) { - stream.second.ctx_->Wait(); - } + member_->nccl_ctxs_->WaitAll(); } #else PADDLE_THROW("Not compiled with CUDA"); @@ -615,15 +539,7 @@ void ParallelExecutor::BCastParamsToGPUs( void ParallelExecutor::BuildNCCLCommunicator() const { #ifdef PADDLE_WITH_CUDA - for (auto &place : member_->places_) { - int dev_id = boost::get(place).device; - - member_->communication_streams_.emplace(dev_id, - platform::NCCLContext(dev_id)); - } - - platform::NCCLContext::InitNCCLContext(member_->communication_streams_, - member_->places_); + member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); #endif } @@ -682,7 +598,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, op->offset_ = i; op->local_scopes_ = &member_->local_scopes_; for (auto &p : member_->places_) { - op->dev_ctx_[p] = member_->GetNCCLCtx(p).ctx_.get(); + op->dev_ctx_[p] = member_->nccl_ctxs_->DevCtx(p); } for (auto *var : vars) { diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 3db846b024..2999004320 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -87,5 +87,51 @@ struct NCCLContext { } }; +struct NCCLContextMap { + std::unordered_map contexts_; + std::vector order_; + + NCCLContextMap(const std::vector &places) { + order_.reserve(places.size()); + for (auto &p : places) { + int dev_id = boost::get(p).device; + order_.emplace_back(dev_id); + contexts_.emplace(dev_id, NCCLContext(dev_id)); + } + PADDLE_ENFORCE_EQ( + order_.size(), contexts_.size(), + "NCCL Context Map does not support contain two or more same device"); + + std::vector comms; + comms.resize(order_.size()); + + PADDLE_ENFORCE(platform::dynload::ncclCommInitAll( + &comms[0], static_cast(order_.size()), &order_[0])); + + int i = 0; + for (auto &dev_id : order_) { + contexts_.at(dev_id).comm_ = comms[i++]; + } + } + + CUDADeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } + + CUDADeviceContext *DevCtx(platform::Place p) const { + return DevCtx(boost::get(p).device); + } + + const NCCLContext &at(platform::Place p) const { + return this->at(boost::get(p).device); + } + + const NCCLContext &at(int dev_id) const { return contexts_.at(dev_id); } + + void WaitAll() { + for (auto &p : contexts_) { + p.second.ctx_->Wait(); + } + } +}; + } // namespace platform } // namespace paddle From 1eec9261245028b48fb0b6bc80c85e8bd87851d4 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 21 Mar 2018 14:52:16 +0800 Subject: [PATCH 127/314] updates --- paddle/fluid/operators/send_op.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 2df25ae5a6..443f40e803 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -66,7 +66,6 @@ class SendOp : public framework::OperatorBase { auto* client_var = scope.FindVar(client_var_name); detail::RPCClient* rpc_client = client_var->GetMutable(); - ctx.Wait(); // wait before sending for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; From 5368e50d845bd70d9c9f38a5a75db6cba949f48a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 14:58:28 +0800 Subject: [PATCH 128/314] Reorganize code --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 1 + .../details/scale_loss_grad_op_handle.cc | 47 +++++++++++++++++++ .../details/scale_loss_grad_op_handle.h | 39 +++++++++++++++ paddle/fluid/framework/parallel_executor.cc | 35 +------------- 5 files changed, 90 insertions(+), 34 deletions(-) create mode 100644 paddle/fluid/framework/details/scale_loss_grad_op_handle.cc create mode 100644 paddle/fluid/framework/details/scale_loss_grad_op_handle.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index afc7ec9d66..123b9cb735 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -88,7 +88,7 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - framework_proto backward glog lod_rank_table simple_threadpool var_handle op_handle_base) + framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index d9bdf0b94d..427785d518 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -1,2 +1,3 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) +cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc new file mode 100644 index 0000000000..df9ca37180 --- /dev/null +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { +ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope, + platform::Place place) + : coeff_(static_cast(1.0 / num_dev)), scope_(scope), place_(place) {} + +ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} + +void ScaleLossGradOpHandle::RunImpl() { + std::string var_name = static_cast(this->outputs_[0])->name_; + + float *tmp = + scope_->FindVar(var_name)->GetMutable()->mutable_data( + make_ddim({1}), place_); + + if (platform::is_cpu_place(place_)) { + *tmp = coeff_; + } else { +#ifdef PADDLE_WITH_CUDA + auto stream = + static_cast(this->dev_ctx_[place_]) + ->stream(); + memory::Copy(boost::get(place_), tmp, + platform::CPUPlace(), &coeff_, sizeof(float), stream); +#endif + } +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h new file mode 100644 index 0000000000..44a10e3375 --- /dev/null +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +namespace paddle { +namespace framework { +namespace details { + +struct ScaleLossGradOpHandle : public OpHandleBase { + float coeff_; + Scope *scope_; + platform::Place place_; + + ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place); + + ~ScaleLossGradOpHandle() final; + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 3c24fa4bdf..5dba3e94c1 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "lod_tensor_array.h" #include "op_registry.h" #include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/platform/nccl_helper.h" @@ -27,42 +28,10 @@ namespace framework { using details::DummyVarHandle; using details::OpHandleBase; +using details::ScaleLossGradOpHandle; using details::VarHandle; using details::VarHandleBase; -struct ScaleLossGradOpHandle : public OpHandleBase { - float coeff_; - Scope *scope_; - platform::Place place_; - - explicit ScaleLossGradOpHandle(size_t num_dev, Scope *scope, - platform::Place place) - : coeff_(static_cast(1.0 / num_dev)), - scope_(scope), - place_(place) {} - - ~ScaleLossGradOpHandle() {} - - protected: - void RunImpl() override { - std::string var_name = static_cast(this->outputs_[0])->name_; - - float *tmp = scope_->FindVar(var_name) - ->GetMutable() - ->mutable_data(make_ddim({1}), place_); - - if (platform::is_cpu_place(place_)) { - *tmp = coeff_; - } else { - auto stream = - static_cast(this->dev_ctx_[place_]) - ->stream(); - memory::Copy(boost::get(place_), tmp, - platform::CPUPlace(), &coeff_, sizeof(float), stream); - } - } -}; - struct FetchOpHandle : public OpHandleBase { FeedFetchList *data_; size_t offset_; From 15f5f10ed5b09b47bd897f8d0df916bed3fcf0f6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 15:43:21 +0800 Subject: [PATCH 129/314] AddInput/AddOutput for OpHandle --- paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/details/CMakeLists.txt | 1 + .../framework/details/fetch_op_handle.cc | 77 ++++++++++ .../fluid/framework/details/fetch_op_handle.h | 47 ++++++ .../fluid/framework/details/op_handle_base.cc | 11 ++ .../fluid/framework/details/op_handle_base.h | 4 + .../details/scale_loss_grad_op_handle.cc | 7 +- .../details/scale_loss_grad_op_handle.h | 4 +- paddle/fluid/framework/parallel_executor.cc | 140 +++++------------- 9 files changed, 190 insertions(+), 104 deletions(-) create mode 100644 paddle/fluid/framework/details/fetch_op_handle.cc create mode 100644 paddle/fluid/framework/details/fetch_op_handle.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 123b9cb735..cf288e7804 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -88,7 +88,8 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle) + framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle + fetch_op_handle) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 427785d518..aed444d9aa 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -1,3 +1,4 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) +cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc new file mode 100644 index 0000000000..ab552081a4 --- /dev/null +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -0,0 +1,77 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/fetch_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset, + std::vector *local_scopes) + : data_(data), offset_(offset), local_scopes_(local_scopes) {} + +FetchOpHandle::~FetchOpHandle() { + for (auto *input_var : inputs_) { + input_var->pending_ops_.erase(this); + } +} + +void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) { + PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); +} + +void FetchOpHandle::WaitAndMergeCPUTensors() const { + // Wait fetch stream done. + for (auto &ctx : dev_ctx_) { + ctx.second->Wait(); + } + + std::vector tensors_ptr; + tensors_ptr.reserve(tensors_.size()); + for (auto &t : tensors_) { + tensors_ptr.emplace_back(&t); + } + data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace()); +} + +void FetchOpHandle::RunImpl() { + for (auto *input : inputs_) { + auto *var = static_cast(input); + var->generated_op_->Wait(this->dev_ctx_[var->place_]); + } + + tensors_.resize(inputs_.size()); + auto *var = static_cast(inputs_[0]); + auto &var_name = var->name_; + platform::CPUPlace cpu; + auto &scopes = *local_scopes_; + + for (size_t i = 0; i < scopes.size(); ++i) { + auto &scope = scopes[i]; + auto &t = scope->FindVar(var_name)->Get(); + if (platform::is_gpu_place(var->place_)) { +#ifdef PADDLE_WITH_CUDA + TensorCopy(t, cpu, *dev_ctx_[t.place()], &tensors_[i]); +#endif + } else { + tensors_[i].ShareDataWith(t); + tensors_[i].set_lod(t.lod()); + } + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h new file mode 100644 index 0000000000..3123f7ba23 --- /dev/null +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -0,0 +1,47 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/feed_fetch_type.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { + +struct FetchOpHandle : public OpHandleBase { + FeedFetchList *data_; + size_t offset_; + std::vector *local_scopes_; + std::vector tensors_; + + FetchOpHandle(FeedFetchList *data, size_t offset, + std::vector *local_scopes); + + ~FetchOpHandle(); + + void Wait(platform::DeviceContext *waited_dev) override; + + void WaitAndMergeCPUTensors() const; + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 094b62cc94..ca354a63c6 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -79,6 +79,17 @@ void OpHandleBase::Wait(platform::DeviceContext *waited_dev) { } #endif } + +void OpHandleBase::AddInput(VarHandleBase *in) { + this->inputs_.emplace_back(in); + in->pending_ops_.insert(this); +} + +void OpHandleBase::AddOutput(VarHandleBase *out) { + outputs_.emplace_back(out); + out->generated_op_ = this; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index bdfd1f78ad..5178b51d8d 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -39,6 +39,10 @@ struct OpHandleBase { virtual void Wait(platform::DeviceContext *waited_dev); + void AddInput(VarHandleBase *in); + + void AddOutput(VarHandleBase *out); + protected: virtual void RunImpl() = 0; }; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index df9ca37180..2e69f1e5e8 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -18,8 +18,11 @@ namespace paddle { namespace framework { namespace details { ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope, - platform::Place place) - : coeff_(static_cast(1.0 / num_dev)), scope_(scope), place_(place) {} + platform::Place place, + platform::DeviceContext *dev_ctx) + : coeff_(static_cast(1.0 / num_dev)), scope_(scope), place_(place) { + dev_ctx_[place_] = dev_ctx; +} ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index 44a10e3375..3a35574919 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" + namespace paddle { namespace framework { namespace details { @@ -26,7 +27,8 @@ struct ScaleLossGradOpHandle : public OpHandleBase { Scope *scope_; platform::Place place_; - ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place); + ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place, + platform::DeviceContext *context); ~ScaleLossGradOpHandle() final; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5dba3e94c1..7064828b21 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -17,77 +17,22 @@ limitations under the License. */ #include "lod_tensor.h" #include "lod_tensor_array.h" #include "op_registry.h" +#include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/var_handle.h" -#include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/platform/nccl_helper.h" namespace paddle { namespace framework { using details::DummyVarHandle; +using details::FetchOpHandle; using details::OpHandleBase; using details::ScaleLossGradOpHandle; using details::VarHandle; using details::VarHandleBase; -struct FetchOpHandle : public OpHandleBase { - FeedFetchList *data_; - size_t offset_; - std::vector *local_scopes_; - std::vector tensors_; - - ~FetchOpHandle() { - for (auto *input_var : inputs_) { - input_var->pending_ops_.erase(this); - } - } - - void Wait(platform::DeviceContext *waited_dev) override { - PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error"); - } - - void WaitAndMergeCPUTensors() const { - // Wait fetch stream done. - for (auto &ctx : dev_ctx_) { - ctx.second->Wait(); - } - - std::vector tensors_ptr; - tensors_ptr.reserve(tensors_.size()); - for (auto &t : tensors_) { - tensors_ptr.emplace_back(&t); - } - data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace()); - } - - protected: - void RunImpl() override { - for (auto *input : inputs_) { - auto *var = static_cast(input); - var->generated_op_->Wait(this->dev_ctx_[var->place_]); - } - - tensors_.resize(inputs_.size()); - auto *var = static_cast(inputs_[0]); - auto &var_name = var->name_; - platform::CPUPlace cpu; - auto &scopes = *local_scopes_; - - for (size_t i = 0; i < scopes.size(); ++i) { - auto &scope = scopes[i]; - auto &t = scope->FindVar(var_name)->Get(); - if (platform::is_gpu_place(var->place_)) { - TensorCopy(t, cpu, *dev_ctx_[t.place()], &tensors_[i]); - } else { - tensors_[i].ShareDataWith(t); - tensors_[i].set_lod(t.lod()); - } - } - } -}; - class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(size_t num_threads) @@ -99,19 +44,9 @@ class ParallelExecutorPrivate { Scope *global_scope_; std::unique_ptr nccl_ctxs_; - - platform::DeviceContext *CommunicationDevCtx(const platform::Place &place) { - if (platform::is_cpu_place(place) || local_scopes_.size() == 1) { - return const_cast( - platform::DeviceContextPool::Instance().Get(place)); - } else { -#ifdef PADDLE_WITH_CUDA - return nccl_ctxs_->DevCtx(place); -#else - PADDLE_THROW("Not compiled with CUDA") -#endif - } - } + std::unordered_map + fetch_dev_ctxs_; platform::Place main_place_; @@ -119,6 +54,7 @@ class ParallelExecutorPrivate { std::unordered_map>, platform::PlaceHash> vars_; + std::unordered_set> dep_vars_; std::vector> ops_; @@ -183,10 +119,9 @@ class ParallelExecutorPrivate { size_t version = vars.size(); auto &var = vars[version]; var.version_ = version; - var.generated_op_ = op_handle; var.name_ = each_var_name; var.place_ = place; - op_handle->outputs_.emplace_back(&var); + op_handle->AddOutput(&var); } }; // namespace framework @@ -198,7 +133,11 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { explicit NCCLAllReduceOpHandle(const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap &ctxs) - : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) {} + : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { + for (auto &p : places_) { + this->dev_ctx_[p] = nccl_ctxs_.DevCtx(p); + } + } void Wait(platform::DeviceContext *waited_dev) override { OpHandleBase::Wait(waited_dev); @@ -283,6 +222,17 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(num_threads)) { member_->places_ = places; member_->global_scope_ = scope; + + if (platform::is_cpu_place(places[0])) { + member_->fetch_dev_ctxs_[places[0]] = const_cast( + platform::DeviceContextPool::Instance().Get(places[0])); + } else { + for (auto &p : member_->places_) { + member_->fetch_dev_ctxs_[p] = + new platform::CUDADeviceContext(boost::get(p)); + } + } + // Step 1. RunStartupProgram and Bcast the params to devs. Executor exe(places[0]); exe.Run(startup_program, scope, 0); @@ -348,8 +298,7 @@ void ParallelExecutor::ConstructDependencyGraph( for (auto &each_var_name : var_names) { VarHandle *var = member_->GetVarHandle(each_var_name, p); - op_handle->inputs_.emplace_back(var); - var->pending_ops_.emplace(op_handle); + op_handle->AddInput(var); } var_names = op->OutputArgumentNames(); @@ -360,11 +309,10 @@ void ParallelExecutor::ConstructDependencyGraph( if (is_forwarding) { if (var_names.size() == 1 && var_names[0] == loss_var_name) { // Insert ScaleCost OpHandle - member_->ops_.emplace_back(new ScaleLossGradOpHandle( - this->member_->local_scopes_.size(), s, p)); - op_handle = member_->ops_.back().get(); - - op_handle->dev_ctx_[p] = member_->CommunicationDevCtx(p); + op_handle = + new ScaleLossGradOpHandle(this->member_->local_scopes_.size(), s, + p, member_->nccl_ctxs_->DevCtx(p)); + member_->ops_.emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale // factor. So it does not depend on any other operators. @@ -399,15 +347,14 @@ void ParallelExecutor::ConstructDependencyGraph( continue; } auto *prev_grad = &vars[vars.size() - 1]; - op_handle->inputs_.emplace_back(prev_grad); - prev_grad->pending_ops_.emplace(op_handle); + op_handle->AddInput(prev_grad); + auto &var = vars[vars.size()]; var.place_ = p; - var.generated_op_ = op_handle; var.name_ = og; var.version_ = vars.size() - 1; - op_handle->outputs_.emplace_back(&var); - op_handle->dev_ctx_[p] = member_->CommunicationDevCtx(p); + + op_handle->AddOutput(&var); } } } @@ -454,12 +401,8 @@ void ParallelExecutor::PolishGraphToSupportDataHazards() const { } auto *dep_var = new DummyVarHandle(); - - dep_var->generated_op_ = read_op; - read_op->outputs_.emplace_back(dep_var); - - dep_var->pending_ops_.emplace(write_op); - write_op->inputs_.emplace_back(dep_var); + read_op->AddOutput(dep_var); + write_op->AddInput(dep_var); member_->dep_vars_.emplace(dep_var); } } @@ -561,24 +504,21 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, for (size_t i = 0; i < fetch_tensors.size(); ++i) { auto &var_name = fetch_tensors[i]; auto &vars = fetched_vars[var_name]; - fetch_ops.emplace_back(); + fetch_ops.emplace_back(&fetched_data, i, &member_->local_scopes_); FetchOpHandle *op = &fetch_ops.back(); - op->data_ = &fetched_data; - op->offset_ = i; - op->local_scopes_ = &member_->local_scopes_; + + // FIXME: Use new device context for (auto &p : member_->places_) { - op->dev_ctx_[p] = member_->nccl_ctxs_->DevCtx(p); + op->dev_ctx_[p] = member_->fetch_dev_ctxs_[p]; } for (auto *var : vars) { - var->pending_ops_.emplace(op); - op->inputs_.emplace_back(var); + op->AddInput(var); } dummy_vars.emplace_back(); auto *var = &dummy_vars.back(); - op->outputs_.emplace_back(var); - var->generated_op_ = op; + op->AddOutput(var); pending_vars[var] = false; pending_ops.insert({op, op->inputs_.size()}); From 5c333e414380f064696a1c152d26cc6b5d6750e4 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 16:21:18 +0800 Subject: [PATCH 130/314] Add dctor for dev_ctx --- paddle/fluid/framework/parallel_executor.cc | 27 +++++----------- paddle/fluid/platform/device_context.cc | 34 +++++++++++---------- paddle/fluid/platform/device_context.h | 17 ++--------- paddle/fluid/platform/place.h | 3 +- 4 files changed, 31 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 7064828b21..8c29aacab6 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -35,18 +35,18 @@ using details::VarHandleBase; class ParallelExecutorPrivate { public: - explicit ParallelExecutorPrivate(size_t num_threads) - : pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) {} + explicit ParallelExecutorPrivate(size_t num_threads, + const std::vector &places) + : places_(places), + fetch_dev_ctxs_(places), + pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; - + platform::DeviceContextPool fetch_dev_ctxs_; std::vector local_scopes_; Scope *global_scope_; std::unique_ptr nccl_ctxs_; - std::unordered_map - fetch_dev_ctxs_; platform::Place main_place_; @@ -219,20 +219,9 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set ¶ms, const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope) - : member_(new ParallelExecutorPrivate(num_threads)) { - member_->places_ = places; + : member_(new ParallelExecutorPrivate(num_threads, places)) { member_->global_scope_ = scope; - if (platform::is_cpu_place(places[0])) { - member_->fetch_dev_ctxs_[places[0]] = const_cast( - platform::DeviceContextPool::Instance().Get(places[0])); - } else { - for (auto &p : member_->places_) { - member_->fetch_dev_ctxs_[p] = - new platform::CUDADeviceContext(boost::get(p)); - } - } - // Step 1. RunStartupProgram and Bcast the params to devs. Executor exe(places[0]); exe.Run(startup_program, scope, 0); @@ -509,7 +498,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, // FIXME: Use new device context for (auto &p : member_->places_) { - op->dev_ctx_[p] = member_->fetch_dev_ctxs_[p]; + op->dev_ctx_[p] = member_->fetch_dev_ctxs_.Get(p); } for (auto *var : vars) { diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index ab02a95f26..59b76a1edb 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -10,43 +10,45 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/device_context.h" +#include #include "paddle/fluid/memory/memory.h" - namespace paddle { namespace platform { DeviceContextPool* DeviceContextPool::pool = nullptr; -const platform::DeviceContext* DeviceContextPool::Get( - const platform::Place& place) { +platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { auto it = device_contexts_.find(place); if (it == device_contexts_.end()) { PADDLE_THROW( "'Place' is not supported, Please re-compile with WITH_GPU " "option"); } - return it->second; + return it->second.get(); } DeviceContextPool::DeviceContextPool( const std::vector& places) { PADDLE_ENFORCE_GT(places.size(), 0); - for (size_t i = 0; i < places.size(); i++) { - if (platform::is_cpu_place(places[i])) { + using PtrType = std::unique_ptr; + std::unordered_set set; + for (auto& p : places) { + set.insert(p); + } + + for (auto& p : set) { + if (platform::is_cpu_place(p)) { #ifdef PADDLE_WITH_MKLDNN - device_contexts_.emplace(places[i], - new platform::MKLDNNDeviceContext( - boost::get(places[i]))); + device_contexts_.emplace( + p, PtrType(new MKLDNNDeviceContext(boost::get(p)))); #else - device_contexts_.emplace(places[i], - new platform::CPUDeviceContext( - boost::get(places[i]))); + device_contexts_.emplace( + p, PtrType(new CPUDeviceContext(boost::get(p)))); #endif - } else if (platform::is_gpu_place(places[i])) { + } else if (platform::is_gpu_place(p)) { #ifdef PADDLE_WITH_CUDA - device_contexts_.emplace(places[i], - new platform::CUDADeviceContext( - boost::get(places[i]))); + device_contexts_.emplace( + p, PtrType(new CUDADeviceContext(boost::get(p)))); #else PADDLE_THROW( "'CUDAPlace' is not supported, Please re-compile with WITH_GPU " diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index df0a427b48..202394c7be 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -160,7 +160,7 @@ class DeviceContextPool { } /*! \brief Return handle of single device context. */ - const platform::DeviceContext* Get(const platform::Place& place); + platform::DeviceContext* Get(const platform::Place& place); template const typename DefaultDeviceContextType::TYPE* GetByPlace( @@ -173,19 +173,8 @@ class DeviceContextPool { private: static DeviceContextPool* pool; - constexpr static int LEFT_SHIFT = 8; - struct Hash { - std::hash hash_; - size_t operator()(const platform::Place& place) const { - int pre_hash = place.which() << LEFT_SHIFT; - if (platform::is_gpu_place(place)) { - pre_hash += boost::get(place).GetDeviceId(); - } - return hash_(pre_hash); - } - }; - std::unordered_map + std::unordered_map, PlaceHash> device_contexts_; DISABLE_COPY_AND_ASSIGN(DeviceContextPool); }; diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 633251eb47..4cc8b377b8 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -67,12 +67,13 @@ bool is_same_place(const Place &, const Place &); struct PlaceHash { std::size_t operator()(const Place &p) const { + constexpr size_t num_dev_bits = 4; std::hash ihash; size_t dev_id = 0; if (is_gpu_place(p)) { dev_id = boost::get(p).device; } - return ihash(dev_id << 2 | p.which()); + return ihash(dev_id << num_dev_bits | p.which()); } }; From f28ae6e4b16322310ec91fa3e7f6916f2aa79889 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 16:48:44 +0800 Subject: [PATCH 131/314] Reorganize Code --- paddle/fluid/framework/CMakeLists.txt | 8 +- paddle/fluid/framework/details/CMakeLists.txt | 2 + .../details/nccl_all_reduce_op_handle.cc | 74 +++++++++++++++++++ .../details/nccl_all_reduce_op_handle.h | 41 ++++++++++ paddle/fluid/framework/parallel_executor.cc | 65 +--------------- 5 files changed, 126 insertions(+), 64 deletions(-) create mode 100644 paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc create mode 100644 paddle/fluid/framework/details/nccl_all_reduce_op_handle.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cf288e7804..12d6541b8f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -87,9 +87,15 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) + +if(WITH_GPU) + set(parallel_executor_cuda_deps nccl_all_reduce_op_handle) +else() + set(parallel_executor_cuda_deps) +endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle - fetch_op_handle) + fetch_op_handle ${parallel_executor_cuda_deps}) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index aed444d9aa..fb276ea703 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -2,3 +2,5 @@ cc_library(var_handle SRCS var_handle.cc DEPS place) cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context) cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) +nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory + dynload_cuda) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc new file mode 100644 index 0000000000..a79c61f359 --- /dev/null +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -0,0 +1,74 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { +NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( + const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap &ctxs) + : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { + for (auto &p : places_) { + this->dev_ctx_[p] = nccl_ctxs_.DevCtx(p); + } +} + +void NCCLAllReduceOpHandle::RunImpl() { + if (inputs_.size() == 1) { + return; // No need to all reduce when GPU count = 1; + } else { + // Wait input done + for (auto *in : inputs_) { + auto &p = static_cast(in)->place_; + in->generated_op_->Wait(dev_ctx_[p]); + } + + auto &var_name = static_cast(this->inputs_[0])->name_; + int dtype = -1; + size_t numel = 0; + + platform::NCCLGroupGuard guard; + + for (size_t i = 0; i < local_scopes_.size(); ++i) { + auto &p = places_[i]; + auto *s = local_scopes_[i]; + int dev_id = boost::get(p).device; + + auto &lod_tensor = s->FindVar(var_name)->Get(); + void *buffer = const_cast(lod_tensor.data()); + uintptr_t buf = reinterpret_cast(buffer); + if (buf % sizeof(float) != 0) { + VLOG(3) << "Buffer is not aligned " << buf; + } + + if (dtype == -1) { + dtype = platform::ToNCCLDataType(lod_tensor.type()); + } + + if (numel == 0) { + numel = static_cast(lod_tensor.numel()); + } + auto &nccl_ctx = nccl_ctxs_.at(dev_id); + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(dtype), ncclSum, + nccl_ctx.comm_, nccl_ctx.stream())); + } + } +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h new file mode 100644 index 0000000000..7152d1a587 --- /dev/null +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/nccl_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +struct NCCLAllReduceOpHandle : public OpHandleBase { + const std::vector &local_scopes_; + const std::vector &places_; + const platform::NCCLContextMap &nccl_ctxs_; + + NCCLAllReduceOpHandle(const std::vector &local_scopes, + const std::vector &places, + const platform::NCCLContextMap &ctxs); + + protected: + void RunImpl() override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8c29aacab6..93db5ad3e5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "lod_tensor_array.h" #include "op_registry.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/var_handle.h" @@ -28,6 +29,7 @@ namespace framework { using details::DummyVarHandle; using details::FetchOpHandle; +using details::NCCLAllReduceOpHandle; using details::OpHandleBase; using details::ScaleLossGradOpHandle; using details::VarHandle; @@ -123,69 +125,6 @@ class ParallelExecutorPrivate { var.place_ = place; op_handle->AddOutput(&var); } -}; // namespace framework - -struct NCCLAllReduceOpHandle : public OpHandleBase { - const std::vector &local_scopes_; - const std::vector &places_; - const platform::NCCLContextMap &nccl_ctxs_; - - explicit NCCLAllReduceOpHandle(const std::vector &local_scopes, - const std::vector &places, - const platform::NCCLContextMap &ctxs) - : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { - for (auto &p : places_) { - this->dev_ctx_[p] = nccl_ctxs_.DevCtx(p); - } - } - - void Wait(platform::DeviceContext *waited_dev) override { - OpHandleBase::Wait(waited_dev); - } - - protected: - void RunImpl() override { - if (inputs_.size() == 1) { - return; // No need to all reduce when GPU count = 1; - } else { - // Wait input done - for (auto *in : inputs_) { - auto &p = static_cast(in)->place_; - in->generated_op_->Wait(dev_ctx_[p]); - } - - auto &var_name = static_cast(this->inputs_[0])->name_; - int dtype = -1; - size_t numel = 0; - - platform::NCCLGroupGuard guard; - - for (size_t i = 0; i < local_scopes_.size(); ++i) { - auto &p = places_[i]; - auto *s = local_scopes_[i]; - int dev_id = boost::get(p).device; - - auto &lod_tensor = s->FindVar(var_name)->Get(); - void *buffer = const_cast(lod_tensor.data()); - uintptr_t buf = reinterpret_cast(buffer); - if (buf % sizeof(float) != 0) { - VLOG(3) << "Buffer is not aligned " << buf; - } - - if (dtype == -1) { - dtype = platform::ToNCCLDataType(lod_tensor.type()); - } - - if (numel == 0) { - numel = static_cast(lod_tensor.numel()); - } - auto &nccl_ctx = nccl_ctxs_.at(dev_id); - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - buffer, buffer, numel, static_cast(dtype), ncclSum, - nccl_ctx.comm_, nccl_ctx.stream())); - } - } - } }; struct ComputationOpHandle : public OpHandleBase { From 31815010130249033096ea584bc2c89983a7e367 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 17:02:51 +0800 Subject: [PATCH 132/314] Rerange code --- paddle/fluid/framework/CMakeLists.txt | 4 +- paddle/fluid/framework/details/CMakeLists.txt | 1 + .../details/computation_op_handle.cc | 40 +++++++++++++++++++ .../framework/details/computation_op_handle.h | 39 ++++++++++++++++++ paddle/fluid/framework/parallel_executor.cc | 28 +------------ 5 files changed, 84 insertions(+), 28 deletions(-) create mode 100644 paddle/fluid/framework/details/computation_op_handle.cc create mode 100644 paddle/fluid/framework/details/computation_op_handle.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 12d6541b8f..2b90bb5abd 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -94,8 +94,8 @@ else() set(parallel_executor_cuda_deps) endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - framework_proto backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle - fetch_op_handle ${parallel_executor_cuda_deps}) + backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle + fetch_op_handle computation_op_handle ${parallel_executor_cuda_deps}) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index fb276ea703..7565bc4c9c 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -4,3 +4,4 @@ cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_h cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory) nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda) +cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc new file mode 100644 index 0000000000..5867f8fc55 --- /dev/null +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -0,0 +1,40 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/computation_op_handle.h" + +namespace paddle { +namespace framework { +namespace details { +ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope, + platform::Place place) + : op_(framework::OpRegistry::CreateOp(op_desc)), + scope_(scope), + place_(place) {} + +void ComputationOpHandle::RunImpl() { + auto *cur_ctx = dev_ctx_[place_]; + for (auto *in : inputs_) { + bool need_wait = + in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx; + if (need_wait) { + in->generated_op_->Wait(cur_ctx); + } + } + + op_->Run(*scope_, place_); +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h new file mode 100644 index 0000000000..1fbfd4eabe --- /dev/null +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" + +namespace paddle { +namespace framework { +namespace details { +struct ComputationOpHandle : public OpHandleBase { + std::unique_ptr op_; + Scope *scope_; + platform::Place place_; + + ComputationOpHandle(const OpDesc &op_desc, Scope *scope, + platform::Place place); + + protected: + void RunImpl() override; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 93db5ad3e5..440040a2ef 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "lod_tensor.h" #include "lod_tensor_array.h" #include "op_registry.h" +#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/op_handle_base.h" @@ -34,6 +35,7 @@ using details::OpHandleBase; using details::ScaleLossGradOpHandle; using details::VarHandle; using details::VarHandleBase; +using details::ComputationOpHandle; class ParallelExecutorPrivate { public: @@ -127,32 +129,6 @@ class ParallelExecutorPrivate { } }; -struct ComputationOpHandle : public OpHandleBase { - std::unique_ptr op_; - Scope *scope_; - platform::Place place_; - - explicit ComputationOpHandle(const OpDesc &op_desc, Scope *scope, - platform::Place place) - : op_(framework::OpRegistry::CreateOp(op_desc)), - scope_(scope), - place_(place) {} - - protected: - void RunImpl() override { - auto *cur_ctx = dev_ctx_[place_]; - for (auto *in : inputs_) { - bool need_wait = - in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx; - if (need_wait) { - in->generated_op_->Wait(cur_ctx); - } - } - - op_->Run(*scope_, place_); - } -}; - ParallelExecutor::ParallelExecutor( size_t num_threads, const std::vector &places, const std::unordered_set ¶ms, From 8dec4ad7a1c37b705b584e64c3eef4d6df320c13 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 17:12:27 +0800 Subject: [PATCH 133/314] Use int not Place for vars --- paddle/fluid/framework/parallel_executor.cc | 46 ++++++++++----------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 440040a2ef..d3919f0d51 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -28,6 +28,7 @@ limitations under the License. */ namespace paddle { namespace framework { +using details::ComputationOpHandle; using details::DummyVarHandle; using details::FetchOpHandle; using details::NCCLAllReduceOpHandle; @@ -35,7 +36,6 @@ using details::OpHandleBase; using details::ScaleLossGradOpHandle; using details::VarHandle; using details::VarHandleBase; -using details::ComputationOpHandle; class ParallelExecutorPrivate { public: @@ -43,7 +43,9 @@ class ParallelExecutorPrivate { const std::vector &places) : places_(places), fetch_dev_ctxs_(places), - pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) {} + pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) { + vars_.resize(places.size()); + } std::vector places_; platform::DeviceContextPool fetch_dev_ctxs_; @@ -52,12 +54,7 @@ class ParallelExecutorPrivate { std::unique_ptr nccl_ctxs_; - platform::Place main_place_; - - std::unordered_map>, - platform::PlaceHash> - vars_; + std::vector>> vars_; std::unordered_set> dep_vars_; @@ -69,8 +66,8 @@ class ParallelExecutorPrivate { std::unique_ptr exception_; VarHandle *GetVarHandle(const std::string &each_var_name, - const platform::Place &place) { - auto &var_holders = vars_[place]; + const platform::Place &place, size_t place_offset) { + auto &var_holders = vars_[place_offset]; auto &var_holder = var_holders[each_var_name]; VarHandle *var = nullptr; if (var_holder.empty()) { @@ -118,8 +115,8 @@ class ParallelExecutorPrivate { } void GenerateVar(OpHandleBase *op_handle, const std::string &each_var_name, - const platform::Place &place) { - auto &vars = vars_[place][each_var_name]; + const platform::Place &place, size_t place_offset) { + auto &vars = vars_[place_offset][each_var_name]; size_t version = vars.size(); auto &var = vars[version]; var.version_ = version; @@ -144,11 +141,10 @@ ParallelExecutor::ParallelExecutor( for (size_t i = 0; i < member_->places_.size(); ++i) { member_->local_scopes_.push_back(&scope->NewScope()); } - member_->main_place_ = places[0]; // Bcast Parameters to all GPUs BuildNCCLCommunicator(); - if (platform::is_gpu_place(member_->main_place_) && + if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1) { // Is CUDA BCastParamsToGPUs(startup_program); } @@ -201,13 +197,13 @@ void ParallelExecutor::ConstructDependencyGraph( auto var_names = op->InputArgumentNames(); for (auto &each_var_name : var_names) { - VarHandle *var = member_->GetVarHandle(each_var_name, p); + VarHandle *var = member_->GetVarHandle(each_var_name, p, i); op_handle->AddInput(var); } var_names = op->OutputArgumentNames(); for (auto &each_var_name : var_names) { - member_->GenerateVar(op_handle, each_var_name, p); + member_->GenerateVar(op_handle, each_var_name, p, i); } if (is_forwarding) { @@ -224,7 +220,7 @@ void ParallelExecutor::ConstructDependencyGraph( // loss->pending_ops_.emplace_back(op_handle); // op_handle->inputs_.emplace_back(loss); - member_->GenerateVar(op_handle, loss_var_name + "@GRAD", p); + member_->GenerateVar(op_handle, loss_var_name + "@GRAD", p, i); change_forward = true; } } @@ -245,7 +241,7 @@ void ParallelExecutor::ConstructDependencyGraph( for (size_t i = 0; i < member_->places_.size(); ++i) { auto &p = member_->places_[i]; - auto &vars = member_->vars_[p][og]; + auto &vars = member_->vars_[i][og]; if (vars.empty()) { // This device has no data. continue. continue; @@ -280,8 +276,8 @@ void ParallelExecutor::ConstructDependencyGraph( * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) */ void ParallelExecutor::PolishGraphToSupportDataHazards() const { - for (auto &place_pair : member_->vars_) { - for (auto &name_pair : place_pair.second) { + for (auto &var_map : member_->vars_) { + for (auto &name_pair : var_map) { if (name_pair.second.size() <= 1) { return; } @@ -369,8 +365,8 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, std::unordered_map pending_ops; std::vector dummy_vars; - for (auto &place_pair : member_->vars_) { - for (auto &name_pair : place_pair.second) { + for (auto &var_map : member_->vars_) { + for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { pending_vars[&version_pair.second] = version_pair.second.generated_op_ == nullptr; @@ -395,9 +391,9 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, std::unordered_map> fetched_vars; for (auto &fetch_var_name : fetch_tensors) { - for (auto &pair : member_->vars_) { - auto it = pair.second.find(fetch_var_name); - if (it != pair.second.end()) { + for (auto &var_map : member_->vars_) { + auto it = var_map.find(fetch_var_name); + if (it != var_map.end()) { fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); } } From 64d7a3027157c0de8dcfdbb27e5d013620a68151 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 18:11:23 +0800 Subject: [PATCH 134/314] Extract SSAGraph --- paddle/fluid/framework/parallel_executor.cc | 189 ++++++++++---------- paddle/fluid/framework/parallel_executor.h | 2 - 2 files changed, 98 insertions(+), 93 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d3919f0d51..37bfdc0df5 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -37,6 +37,86 @@ using details::ScaleLossGradOpHandle; using details::VarHandle; using details::VarHandleBase; +struct SSAGraph { + std::vector>> vars_; + std::unordered_set> dep_vars_; + std::vector> ops_; +}; + +/** + * We only handle write after read(WAR), since it should not have a write + * after write in program. If there are write after write operators, we need + * prune them. + * + * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + */ +static void PolishGraphToSupportDataHazards(SSAGraph *graph) { + for (auto &var_map : graph->vars_) { + for (auto &name_pair : var_map) { + if (name_pair.second.size() <= 1) { + return; + } + auto it_new = name_pair.second.rbegin(); + auto it_old = name_pair.second.rbegin(); + ++it_old; + for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { + auto *write_op = it_new->second.generated_op_; + auto &read_ops = it_old->second.pending_ops_; + auto *ex_write_op = it_old->second.generated_op_; + + if (ex_write_op == nullptr) { // Nobody write this var. + continue; + } + + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } + + auto *dep_var = new DummyVarHandle(); + read_op->AddOutput(dep_var); + write_op->AddInput(dep_var); + graph->dep_vars_.emplace(dep_var); + } + } + } + } +} + +static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph, + const std::string &each_var_name, + const platform::Place &place, + size_t place_offset) { + auto &var_holders = graph->vars_[place_offset]; + auto &var_holder = var_holders[each_var_name]; + VarHandle *var = nullptr; + if (var_holder.empty()) { + auto &init_var = var_holder[0]; + init_var.place_ = place; + init_var.name_ = each_var_name; + init_var.generated_op_ = nullptr; + init_var.version_ = 0; + var = &init_var; + } else { + var = &var_holder.rbegin()->second; + } + return var; +} + +static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, + const std::string &each_var_name, + const platform::Place &place, size_t place_offset) { + auto &vars = graph->vars_[place_offset][each_var_name]; + size_t version = vars.size(); + auto &var = vars[version]; + var.version_ = version; + var.name_ = each_var_name; + var.place_ = place; + op_handle->AddOutput(&var); +} + class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(size_t num_threads, @@ -44,7 +124,7 @@ class ParallelExecutorPrivate { : places_(places), fetch_dev_ctxs_(places), pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) { - vars_.resize(places.size()); + graph_.vars_.resize(places.size()); } std::vector places_; @@ -54,35 +134,13 @@ class ParallelExecutorPrivate { std::unique_ptr nccl_ctxs_; - std::vector>> vars_; - - std::unordered_set> dep_vars_; - - std::vector> ops_; + SSAGraph graph_; // Use a simpler thread pool, might be faster. std::unique_ptr pool_; std::unique_ptr exception_; - VarHandle *GetVarHandle(const std::string &each_var_name, - const platform::Place &place, size_t place_offset) { - auto &var_holders = vars_[place_offset]; - auto &var_holder = var_holders[each_var_name]; - VarHandle *var = nullptr; - if (var_holder.empty()) { - auto &init_var = var_holder[0]; - init_var.place_ = place; - init_var.name_ = each_var_name; - init_var.generated_op_ = nullptr; - init_var.version_ = 0; - var = &init_var; - } else { - var = &var_holder.rbegin()->second; - } - return var; - } - void RunOp( bool use_event, std::unordered_map> &pending_vars, @@ -113,17 +171,6 @@ class ParallelExecutorPrivate { op_run(); } } - - void GenerateVar(OpHandleBase *op_handle, const std::string &each_var_name, - const platform::Place &place, size_t place_offset) { - auto &vars = vars_[place_offset][each_var_name]; - size_t version = vars.size(); - auto &var = vars[version]; - var.version_ = version; - var.name_ = each_var_name; - var.place_ = place; - op_handle->AddOutput(&var); - } }; ParallelExecutor::ParallelExecutor( @@ -189,21 +236,22 @@ void ParallelExecutor::ConstructDependencyGraph( auto &p = member_->places_[i]; auto *s = member_->local_scopes_[i]; - member_->ops_.emplace_back(new ComputationOpHandle(*op, s, p)); - auto *op_handle = member_->ops_.back().get(); + member_->graph_.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); + auto *op_handle = member_->graph_.ops_.back().get(); op_handle->dev_ctx_[p] = const_cast( platform::DeviceContextPool::Instance().Get(p)); auto var_names = op->InputArgumentNames(); for (auto &each_var_name : var_names) { - VarHandle *var = member_->GetVarHandle(each_var_name, p, i); + VarHandle *var = + CreateOrGetLatestVarHandle(&member_->graph_, each_var_name, p, i); op_handle->AddInput(var); } var_names = op->OutputArgumentNames(); for (auto &each_var_name : var_names) { - member_->GenerateVar(op_handle, each_var_name, p, i); + CreateOpOutput(&member_->graph_, op_handle, each_var_name, p, i); } if (is_forwarding) { @@ -212,7 +260,7 @@ void ParallelExecutor::ConstructDependencyGraph( op_handle = new ScaleLossGradOpHandle(this->member_->local_scopes_.size(), s, p, member_->nccl_ctxs_->DevCtx(p)); - member_->ops_.emplace_back(op_handle); + member_->graph_.ops_.emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale // factor. So it does not depend on any other operators. @@ -220,7 +268,8 @@ void ParallelExecutor::ConstructDependencyGraph( // loss->pending_ops_.emplace_back(op_handle); // op_handle->inputs_.emplace_back(loss); - member_->GenerateVar(op_handle, loss_var_name + "@GRAD", p, i); + CreateOpOutput(&member_->graph_, op_handle, loss_var_name + "@GRAD", + p, i); change_forward = true; } } @@ -235,13 +284,13 @@ void ParallelExecutor::ConstructDependencyGraph( for (auto &og : var_names) { if (grads.count(og) != 0) { // is param grad // Insert NCCL AllReduce Op - member_->ops_.emplace_back(new NCCLAllReduceOpHandle( + member_->graph_.ops_.emplace_back(new NCCLAllReduceOpHandle( member_->local_scopes_, member_->places_, *member_->nccl_ctxs_)); - auto *op_handle = member_->ops_.back().get(); + auto *op_handle = member_->graph_.ops_.back().get(); for (size_t i = 0; i < member_->places_.size(); ++i) { auto &p = member_->places_[i]; - auto &vars = member_->vars_[i][og]; + auto &vars = member_->graph_.vars_[i][og]; if (vars.empty()) { // This device has no data. continue. continue; @@ -265,49 +314,7 @@ void ParallelExecutor::ConstructDependencyGraph( Dependency graph has been constructed. However, there are still data harzaeds need to be handled. */ - PolishGraphToSupportDataHazards(); -} - -/** - * We only handle write after read(WAR), since it should not have a write - * after write in program. If there are write after write operators, we need - * prune them. - * - * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) - */ -void ParallelExecutor::PolishGraphToSupportDataHazards() const { - for (auto &var_map : member_->vars_) { - for (auto &name_pair : var_map) { - if (name_pair.second.size() <= 1) { - return; - } - auto it_new = name_pair.second.rbegin(); - auto it_old = name_pair.second.rbegin(); - ++it_old; - for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { - auto *write_op = it_new->second.generated_op_; - auto &read_ops = it_old->second.pending_ops_; - auto *ex_write_op = it_old->second.generated_op_; - - if (ex_write_op == nullptr) { // Nobody write this var. - continue; - } - - for (auto *read_op : read_ops) { - // Manually add a dependency var from read_op to write_op; - if (read_op == write_op) { - // Read Write is the same op. - continue; - } - - auto *dep_var = new DummyVarHandle(); - read_op->AddOutput(dep_var); - write_op->AddInput(dep_var); - member_->dep_vars_.emplace(dep_var); - } - } - } - } + PolishGraphToSupportDataHazards(&member_->graph_); } void ParallelExecutor::BCastParamsToGPUs( @@ -365,7 +372,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, std::unordered_map pending_ops; std::vector dummy_vars; - for (auto &var_map : member_->vars_) { + for (auto &var_map : member_->graph_.vars_) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { pending_vars[&version_pair.second] = @@ -374,13 +381,13 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } } - for (auto &var : member_->dep_vars_) { + for (auto &var : member_->graph_.dep_vars_) { pending_vars[var.get()] = var->generated_op_ == nullptr; } std::vector to_run; - for (auto &op : member_->ops_) { + for (auto &op : member_->graph_.ops_) { if (op->inputs_.empty()) { // Special case, Op has no input. to_run.emplace_back(op.get()); } else { @@ -391,7 +398,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, std::unordered_map> fetched_vars; for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : member_->vars_) { + for (auto &var_map : member_->graph_.vars_) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 466b5f5f62..8c91c45d14 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -52,8 +52,6 @@ class ParallelExecutor { const std::string& loss_var_name) const; void BuildNCCLCommunicator() const; - - void PolishGraphToSupportDataHazards() const; }; } // namespace framework From eb12cbe764a5e80cc8136fe6b96f6783f77ae474 Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 21 Mar 2018 18:13:00 +0800 Subject: [PATCH 135/314] Refine reshape_op infershape --- paddle/fluid/operators/reshape_op.cc | 89 +------------------- paddle/fluid/operators/reshape_op.h | 119 +++++++++++++++++++-------- 2 files changed, 84 insertions(+), 124 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 489742b492..ed153e7722 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -17,93 +17,6 @@ limitations under the License. */ namespace paddle { namespace operators { -class ReshapeOp : public framework::OperatorWithKernel { - public: - ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReshapeOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReshapeOp should not be null."); - - const std::vector &shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE(!shape.empty(), - "The shape information must be set by Attr(shape)."); - - std::vector output_shape; - auto x_dims = ctx->GetInputDim("X"); - bool need_copy_dim = ValidateShape(shape, x_dims, output_shape); - - if (need_copy_dim) { - // Some dimensions can only be determined during runtime. Here temporarily - // set output tensor's shape the same as that of the input tensor. - ctx->SetOutputDim("Out", x_dims); - } else { - ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); - } - - // NOTE: Reshape op cannot reshape an input sequence batch into an output - // sequence batch that has a different number of time steps. - // Here output always shares the LoD information with input. But if - // Attr(shape) contains 0 or -1, the actual output shape can only be - // determined during runtime. The check for wheather it is a valid output - // sequence batch is performed in runtime. - ctx->ShareLoD("X", /*->*/ "Out"); - } - - private: - bool ValidateShape(const std::vector &shape, - const framework::DDim &input_dim, - std::vector &output_shape) const { - // only one dimension can be set to -1, whose size will be automatically - // infered. - const int64_t unknown_index = -1; - const auto in_size = framework::product(input_dim); - const auto x_rank = input_dim.size(); - - bool need_dim_copy = false; - std::vector neg_dims_idx; - for (size_t i = 0; i < shape.size(); ++i) { - PADDLE_ENFORCE(shape[i] >= 0 || shape[i] == unknown_index, - "Each input dimension of Attr(shape) must be positive, or " - "only one input dimension can be -1."); - if (shape[i] == unknown_index) { - neg_dims_idx.push_back(i); - } else if (shape[i] == 0) { - PADDLE_ENFORCE_LT( - i, x_rank, - "Only dimension less than rank of Input(X) can be set to 0."); - need_dim_copy = true; - } - } - PADDLE_ENFORCE_LE( - neg_dims_idx.size(), 1, - "Only one input dimension of Attr(shape) can be unknown."); - - output_shape.resize(shape.size(), 0); - std::transform(shape.begin(), shape.end(), output_shape.begin(), - [](int a) { return static_cast(a); }); - - // some dimension can only be determinted during runtime. - if (need_dim_copy) return need_dim_copy; - - int64_t inferred_dim = 0; - if (neg_dims_idx.size()) { - int64_t capacity = std::accumulate(shape.begin(), shape.end(), 1, - std::multiplies()); - inferred_dim = in_size / (-capacity); - PADDLE_ENFORCE_EQ(inferred_dim * (-capacity), in_size, - "Invalid shape is given."); - output_shape[neg_dims_idx[0]] = inferred_dim; - } - return false; - } -}; - class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { public: ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) @@ -150,7 +63,7 @@ the actual dimension value will be infered from the total element number of Input(X) and remaining dimensions. 1. More than one dimensions in Attr(shape) can be set to 0, which means the real dimension value will be copied from Input(X) at runtime. Note that the index of -0 can not access Rank(X). For example, Input(X) is a 3-D tensor with shape +0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. )DOC"); diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index dd8eaf3e4f..db632577d7 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -20,15 +20,90 @@ limitations under the License. */ namespace paddle { namespace operators { +class ReshapeOp : public framework::OperatorWithKernel { + public: + ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReshapeOp should not be null."); + + const std::vector &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE(!shape.empty(), + "The shape information must be set by Attr(shape)."); + + std::vector output_shape; + auto x_dims = ctx->GetInputDim("X"); + auto out_dims = ValidateShape(shape, x_dims); + ctx->SetOutputDim("Out", out_dims); + // NOTE: Reshape op cannot reshape an input sequence batch into an + // output sequence batch that has a different number of time steps. Here + // output always shares the LoD information with input. But if + // Attr(shape) contains 0 or -1, the actual output shape can only be + // determined during runtime. The check for wheather it is a valid + // output sequence batch is performed in runtime. + ctx->ShareLoD("X", /*->*/ "Out"); + } + + static framework::DDim ValidateShape(const std::vector shape, + const framework::DDim &in_dims) { + const int64_t in_size = framework::product(in_dims); + // only one dimension canbe set to -1, whose size will be automatically + // infered. + const int64_t unk_dim_val = -1; + const int64_t copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + int64_t capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE( + unk_dim_idx == -1, + "Only one input dimension of Attr(shape) can be unknown."); + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + PADDLE_ENFORCE( + static_cast(i) < in_dims.size(), + "The index of dimension to copy from input shape must be less " + "than the size of input shape."); + } else { + PADDLE_ENFORCE( + shape[i] > 0, + "Each input dimension of Attr(shape) must not be negtive except " + "one unknown dimension."); + } + + capacity *= (shape[i] ? shape[i] : in_dims[i]); + output_shape[i] = + (shape[i] ? static_cast(shape[i]) : in_dims[i]); + } + + if (unk_dim_idx != -1) { + output_shape[unk_dim_idx] = -in_size / capacity; + PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size, + "Invalid shape is given."); + } else { + PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); + } + return framework::make_ddim(output_shape); + } +}; + template class ReshapeKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* out = ctx.Output("Out"); - auto* in = ctx.Input("X"); + void Compute(const framework::ExecutionContext &ctx) const { + auto *out = ctx.Output("Out"); + auto *in = ctx.Input("X"); - auto out_dims = - ValidateShape(ctx.Attr>("shape"), in->dims()); + auto out_dims = ReshapeOp::ValidateShape( + ctx.Attr>("shape"), in->dims()); if (!in->lod().empty()) { PADDLE_ENFORCE_EQ( @@ -49,42 +124,14 @@ class ReshapeKernel : public framework::OpKernel { out->Resize(out_dims); } } - - private: - framework::DDim ValidateShape(const std::vector shape_attr, - const framework::DDim& in_dims) const { - const int64_t in_size = framework::product(in_dims); - // only one dimension canbe set to -1, whose size will be automatically - // infered. - const int64_t unknown_index = -1; - - std::vector output_shape(shape_attr.size(), 0); - int64_t capacity = 1; - int neg_dim_idx = -1; - for (size_t i = 0; i < shape_attr.size(); ++i) { - if (shape_attr[i] == unknown_index) neg_dim_idx = i; - capacity *= (shape_attr[i] ? shape_attr[i] : in_dims[i]); - output_shape[i] = - (shape_attr[i] ? static_cast(shape_attr[i]) : in_dims[i]); - } - - if (neg_dim_idx != -1) { - output_shape[neg_dim_idx] = -in_size / capacity; - PADDLE_ENFORCE_EQ(output_shape[neg_dim_idx] * capacity, -in_size, - "Invalid shape is given."); - } else { - PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); - } - return framework::make_ddim(output_shape); - } }; template class ReshapeGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& ctx) const { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* d_x = ctx.Output(framework::GradVarName("X")); + void Compute(const framework::ExecutionContext &ctx) const { + auto *d_out = ctx.Input(framework::GradVarName("Out")); + auto *d_x = ctx.Output(framework::GradVarName("X")); d_x->mutable_data(ctx.GetPlace()); bool inplace = ctx.Attr("inplace"); From 454b0a96be7ff319a9ed05f45f23c513e70eb19f Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 21 Mar 2018 18:39:58 +0800 Subject: [PATCH 136/314] Remove the extra call of ValidateShape in ReshapeKernel --- paddle/fluid/operators/reshape_op.cc | 76 +++++++++++++++++++++++++++ paddle/fluid/operators/reshape_op.h | 78 +--------------------------- 2 files changed, 77 insertions(+), 77 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index ed153e7722..c817b35693 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -17,6 +17,82 @@ limitations under the License. */ namespace paddle { namespace operators { +class ReshapeOp : public framework::OperatorWithKernel { + public: + ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReshapeOp should not be null."); + + const std::vector &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE(!shape.empty(), + "The shape information must be set by Attr(shape)."); + + std::vector output_shape; + auto x_dims = ctx->GetInputDim("X"); + auto out_dims = ValidateShape(shape, x_dims); + ctx->SetOutputDim("Out", out_dims); + // NOTE: Reshape op cannot reshape an input sequence batch into an + // output sequence batch that has a different number of time steps. Here + // output always shares the LoD information with input. But if + // Attr(shape) contains 0 or -1, the actual output shape can only be + // determined during runtime. The check for wheather it is a valid + // output sequence batch is performed in runtime. + ctx->ShareLoD("X", /*->*/ "Out"); + } + + private: + framework::DDim ValidateShape(const std::vector shape, + const framework::DDim &in_dims) const { + const int64_t in_size = framework::product(in_dims); + // only one dimension canbe set to -1, whose size will be automatically + // infered. + const int64_t unk_dim_val = -1; + const int64_t copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + int64_t capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE( + unk_dim_idx == -1, + "Only one input dimension of Attr(shape) can be unknown."); + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + PADDLE_ENFORCE( + static_cast(i) < in_dims.size(), + "The index of dimension to copy from input shape must be less " + "than the size of input shape."); + } else { + PADDLE_ENFORCE( + shape[i] > 0, + "Each input dimension of Attr(shape) must not be negtive except " + "one unknown dimension."); + } + + capacity *= (shape[i] ? shape[i] : in_dims[i]); + output_shape[i] = + (shape[i] ? static_cast(shape[i]) : in_dims[i]); + } + + if (unk_dim_idx != -1) { + output_shape[unk_dim_idx] = -in_size / capacity; + PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size, + "Invalid shape is given."); + } else { + PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); + } + return framework::make_ddim(output_shape); + } +}; + class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { public: ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index db632577d7..59adb5e87c 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -20,81 +20,6 @@ limitations under the License. */ namespace paddle { namespace operators { -class ReshapeOp : public framework::OperatorWithKernel { - public: - ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReshapeOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReshapeOp should not be null."); - - const std::vector &shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE(!shape.empty(), - "The shape information must be set by Attr(shape)."); - - std::vector output_shape; - auto x_dims = ctx->GetInputDim("X"); - auto out_dims = ValidateShape(shape, x_dims); - ctx->SetOutputDim("Out", out_dims); - // NOTE: Reshape op cannot reshape an input sequence batch into an - // output sequence batch that has a different number of time steps. Here - // output always shares the LoD information with input. But if - // Attr(shape) contains 0 or -1, the actual output shape can only be - // determined during runtime. The check for wheather it is a valid - // output sequence batch is performed in runtime. - ctx->ShareLoD("X", /*->*/ "Out"); - } - - static framework::DDim ValidateShape(const std::vector shape, - const framework::DDim &in_dims) { - const int64_t in_size = framework::product(in_dims); - // only one dimension canbe set to -1, whose size will be automatically - // infered. - const int64_t unk_dim_val = -1; - const int64_t copy_dim_val = 0; - - std::vector output_shape(shape.size(), 0); - int64_t capacity = 1; - int unk_dim_idx = -1; - for (size_t i = 0; i < shape.size(); ++i) { - if (shape[i] == unk_dim_val) { - PADDLE_ENFORCE( - unk_dim_idx == -1, - "Only one input dimension of Attr(shape) can be unknown."); - unk_dim_idx = i; - } else if (shape[i] == copy_dim_val) { - PADDLE_ENFORCE( - static_cast(i) < in_dims.size(), - "The index of dimension to copy from input shape must be less " - "than the size of input shape."); - } else { - PADDLE_ENFORCE( - shape[i] > 0, - "Each input dimension of Attr(shape) must not be negtive except " - "one unknown dimension."); - } - - capacity *= (shape[i] ? shape[i] : in_dims[i]); - output_shape[i] = - (shape[i] ? static_cast(shape[i]) : in_dims[i]); - } - - if (unk_dim_idx != -1) { - output_shape[unk_dim_idx] = -in_size / capacity; - PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size, - "Invalid shape is given."); - } else { - PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); - } - return framework::make_ddim(output_shape); - } -}; - template class ReshapeKernel : public framework::OpKernel { public: @@ -102,8 +27,7 @@ class ReshapeKernel : public framework::OpKernel { auto *out = ctx.Output("Out"); auto *in = ctx.Input("X"); - auto out_dims = ReshapeOp::ValidateShape( - ctx.Attr>("shape"), in->dims()); + auto out_dims = out->dims(); if (!in->lod().empty()) { PADDLE_ENFORCE_EQ( From 0760aaf4401b2e87684a9ae8e7931cf9e51a74b8 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 19:20:49 +0800 Subject: [PATCH 137/314] Shrink batch_norm_grad's inputs --- paddle/fluid/operators/batch_norm_op.cc | 31 +++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 5d27f5b60c..36049ee6a4 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -457,12 +457,39 @@ class BatchNormGradKernel } }; +class BatchNormGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto *op = new framework::OpDesc(); + op->SetType("batch_norm_grad"); + op->SetInput("X", Input("X")); + op->SetInput(framework::GradVarName("Y"), OutputGrad("Y")); + + op->SetInput("Scale", Input("Scale")); + op->SetInput("SavedMean", Output("SavedMean")); + op->SetInput("SavedVariance", Output("SavedVariance")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("Scale"), InputGrad("Scale")); + op->SetOutput(framework::GradVarName("Bias"), InputGrad("Bias")); + + return std::unique_ptr(op); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, - batch_norm_grad, ops::BatchNormGradOp); +REGISTER_OPERATOR(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, + ops::BatchNormGradMaker); +REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp); + REGISTER_OP_CPU_KERNEL( batch_norm, ops::BatchNormKernel); From 2a4221ac074f50a242bdc988eab49cca17414fcb Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Wed, 21 Mar 2018 20:00:29 +0800 Subject: [PATCH 138/314] split send op to send_vars and send_barrier --- paddle/fluid/operators/CMakeLists.txt | 4 + paddle/fluid/operators/send_barrier_op.cc | 103 +++++++++++++++++ paddle/fluid/operators/send_vars_op.cc | 132 ++++++++++++++++++++++ 3 files changed, 239 insertions(+) create mode 100644 paddle/fluid/operators/send_barrier_op.cc create mode 100644 paddle/fluid/operators/send_vars_op.cc diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index d30124d4a3..254f89d987 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -156,6 +156,10 @@ if(WITH_DISTRIBUTE) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) else() set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op) diff --git a/paddle/fluid/operators/send_barrier_op.cc b/paddle/fluid/operators/send_barrier_op.cc new file mode 100644 index 0000000000..8d02a6f291 --- /dev/null +++ b/paddle/fluid/operators/send_barrier_op.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#include +#include "paddle/fluid/operators/detail/grpc_client.h" + +namespace paddle { +namespace operators { + +class SendBarrierOp : public framework::OperatorBase { + public: + SendBarrierOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + std::vector eps = Attr>("endpoints"); + + auto client_var_name = Output("RPCClient"); + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), + "Can not find variable '%s' in the scope.", + client_var_name); + auto* client_var = scope.FindVar(client_var_name); + detail::RPCClient* rpc_client = client_var->GetMutable(); + + // need to wait before sending send_barrier message + PADDLE_ENFORCE(rpc_client->Wait()); + + for (auto& ep : eps) { + VLOG(3) << "send barrier, ep: " << ep; + rpc_client->AsyncSendBatchBarrier(ep); + } + PADDLE_ENFORCE(rpc_client->Wait()); + } +}; + +class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SendBarrierOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddOutput("RPCClient", + "(RPCClient) The RPC client object which is" + "initialized at most once."); + AddComment(R"DOC( +SendBarrier operator + +This operator will send a send barrier signal to list_and_serv op, so that +the Parameter Server would knew all variables have been sent. +)DOC"); + + AddAttr>("endpoints", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints to send variables to.") + .SetDefault({"127.0.0.1:6164"}); + } +}; + +class SendBarrierOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output("RPCClient").front(); + auto& out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto var_type = framework::proto::VarType::RAW; + out_var.SetType(var_type); + } +}; + +class SendBarrierOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp, + paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker, + ops::SendBarrierOpVarTypeInference, + ops::SendBarrierOpShapeInference); diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc new file mode 100644 index 0000000000..af791bc8e2 --- /dev/null +++ b/paddle/fluid/operators/send_vars_op.cc @@ -0,0 +1,132 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/framework.pb.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#include +#include "paddle/fluid/operators/detail/grpc_client.h" + +namespace paddle { +namespace operators { +static bool NeedSend(const framework::Scope& scope, + const std::string& varname) { + auto* var = scope.FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.", + varname); + if (var->IsType()) { + return var->Get().IsInitialized(); + } else if (var->IsType()) { + return var->Get().rows().size() > 0UL; + } else { + PADDLE_THROW( + "Variable type in send side should be in " + "[LodTensor, SelectedRows]"); + } + return false; +} + +class SendVarsOp : public framework::OperatorBase { + public: + SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + auto ins = Inputs("X"); + + std::vector epmap = Attr>("epmap"); + int flag_wait = Attr("wait"); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + auto client_var_name = Output("RPCClient"); + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), + "Can not find variable '%s' in the scope.", + client_var_name); + auto* client_var = scope.FindVar(client_var_name); + detail::RPCClient* rpc_client = client_var->GetMutable(); + + for (size_t i = 0; i < ins.size(); i++) { + if (NeedSend(scope, ins[i])) { + VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); + } else { + VLOG(3) << "don't send no-initialied variable: " << ins[i]; + } + } + if (flag_wait) { + rpc_client->Wait(); + } + } +}; + +class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SendVarsOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") + .AsDuplicable(); + AddOutput("RPCClient", + "(RPCClient) The RPC client object which is" + "initialized at most once."); + AddComment(R"DOC( +Send operator + +This operator will send variables to listen_and_serve op at the parameter server. +)DOC"); + AddAttr("wait", + "(int, default 0)" + "whether watting for all send request have been sent.") + .SetDefault(0); + AddAttr>("epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input " + "variables for mapping") + .SetDefault({"127.0.0.1:6164"}); + } +}; + +class SendVarsOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output("RPCClient").front(); + auto& out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto var_type = framework::proto::VarType::RAW; + out_var.SetType(var_type); + } +}; + +class SendVarsOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(send_vars, ops::SendVarsOp, + paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker, + ops::SendVarsOpVarTypeInference, + ops::SendVarsOpShapeInference); From 79989c902530fcaf525161b8d1b3eaee9d634291 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 21 Mar 2018 20:17:11 +0800 Subject: [PATCH 139/314] Add SSA builder --- paddle/fluid/framework/parallel_executor.cc | 369 +++++++++++--------- paddle/fluid/framework/parallel_executor.h | 4 - 2 files changed, 199 insertions(+), 174 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 37bfdc0df5..b2be3d1305 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -43,79 +43,211 @@ struct SSAGraph { std::vector> ops_; }; -/** - * We only handle write after read(WAR), since it should not have a write - * after write in program. If there are write after write operators, we need - * prune them. - * - * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) - */ -static void PolishGraphToSupportDataHazards(SSAGraph *graph) { - for (auto &var_map : graph->vars_) { - for (auto &name_pair : var_map) { - if (name_pair.second.size() <= 1) { - return; - } - auto it_new = name_pair.second.rbegin(); - auto it_old = name_pair.second.rbegin(); - ++it_old; - for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { - auto *write_op = it_new->second.generated_op_; - auto &read_ops = it_old->second.pending_ops_; - auto *ex_write_op = it_old->second.generated_op_; - - if (ex_write_op == nullptr) { // Nobody write this var. - continue; +class SSAGraphBuilder { + public: + virtual ~SSAGraphBuilder() {} + virtual void Build(const ProgramDesc &program, SSAGraph *graph) const = 0; + + protected: + /** + * We only handle write after read(WAR), since it should not have a write + * after write in program. If there are write after write operators, we need + * prune them. + * + * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + */ + static void PolishGraphToSupportDataHazards(SSAGraph *graph) { + for (auto &var_map : graph->vars_) { + for (auto &name_pair : var_map) { + if (name_pair.second.size() <= 1) { + return; } - - for (auto *read_op : read_ops) { - // Manually add a dependency var from read_op to write_op; - if (read_op == write_op) { - // Read Write is the same op. + auto it_new = name_pair.second.rbegin(); + auto it_old = name_pair.second.rbegin(); + ++it_old; + for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { + auto *write_op = it_new->second.generated_op_; + auto &read_ops = it_old->second.pending_ops_; + auto *ex_write_op = it_old->second.generated_op_; + + if (ex_write_op == nullptr) { // Nobody write this var. continue; } - auto *dep_var = new DummyVarHandle(); - read_op->AddOutput(dep_var); - write_op->AddInput(dep_var); - graph->dep_vars_.emplace(dep_var); + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } + + auto *dep_var = new DummyVarHandle(); + read_op->AddOutput(dep_var); + write_op->AddInput(dep_var); + graph->dep_vars_.emplace(dep_var); + } } } } } -} -static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph, - const std::string &each_var_name, - const platform::Place &place, - size_t place_offset) { - auto &var_holders = graph->vars_[place_offset]; - auto &var_holder = var_holders[each_var_name]; - VarHandle *var = nullptr; - if (var_holder.empty()) { - auto &init_var = var_holder[0]; - init_var.place_ = place; - init_var.name_ = each_var_name; - init_var.generated_op_ = nullptr; - init_var.version_ = 0; - var = &init_var; - } else { - var = &var_holder.rbegin()->second; + static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph, + const std::string &each_var_name, + const platform::Place &place, + size_t place_offset) { + auto &var_holders = graph->vars_[place_offset]; + auto &var_holder = var_holders[each_var_name]; + VarHandle *var = nullptr; + if (var_holder.empty()) { + auto &init_var = var_holder[0]; + init_var.place_ = place; + init_var.name_ = each_var_name; + init_var.generated_op_ = nullptr; + init_var.version_ = 0; + var = &init_var; + } else { + var = &var_holder.rbegin()->second; + } + return var; } - return var; -} -static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, - const std::string &each_var_name, - const platform::Place &place, size_t place_offset) { - auto &vars = graph->vars_[place_offset][each_var_name]; - size_t version = vars.size(); - auto &var = vars[version]; - var.version_ = version; - var.name_ = each_var_name; - var.place_ = place; - op_handle->AddOutput(&var); -} + static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, + const std::string &each_var_name, + const platform::Place &place, + size_t place_offset) { + auto &vars = graph->vars_[place_offset][each_var_name]; + size_t version = vars.size(); + auto &var = vars[version]; + var.version_ = version; + var.name_ = each_var_name; + var.place_ = place; + op_handle->AddOutput(&var); + } +}; + +class MultiDevSSAGraphBuilder : public SSAGraphBuilder { + public: + MultiDevSSAGraphBuilder(const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶ms, + const std::vector &local_scopes, + platform::NCCLContextMap *nccl_ctxs) + : loss_var_name_(loss_var_name), + places_(places), + local_scopes_(local_scopes), + nccl_ctxs_(nccl_ctxs) { + for (auto &p : params) { + grad_names_.insert(GradVarName(p)); + } + } + + void Build(const ProgramDesc &program, SSAGraph *graph) const override { + SSAGraph &result = *graph; + result.vars_.resize(places_.size()); + + bool is_forwarding = true; + for (auto *op : program.Block(0).AllOps()) { + bool change_forward = false; + if (!is_forwarding) { + // FIXME(yy): Do not hard code like this + if (op->OutputArgumentNames().size() == 1 && + op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) { + continue; // Drop fill 1. for backward coeff; + } + } + + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + auto *s = local_scopes_[i]; + + result.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); + auto *op_handle = result.ops_.back().get(); + op_handle->dev_ctx_[p] = const_cast( + platform::DeviceContextPool::Instance().Get(p)); + + auto var_names = op->InputArgumentNames(); + + for (auto &each_var_name : var_names) { + VarHandle *var = + CreateOrGetLatestVarHandle(&result, each_var_name, p, i); + op_handle->AddInput(var); + } + var_names = op->OutputArgumentNames(); + + for (auto &each_var_name : var_names) { + CreateOpOutput(&result, op_handle, each_var_name, p, i); + } + + if (is_forwarding) { + if (var_names.size() == 1 && var_names[0] == loss_var_name_) { + // Insert ScaleCost OpHandle + op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p, + nccl_ctxs_->DevCtx(p)); + result.ops_.emplace_back(op_handle); + + // FIXME: Currently ScaleLossGradOp only use device_count as scale + // factor. So it does not depend on any other operators. + // VarHandle *loss = GetVarHandle(loss_var_name, place); + // loss->pending_ops_.emplace_back(op_handle); + // op_handle->inputs_.emplace_back(loss); + + CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, + i); + change_forward = true; + } + } + } + + if (change_forward) { + is_forwarding = false; + } + + if (!is_forwarding) { + auto var_names = op->OutputArgumentNames(); + for (auto &og : var_names) { + if (grad_names_.count(og) != 0) { // is param grad + // Insert NCCL AllReduce Op + result.ops_.emplace_back( + new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); + auto *op_handle = result.ops_.back().get(); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + auto &vars = result.vars_[i][og]; + + if (vars.empty()) { // This device has no data. continue. + continue; + } + auto *prev_grad = &vars[vars.size() - 1]; + op_handle->AddInput(prev_grad); + + auto &var = vars[vars.size()]; + var.place_ = p; + var.name_ = og; + var.version_ = vars.size() - 1; + + op_handle->AddOutput(&var); + } + } + } + } + } + + /* + Dependency graph has been constructed. However, there are still data + harzaeds need to be handled. + */ + PolishGraphToSupportDataHazards(&result); + } + + private: + std::string loss_var_name_; + const std::vector &places_; + const std::vector &local_scopes_; + platform::NCCLContextMap *nccl_ctxs_; + + std::unordered_set grad_names_; +}; class ParallelExecutorPrivate { public: @@ -123,9 +255,7 @@ class ParallelExecutorPrivate { const std::vector &places) : places_(places), fetch_dev_ctxs_(places), - pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) { - graph_.vars_.resize(places.size()); - } + pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) {} std::vector places_; platform::DeviceContextPool fetch_dev_ctxs_; @@ -199,7 +329,10 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - ConstructDependencyGraph(params, main_program, loss_var_name); + MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, + member_->local_scopes_, + member_->nccl_ctxs_.get()); + builder.Build(main_program, &member_->graph_); // Step 3. Create vars in each scope; for (auto *scope : member_->local_scopes_) { @@ -213,110 +346,6 @@ ParallelExecutor::ParallelExecutor( } } -void ParallelExecutor::ConstructDependencyGraph( - const std::unordered_set ¶ms, - const ProgramDesc &main_program, const std::string &loss_var_name) const { - std::unordered_set grads; - for (auto &each_param : params) { - grads.insert(each_param + "@GRAD"); - } - - bool is_forwarding = true; - for (auto *op : main_program.Block(0).AllOps()) { - bool change_forward = false; - if (!is_forwarding) { - // FIXME(yy): Do not hard code like this - if (op->OutputArgumentNames().size() == 1 && - op->OutputArgumentNames()[0] == loss_var_name + "@GRAD") { - continue; // Drop fill 1. for backward coeff; - } - } - - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto &p = member_->places_[i]; - auto *s = member_->local_scopes_[i]; - - member_->graph_.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); - auto *op_handle = member_->graph_.ops_.back().get(); - op_handle->dev_ctx_[p] = const_cast( - platform::DeviceContextPool::Instance().Get(p)); - - auto var_names = op->InputArgumentNames(); - - for (auto &each_var_name : var_names) { - VarHandle *var = - CreateOrGetLatestVarHandle(&member_->graph_, each_var_name, p, i); - op_handle->AddInput(var); - } - var_names = op->OutputArgumentNames(); - - for (auto &each_var_name : var_names) { - CreateOpOutput(&member_->graph_, op_handle, each_var_name, p, i); - } - - if (is_forwarding) { - if (var_names.size() == 1 && var_names[0] == loss_var_name) { - // Insert ScaleCost OpHandle - op_handle = - new ScaleLossGradOpHandle(this->member_->local_scopes_.size(), s, - p, member_->nccl_ctxs_->DevCtx(p)); - member_->graph_.ops_.emplace_back(op_handle); - - // FIXME: Currently ScaleLossGradOp only use device_count as scale - // factor. So it does not depend on any other operators. - // VarHandle *loss = GetVarHandle(loss_var_name, place); - // loss->pending_ops_.emplace_back(op_handle); - // op_handle->inputs_.emplace_back(loss); - - CreateOpOutput(&member_->graph_, op_handle, loss_var_name + "@GRAD", - p, i); - change_forward = true; - } - } - } - - if (change_forward) { - is_forwarding = false; - } - - if (!is_forwarding) { - auto var_names = op->OutputArgumentNames(); - for (auto &og : var_names) { - if (grads.count(og) != 0) { // is param grad - // Insert NCCL AllReduce Op - member_->graph_.ops_.emplace_back(new NCCLAllReduceOpHandle( - member_->local_scopes_, member_->places_, *member_->nccl_ctxs_)); - auto *op_handle = member_->graph_.ops_.back().get(); - - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto &p = member_->places_[i]; - auto &vars = member_->graph_.vars_[i][og]; - - if (vars.empty()) { // This device has no data. continue. - continue; - } - auto *prev_grad = &vars[vars.size() - 1]; - op_handle->AddInput(prev_grad); - - auto &var = vars[vars.size()]; - var.place_ = p; - var.name_ = og; - var.version_ = vars.size() - 1; - - op_handle->AddOutput(&var); - } - } - } - } - } - - /* - Dependency graph has been constructed. However, there are still data - harzaeds need to be handled. - */ - PolishGraphToSupportDataHazards(&member_->graph_); -} - void ParallelExecutor::BCastParamsToGPUs( const ProgramDesc &startup_program) const { #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 8c91c45d14..39a1c51b9e 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -47,10 +47,6 @@ class ParallelExecutor { void BCastParamsToGPUs(const ProgramDesc& startup_program) const; - void ConstructDependencyGraph(const std::unordered_set& params, - const ProgramDesc& main_program, - const std::string& loss_var_name) const; - void BuildNCCLCommunicator() const; }; From 72cc64e40e5d624bcc97bd81f144fcb446167a21 Mon Sep 17 00:00:00 2001 From: Tomasz Patejko Date: Wed, 21 Mar 2018 10:20:29 -0400 Subject: [PATCH 140/314] Device blobs are created only in training. Added testing attribute --- paddle/fluid/operators/lrn_mkldnn_op.cc | 71 ++++++++++++++++++------- paddle/fluid/operators/lrn_op.cc | 1 + 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index a2971fcd14..3bead16ce4 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -22,6 +22,22 @@ namespace operators { using paddle::framework::Tensor; using paddle::platform::MKLDNNDeviceContext; +namespace { +template +std::shared_ptr insert_to_context(const std::string& key, + const MKLDNNDeviceContext& dev_ctx, + Args&&... args) { + auto p = std::static_pointer_cast(dev_ctx.GetBlob(key)); + + if (!p) { + p = std::make_shared(args...); + dev_ctx.SetBlob(key, std::static_pointer_cast(p)); + } + + return p; +} +} // namespace + template class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -42,15 +58,11 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto output_data = out->mutable_data(ctx.GetPlace()); mid->mutable_data(ctx.GetPlace()); - const std::string key = ctx.op().Output("Out"); - const std::string key_src_memory = key + "@lrn_src_memory"; - const std::string key_pd = key + "@lrn_pd"; - const std::string key_workspace_memory = key + "@lrn_workspace_memory"; - const int n = ctx.Attr("n"); const float alpha = ctx.Attr("alpha"); const float beta = ctx.Attr("beta"); const float k = ctx.Attr("k"); + const bool is_test = ctx.Attr("is_test"); auto e_mid = framework::EigenTensor::From(*mid); e_mid = e_mid.constant(k); @@ -71,28 +83,47 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { beta, k}; - auto forward_pd = std::make_shared( - forward_desc, mkldnn_engine); - - dev_ctx.SetBlob(key_pd, forward_pd); - auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine}; - auto src_memory = std::make_shared( - src_memory_pd, static_cast(const_cast(input_data))); - - dev_ctx.SetBlob(key_src_memory, src_memory); auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine}, static_cast(output_data)}; - auto workspace_md = forward_pd->workspace_primitive_desc(); - auto workspace_memory = std::make_shared(workspace_md); + std::unique_ptr forward_op = nullptr; + + if (!is_test) { + const std::string key = ctx.op().Output("Out"); + const std::string key_src_memory = key + "@lrn_src_memory"; + const std::string key_pd = key + "@lrn_pd"; + const std::string key_workspace_memory = key + "@lrn_workspace_memory"; + + auto forward_pd = insert_to_context( + key_pd, dev_ctx, forward_desc, mkldnn_engine); + + auto src_memory = insert_to_context( + key_src_memory, dev_ctx, src_memory_pd); + + src_memory->set_data_handle( + static_cast(const_cast(input_data))); + + auto workspace_memory = insert_to_context( + key_workspace_memory, dev_ctx, + forward_pd->workspace_primitive_desc()); + + forward_op.reset(new mkldnn::lrn_forward{*forward_pd, *src_memory, + *workspace_memory, dst_memory}); - dev_ctx.SetBlob(key_workspace_memory, workspace_memory); + } else { + auto forward_pd = + mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; + auto src_memory = mkldnn::memory{ + src_memory_pd, static_cast(const_cast(input_data))}; + auto workspace_memory = + mkldnn::memory{forward_pd.workspace_primitive_desc()}; - auto forward_op = mkldnn::lrn_forward{*forward_pd, *src_memory, - *workspace_memory, dst_memory}; + forward_op.reset(new mkldnn::lrn_forward{forward_pd, src_memory, + workspace_memory, dst_memory}); + } - std::vector pipeline = {forward_op}; + std::vector pipeline = {*forward_op}; mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } }; diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index bd72f0435e..2b1947a187 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -214,6 +214,7 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); + AddAttr("is_test", "").SetDefault(false); AddComment(R"DOC( Local Response Normalization Operator. From 8440046b7f69a34e4d593bf1b8c4fe997270a6d9 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Thu, 22 Mar 2018 10:14:48 +0800 Subject: [PATCH 141/314] fix doc --- python/paddle/trainer_config_helpers/layers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index eac2cb3168..3684d1e8f7 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2747,17 +2747,17 @@ def img_pool_layer(input, .. math:: - w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} + w & = 1 + ceil(\\frac{input\_width + 2 * padding - pool\_size}{stride}) - h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + h & = 1 + ceil(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y}) - ceil_mode=False: .. math:: - w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride} + w & = 1 + floor(\\frac{input\_width + 2 * padding - pool\_size}{stride}) - h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + h & = 1 + floor(\\frac{input\_height + 2 * padding\_y - pool\_size\_y}{stride\_y}) The example usage is: From d70a70bcdac3c7382be999ee685ae8c7e50cd381 Mon Sep 17 00:00:00 2001 From: weixing02 <564445201@qq.com> Date: Thu, 22 Mar 2018 10:18:10 +0800 Subject: [PATCH 142/314] Modified build.sh and remove build_doc.sh --- paddle/scripts/docker/build.sh | 6 +++--- paddle/scripts/tools/build_docs/.gitignore | 2 -- paddle/scripts/tools/build_docs/build_docs.sh | 8 -------- 3 files changed, 3 insertions(+), 13 deletions(-) delete mode 100644 paddle/scripts/tools/build_docs/.gitignore delete mode 100755 paddle/scripts/tools/build_docs/build_docs.sh diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 6be2bd8fad..2e9b088bfa 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -35,7 +35,7 @@ function cmake_gen() { -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} ${PYTHON_FLAGS} -DWITH_DSO=ON - -DWITH_DOC=OFF + -DWITH_DOC=${WITH_DOC:-OFF} -DWITH_GPU=${WITH_GPU:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_MKL=${WITH_MKL:-ON} @@ -60,7 +60,7 @@ EOF -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} \ ${PYTHON_FLAGS} \ -DWITH_DSO=ON \ - -DWITH_DOC=OFF \ + -DWITH_DOC=${WITH_DOC:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_MKL=${WITH_MKL:-ON} \ @@ -231,7 +231,7 @@ gen_capi_package gen_fluid_inference_lib if [[ ${WITH_C_API:-OFF} == "ON" ]]; then - printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" + printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n" else printf "If you need to install PaddlePaddle in develop docker image," printf "please make install or pip install build/python/dist/*.whl.\n" diff --git a/paddle/scripts/tools/build_docs/.gitignore b/paddle/scripts/tools/build_docs/.gitignore deleted file mode 100644 index 6ec14c8f5b..0000000000 --- a/paddle/scripts/tools/build_docs/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -doc -doc_cn diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh deleted file mode 100755 index f9bc8bf63a..0000000000 --- a/paddle/scripts/tools/build_docs/build_docs.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -docker run --rm \ - -v $(git rev-parse --show-toplevel):/paddle \ - -e "WITH_GPU=OFF" \ - -e "WITH_AVX=ON" \ - -e "WITH_DOC=ON" \ - -e "WOBOQ=ON" \ - ${1:-"paddlepaddle/paddle:latest-dev"} From 990d6396fed3708d1f1eaa5ad87a9a4c3e841c5c Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 22 Mar 2018 10:47:05 +0800 Subject: [PATCH 143/314] Reuduce memory copy when communication between trainer and pserver. (#9271) --- benchmark/cluster/vgg16/vgg16_fluid.py | 52 ++- benchmark/cluster/vgg16/vgg16_tf.py | 10 +- paddle/fluid/operators/detail/CMakeLists.txt | 6 +- .../operators/detail/bytebuffer_stream.h | 134 ++++++ paddle/fluid/operators/detail/grpc_client.cc | 39 +- paddle/fluid/operators/detail/grpc_client.h | 38 +- paddle/fluid/operators/detail/grpc_server.cc | 92 ++-- paddle/fluid/operators/detail/grpc_server.h | 36 +- paddle/fluid/operators/detail/grpc_service.h | 118 ++++++ paddle/fluid/operators/detail/send_recv.proto | 6 +- .../operators/detail/sendrecvop_utils.cc | 129 +----- .../fluid/operators/detail/sendrecvop_utils.h | 12 +- paddle/fluid/operators/detail/test_serde.cc | 177 ++++---- .../operators/detail/variable_response.cc | 400 ++++++++++++++++++ .../operators/detail/variable_response.h | 81 ++++ paddle/fluid/operators/listen_and_serv_op.cc | 9 +- python/paddle/fluid/debuger.py | 2 - python/paddle/fluid/distribute_transpiler.py | 2 + 18 files changed, 1021 insertions(+), 322 deletions(-) create mode 100644 paddle/fluid/operators/detail/grpc_service.h create mode 100644 paddle/fluid/operators/detail/variable_response.cc create mode 100644 paddle/fluid/operators/detail/variable_response.h diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 786f224608..8b29227cfa 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -18,12 +18,13 @@ import sys import time import numpy as np import paddle.v2 as paddle -import paddle.v2.fluid as fluid -import paddle.v2.fluid.core as core -import paddle.v2.fluid.profiler as profiler +import paddle.fluid as fluid +import paddle.fluid.core as core +import paddle.fluid.profiler as profiler import argparse import functools import os +from paddle.fluid import debuger def str2bool(v): @@ -182,28 +183,27 @@ def main(): start_time = time.time() num_samples = 0 train_pass_acc.reset() - with profiler.profiler("CPU", 'total') as prof: - for batch_id, data in enumerate(train_reader()): - ts = time.time() - img_data = np.array( - map(lambda x: x[0].reshape(data_shape), data)).astype( - "float32") - y_data = np.array(map(lambda x: x[1], data)).astype("int64") - y_data = y_data.reshape([-1, 1]) - - loss, acc, b_size = exe.run( - trainer_prog, - feed={"pixel": img_data, - "label": y_data}, - fetch_list=[avg_cost, batch_acc, batch_size]) - iters += 1 - num_samples += len(data) - train_pass_acc.add(value=acc, weight=b_size) - print( - "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" - % (pass_id, iters, loss, acc, - len(data) / (time.time() - ts)) - ) # The accuracy is the accumulation of batches, but not the current batch. + for batch_id, data in enumerate(train_reader()): + ts = time.time() + img_data = np.array( + map(lambda x: x[0].reshape(data_shape), data)).astype( + "float32") + y_data = np.array(map(lambda x: x[1], data)).astype("int64") + y_data = y_data.reshape([-1, 1]) + + loss, acc, b_size = exe.run( + trainer_prog, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[avg_cost, batch_acc, batch_size]) + iters += 1 + num_samples += len(data) + train_pass_acc.add(value=acc, weight=b_size) + print( + "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" + % (pass_id, iters, loss, acc, + len(data) / (time.time() - ts)) + ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time pass_train_acc = train_pass_acc.eval() @@ -254,9 +254,7 @@ def main(): pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) - print("starting server side startup") exe.run(pserver_startup) - print("starting parameter server...") exe.run(pserver_prog) elif training_role == "TRAINER": # Parameter initialization diff --git a/benchmark/cluster/vgg16/vgg16_tf.py b/benchmark/cluster/vgg16/vgg16_tf.py index 996df0e314..2d220478ac 100644 --- a/benchmark/cluster/vgg16/vgg16_tf.py +++ b/benchmark/cluster/vgg16/vgg16_tf.py @@ -292,14 +292,18 @@ def run_benchmark(cluster_spec, server): return np.mean(test_accs) config = tf.ConfigProto( - intra_op_parallelism_threads=1, inter_op_parallelism_threads=1) + intra_op_parallelism_threads=1, + inter_op_parallelism_threads=1, + log_device_placement=True) config.gpu_options.allow_growth = True hooks = [tf.train.StopAtStepHook(last_step=1000000)] with tf.train.MonitoredTrainingSession( - master=server.target, is_chief=(args.task_index == 0), - hooks=hooks) as sess: + master=server.target, + is_chief=(args.task_index == 0), + hooks=hooks, + config=config) as sess: iters, num_samples, start_time = 0, 0, 0.0 for pass_id in range(args.num_passes): # train diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 94395ccfbc..2b19f04489 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -1,6 +1,8 @@ if(WITH_DISTRIBUTE) - grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) + grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc + grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) + cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr + cares zlib protobuf sendrecvop_grpc) endif() diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h index 099deb12d0..0cbe514d04 100644 --- a/paddle/fluid/operators/detail/bytebuffer_stream.h +++ b/paddle/fluid/operators/detail/bytebuffer_stream.h @@ -23,9 +23,107 @@ limitations under the License. */ #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" +namespace grpc { +// A ZeroCopyInputStream that reads from grpc_byte_buffer +class GrpcBufferReader final + : public ::google::protobuf::io::ZeroCopyInputStream { + typedef void (CoreCodegenInterface::*OldReaderInitAPI)( + grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); + typedef int (CoreCodegenInterface::*NewReaderInitAPI)( + grpc_byte_buffer_reader* reader, grpc_byte_buffer* buffer); + void ReaderInit(OldReaderInitAPI ptr, grpc_byte_buffer_reader* reader, + grpc_byte_buffer* buffer) { + (g_core_codegen_interface->*ptr)(reader, buffer); + } + void ReaderInit(NewReaderInitAPI ptr, grpc_byte_buffer_reader* reader, + grpc_byte_buffer* buffer) { + int result = (g_core_codegen_interface->*ptr)(reader, buffer); + (void)result; + } + + public: + explicit GrpcBufferReader(grpc_byte_buffer* buffer) + : byte_count_(0), backup_count_(0) { + ReaderInit(&CoreCodegenInterface::grpc_byte_buffer_reader_init, &reader_, + buffer); + } + ~GrpcBufferReader() override { + g_core_codegen_interface->grpc_byte_buffer_reader_destroy(&reader_); + } + + bool Next(const void** data, int* size) override { + if (backup_count_ > 0) { + *data = GRPC_SLICE_START_PTR(slice_) + GRPC_SLICE_LENGTH(slice_) - + backup_count_; + GPR_CODEGEN_ASSERT(backup_count_ <= INT_MAX); + *size = (int)backup_count_; + backup_count_ = 0; + return true; + } + if (!g_core_codegen_interface->grpc_byte_buffer_reader_next(&reader_, + &slice_)) { + return false; + } + g_core_codegen_interface->grpc_slice_unref(slice_); + *data = GRPC_SLICE_START_PTR(slice_); + // On win x64, int is only 32bit + GPR_CODEGEN_ASSERT(GRPC_SLICE_LENGTH(slice_) <= INT_MAX); + byte_count_ += * size = (int)GRPC_SLICE_LENGTH(slice_); + return true; + } + + void BackUp(int count) override { backup_count_ = count; } + + bool Skip(int count) override { + const void* data; + int size; + while (Next(&data, &size)) { + if (size >= count) { + BackUp(size - count); + return true; + } + // size < count; + count -= size; + } + // error or we have too large count; + return false; + } + + ::google::protobuf::int64 ByteCount() const override { + return byte_count_ - backup_count_; + } + + private: + int64_t byte_count_; + int64_t backup_count_; + grpc_byte_buffer_reader reader_; + grpc_slice slice_; +}; + +}; // namespace grpc + namespace paddle { namespace operators { namespace detail { +// Source provides a way for a particular RPC implementation to provide +// received data to ParseFrom. +class Source { + public: + virtual ~Source() {} + + // Return the stream that contains the data to be parsed. + // Note that this method might be invoked more than once if + // ParseFrom needs to fall back to a more expensive parsing method. + // Every call must return a stream pointing at the beginning of + // the serialized RecvTensorResponse. + // + // Note that a subsequent call to contents() invalidates previous + // results of contents(). + // + // Ownership of the returned stream is retained by the Source and + // should not be deleted by the caller. + virtual ::google::protobuf::io::ZeroCopyInputStream* contents() = 0; +}; // A ZeroCopyInputStream that reads from a grpc::ByteBuffer. class GrpcByteBufferSource @@ -46,6 +144,42 @@ class GrpcByteBufferSource ::google::protobuf::int64 byte_count_; }; +class GrpcByteBufferSourceWrapper : public Source { + public: + GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) : source_(source) {} + virtual ::google::protobuf::io::ZeroCopyInputStream* contents() override { + return source_; + } + + private: + GrpcByteBufferSource* source_; +}; + +class GrpcByteSource : public Source { + public: + explicit GrpcByteSource(grpc_byte_buffer* buffer) : buffer_(buffer) {} + ~GrpcByteSource() override { DeleteStream(); } + + typedef ::grpc::GrpcBufferReader Reader; + + ::google::protobuf::io::ZeroCopyInputStream* contents() override { + DeleteStream(); + stream_ = new (&space_) Reader(buffer_); + return stream_; + } + + private: + void DeleteStream() { + if (stream_) { + stream_->~Reader(); + } + } + + grpc_byte_buffer* buffer_; // Not owned + Reader* stream_ = nullptr; // Points into space_ if non-nullptr + char space_[sizeof(Reader)]; +}; + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index ddeeebec58..eb19685aa6 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "grpc_client.h" +#include #include "paddle/fluid/framework/threadpool.h" + namespace paddle { namespace operators { namespace detail { @@ -31,8 +33,9 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, framework::Async([var_name_val, p_ctx, ep_val, p_scope, time_out, ch, this] { auto* var = p_scope->FindVar(var_name_val); - sendrecv::VariableMessage req; - SerializeToMessage(var_name_val, var, *p_ctx, &req); + + ::grpc::ByteBuffer req; + SerializeToByteBuffer(var_name_val, var, *p_ctx, &req); // varhandle VarHandle var_h; @@ -46,8 +49,11 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, s->Prepare(var_h, time_out); s->response_call_back_ = NULL; - auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, (void*)s); + auto call = std::move(s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, + &cq_)); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, (void*)s); }); req_count_++; @@ -56,9 +62,19 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, } void ProcGetResponse(const VarHandle& var_h, - const sendrecv::VariableMessage& ret_msg) { - auto* outvar = var_h.scope->FindVar(var_h.name); - DeserializeFromMessage(ret_msg, *var_h.ctx, outvar); + // const sendrecv::VariableMessage& ret_msg) { + const ::grpc::ByteBuffer& ret_msg) { + framework::Variable* outvar = NULL; + DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, outvar); +} + +template +void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { + ::grpc::Slice slice(proto.ByteSizeLong()); + proto.SerializeWithCachedSizesToArray( + const_cast(reinterpret_cast(slice.begin()))); + ::grpc::ByteBuffer tmp(&slice, 1); + result->Swap(&tmp); } bool RPCClient::AsyncGetVariable(const std::string& ep, @@ -88,8 +104,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, s->Prepare(var_h, time_out); s->response_call_back_ = ProcGetResponse; - auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, (void*)s); + ::grpc::ByteBuffer buf; + RequestToByteBuffer(req, &buf); + + auto call = std::move(s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_)); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, (void*)s); }); req_count_++; diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h index f520367dd9..8216ac52fb 100644 --- a/paddle/fluid/operators/detail/grpc_client.h +++ b/paddle/fluid/operators/detail/grpc_client.h @@ -25,6 +25,11 @@ limitations under the License. */ #include #include +#include +#include +#include +#include + #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" @@ -49,15 +54,11 @@ struct VarHandle { } }; -void ProcGetResponse(const VarHandle& var_h, - const sendrecv::VariableMessage& msg); +void ProcGetResponse(const VarHandle& var_h, const grpc::ByteBuffer& msg); class BaseProcessor { public: - explicit BaseProcessor(std::shared_ptr ch) { - stub_ = sendrecv::SendRecvService::NewStub(ch); - context_ = NULL; - } + explicit BaseProcessor(std::shared_ptr ch) { context_ = NULL; } virtual ~BaseProcessor() {} @@ -82,19 +83,18 @@ class BaseProcessor { virtual void Process() = 0; - std::unique_ptr stub_; std::unique_ptr context_; grpc::Status status_; VarHandle var_h_; }; -typedef std::function +typedef std::function RequestSendCallBack; class SendProcessor : public BaseProcessor { public: explicit SendProcessor(std::shared_ptr ch) - : BaseProcessor(ch) {} + : BaseProcessor(ch), stub_g_(ch) {} virtual ~SendProcessor() {} @@ -104,17 +104,18 @@ class SendProcessor : public BaseProcessor { } } - sendrecv::VoidMessage reply_; + ::grpc::GenericStub stub_g_; + ::grpc::ByteBuffer reply_; RequestSendCallBack response_call_back_ = NULL; }; -typedef std::function +typedef std::function RequestGetCallBack; class GetProcessor : public BaseProcessor { public: explicit GetProcessor(std::shared_ptr ch) - : BaseProcessor(ch) {} + : BaseProcessor(ch), stub_g_(ch) {} virtual ~GetProcessor() {} @@ -124,30 +125,37 @@ class GetProcessor : public BaseProcessor { } } - sendrecv::VariableMessage reply_; + ::grpc::ByteBuffer reply_; + ::grpc::GenericStub stub_g_; RequestGetCallBack response_call_back_ = ProcGetResponse; }; class BatchBarrierProcessor : public BaseProcessor { public: explicit BatchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor(ch) {} + : BaseProcessor(ch) { + stub_ = sendrecv::SendRecvService::NewStub(ch); + } virtual ~BatchBarrierProcessor() {} virtual void Process() {} sendrecv::VoidMessage reply_; + std::unique_ptr stub_; }; class FetchBarrierProcessor : public BaseProcessor { public: explicit FetchBarrierProcessor(std::shared_ptr ch) - : BaseProcessor(ch) {} + : BaseProcessor(ch) { + stub_ = sendrecv::SendRecvService::NewStub(ch); + } virtual ~FetchBarrierProcessor() {} virtual void Process() {} sendrecv::VariableMessage reply_; + std::unique_ptr stub_; }; class RPCClient { diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 8fff430cc4..9691d1e86b 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -14,7 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/detail/grpc_server.h" -using grpc::ServerAsyncResponseWriter; +using ::grpc::ServerAsyncResponseWriter; namespace paddle { namespace operators { @@ -26,9 +26,10 @@ enum CallStatus { PROCESS = 0, FINISH }; // https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server class RequestBase { public: - explicit RequestBase(sendrecv::SendRecvService::AsyncService* service, - grpc::ServerCompletionQueue* cq) - : service_(service), cq_(cq), status_(PROCESS) { + explicit RequestBase(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + const platform::DeviceContext* dev_ctx) + : service_(service), cq_(cq), status_(PROCESS), dev_ctx_(dev_ctx) { PADDLE_ENFORCE(cq_); } virtual ~RequestBase() {} @@ -42,55 +43,58 @@ class RequestBase { } protected: - grpc::ServerContext ctx_; - sendrecv::SendRecvService::AsyncService* service_; - grpc::ServerCompletionQueue* cq_; + ::grpc::ServerContext ctx_; + GrpcService::AsyncService* service_; + ::grpc::ServerCompletionQueue* cq_; CallStatus status_; + const platform::DeviceContext* dev_ctx_; }; -typedef std::pair MessageWithName; - class RequestSend final : public RequestBase { public: - explicit RequestSend(sendrecv::SendRecvService::AsyncService* service, - grpc::ServerCompletionQueue* cq, - SimpleBlockQueue* queue) - : RequestBase(service, cq), queue_(queue), responder_(&ctx_) { - service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_, - this); + explicit RequestSend(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + framework::Scope* scope, ReceivedQueue* queue, + const platform::DeviceContext* dev_ctx) + : RequestBase(service, cq, dev_ctx), queue_(queue), responder_(&ctx_) { + request_.reset(new VariableResponse(scope, dev_ctx_)); + int method_id = static_cast(detail::GrpcMethod::kSendVariable); + service_->RequestAsyncUnary(method_id, &ctx_, request_.get(), &responder_, + cq_, cq_, this); } virtual ~RequestSend() {} - virtual std::string GetReqName() { return request_.varname(); } + virtual std::string GetReqName() { return request_->Varname(); } virtual void Process() { - MessageWithName msg_with_name = - std::make_pair(request_.varname(), std::move(request_)); - queue_->Push(std::move(msg_with_name)); - responder_.Finish(reply_, grpc::Status::OK, this); + queue_->Push(std::make_pair(request_->Varname(), request_)); + + sendrecv::VoidMessage reply; + responder_.Finish(reply, ::grpc::Status::OK, this); status_ = FINISH; } protected: - sendrecv::VariableMessage request_; - sendrecv::VoidMessage reply_; - SimpleBlockQueue* queue_; + std::shared_ptr request_; + ReceivedQueue* queue_; ServerAsyncResponseWriter responder_; }; class RequestGet final : public RequestBase { public: - explicit RequestGet(sendrecv::SendRecvService::AsyncService* service, - grpc::ServerCompletionQueue* cq, framework::Scope* scope, + explicit RequestGet(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + framework::Scope* scope, const platform::DeviceContext* dev_ctx, SimpleBlockQueue* queue) - : RequestBase(service, cq), + : RequestBase(service, cq, dev_ctx), responder_(&ctx_), scope_(scope), - dev_ctx_(dev_ctx), queue_(queue) { - service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this); + int method_id = static_cast(detail::GrpcMethod::kGetVariable); + service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, + cq_, this); } virtual ~RequestGet() {} @@ -101,24 +105,26 @@ class RequestGet final : public RequestBase { // proc request. std::string var_name = request_.varname(); auto* var = scope_->FindVar(var_name); + + ::grpc::ByteBuffer reply; if (var_name != FETCH_BARRIER_MESSAGE) { - SerializeToMessage(var_name, var, *dev_ctx_, &reply_); + SerializeToByteBuffer(var_name, var, *dev_ctx_, &reply); } - // TODO(gongwb): check var's info. - responder_.Finish(reply_, grpc::Status::OK, this); + + responder_.Finish(reply, ::grpc::Status::OK, this); status_ = FINISH; - MessageWithName msg_with_name = - // request name reply - std::make_pair(var_name, std::move(reply_)); - queue_->Push(msg_with_name); + + if (var_name == FETCH_BARRIER_MESSAGE) { + sendrecv::VariableMessage msg; + MessageWithName msg_with_name = std::make_pair(var_name, msg); + queue_->Push(msg_with_name); + } } protected: sendrecv::VariableMessage request_; - sendrecv::VariableMessage reply_; - ServerAsyncResponseWriter responder_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; framework::Scope* scope_; - const platform::DeviceContext* dev_ctx_; SimpleBlockQueue* queue_; }; @@ -133,8 +139,8 @@ void AsyncGRPCServer::WaitClientGet(int count) { } void AsyncGRPCServer::RunSyncUpdate() { - grpc::ServerBuilder builder; - builder.AddListeningPort(address_, grpc::InsecureServerCredentials()); + ::grpc::ServerBuilder builder; + builder.AddListeningPort(address_, ::grpc::InsecureServerCredentials()); builder.SetMaxSendMessageSize(std::numeric_limits::max()); builder.SetMaxReceiveMessageSize(std::numeric_limits::max()); builder.RegisterService(&service_); @@ -182,8 +188,8 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() { if (is_shut_down_) { return; } - RequestSend* send = - new RequestSend(&service_, cq_send_.get(), &var_recv_queue_); + RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_, + &var_recv_queue_, dev_ctx_); VLOG(4) << "Create RequestSend status:" << send->Status(); } @@ -198,7 +204,7 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() { } // FIXME(typhoonzero): change cq_name to enum. -void AsyncGRPCServer::HandleRequest(grpc::ServerCompletionQueue* cq, +void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, std::string cq_name, std::function TryToRegisterNewOne) { TryToRegisterNewOne(); diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index b6666bcf96..9c21a07432 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -14,28 +14,35 @@ limitations under the License. */ #pragma once +#include +#include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/simple_block_queue.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h" -#include -#include -#include -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/detail/grpc_service.h" + +//#include namespace paddle { namespace operators { namespace detail { +typedef std::pair> + ReceivedMessage; +typedef SimpleBlockQueue ReceivedQueue; + typedef std::pair MessageWithName; class RequestBase; -class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { +class AsyncGRPCServer final { public: explicit AsyncGRPCServer(const std::string &address) : address_(address) {} @@ -50,14 +57,16 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; } - const MessageWithName Get() { return this->var_recv_queue_.Pop(); } + const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); } - void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); } + void Push(const std::string &msg_name) { + this->var_recv_queue_.Push(std::make_pair(msg_name, nullptr)); + } void ShutDown(); protected: - void HandleRequest(grpc::ServerCompletionQueue *cq, std::string cq_name, + void HandleRequest(::grpc::ServerCompletionQueue *cq, std::string cq_name, std::function TryToRegisterNewOne); void TryToRegisterNewSendOne(); void TryToRegisterNewGetOne(); @@ -66,18 +75,19 @@ class AsyncGRPCServer final : public sendrecv::SendRecvService::Service { private: std::mutex cq_mutex_; volatile bool is_shut_down_ = false; - std::unique_ptr cq_send_; - std::unique_ptr cq_get_; + std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_; + std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_; - sendrecv::SendRecvService::AsyncService service_; - std::unique_ptr server_; + GrpcService::AsyncService service_; + std::unique_ptr<::grpc::Server> server_; std::string address_; framework::Scope *scope_; const platform::DeviceContext *dev_ctx_; + // received variable from RPC, operators fetch variable from this queue. - SimpleBlockQueue var_recv_queue_; SimpleBlockQueue var_get_queue_; + ReceivedQueue var_recv_queue_; // condition of the sub program std::mutex barrier_mutex_; diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h new file mode 100644 index 0000000000..ae6f9db3bd --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_service.h @@ -0,0 +1,118 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/operators/detail/variable_response.h" + +// NOTE: This method was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// method and did some modifications so that we can parse gRPC +// requests without too much copying of the tensor data. + +namespace grpc { +class CompletionQueue; +class Channel; +class RpcService; +class ServerCompletionQueue; +class ServerContext; + +// Support parsing/unparsing of tensorflow::VariableResponse. +// Wire-format is identical to RecvVariableResponse. +template <> +class SerializationTraits { + public: + static Status Serialize( + const paddle::operators::detail::VariableResponse& msg, + grpc_byte_buffer** bp, bool* own_buffer) { + PADDLE_ENFORCE(false, "SerializationTraits::Serialize not implemented!"); + return Status(); + } + static Status Deserialize(grpc_byte_buffer* buffer, + paddle::operators::detail::VariableResponse* msg, + int max_message_size = INT_MAX) { + if (buffer == nullptr) { + return Status(StatusCode::INTERNAL, "No payload"); + } + + Status result = g_core_codegen_interface->ok(); + if (result.ok()) { + paddle::operators::detail::GrpcByteSource source(buffer); + int ret = msg->Parse(&source); + if (ret != 0) { + result = Status(StatusCode::INTERNAL, "VariableResponse parse error"); + } + } + g_core_codegen_interface->grpc_byte_buffer_destroy(buffer); + return result; + } +}; +} // namespace grpc + +namespace paddle { +namespace operators { +namespace detail { + +enum class GrpcMethod { + kSendVariable, + kGetVariable, +}; + +static const int kGrpcNumMethods = + static_cast(GrpcMethod::kGetVariable) + 1; + +inline const char* GrpcMethodName(GrpcMethod id) { + switch (id) { + case GrpcMethod::kSendVariable: + return "/sendrecv.SendRecvService/SendVariable"; + case GrpcMethod::kGetVariable: + return "/sendrecv.SendRecvService/GetVariable"; + } + + // Shouldn't be reached. + PADDLE_ENFORCE(false, "Invalid id: not found valid method name"); + return nullptr; +} + +class GrpcService final { + public: + class AsyncService : public ::grpc::Service { + public: + AsyncService() { + for (int i = 0; i < kGrpcNumMethods; ++i) { + AddMethod(new ::grpc::internal::RpcServiceMethod( + GrpcMethodName(static_cast(i)), + ::grpc::internal::RpcMethod::NORMAL_RPC, nullptr)); + ::grpc::Service::MarkMethodAsync(i); + } + } + virtual ~AsyncService() {} + + // Make RequestAsyncUnary public for grpc_call.h + using ::grpc::Service::RequestAsyncUnary; + }; +}; + +} // namespace detail +} // namespace operator +} // namespace paddle diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index b0215d4a80..598aaa4c51 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -32,6 +32,9 @@ enum VarType { SELECTED_ROWS = 1; } +// NOTICE(gongwb):don't modify this proto if you are not +// not familar with how we serialize in sendrecvop_utils.h +// and deserilize it in variable_response.h. message VariableMessage { enum Type { // Pod Types @@ -45,7 +48,6 @@ message VariableMessage { } message LodData { repeated int64 lod_data = 1; } - string varname = 1; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType VarType type = 2; @@ -64,3 +66,5 @@ message VariableMessage { } message VoidMessage {} + +message TestMessage { int64 test_1 = 1; } diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index 39117eeeb6..d7bbf79c50 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -13,61 +13,19 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include +#include #include "google/protobuf/io/coded_stream.h" #include "google/protobuf/io/zero_copy_stream.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/detail/bytebuffer_stream.h" #include "paddle/fluid/operators/detail/proto_encoder_helper.h" +#include "paddle/fluid/operators/detail/variable_response.h" namespace paddle { namespace operators { namespace detail { -void SerializeToMessage(const std::string& name, const framework::Variable* var, - const platform::DeviceContext& ctx, - sendrecv::VariableMessage* msg) { - msg->set_varname(name); - std::ostringstream oss; - switch (framework::ToVarType(var->Type())) { - case framework::proto::VarType_Type_LOD_TENSOR: - msg->set_type(sendrecv::VarType::LOD_TENSOR); - framework::SerializeToStream(oss, var->Get(), ctx); - break; - case framework::proto::VarType_Type_SELECTED_ROWS: - msg->set_type(sendrecv::VarType::SELECTED_ROWS); - framework::SerializeToStream(oss, var->Get(), - ctx); - break; - default: { - PADDLE_THROW("Serialize does not support type: %s", - typeid(var->Type()).name()); - break; - } - } - msg->set_serialized(oss.str()); -} - -void DeserializeFromMessage(const sendrecv::VariableMessage& msg, - const platform::DeviceContext& ctx, - framework::Variable* var) { - std::istringstream iss(msg.serialized()); - switch (msg.type()) { - case sendrecv::VarType::LOD_TENSOR: - DeserializeFromStream(iss, var->GetMutable(), ctx); - break; - case sendrecv::VarType::SELECTED_ROWS: { - DeserializeFromStream(iss, var->GetMutable(), - ctx); - break; - } - default: { - PADDLE_THROW("Deserialize does not support type: %s", - typeid(var->Type()).name()); - break; - } - } -} - void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg) { @@ -123,6 +81,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, static_cast(ctx); auto copy_size = tensor.memory_size(); payload = memory::Alloc(cpu, copy_size); + memory::Copy(cpu, payload, boost::get(tensor.place()), reinterpret_cast(tensor.data()), @@ -132,6 +91,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, platform::CPUPlace cpu; memory::Free(cpu, backing); }; + #endif } else { payload = tensor.data(); @@ -219,80 +179,11 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, - framework::Variable* var) { - sendrecv::VariableMessage meta; - GrpcByteBufferSource source; - source.Init(msg); - ::google::protobuf::io::CodedInputStream input(&source); - // do zerocopy parsing - PADDLE_ENFORCE(meta.ParseFromCodedStream(&input)); - PADDLE_ENFORCE(input.ConsumedEntireMessage()); - // dims is needed by both tensor and selectedrows - std::vector vecdims; - for (auto& d : meta.dims()) { - vecdims.push_back(d); - } - framework::DDim dims = framework::make_ddim(vecdims); - - if (meta.type() == sendrecv::LOD_TENSOR) { - auto* tensor = var->GetMutable(); - tensor->Resize(dims); - void* tensor_data = tensor->mutable_data( - ctx.GetPlace(), - paddle::operators::detail::ToTypeIndex(meta.data_type())); - framework::LoD lod; - for (int i = 0; i < meta.lod_level(); ++i) { - framework::Vector v; - for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) { - v.push_back(meta.lod(i).lod_data(j)); - } - lod.push_back(v); - } - tensor->set_lod(lod); - // How to avoid copying and use the message buffer directly? - // Maybe need to find a way to release all memory except tensor content. - if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA - platform::CPUPlace cpu; - auto& gpu_dev_ctx = static_cast(ctx); - memory::Copy(boost::get(tensor->place()), - tensor_data, cpu, - reinterpret_cast(meta.serialized().data()), - meta.serialized().size(), gpu_dev_ctx.stream()); - ctx.Wait(); -#endif - } else { - memcpy(tensor_data, - reinterpret_cast(meta.serialized().data()), - meta.serialized().size()); - } - } else if (meta.type() == sendrecv::SELECTED_ROWS) { - auto* slr = var->GetMutable(); - auto* tensor = slr->mutable_value(); - int64_t* rows_data = slr->mutable_rows()->data(); - tensor->Resize(dims); - void* tensor_data = tensor->mutable_data( - ctx.GetPlace(), - paddle::operators::detail::ToTypeIndex(meta.data_type())); - if (platform::is_gpu_place(ctx.GetPlace())) { -#ifdef PADDLE_WITH_CUDA - platform::CPUPlace cpu; - auto& gpu_dev_ctx = static_cast(ctx); - memory::Copy(boost::get(tensor->place()), - tensor_data, cpu, - reinterpret_cast(meta.serialized().data()), - meta.serialized().size(), gpu_dev_ctx.stream()); - ctx.Wait(); -#endif - } else { - memcpy(tensor_data, - reinterpret_cast(meta.serialized().data()), - meta.serialized().size()); - } - // copy rows CPU data, GPU data will be copied lazly - memcpy(rows_data, reinterpret_cast(meta.rows().data()), - meta.rows().size()); - } + const framework::Scope* scope, + framework::Variable*& var) { + operators::detail::VariableResponse resp(scope, &ctx); + PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); + var = resp.GetVar(); } } // namespace detail diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index 4fa6aefd3e..3b87562703 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" @@ -36,21 +37,14 @@ namespace detail { typedef void (*DestroyCallback)(void*); -void SerializeToMessage(const std::string& name, const framework::Variable* var, - const platform::DeviceContext& ctx, - sendrecv::VariableMessage* msg); - -void DeserializeFromMessage(const sendrecv::VariableMessage& msg, - const platform::DeviceContext& ctx, - framework::Variable* var); - void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg); void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, const platform::DeviceContext& ctx, - framework::Variable* var); + const framework::Scope* scope, + framework::Variable*& var); inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { switch (type) { diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc index 2f06e5a686..4be5963794 100644 --- a/paddle/fluid/operators/detail/test_serde.cc +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -16,11 +16,13 @@ limitations under the License. */ #include #include +#include #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/detail/variable_response.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/printf.h" @@ -31,19 +33,21 @@ namespace operators = paddle::operators; namespace math = paddle::operators::math; namespace memory = paddle::memory; -void RunSerdeTestTensor(platform::Place place) { - // serialize var to ByteBuffer - framework::Variable var; - auto* tensor = var.GetMutable(); - tensor->Resize(framework::make_ddim({4, 8, 4, 2})); - framework::LoD lod; - lod.push_back(framework::Vector({1, 3, 8})); - tensor->set_lod(lod); - int tensor_numel = 4 * 8 * 4 * 2; +void RunSerdeTestSelectedRows(platform::Place place) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + framework::Variable var; + auto* slr = var.GetMutable(); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({2, 10})); tensor->mutable_data(place); - math::set_constant(ctx, tensor, 31.9); + int tensor_numel = 2 * 10; + math::set_constant(ctx, tensor, 32.7); + rows->push_back(3); + rows->push_back(10); ::grpc::ByteBuffer msg; operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); @@ -56,62 +60,67 @@ void RunSerdeTestTensor(platform::Place place) { for (const auto& s : slices) { tmp.append(reinterpret_cast(s.begin()), s.size()); } + sendrecv::VariableMessage varmsg; EXPECT_TRUE(varmsg.ParseFromString(tmp)); + EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 0); - EXPECT_EQ(varmsg.dims()[0], 4); - EXPECT_EQ(varmsg.dims()[1], 8); - EXPECT_EQ(varmsg.dims()[2], 4); - EXPECT_EQ(varmsg.dims()[3], 2); - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + EXPECT_EQ(varmsg.type(), 1); const float* tensor_data = reinterpret_cast(varmsg.serialized().data()); + const int64_t* rows_data = + reinterpret_cast(varmsg.rows().data()); for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 31.9); + EXPECT_FLOAT_EQ(tensor_data[i], 32.7); } - + EXPECT_EQ(rows_data[0], 3); + EXPECT_EQ(rows_data[1], 10); // deserialize zero-copy - framework::Variable var2; - operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); - auto tensor2 = var2.Get(); + // framework::Variable var2; + // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + framework::Scope scope; + scope.Var("myvar"); + operators::detail::TensorResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto* slr2 = var2->GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); float* tensor_data2 = nullptr; framework::Tensor tmp_tensor; if (platform::is_gpu_place(ctx.GetPlace())) { platform::CPUPlace cpu; - framework::TensorCopy(tensor2, cpu, &tmp_tensor); + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); tensor_data2 = tmp_tensor.data(); } else { - tensor_data2 = const_cast(tensor2.data()); + tensor_data2 = const_cast(tensor2->data()); } + const int64_t* rows_data2 = rows2->data(); - EXPECT_EQ(varmsg.lod_level(), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); - EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); - EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); - for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + EXPECT_EQ(rows_data2[0], 3); + EXPECT_EQ(rows_data2[1], 10); } -void RunSerdeTestSelectedRows(platform::Place place) { - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - +void RunTestLodTensor(platform::Place place, int from_type = 0) { // serialize var to ByteBuffer framework::Variable var; - auto* slr = var.GetMutable(); - auto* tensor = slr->mutable_value(); - auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({2, 10})); + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({4, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + int tensor_numel = 4 * 8 * 4 * 2; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); tensor->mutable_data(place); - int tensor_numel = 2 * 10; - math::set_constant(ctx, tensor, 32.7); - rows->push_back(3); - rows->push_back(10); + math::set_constant(ctx, tensor, 31.9); ::grpc::ByteBuffer msg; operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); @@ -126,43 +135,75 @@ void RunSerdeTestSelectedRows(platform::Place place) { } sendrecv::VariableMessage varmsg; EXPECT_TRUE(varmsg.ParseFromString(tmp)); - EXPECT_EQ(varmsg.varname(), "myvar"); - EXPECT_EQ(varmsg.type(), 1); + EXPECT_EQ(varmsg.type(), 0); + EXPECT_EQ(varmsg.dims()[0], 4); + EXPECT_EQ(varmsg.dims()[1], 8); + EXPECT_EQ(varmsg.dims()[2], 4); + EXPECT_EQ(varmsg.dims()[3], 2); + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); const float* tensor_data = reinterpret_cast(varmsg.serialized().data()); - const int64_t* rows_data = - reinterpret_cast(varmsg.rows().data()); for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data[i], 32.7); + EXPECT_FLOAT_EQ(tensor_data[i], 31.9); } - EXPECT_EQ(rows_data[0], 3); - EXPECT_EQ(rows_data[1], 10); + + // message binary + std::string str; + varmsg.SerializeToString(&str); + + // message bytebuffer + ::grpc::Slice slices_2[1]; + int num_slices = 1; + slices_2[0] = ::grpc::Slice(str.length()); + memcpy(const_cast(slices_2[0].begin()), str.c_str(), str.length()); + ::grpc::ByteBuffer bytebuffer2(&slices_2[0], num_slices); + // deserialize zero-copy - framework::Variable var2; - operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + framework::Scope scope; + scope.Var("myvar"); + operators::detail::TensorResponse resp(&scope, &ctx); + if (from_type == 0) { + EXPECT_EQ(resp.Parse(msg), 0); + } else { + EXPECT_EQ(resp.Parse(bytebuffer2), 0); + } - auto* slr2 = var2.GetMutable(); - auto* tensor2 = slr2->mutable_value(); - auto* rows2 = slr2->mutable_rows(); + framework::Variable* var2 = resp.GetVar(); + + auto tensor2 = var2->Get(); float* tensor_data2 = nullptr; framework::Tensor tmp_tensor; if (platform::is_gpu_place(ctx.GetPlace())) { platform::CPUPlace cpu; - framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + framework::TensorCopy(tensor2, cpu, &tmp_tensor); tensor_data2 = tmp_tensor.data(); } else { - tensor_data2 = const_cast(tensor2->data()); + tensor_data2 = const_cast(tensor2.data()); } - const int64_t* rows_data2 = rows2->data(); - for (int i = 0; i < tensor_numel; ++i) { - EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); - } - EXPECT_EQ(rows_data2[0], 3); - EXPECT_EQ(rows_data2[1], 10); + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); +} + +TEST(LodTensor, GPU) { + platform::CUDAPlace place; + RunTestLodTensor(place); + RunTestLodTensor(place, 1); +} + +TEST(LodTensor, CPU) { + platform::CPUPlace place; + RunTestLodTensor(place); + RunTestLodTensor(place, 1); } TEST(SelectedRows, CPU) { @@ -174,13 +215,3 @@ TEST(SelectedRows, GPU) { platform::CUDAPlace place; RunSerdeTestSelectedRows(place); } - -TEST(Tensor, CPU) { - platform::CPUPlace place; - RunSerdeTestTensor(place); -} - -TEST(Tensor, GPU) { - platform::CUDAPlace place; - RunSerdeTestTensor(place); -} \ No newline at end of file diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc new file mode 100644 index 0000000000..12e8eb0b4d --- /dev/null +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -0,0 +1,400 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/detail/variable_response.h" +#include +#include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace detail { + +enum WireType { + WIRETYPE_VARINT = 0, + WIRETYPE_LENGTH_DELIMITED = 2, +}; + +inline int GetTagFieldNumber(uint32_t tag) { return tag >> 3; } + +inline WireType GetTagWireType(uint32_t tag) { + return static_cast(tag & 0x7); +} + +bool ReadVarintSizeAsInt(::google::protobuf::io::CodedInputStream* input, + int* result) { + uint64_t v; + if (input->ReadVarint64(&v) && v <= static_cast(INT_MAX)) { + *result = static_cast(v); + return true; + } else { + return false; + } +} + +bool ReadRaw(::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& dev_ctx, platform::Place place, + void* dest, int size) { + const void* data = NULL; + int size_to_write = 0; + + if (platform::is_gpu_place(place)) { +#ifdef PADDLE_WITH_CUDA + auto& gpu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + + char* p = reinterpret_cast(dest); + while (size > 0) { + if (!input->GetDirectBufferPointer(&data, &size_to_write)) { + return false; + } + + memory::Copy(boost::get(place), + reinterpret_cast(p), cpu, data, size_to_write, + gpu_dev_ctx.stream()); + p += size_to_write; + size -= size_to_write; + + input->Skip(size_to_write); + } + gpu_dev_ctx.Wait(); +#else + PADDLE_THROW("Unexpected branch"); +#endif + return true; + } + + char* p = reinterpret_cast(dest); + while (size > 0) { + if (!input->GetDirectBufferPointer(&data, &size_to_write)) { + return false; + } + // TODO(gongwb): can we avoid copy? + platform::CPUPlace cpu; + memory::Copy(cpu, reinterpret_cast(p), cpu, data, size_to_write); + + p += size_to_write; + size -= size_to_write; + + input->Skip(size_to_write); + } + + return true; +} + +bool VariableResponse::CopyLodTensorData( + ::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& ctx, framework::DDim& dims, int length) { + auto var = scope_->FindVar(meta_.varname()); + auto* tensor = var->GetMutable(); + tensor->Resize(dims); + + framework::LoD lod; + for (int i = 0; i < meta_.lod_level(); ++i) { + framework::Vector v; + for (int j = 0; j < meta_.lod(i).lod_data_size(); ++j) { + v.push_back(meta_.lod(i).lod_data(j)); + } + lod.push_back(v); + } + tensor->set_lod(lod); + + void* tensor_data = + tensor->mutable_data(ctx.GetPlace(), ToTypeIndex(meta_.data_type())); + + if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { + return false; + } + + return true; +} + +inline framework::DDim GetDims( + const ::google::protobuf::RepeatedField<::google::protobuf::int64>& dims) { + std::vector vecdims; + for (auto& d : dims) { + vecdims.push_back(d); + } + return framework::make_ddim(vecdims); +} + +bool VariableResponse::CopySelectRowsTensorData( + ::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& ctx, framework::DDim& dims, int length) { + auto var = scope_->FindVar(meta_.varname()); + auto* slr = var->GetMutable(); + auto* tensor = slr->mutable_value(); + tensor->Resize(dims); + void* tensor_data = tensor->mutable_data( + ctx.GetPlace(), + paddle::operators::detail::ToTypeIndex(meta_.data_type())); + + if (!ReadRaw(input, ctx, tensor->place(), tensor_data, length)) { + return false; + } + + return true; +} + +bool VariableResponse::CopySelectRowsData( + ::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& ctx, int length) { + auto var = scope_->FindVar(meta_.varname()); + auto* slr = var->GetMutable(); + int64_t* rows_data = slr->mutable_rows()->data(); + + // copy rows CPU data, GPU data will be copied lazily. + platform::CPUPlace cpu; + if (!ReadRaw(input, ctx, cpu, rows_data, length)) { + return false; + } + + return true; +} + +bool ParseLodData(::google::protobuf::io::CodedInputStream* input, + std::vector* lod) { + while (true) { + auto p = input->ReadTagWithCutoff(127); + int tag = GetTagFieldNumber(p.first); + WireType wt = GetTagWireType(p.first); + + if (!p.second) { + return (tag == 0); + } + + switch (tag) { + case sendrecv::VariableMessage_LodData::kLodDataFieldNumber: { + uint64_t v; + if (wt == WIRETYPE_VARINT) { + if (!input->ReadVarint64(&v)) { + return false; + } + lod->push_back(v); + break; + } + + if (wt == WIRETYPE_LENGTH_DELIMITED) { + int length = 0; + if (!input->ReadVarintSizeAsInt(&length)) { + return tag; + } + + for (int i = 0; i < length; i++) { + uint64_t v; + if (!input->ReadVarint64(&v)) { + return false; + } + lod->push_back(v); + } + break; + } + + return false; + } + default: { return false; } + } + } + + return true; +} + +int VariableResponse::Parse(const ::grpc::ByteBuffer& byte_buffer) { + GrpcByteBufferSource source; + source.Init(byte_buffer); + GrpcByteBufferSourceWrapper r(&source); + + return Parse(&r); +} + +int VariableResponse::Parse(Source* source) { + ::google::protobuf::io::ZeroCopyInputStream* input_stream = + source->contents(); + ::google::protobuf::io::CodedInputStream input(input_stream); + input.SetTotalBytesLimit(INT_MAX, INT_MAX); + + while (true) { + auto p = input.ReadTagWithCutoff(127); + int tag = GetTagFieldNumber(p.first); + WireType wt = GetTagWireType(p.first); + if (!p.second) { + if (tag != 0) { + return -1; + } + + return 0; + } + + switch (tag) { + case sendrecv::VariableMessage::kVarnameFieldNumber: { + uint32_t length; + if ((wt != WIRETYPE_LENGTH_DELIMITED) || !input.ReadVarint32(&length)) { + return tag; + } + + std::string temp; + if (!input.ReadString(&temp, length)) { + return tag; + } + + meta_.set_varname(temp); + break; + } + case sendrecv::VariableMessage::kTypeFieldNumber: { + uint64_t v; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { + return tag; + } + + meta_.set_type(static_cast<::sendrecv::VarType>(v)); + break; + } + case sendrecv::VariableMessage::kDataTypeFieldNumber: { + uint64_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { + return tag; + } + + meta_.set_data_type(static_cast<::sendrecv::VariableMessage_Type>(v)); + break; + } + case sendrecv::VariableMessage::kDimsFieldNumber: { + // not packed + if (wt == WIRETYPE_VARINT) { + uint64_t v; + if (!input.ReadVarint64(&v)) { + return tag; + } + meta_.add_dims(v); + break; + } + + // packed + if (wt == WIRETYPE_LENGTH_DELIMITED) { + int length = 0; + if (!input.ReadVarintSizeAsInt(&length)) { + return tag; + } + for (int i = 0; i < length; i++) { + uint64_t v; + if (!input.ReadVarint64(&v)) { + return tag; + } + meta_.add_dims(v); + } + break; + } + + return tag; + } + case sendrecv::VariableMessage::kLodLevelFieldNumber: { + uint64_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { + return tag; + } + meta_.set_lod_level(static_cast(v)); + break; + } + case sendrecv::VariableMessage::kLodFieldNumber: { + int length = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &length)) { + return tag; + } + + std::pair<::google::protobuf::io::CodedInputStream::Limit, int> p = + input.IncrementRecursionDepthAndPushLimit(length); + + std::vector lod_data; + if (p.second < 0 || !ParseLodData(&input, &lod_data)) { + return tag; + } + + if (!input.DecrementRecursionDepthAndPopLimit(p.first)) { + return false; + } + + if (lod_data.size() == 0) { + break; + } + + auto lod = meta_.add_lod(); + for (uint32_t i = 0; i < lod_data.size(); i++) { + lod->add_lod_data(lod_data[i]); + } + break; + } + case sendrecv::VariableMessage::kSerializedFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + int length = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &length)) { + return tag; + } + + framework::DDim dims = GetDims(meta_.dims()); + if (meta_.type() == sendrecv::LOD_TENSOR) { + PADDLE_ENFORCE(meta_.lod_size() >= 0, + "lod info should be got first!"); + if (!CopyLodTensorData(&input, *dev_ctx_, dims, length)) { + return tag; + } + break; + } + + if (meta_.type() == sendrecv::SELECTED_ROWS) { + if (!CopySelectRowsTensorData(&input, *dev_ctx_, dims, length)) { + return tag; + } + break; + } + + return tag; + } + case sendrecv::VariableMessage::kRowsFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + int length = 0; + if (wt != WIRETYPE_LENGTH_DELIMITED || + !ReadVarintSizeAsInt(&input, &length)) { + return tag; + } + + if (!CopySelectRowsData(&input, *dev_ctx_, length)) { + return tag; + } + break; + } + + default: { + // Unknown tag, return unknown error. + return -1; + } + } + } + + return 0; +} + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h new file mode 100644 index 0000000000..c7bc7a46e7 --- /dev/null +++ b/paddle/fluid/operators/detail/variable_response.h @@ -0,0 +1,81 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" +#include "paddle/fluid/operators/detail/send_recv.pb.h" + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/detail/bytebuffer_stream.h" + +namespace paddle { +namespace operators { +namespace detail { + +class VariableResponse { + public: + VariableResponse(const framework::Scope* scope, + const platform::DeviceContext* dev_ctx) + : scope_(scope), dev_ctx_(dev_ctx){}; + + virtual ~VariableResponse(){}; + + // return: + // 0:ok. + // -1: unkown error. + // other: number of error field. + int Parse(Source* source); + + // return: + // 0:ok. + // -1: unkown error. + // other: number of error field. + int Parse(const ::grpc::ByteBuffer& byte_buffer); + + inline std::string Varname() { return meta_.varname(); } + + // should call parse first. + framework::Variable* GetVar() { return scope_->FindVar(meta_.varname()); } + + private: + bool CopySelectRowsTensorData(::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& ctx, + framework::DDim& dims, int length); + + bool CopySelectRowsData(::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& ctx, int length); + + bool CopyLodTensorData(::google::protobuf::io::CodedInputStream* input, + const platform::DeviceContext& ctx, + framework::DDim& dims, int length); + + private: + const framework::Scope* scope_; + const platform::DeviceContext* dev_ctx_; + // only Skeleton + sendrecv::VariableMessage meta_; +}; + +}; // namespace detail +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index a594de67e0..31ea2a7e58 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -69,9 +69,7 @@ class ListenAndServOp : public framework::OperatorBase { } void Stop() override { - detail::MessageWithName term_msg; - term_msg.first = LISTEN_TERMINATE_MESSAGE; - rpc_service_->Push(term_msg); + rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); rpc_service_->ShutDown(); server_thread_->join(); } @@ -108,7 +106,7 @@ class ListenAndServOp : public framework::OperatorBase { size_t recv_var_cnt = 0; int batch_barrier = 0; while (batch_barrier != fan_in) { - const detail::MessageWithName &v = rpc_service_->Get(); + const detail::ReceivedMessage v = rpc_service_->Get(); auto recv_var_name = v.first; if (recv_var_name == LISTEN_TERMINATE_MESSAGE) { LOG(INFO) << "received terminate message and exit"; @@ -121,12 +119,11 @@ class ListenAndServOp : public framework::OperatorBase { } else { VLOG(3) << "received grad: " << recv_var_name; recv_var_cnt++; - auto *var = recv_scope.FindVar(recv_var_name); + auto var = v.second->GetVar(); if (var == nullptr) { LOG(ERROR) << "Can not find server side var: " << recv_var_name; PADDLE_THROW("Can not find server side var"); } - detail::DeserializeFromMessage(v.second, dev_ctx, var); if (var->IsType()) { sparse_vars.push_back(var); } diff --git a/python/paddle/fluid/debuger.py b/python/paddle/fluid/debuger.py index 97fa182c40..7b4afa9bf6 100644 --- a/python/paddle/fluid/debuger.py +++ b/python/paddle/fluid/debuger.py @@ -16,7 +16,6 @@ import sys import re from graphviz import GraphPreviewGenerator import proto.framework_pb2 as framework_pb2 -import paddle.fluid.core as core _vartype2str_ = [ "UNK", @@ -126,7 +125,6 @@ def pprint_block_codes(block_desc, show_backward=False): def is_var_backward(var_desc): return "@GRAD" in var_desc.name - #print(type(block_desc)) if type(block_desc) is not framework_pb2.BlockDesc: block_desc = framework_pb2.BlockDesc.FromString( block_desc.serialize_to_string()) diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index ad655ee96c..33cea96421 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -20,6 +20,7 @@ from layer_helper import LayerHelper from distributed_spliter import * import math from . import core +import debuger class VarBlock: @@ -289,6 +290,7 @@ class DistributeTranspiler: dtype=v.dtype, shape=v.shape) recv_inputs.append(var) + # step3 optimize_block = pserver_program.create_block(0) # step 4 From e0ac6bc436725a7750b46a674b97b89cccdef36b Mon Sep 17 00:00:00 2001 From: sabreshao Date: Thu, 22 Mar 2018 10:48:27 +0800 Subject: [PATCH 144/314] CMake refine for HIP support. Fix CI. --- paddle/fluid/pybind/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d523ad7f73..fe991033df 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,12 +1,12 @@ if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED - SRCS pybind.cc exception.cc protobuf.cc const_value.cc + SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method ${GLOB_OP_LIB}) else() cc_library(paddle_pybind SHARED - SRCS pybind.cc exception.cc protobuf.cc const_value.cc + SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method ${GLOB_OP_LIB}) if(NOT APPLE AND NOT ANDROID) From dd73d18bb7b7cb521cab2f3547633fd6736e8c12 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 22 Mar 2018 10:49:51 +0800 Subject: [PATCH 145/314] Extract SSAGraph --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/details/CMakeLists.txt | 2 ++ paddle/fluid/framework/details/ssa_graph.cc | 15 ++++++++ paddle/fluid/framework/details/ssa_graph.h | 34 +++++++++++++++++++ paddle/fluid/framework/parallel_executor.cc | 12 ++----- 5 files changed, 54 insertions(+), 11 deletions(-) create mode 100644 paddle/fluid/framework/details/ssa_graph.cc create mode 100644 paddle/fluid/framework/details/ssa_graph.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 2b90bb5abd..f1d19efa97 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -95,7 +95,7 @@ else() endif() cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle - fetch_op_handle computation_op_handle ${parallel_executor_cuda_deps}) + fetch_op_handle computation_op_handle ssa_graph ${parallel_executor_cuda_deps}) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 7565bc4c9c..9ed41ab94c 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -5,3 +5,5 @@ cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda) cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) + +cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) diff --git a/paddle/fluid/framework/details/ssa_graph.cc b/paddle/fluid/framework/details/ssa_graph.cc new file mode 100644 index 0000000000..1b8c889449 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph.cc @@ -0,0 +1,15 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph.h" diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h new file mode 100644 index 0000000000..c1e041b8c0 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph.h @@ -0,0 +1,34 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" + +namespace paddle { +namespace framework { +namespace details { + +struct SSAGraph { + std::vector>> vars_; + std::unordered_set> dep_vars_; + std::vector> ops_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index b2be3d1305..5c10595db9 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -15,15 +15,12 @@ limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" #include "ThreadPool.h" #include "lod_tensor.h" -#include "lod_tensor_array.h" #include "op_registry.h" #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" #include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" -#include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" -#include "paddle/fluid/framework/details/var_handle.h" -#include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/framework/details/ssa_graph.h" namespace paddle { namespace framework { @@ -34,15 +31,10 @@ using details::FetchOpHandle; using details::NCCLAllReduceOpHandle; using details::OpHandleBase; using details::ScaleLossGradOpHandle; +using details::SSAGraph; using details::VarHandle; using details::VarHandleBase; -struct SSAGraph { - std::vector>> vars_; - std::unordered_set> dep_vars_; - std::vector> ops_; -}; - class SSAGraphBuilder { public: virtual ~SSAGraphBuilder() {} From ab5ecdf60ebecdd4e18dd4208dee873ba0bb8dfc Mon Sep 17 00:00:00 2001 From: weixing Date: Thu, 22 Mar 2018 13:02:09 +0800 Subject: [PATCH 146/314] Adjust some contents in write_docs_en.rst for Contribue Documentation (#9147) * Add some contents * Adjust the content of the English version * Fix some error, replace word generate with build * Replace document with documentation * Adjust contents * Make links more visible --- doc/v2/dev/write_docs_cn.rst | 9 +++-- doc/v2/dev/write_docs_en.rst | 78 +++++++++++++++++++++++++++--------- 2 files changed, 65 insertions(+), 22 deletions(-) diff --git a/doc/v2/dev/write_docs_cn.rst b/doc/v2/dev/write_docs_cn.rst index a055bb04c0..23615f8830 100644 --- a/doc/v2/dev/write_docs_cn.rst +++ b/doc/v2/dev/write_docs_cn.rst @@ -2,13 +2,14 @@ 如何贡献文档 ############# -PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成,也可以利用paddlepaddle.org工具来编译和预览文档。 +PaddlePaddle的文档包括中英文两个部分。文档都是通过 ``cmake`` 驱动 ``sphinx`` 编译生成的,PaddlePaddle.org工具可以帮助我们实现这一编译过程,并提供更好的预览效果。 如何构建文档 ============ PaddlePaddle的文档构建有两种方式,分别为使用paddlepaddle.org工具和不使用paddlepaddle.org工具,两种方式都有各自的优点,前者方便预览,后者方便开发者进行调试。这两种方式中又分别有使用docker和不使用docker的两种构建方法。 +我们建议使用PaddlePaddle.org工具来构建文档。 使用PaddlePaddle.org工具 ------------------------ @@ -31,7 +32,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest 注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令 -之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档 +之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档 编译后的文件将被存储在工作目录 /.ppo_workspace/content。 如果不想使用Docker,你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。 @@ -56,7 +57,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D python manage.py runserver 工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。 -之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。 +之后再用网页连到 http://localhost:8000 就可以在网页上生成需要的文档。 编译后的文件将被存储在工作目录 /.ppo_workspace/content。 想了解更多PaddlePaddle.org工具的详细信息,可以 `点击这里 `_ 。 @@ -96,7 +97,7 @@ PaddlePaddle.org工具可以配合Docker使用,需要在系统里先安装好D python -m SimpleHTTPServer 8088 -在浏览器中输入http://localhost:8088就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。 +在浏览器中输入 http://localhost:8088 就可以看到编译生成的中/英文的文档页面和英文的API页面,下图为生成的英文文档首页示例。注意,示例中由于使用了sphinx的原始主题,所以页面的风格与官网并不一致,但这并不影响开发者进行调试。 .. image:: src/doc_en.png :align: center diff --git a/doc/v2/dev/write_docs_en.rst b/doc/v2/dev/write_docs_en.rst index f3408a8426..15ff0d34ad 100644 --- a/doc/v2/dev/write_docs_en.rst +++ b/doc/v2/dev/write_docs_en.rst @@ -2,21 +2,20 @@ Contribute Documentation ######################## -PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``. -Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories. -When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content +PaddlePaddle's documentation includes both Chinese and English versions. The documentation is built using the ``cmake`` command to drive the ``sphinx`` compiler. The PaddlePaddle.org tool helps us to implement this compilation process and provides better preview results. -How to Build Documentations -============ +How to build Documentation +=========================== -We recommend using PaddlePaddle.org tool to build documentation +PaddlePaddle's documentation is built in two ways: using the PaddlePaddle.org tool and without using it. Both methods have their own advantages. The former facilitates previewing, while the latter facilitates debugging by the developer. We could choose to build the documentation with Docker or without it in each of the above ways. +We recommend using PaddlePaddle.org tool to build documentation. -Use PaddlePaddle.org tool --------------- -This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser. +Using PaddlePaddle.org tool +----------------------------- +This is the recommended method to build documentation, because it can automatically compile the documentation and preview the documentation directly in a web page. Note that, although you can preview the documentation in other ways, its style may not be consistent with the official website. Compiling with the PaddlePaddle.org tool produces a preview that will be consistent with the official website documentation style. -The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool +The PaddlePaddle.org tool can be used with Docker and Docker needs to be installed first. Please refer to `Docker's official website `_ on how to install Docker. After installing Docker, you may use the following commands to activate the tool .. code-block:: bash @@ -32,8 +31,8 @@ The tool uses Docker, please install it on your system. Please check Docker offi # Please specify the working directory through -v docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest -Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command -Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run commands +Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation. The compiled documentations will be stored in /.ppo_workspace/content @@ -58,19 +57,62 @@ If you don't wish to use Docker, you can also activate the tool through Django. pip install -r requirements.txt python manage.py runserver -Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation +Specify the PaddlePaddle working directory for the environment variable CONTENT_DIR so that the tool could find where the working directory is. + +Use a web browser and navigate to http://localhost:8000. Click the buttons to compile the documentation The compiled documentations will be stored in /.ppo_workspace/content -If you want to learn more on the PaddlePaddle.org, please `click here `_ 。 +Please `click here `_ for more information about the PaddlePaddle.org tool. + + +Manually Building the Documentation +------------------------------------- + +Build PaddlePaddle's documentation with Docker,you need to install Docker first. Please refer to `Docker's official website `_ on how to install Docker. After Docker is installed, you could use the scripts in the source directory to build the documentation. + +[TBD] + +If you do not wish to use Docker, you can also use the following commands to directly build the PaddlePaddle documentation. + +.. code-block:: bash + + mkdir paddle + cd paddle + git clone https://github.com/PaddlePaddle/Paddle.git + mkdir -p build + cd build + cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON + + # If you only need to build documents, use the following commands + make -j $processors gen_proto_py + make -j $processors paddle_docs paddle_docs_cn + + # If you only need to build APIs, use the following commands + make -j $processors gen_proto_py framework_py_proto + make -j $processors copy_paddle_pybind + make -j $processors paddle_api_docs + +$processors indicates that as many processes as the CPU cores are started to compile in parallel. It should be set according to the number of CPU cores of your machine. + +After the compilation is complete, enter the ``doc/v2`` directory. If you chose to build documents, it will generate ``cn/html/`` and ``en/html`` subdirectories under this directory. If you chose to build APIs,it will generate``api/en/html`` subdirectory. Please enter these directories respectively and execute the following commands: + +.. code-block:: bash + + python -m SimpleHTTPServer 8088 + +Use a web browser and navigate to http://localhost:8000, you could see the compiled Chinese/English documents page and the English APIs page. The following figure is an example of the built English documents home page. Note that due to the sphinx's original theme used in the example, the style of the page is not consistent with the official website, but this does not affect the developer's debugging. -How to write Documentations -============ +.. image:: src/doc_en.png + :align: center + :scale: 60 % -PaddlePaddle uses `sphinx`_ to compile documentations,Please check sphinx official website for more detail. +How to write Documentation +=========================== +PaddlePaddle uses `sphinx`_ to compile documentation,Please check sphinx official website for more detail. How to update www.paddlepaddle.org -============================ +=================================== Please create PRs and submit them to github, please check `Contribute Code `_ 。 PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs `_ and From d4bb2ca71f72e31b78231e1bc0907330392ef759 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 22 Mar 2018 13:36:58 +0800 Subject: [PATCH 147/314] Follow comments and refine the python wrapper of reshape_op --- python/paddle/fluid/layers/nn.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b4e3e83e3a..d98e1bdfca 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3361,7 +3361,9 @@ def reshape(x, shape, act=None, inplace=True, name=None): Examples: .. code-block:: python - data = fluid.layers.data(name='data', shape=[2, 4, 6], dtype='float32') + data = fluid.layers.data( + name='data', shape=[2, 4, 6], dtype='float32' + ) reshaped = fluid.layers.reshape( x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True ) @@ -3371,6 +3373,21 @@ def reshape(x, shape, act=None, inplace=True, name=None): if not (isinstance(shape, list) or isinstance(shape, tuple)): raise ValueError("Input shape must be a python lsit or tuple.") + # Validate the shape + unk_dim_idx = -1 + for dim_idx, dim_size in enumerate(shape): + if dim_size == -1: + assert unk_dim_idx == -1, ( + "Only one dimension in shape can be unknown.") + unk_dim_idx = dim_idx + elif dim_size == 0: + assert dim_idx < len(x.shape), ( + "The indice of 0s in shape can not exceed Rank(X).") + else: + assert dim_size > 0, ( + "Each dimension size given in shape must not be negtive " + "except one unknown dimension.") + helper = LayerHelper("reshape", **locals()) reshaped = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( From 3c8bbd306f254841dd7c0af820739d945bf096d7 Mon Sep 17 00:00:00 2001 From: legend06hvl Date: Thu, 22 Mar 2018 15:10:04 +0800 Subject: [PATCH 148/314] Update index_en.rst (#9280) * Update index_en.rst * Update index_en.rst Update refer to commits --- doc/v2/howto/index_en.rst | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst index 2079be766f..bf2320a169 100644 --- a/doc/v2/howto/index_en.rst +++ b/doc/v2/howto/index_en.rst @@ -1,11 +1,37 @@ HOW TO -======= +======== + +PaddlePaddle provides the users the ability to flexibly set various command line parameters to control the model training and inference process. Please refer to the following instructions on using PaddlePaddle: + +.. toctree:: + :maxdepth: 1 + + cmd_parameter/index_cn.rst + +PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to: + +.. toctree:: + :maxdepth: 1 + + cluster/index_cn.rst + +PaddlePaddle provides a C-API for inference. We provide the following guidelines for using the C-API: + +.. toctree:: + :maxdepth: 1 + + capi/index_cn.rst + +PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to: + +.. toctree:: + :maxdepth: 1 + + rnn/index_cn.rst + +How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to: .. toctree:: :maxdepth: 1 - cmd_parameter/index_en.rst - cluster/index_en.rst - capi/index_en.rst - rnn/index_en.rst - optimization/gpu_profiling_en.rst + optimization/gpu_profiling_cn.rst From 13f1050ab0f5113fea223f47e99f7c6b4f9644a7 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 22 Mar 2018 15:15:02 +0800 Subject: [PATCH 149/314] "fix mixed_vector bug" (#9319) --- paddle/fluid/framework/mixed_vector.h | 2 +- paddle/fluid/framework/mixed_vector_test.cu | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index 6a6fa53871..d99a15547b 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -176,7 +176,7 @@ class Vector { // resize the vector void resize(size_t size) { - if (size + 1 < capacity()) { + if (size + 1 <= capacity()) { size_ = size; } else { MutableCPU(); diff --git a/paddle/fluid/framework/mixed_vector_test.cu b/paddle/fluid/framework/mixed_vector_test.cu index 4bf78499f2..d57f825108 100644 --- a/paddle/fluid/framework/mixed_vector_test.cu +++ b/paddle/fluid/framework/mixed_vector_test.cu @@ -104,3 +104,11 @@ TEST(mixed_vector, ForEach) { for (auto& v : tmp) { } } + +TEST(mixed_vector, Reserve) { + paddle::framework::Vector vec; + vec.reserve(1); + vec.push_back(0); + vec.push_back(0); + vec.push_back(0); +} From 466f28a6b18f56fe0b2686091a49802ea97334b7 Mon Sep 17 00:00:00 2001 From: legend06hvl Date: Thu, 22 Mar 2018 15:16:01 +0800 Subject: [PATCH 150/314] Update index_en.rst (#9286) * Update index_en.rst Update en version * Update index_en.rst Update refer to commits and thank you for the suggestion. --- doc/v2/howto/capi/index_en.rst | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/doc/v2/howto/capi/index_en.rst b/doc/v2/howto/capi/index_en.rst index 2cbbe362fd..4ec39c9d52 100644 --- a/doc/v2/howto/capi/index_en.rst +++ b/doc/v2/howto/capi/index_en.rst @@ -1,6 +1,23 @@ -C-API Prediction Library +C-API Inference Library ======================== +After we train a neural network, we use it to do inference. Inference is the process of preparing input data and propagating it through the model to produce the result. + +Compared with model training, prediction has the following features: + +#. Inference does not require backpropagation and parameter updates, as required during training. +#. Labels are not needed in prediction. +#. Most of the time, predictions need to be integrated with the user system. + +Therefore, the model prediction SDK needs to be designed separately and has the following features: + +#. The predictive SDK does not include backpropagation and parameter updates to reduce the size of the SDK. +#. The predictive SDK needs a simple user interface for ease of use. +#. Since the input data may have a variety of structures, the format of the input data is clearly and compactly packaged. +#. In order to be compatible with user's system, the SDK's interface must conform to the C-standard interface. + +PaddlePaddle provides C-API to solve the above problem. Following are the guidelines to use the C-API: + .. toctree:: :maxdepth: 1 From 5e6276edc1d92632322d6e748f281b9156251671 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 22 Mar 2018 15:17:18 +0800 Subject: [PATCH 151/314] fix transpiler bug --- paddle/fluid/operators/send_op.cc | 8 ++++---- python/paddle/fluid/distribute_transpiler.py | 7 +++++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 443f40e803..a77c38f633 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -68,7 +68,7 @@ class SendOp : public framework::OperatorBase { for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { - VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + VLOG(2) << "sending " << ins[i] << " to " << epmap[i]; rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; @@ -77,20 +77,20 @@ class SendOp : public framework::OperatorBase { PADDLE_ENFORCE(rpc_client->Wait()); for (auto& ep : endpoints) { - VLOG(3) << "batch barrier, ep: " << ep; + VLOG(2) << "batch barrier, ep: " << ep; rpc_client->AsyncSendBatchBarrier(ep); } PADDLE_ENFORCE(rpc_client->Wait()); if (outs.size() > 0) { for (size_t i = 0; i < outs.size(); i++) { - VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; + VLOG(2) << "getting " << outs[i] << " from " << epmap[i]; rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); } PADDLE_ENFORCE(rpc_client->Wait()); // tell pservers that current trainer have called fetch for (auto& ep : endpoints) { - VLOG(3) << "send fetch barrier, ep: " << ep; + VLOG(2) << "send fetch barrier, ep: " << ep; rpc_client->AsyncSendFetchBarrier(ep); } PADDLE_ENFORCE(rpc_client->Wait()); diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index ad655ee96c..4c3789b99e 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -563,6 +563,8 @@ class DistributeTranspiler: orig_var_name = "" if suff_idx >= 0: orig_var_name = varname[:suff_idx] + else: + orig_var_name = varname return orig_var_name def _append_pserver_ops(self, optimize_block, opt_op, endpoint, @@ -577,7 +579,8 @@ class DistributeTranspiler: grad_block = None for g in self.param_grad_ep_mapping[endpoint]["grads"]: if same_or_split_var( - self._orig_varname(g.name), opt_op.input(key)[0]): + self._orig_varname(g.name), + self._orig_varname(opt_op.input(key)[0])): grad_block = g break if not grad_block: @@ -748,7 +751,7 @@ class DistributeTranspiler: param_names = [ p.name for p in self.param_grad_ep_mapping[endpoint]["params"] ] - if op.input("Param") in param_names: + if op.input("Param")[0] in param_names: return True else: for n in param_names: From a88cc462219681cbc74d2beee022e8c67d8f0de6 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 22 Mar 2018 16:14:37 +0800 Subject: [PATCH 152/314] update --- paddle/fluid/operators/detail/bytebuffer_stream.h | 5 +++-- paddle/fluid/operators/detail/grpc_server.h | 10 +++------- paddle/fluid/operators/detail/test_serde.cc | 4 ++-- paddle/fluid/operators/detail/variable_response.h | 4 ++-- 4 files changed, 10 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h index 0cbe514d04..1791a48aab 100644 --- a/paddle/fluid/operators/detail/bytebuffer_stream.h +++ b/paddle/fluid/operators/detail/bytebuffer_stream.h @@ -146,8 +146,9 @@ class GrpcByteBufferSource class GrpcByteBufferSourceWrapper : public Source { public: - GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) : source_(source) {} - virtual ::google::protobuf::io::ZeroCopyInputStream* contents() override { + explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) + : source_(source) {} + ::google::protobuf::io::ZeroCopyInputStream* contents() override { return source_; } diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 9c21a07432..10e6dd45a9 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -21,15 +21,11 @@ limitations under the License. */ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/simple_block_queue.h" - +#include "paddle/fluid/operators/detail/grpc_service.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h" - -#include "paddle/fluid/operators/detail/grpc_service.h" - -//#include +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc index 4be5963794..494ac1d679 100644 --- a/paddle/fluid/operators/detail/test_serde.cc +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -81,7 +81,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); framework::Scope scope; scope.Var("myvar"); - operators::detail::TensorResponse resp(&scope, &ctx); + operators::detail::VariableResponse resp(&scope, &ctx); EXPECT_EQ(resp.Parse(msg), 0); framework::Variable* var2 = resp.GetVar(); @@ -166,7 +166,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { // deserialize zero-copy framework::Scope scope; scope.Var("myvar"); - operators::detail::TensorResponse resp(&scope, &ctx); + operators::detail::VariableResponse resp(&scope, &ctx); if (from_type == 0) { EXPECT_EQ(resp.Parse(msg), 0); } else { diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h index c7bc7a46e7..e121ed7bce 100644 --- a/paddle/fluid/operators/detail/variable_response.h +++ b/paddle/fluid/operators/detail/variable_response.h @@ -36,9 +36,9 @@ class VariableResponse { public: VariableResponse(const framework::Scope* scope, const platform::DeviceContext* dev_ctx) - : scope_(scope), dev_ctx_(dev_ctx){}; + : scope_(scope), dev_ctx_(dev_ctx) {} - virtual ~VariableResponse(){}; + virtual ~VariableResponse() {} // return: // 0:ok. From 8f8728635a028e5ef69498cae109366302a048ee Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Thu, 22 Mar 2018 17:00:06 +0800 Subject: [PATCH 153/314] Fix bug for backward tanspiler when using parallel_do operator. (#9282) * Temporarily fix bug for backward tanspiler when using parallel_do operator. * Fix bug for backward tanspiler when using parallel_do operator --- paddle/fluid/operators/box_coder_op.cc | 3 ++- paddle/fluid/operators/detection_map_op.cc | 4 ++-- paddle/fluid/operators/iou_similarity_op.cc | 5 +++-- paddle/fluid/operators/mine_hard_examples_op.cc | 5 +++-- paddle/fluid/operators/prior_box_op.cc | 4 +++- paddle/fluid/operators/target_assign_op.cc | 4 ++-- python/paddle/fluid/layers/detection.py | 7 +++++-- 7 files changed, 20 insertions(+), 12 deletions(-) diff --git a/paddle/fluid/operators/box_coder_op.cc b/paddle/fluid/operators/box_coder_op.cc index eccdd408a1..ec416f725e 100644 --- a/paddle/fluid/operators/box_coder_op.cc +++ b/paddle/fluid/operators/box_coder_op.cc @@ -126,6 +126,7 @@ width and height. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker); +REGISTER_OPERATOR(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker, + paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel, ops::BoxCoderKernel); diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc index 73c84c2fe0..93ef15b933 100644 --- a/paddle/fluid/operators/detection_map_op.cc +++ b/paddle/fluid/operators/detection_map_op.cc @@ -188,8 +188,8 @@ The general steps are as follows. First, calculate the true positive and } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(detection_map, ops::DetectionMAPOp, - ops::DetectionMAPOpMaker); +REGISTER_OPERATOR(detection_map, ops::DetectionMAPOp, ops::DetectionMAPOpMaker, + paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( detection_map, ops::DetectionMAPOpKernel, ops::DetectionMAPOpKernel); diff --git a/paddle/fluid/operators/iou_similarity_op.cc b/paddle/fluid/operators/iou_similarity_op.cc index ffbd7c7814..4b78ec510d 100755 --- a/paddle/fluid/operators/iou_similarity_op.cc +++ b/paddle/fluid/operators/iou_similarity_op.cc @@ -87,8 +87,9 @@ $$ } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(iou_similarity, ops::IOUSimilarityOp, - ops::IOUSimilarityOpMaker); +REGISTER_OPERATOR(iou_similarity, ops::IOUSimilarityOp, + ops::IOUSimilarityOpMaker, + paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( iou_similarity, diff --git a/paddle/fluid/operators/mine_hard_examples_op.cc b/paddle/fluid/operators/mine_hard_examples_op.cc index 0e81d60878..277901cff4 100644 --- a/paddle/fluid/operators/mine_hard_examples_op.cc +++ b/paddle/fluid/operators/mine_hard_examples_op.cc @@ -324,8 +324,9 @@ MatchIndices elements with value -1. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp, - ops::MineHardExamplesOpMaker); +REGISTER_OPERATOR(mine_hard_examples, ops::MineHardExamplesOp, + ops::MineHardExamplesOpMaker, + paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( mine_hard_examples, diff --git a/paddle/fluid/operators/prior_box_op.cc b/paddle/fluid/operators/prior_box_op.cc index 7ba55437cb..c22a55bce2 100644 --- a/paddle/fluid/operators/prior_box_op.cc +++ b/paddle/fluid/operators/prior_box_op.cc @@ -168,7 +168,9 @@ https://arxiv.org/abs/1512.02325. } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker); +REGISTER_OPERATOR(prior_box, ops::PriorBoxOp, ops::PriorBoxOpMaker, + paddle::framework::EmptyGradOpMaker); + REGISTER_OP_CPU_KERNEL( prior_box, ops::PriorBoxOpKernel, ops::PriorBoxOpKernel); diff --git a/paddle/fluid/operators/target_assign_op.cc b/paddle/fluid/operators/target_assign_op.cc index a894b12fa3..33ff967e5e 100644 --- a/paddle/fluid/operators/target_assign_op.cc +++ b/paddle/fluid/operators/target_assign_op.cc @@ -153,8 +153,8 @@ template struct NegTargetAssignFunctor, diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index a889ab6bdc..cd519e1ee0 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -129,13 +129,11 @@ def detection_output(loc, prior_box_var=prior_box_var, target_box=loc, code_type='decode_center_size') - old_shape = scores.shape scores = ops.reshape(x=scores, shape=(-1, old_shape[-1])) scores = nn.softmax(input=scores) scores = ops.reshape(x=scores, shape=old_shape) scores = nn.transpose(scores, perm=[0, 2, 1]) - nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) helper.append_op( type="multiclass_nms", @@ -475,6 +473,7 @@ def ssd_loss(location, # 2. Compute confidence for mining hard examples # 2.1. Get the target label based on matched indices gt_label = ops.reshape(x=gt_label, shape=gt_label.shape + (1, )) + gt_label.stop_gradient = True target_label, _ = target_assign( gt_label, matched_indices, mismatch_value=background_label) # 2.2. Compute confidence loss. @@ -482,10 +481,12 @@ def ssd_loss(location, confidence = __reshape_to_2d(confidence) target_label = tensor.cast(x=target_label, dtype='int64') target_label = __reshape_to_2d(target_label) + target_label.stop_gradient = True conf_loss = nn.softmax_with_cross_entropy(confidence, target_label) # 3. Mining hard examples conf_loss = ops.reshape(x=conf_loss, shape=(num, num_prior)) + conf_loss.stop_gradient = True neg_indices = helper.create_tmp_variable(dtype='int32') dtype = matched_indices.dtype updated_matched_indices = helper.create_tmp_variable(dtype=dtype) @@ -695,6 +696,8 @@ def multi_box_head(inputs, outputs={"Boxes": box, "Variances": var}, attrs=attrs, ) + box.stop_gradient = True + var.stop_gradient = True return box, var def _reshape_with_axis_(input, axis=1): From ee7f1ecd7cb79d34a7f14a45d4c34e4e6db9b7af Mon Sep 17 00:00:00 2001 From: Yancey Date: Thu, 22 Mar 2018 19:21:43 +0800 Subject: [PATCH 154/314] Fix dist compile error (#9320) --- .../operators/detail/bytebuffer_stream.h | 5 +++-- paddle/fluid/operators/detail/grpc_server.h | 2 -- paddle/fluid/operators/detail/test_serde.cc | 21 +++++++++---------- .../operators/detail/variable_response.h | 4 ++-- 4 files changed, 15 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h index 0cbe514d04..1791a48aab 100644 --- a/paddle/fluid/operators/detail/bytebuffer_stream.h +++ b/paddle/fluid/operators/detail/bytebuffer_stream.h @@ -146,8 +146,9 @@ class GrpcByteBufferSource class GrpcByteBufferSourceWrapper : public Source { public: - GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) : source_(source) {} - virtual ::google::protobuf::io::ZeroCopyInputStream* contents() override { + explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) + : source_(source) {} + ::google::protobuf::io::ZeroCopyInputStream* contents() override { return source_; } diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 9c21a07432..5c278f0ed7 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -29,8 +29,6 @@ limitations under the License. */ #include "paddle/fluid/operators/detail/grpc_service.h" -//#include - namespace paddle { namespace operators { namespace detail { diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc index 4be5963794..99c1577223 100644 --- a/paddle/fluid/operators/detail/test_serde.cc +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -81,7 +81,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); framework::Scope scope; scope.Var("myvar"); - operators::detail::TensorResponse resp(&scope, &ctx); + operators::detail::VariableResponse resp(&scope, &ctx); EXPECT_EQ(resp.Parse(msg), 0); framework::Variable* var2 = resp.GetVar(); @@ -166,7 +166,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { // deserialize zero-copy framework::Scope scope; scope.Var("myvar"); - operators::detail::TensorResponse resp(&scope, &ctx); + operators::detail::VariableResponse resp(&scope, &ctx); if (from_type == 0) { EXPECT_EQ(resp.Parse(msg), 0); } else { @@ -194,24 +194,23 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); } -TEST(LodTensor, GPU) { - platform::CUDAPlace place; +TEST(LodTensor, Run) { + platform::CPUPlace place; RunTestLodTensor(place); RunTestLodTensor(place, 1); -} - -TEST(LodTensor, CPU) { - platform::CPUPlace place; +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace place; RunTestLodTensor(place); RunTestLodTensor(place, 1); +#endif } -TEST(SelectedRows, CPU) { +TEST(SelectedRows, Run) { platform::CPUPlace place; RunSerdeTestSelectedRows(place); -} -TEST(SelectedRows, GPU) { +#ifdef PADDLE_WITH_CUDA platform::CUDAPlace place; RunSerdeTestSelectedRows(place); +#endif } diff --git a/paddle/fluid/operators/detail/variable_response.h b/paddle/fluid/operators/detail/variable_response.h index c7bc7a46e7..e121ed7bce 100644 --- a/paddle/fluid/operators/detail/variable_response.h +++ b/paddle/fluid/operators/detail/variable_response.h @@ -36,9 +36,9 @@ class VariableResponse { public: VariableResponse(const framework::Scope* scope, const platform::DeviceContext* dev_ctx) - : scope_(scope), dev_ctx_(dev_ctx){}; + : scope_(scope), dev_ctx_(dev_ctx) {} - virtual ~VariableResponse(){}; + virtual ~VariableResponse() {} // return: // 0:ok. From e33af2414b1ae92de4c1589e3829a6bcc515dd21 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 22 Mar 2018 04:34:16 -0700 Subject: [PATCH 155/314] "fast hack" --- paddle/fluid/operators/dropout_op.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index f6c85a2a53..94382739b5 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -33,6 +33,7 @@ __global__ void RandomGenerator(const size_t n, const int seed, int idx = blockDim.x * blockIdx.x + threadIdx.x; for (; idx < n; idx += blockDim.x * gridDim.x) { + rng.discard(idx); if (dist(rng) < dropout_prob) { mask_data[idx] = static_cast(0); } else { From ba9f4c787393c57e8f29477e01a3c6b3f43e3fa2 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 22 Mar 2018 20:07:26 +0800 Subject: [PATCH 156/314] fix test_recv_op --- python/paddle/fluid/layers/io.py | 17 ++++++++--------- .../fluid/tests/unittests/test_recv_op.py | 17 +++++++++-------- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index bc5e291ad8..bd7e9c30fe 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -113,9 +113,9 @@ class ListenAndServ(object): which can receive variables from clients and run a block. """ - def __init__(self, endpoint, fan_in=1, optimizer_mode=True): + def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True): self.helper = LayerHelper("listen_and_serv") - self.inputs = [] + self.inputs = inputs self.outputs = [] self.endpoint = endpoint self.fan_in = fan_in @@ -160,18 +160,13 @@ class ListenAndServ(object): current_block = main_program.current_block() parent_block = self.parent_block() - params, grads = self.get_params_and_grads() - param_names = [p.name for p in params] - grad_names = [g.name for g in grads] parent_block.append_op( type='listen_and_serv', - inputs={}, + inputs={"X": self.inputs}, outputs={}, attrs={ 'endpoint': self.endpoint, 'Fanin': self.fan_in, - 'ParamList': param_names, - 'GradList': grad_names, 'OptimizeBlock': current_block }) @@ -196,10 +191,14 @@ def Send(endpoints, send_vars, get_vars): endpoints = list(set(epmap)) helper = LayerHelper("Send", **locals()) + rpc_client_var = default_main_program().global_block().create_var( + name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW) + helper.append_op( type="send", inputs={"X": send_vars}, - outputs={"Out": get_vars}, + outputs={"Out": get_vars, + "RPCClient": rpc_client_var}, attrs={"endpoints": endpoints, "epmap": epmap}) diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py index 985d892c56..f8b7724039 100644 --- a/python/paddle/fluid/tests/unittests/test_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_recv_op.py @@ -32,20 +32,21 @@ class TestRecvOp(unittest.TestCase): time.sleep(1) self.init_client(place) # FIXME(typhoonzero): find a way to gracefully shutdown the server. - os.system("kill -9 %d" % p.pid) + # os.system("kill -9 %d" % p.pid) p.join() def init_serv(self, place): main = fluid.Program() with fluid.program_guard(main): - x = layers.data( - shape=[32, 32], - dtype='float32', - name="X", - append_batch_size=False) - fluid.initializer.Constant(value=1.0)(x, main.global_block()) - serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False) + serv = layers.ListenAndServ( + "127.0.0.1:6174", ["X"], optimizer_mode=False) with serv.do(): + x = layers.data( + shape=[32, 32], + dtype='float32', + name="X", + append_batch_size=False) + fluid.initializer.Constant(value=1.0)(x, main.global_block()) o = layers.scale(x=x, scale=10.0) main.global_block().create_var( name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape) From 6cebbd7bcb9d9a88aa482efd38ecfc3a5d4e9fa9 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Thu, 22 Mar 2018 20:16:24 +0800 Subject: [PATCH 157/314] update --- python/paddle/fluid/tests/unittests/test_recv_op.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_recv_op.py b/python/paddle/fluid/tests/unittests/test_recv_op.py index f8b7724039..854238c627 100644 --- a/python/paddle/fluid/tests/unittests/test_recv_op.py +++ b/python/paddle/fluid/tests/unittests/test_recv_op.py @@ -32,7 +32,7 @@ class TestRecvOp(unittest.TestCase): time.sleep(1) self.init_client(place) # FIXME(typhoonzero): find a way to gracefully shutdown the server. - # os.system("kill -9 %d" % p.pid) + os.system("kill -9 %d" % p.pid) p.join() def init_serv(self, place): From 14ba67c0ef3bcff13d95788406518bb132fe4a28 Mon Sep 17 00:00:00 2001 From: Tomasz Patejko Date: Thu, 22 Mar 2018 08:46:20 -0400 Subject: [PATCH 158/314] Function for running MKLDNN primitive added. Unittest added for is_test attribute --- paddle/fluid/operators/lrn_mkldnn_op.cc | 23 +++++++++++-------- paddle/fluid/operators/lrn_op.cc | 2 +- .../fluid/tests/unittests/test_lrn_op.py | 19 +++++++++++++++ 3 files changed, 33 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/lrn_mkldnn_op.cc b/paddle/fluid/operators/lrn_mkldnn_op.cc index 3bead16ce4..0a18882e81 100644 --- a/paddle/fluid/operators/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/lrn_mkldnn_op.cc @@ -36,6 +36,14 @@ std::shared_ptr insert_to_context(const std::string& key, return p; } + +template +void run_primitive(Args&&... args) { + auto forward_op = mkldnn::lrn_forward{args...}; + + std::vector pipeline = {forward_op}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} } // namespace template @@ -87,8 +95,6 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine}, static_cast(output_data)}; - std::unique_ptr forward_op = nullptr; - if (!is_test) { const std::string key = ctx.op().Output("Out"); const std::string key_src_memory = key + "@lrn_src_memory"; @@ -108,9 +114,7 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { key_workspace_memory, dev_ctx, forward_pd->workspace_primitive_desc()); - forward_op.reset(new mkldnn::lrn_forward{*forward_pd, *src_memory, - *workspace_memory, dst_memory}); - + run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory); } else { auto forward_pd = mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine}; @@ -119,12 +123,8 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { auto workspace_memory = mkldnn::memory{forward_pd.workspace_primitive_desc()}; - forward_op.reset(new mkldnn::lrn_forward{forward_pd, src_memory, - workspace_memory, dst_memory}); + run_primitive(forward_pd, src_memory, workspace_memory, dst_memory); } - - std::vector pipeline = {*forward_op}; - mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); } }; @@ -136,6 +136,9 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { "MKLDNN LRN must use float data."); PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "MKLDNN LRN must use CPUPlace."); + PADDLE_ENFORCE( + !ctx.Attr("is_test"), + "is_test attribute should be set to False in training phase."); auto x = ctx.Input("X"); diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index 2b1947a187..b36b5c3a33 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -155,8 +155,8 @@ class LRNOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4."); ctx->SetOutputDim("Out", x_dim); - ctx->SetOutputDim("MidOut", x_dim); ctx->ShareLoD("X", /*->*/ "Out"); + ctx->SetOutputDim("MidOut", x_dim); } framework::OpKernelType GetExpectedKernelType( diff --git a/python/paddle/fluid/tests/unittests/test_lrn_op.py b/python/paddle/fluid/tests/unittests/test_lrn_op.py index 2268eafdbd..8fa480b9bc 100644 --- a/python/paddle/fluid/tests/unittests/test_lrn_op.py +++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py @@ -97,5 +97,24 @@ class TestLRNMKLDNNOp(TestLRNOp): self.check_output(atol=0.002) +class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp): + def get_attrs(self): + attrs = TestLRNMKLDNNOp.get_attrs(self) + attrs['is_test'] = True + return attrs + + def test_check_grad_normal(self): + def check_raise_is_test(): + try: + self.check_grad(['X'], 'Out', max_relative_error=0.01) + except Exception as e: + t = \ + "is_test attribute should be set to False in training phase." + if t in str(e): + raise AttributeError + + self.assertRaises(AttributeError, check_raise_is_test) + + if __name__ == "__main__": unittest.main() From ac94242ea993948e8e6bb54d961d36794c918864 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Thu, 22 Mar 2018 22:55:21 +0800 Subject: [PATCH 159/314] change boost download url to speed up download --- cmake/external/boost.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake index d9cd264b49..10662fc967 100644 --- a/cmake/external/boost.cmake +++ b/cmake/external/boost.cmake @@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost") # So we use 1.41.0 here. set(BOOST_VER "1.41.0") set(BOOST_TAR "boost_1_41_0") -set(BOOST_URL "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz") +set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz") set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) From 76ae540f8ef3dc5463da6127556fc48a343698c9 Mon Sep 17 00:00:00 2001 From: Varun Arora Date: Thu, 22 Mar 2018 10:44:43 -0700 Subject: [PATCH 160/314] Move Select to concurrency.py; incorporate outputs (#9136) * Move Select to concurrency.py; incorporate outputs * CLang formatting for concurrency * Remove extra bracket - formatting fix - 3 * Comment fix --- paddle/fluid/framework/concurrency_test.cc | 10 +- paddle/fluid/operators/select_op.cc | 5 + python/paddle/fluid/concurrency.py | 182 +++++++++++++++++++- python/paddle/fluid/layers/control_flow.py | 183 +-------------------- 4 files changed, 192 insertions(+), 188 deletions(-) diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc index 25152054eb..e98e9d94bf 100644 --- a/paddle/fluid/framework/concurrency_test.cc +++ b/paddle/fluid/framework/concurrency_test.cc @@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program, // Select block AddOp("select", {{"X", {dataChanName, quitChanName}}, {"case_to_execute", {"caseToExecute"}}}, - {}, {{"sub_block", casesBlock}, - {"cases", std::vector{case0Config, case1Config}}}, + {{"Out", {}}}, + {{"sub_block", casesBlock}, + {"cases", std::vector{case0Config, case1Config}}}, whileBlock); scope->Var("stepScopes"); @@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) { executor.Run(program, &scope, 0, true, true); - // After we call executor.run, the Go operator should do a channel_send to set - // the - // "result" variable to 99 + // After we call executor.run, the Go operator should do a channel_send to + // set the "result" variable to 99. auto *finalData = tensor.data(); EXPECT_EQ(finalData[0], 99); } diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc index 8344a239df..c0bf0ff927 100644 --- a/paddle/fluid/operators/select_op.cc +++ b/paddle/fluid/operators/select_op.cc @@ -27,6 +27,7 @@ namespace operators { static constexpr char kX[] = "X"; static constexpr char kCaseToExecute[] = "case_to_execute"; +static constexpr char kOutputs[] = "Out"; static constexpr char kCases[] = "cases"; static constexpr char kCasesBlock[] = "sub_block"; @@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker { "(Int) The variable the sets the index of the case to execute, " "after evaluating the channels being sent to and received from") .AsDuplicable(); + AddOutput(kOutputs, + "A set of variables, which will be assigned with values " + "generated by the operators inside the cases of Select Op.") + .AsDuplicable(); AddAttr>(kCases, "(String vector) Serialized list of" "all cases in the select op. Each" diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py index 3e4292d235..d65e1a6858 100644 --- a/python/paddle/fluid/concurrency.py +++ b/python/paddle/fluid/concurrency.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from layers.control_flow import BlockGuard, Select +from layers.control_flow import BlockGuard, equal +from .framework import Operator from layer_helper import LayerHelper, unique_name from layers import fill_constant import core @@ -75,6 +76,185 @@ class Go(BlockGuard): attrs={'sub_block': go_block}) +class SelectCase(object): + DEFAULT = 0 + SEND = 1 + RECEIVE = 2 + + def __init__(self, + case_idx, + case_to_execute, + channel_action_fn=None, + channel=None, + value=None): + self.helper = LayerHelper('conditional_block') + self.main_program = self.helper.main_program + self.is_scalar_condition = True + + self.case_to_execute = case_to_execute + self.idx = case_idx + + # Since we aren't going to use the `channel_send` or `channel_recv` + # functions directly, we just need to capture the name. + self.action = (self.SEND + if channel_action_fn.__name__ == ('channel_send') else + self.RECEIVE) if channel_action_fn else self.DEFAULT + self.value = value + self.channel = channel + + def __enter__(self): + self.block = self.main_program.create_block() + + def construct_op(self): + main_program = self.helper.main_program + cases_block = main_program.current_block() + + inner_outputs = set() + input_set = set() + params = set() + + for op in self.block.ops: + # Iterate over all operators, get all the inputs + # and add as input to the SelectCase operator. + for iname in op.input_names: + for in_var_name in op.input(iname): + if in_var_name not in inner_outputs: + input_set.add(in_var_name) + + for oname in op.output_names: + for out_var_name in op.output(oname): + inner_outputs.add(out_var_name) + + param_list = [ + cases_block.var(each_name) for each_name in params + if each_name not in input_set + ] + + # Iterate over all operators, get all the outputs + # add to the output list of SelectCase operator only if + # they exist in the parent block. + out_vars = [] + for inner_out_name in inner_outputs: + if inner_out_name in cases_block.vars: + out_vars.append(cases_block.var(inner_out_name)) + + # First, create an op that will determine whether or not this is the + # conditional variable to execute. + should_execute_block = equal( + fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx), + self.case_to_execute) + + step_scope = cases_block.create_var( + type=core.VarDesc.VarType.STEP_SCOPES) + + cases_block.append_op( + type='conditional_block', + inputs={'X': [should_execute_block], + 'Params': param_list}, + outputs={'Out': out_vars, + 'Scope': [step_scope]}, + attrs={ + 'sub_block': self.block, + 'is_scalar_condition': self.is_scalar_condition + }) + + return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name + if self.channel else '', self.value.name + if self.value else '') + + def __exit__(self, exc_type, exc_val, exc_tb): + self.main_program.rollback() + if exc_type is not None: + return False # re-raise exception + return True + + +class Select(BlockGuard): + def __init__(self, name=None): + self.helper = LayerHelper('select', name=name) + self.cases = [] + + super(Select, self).__init__(self.helper.main_program) + self.case_to_execute = fill_constant( + shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1) + + def __enter__(self): + super(Select, self).__enter__() + return self + + def case(self, channel_action_fn, channel, value): + """Create a new block for this condition. + """ + select_case = SelectCase( + len(self.cases), self.case_to_execute, channel_action_fn, channel, + value) + + self.cases.append(select_case) + + return select_case + + def default(self): + """Create a default case block for this condition. + """ + default_case = SelectCase(len(self.cases), self.case_to_execute) + + self.cases.append(default_case) + + return default_case + + def __exit__(self, exc_type, exc_val, exc_tb): + if exc_type is not None: + return False + + # Create a select op and another block to wrap its + # case blocks. + select_block = self.helper.main_program.current_block() + parent_block = self.helper.main_program.block(select_block.parent_idx) + + # Construct each case op, inside the newly created select block. + serialized_cases = [] + for case in self.cases: + serialized_cases.append(case.construct_op()) + + intermediate = set() + params = set() + + for case_block in select_block.ops: + if case_block.attrs and 'sub_block' in case_block.attrs: + for each_op in case_block.attrs['sub_block'].ops: + assert isinstance(each_op, Operator) + for iname in each_op.input_names: + for in_var_name in each_op.input(iname): + if in_var_name not in intermediate: + params.add(in_var_name) + + for oname in each_op.output_names: + for out_var_name in each_op.output(oname): + intermediate.add(out_var_name) + + out_list = [ + parent_block.var(var_name) for var_name in parent_block.vars + if var_name in intermediate + ] + + X = [select_block.var_recursive(x_name) for x_name in params] + + # Needs to be used by `equal` inside the cases block. + X.append(self.case_to_execute) + + # Construct the select op. + parent_block.append_op( + type='select', + inputs={'X': X, + 'case_to_execute': self.case_to_execute}, + attrs={'sub_block': select_block, + 'cases': serialized_cases}, + outputs={'Out': out_list}) + + return super(Select, self).__exit__(exc_type, exc_val, exc_tb) + + def make_channel(dtype, capacity=0): """ Helps implementation of a concurrent program by creating a "channel" of diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 02cd0a05a1..1bb1aa30ee 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -16,7 +16,7 @@ import contextlib from layer_function_generator import autodoc from tensor import assign, fill_constant from .. import core -from ..framework import Program, Variable, Operator, Block +from ..framework import Program, Variable, Operator from ..layer_helper import LayerHelper, unique_name from ops import logical_and, logical_not, logical_or @@ -29,7 +29,6 @@ __all__ = [ 'WhileGuard', 'While', 'Switch', - 'Select', 'lod_rank_table', 'max_sequence_len', 'topk', @@ -1212,186 +1211,6 @@ class Switch(object): return True -class SelectCase(object): - DEFAULT = 0 - SEND = 1 - RECEIVE = 2 - - def __init__(self, - case_idx, - case_to_execute, - channel_action_fn=None, - channel=None, - value=None): - self.helper = LayerHelper('conditional_block') - self.main_program = self.helper.main_program - self.is_scalar_condition = True - - self.case_to_execute = case_to_execute - self.idx = case_idx - - # Since we aren't going to use the `channel_send` or `channel_recv` - # functions directly, we just need to capture the name. - self.action = (self.SEND - if channel_action_fn.__name__ == ('channel_send') else - self.RECEIVE) if channel_action_fn else (self.DEFAULT) - self.value = value - self.channel = channel - - def __enter__(self): - self.block = self.main_program.create_block() - - def construct_op(self): - main_program = self.helper.main_program - cases_block = main_program.current_block() - - inner_outputs = set() - input_set = set() - params = set() - - for op in self.block.ops: - # Iterate over all operators, get all the inputs - # and add as input to the SelectCase operator. - for iname in op.input_names: - for in_var_name in op.input(iname): - if in_var_name not in inner_outputs: - input_set.add(in_var_name) - - for oname in op.output_names: - for out_var_name in op.output(oname): - inner_outputs.add(out_var_name) - - param_list = [ - cases_block.var(each_name) for each_name in params - if each_name not in input_set - ] - - # Iterate over all operators, get all the outputs - # add to the output list of SelectCase operator only if - # they exist in the parent block. - out_vars = [] - for inner_out_name in inner_outputs: - if inner_out_name in cases_block.vars: - out_vars.append(cases_block.var(inner_out_name)) - - # First, create an op that will determine whether or not this is the - # conditional variable to execute. - should_execute_block = equal( - fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx), - self.case_to_execute) - - step_scope = cases_block.create_var( - type=core.VarDesc.VarType.STEP_SCOPES) - - cases_block.append_op( - type='conditional_block', - inputs={'X': [should_execute_block], - 'Params': param_list}, - outputs={'Out': out_vars, - 'Scope': [step_scope]}, - attrs={ - 'sub_block': self.block, - 'is_scalar_condition': self.is_scalar_condition - }) - - return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name - if self.channel else '', self.value.name - if self.value else '') - - def __exit__(self, exc_type, exc_val, exc_tb): - self.main_program.rollback() - if exc_type is not None: - return False # re-raise exception - return True - - -class Select(BlockGuard): - def __init__(self, name=None): - self.helper = LayerHelper('select', name=name) - self.cases = [] - - super(Select, self).__init__(self.helper.main_program) - self.case_to_execute = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1) - - def __enter__(self): - super(Select, self).__enter__() - return self - - def case(self, channel_action_fn, channel, value): - """Create a new block for this condition. - """ - select_case = SelectCase( - len(self.cases), self.case_to_execute, channel_action_fn, channel, - value) - - self.cases.append(select_case) - - return select_case - - def default(self): - """Create a default case block for this condition. - """ - default_case = SelectCase(len(self.cases), self.case_to_execute) - - self.cases.append(default_case) - - return default_case - - def __exit__(self, exc_type, exc_val, exc_tb): - if exc_type is not None: - return False - - # Create a select op and another block to wrap its - # case blocks. - select_block = self.helper.main_program.current_block() - parent_block = self.helper.main_program.block(select_block.parent_idx) - - # Construct each case op, inside the newly created select block. - serialized_cases = [] - for case in self.cases: - serialized_cases.append(case.construct_op()) - - intermediate = set() - params = set() - - for case_block in select_block.ops: - if case_block.attrs and 'sub_block' in case_block.attrs: - for each_op in case_block.attrs['sub_block'].ops: - assert isinstance(each_op, Operator) - for iname in each_op.input_names: - for in_var_name in each_op.input(iname): - if in_var_name not in intermediate: - params.add(in_var_name) - - for oname in each_op.output_names: - for out_var_name in each_op.output(oname): - intermediate.add(out_var_name) - - # TODO(varunarora): Figure out if defining output is needed. - out_list = [ - parent_block.var(var_name) for var_name in parent_block.vars - if var_name in intermediate - ] - - X = [select_block.var_recursive(x_name) for x_name in params] - - # Needs to be used by `equal` inside the cases block. - X.append(self.case_to_execute) - - # Construct the select op. - parent_block.append_op( - type='select', - inputs={'X': X, - 'case_to_execute': self.case_to_execute}, - attrs={'sub_block': select_block, - 'cases': serialized_cases}, - outputs={}) - - return super(Select, self).__exit__(exc_type, exc_val, exc_tb) - - class IfElseBlockGuard(object): def __init__(self, is_true, ifelse): if not isinstance(ifelse, IfElse): From a9a228ad8dc30e2341e0e64b6cb053dc116578e6 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Fri, 23 Mar 2018 18:40:22 +0800 Subject: [PATCH 161/314] fix dist compile --- paddle/fluid/operators/detail/grpc_server.h | 2 ++ paddle/fluid/operators/detail/test_serde.cc | 10 ++++----- paddle/fluid/operators/listen_and_serv_op.cc | 22 +++++++------------- 3 files changed, 15 insertions(+), 19 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index f891c75dbc..787e1506e2 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -25,6 +25,8 @@ limitations under the License. */ #include "paddle/fluid/operators/detail/grpc_service.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc index 99c1577223..e646c894d1 100644 --- a/paddle/fluid/operators/detail/test_serde.cc +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -199,9 +199,9 @@ TEST(LodTensor, Run) { RunTestLodTensor(place); RunTestLodTensor(place, 1); #ifdef PADDLE_WITH_CUDA - platform::CUDAPlace place; - RunTestLodTensor(place); - RunTestLodTensor(place, 1); + platform::CUDAPlace gpu(0); + RunTestLodTensor(gpu); + RunTestLodTensor(gpu, 1); #endif } @@ -210,7 +210,7 @@ TEST(SelectedRows, Run) { RunSerdeTestSelectedRows(place); #ifdef PADDLE_WITH_CUDA - platform::CUDAPlace place; - RunSerdeTestSelectedRows(place); + platform::CUDAPlace gpu; + RunSerdeTestSelectedRows(gpu); #endif } diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index d8a3c45ac5..9c788108e2 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -93,12 +93,6 @@ class ListenAndServOp : public framework::OperatorBase { "server program should have at least 2 blocks"); framework::Executor executor(dev_place); - std::vector blk_ctx_list; - blk_ctx_list.push_back(nullptr); // block0 is not used. - for (int blkid = 1; blkid < num_blocks; ++blkid) { - auto *exe_ctx = executor.Prepare(*program, blkid); - blk_ctx_list.push_back(exe_ctx); - } // TODO(typhoonzero): change this to a while_op for every cluster-batch. bool exit_flag = false; @@ -150,11 +144,11 @@ class ListenAndServOp : public framework::OperatorBase { // block0 contains only listen_and_serv op, start run from block1. for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { fs.push_back(framework::Async( - [&executor, &program, &recv_scope, &blk_ctx_list, blkid]() { + [&executor, &program, &recv_scope, blkid]() { int run_block = blkid; // thread local try { - executor.RunPreparedContext(blk_ctx_list[run_block], - &recv_scope, false, false); + executor.Run(*program, &recv_scope, run_block, + false, false); } catch (std::exception &e) { LOG(ERROR) << "run sub program error " << e.what(); } @@ -164,8 +158,8 @@ class ListenAndServOp : public framework::OperatorBase { // Run global block at final step, or block1 if there are only 2 blocks if (num_blocks >= 2) { try { - executor.RunPreparedContext(blk_ctx_list[num_blocks - 1], &recv_scope, - false, false); + executor.Run(*program, &recv_scope, num_blocks - 1, + false, false); } catch (std::exception &e) { LOG(ERROR) << "run sub program error " << e.what(); } @@ -185,9 +179,9 @@ class ListenAndServOp : public framework::OperatorBase { sparse_vars.clear(); } // while(true) - for (int i = 0; i < num_blocks; ++i) { - delete blk_ctx_list[i]; - } + // for (int i = 0; i < num_blocks; ++i) { + // delete blk_ctx_list[i]; + // } } protected: From bb815d4364eaaf6c4053fc6c2259ebfa559bca90 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Fri, 23 Mar 2018 19:13:25 +0800 Subject: [PATCH 162/314] update --- .clang_format.hook | 2 +- paddle/fluid/operators/detail/grpc_server.h | 3 +-- paddle/fluid/operators/listen_and_serv_op.cc | 10 ++++------ 3 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.clang_format.hook b/.clang_format.hook index 1d92821686..edec286b77 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/bin/bash set -e -readonly VERSION="3.8" +readonly VERSION="7.0" version=$(clang-format -version) diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 787e1506e2..10e6dd45a9 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -22,11 +22,10 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/detail/grpc_service.h" -#include "paddle/fluid/operators/detail/grpc_service.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h" -#include "paddle/fluid/operators/detail/simple_block_queue.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/detail/simple_block_queue.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 9c788108e2..08b83375dd 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -143,12 +143,11 @@ class ListenAndServOp : public framework::OperatorBase { std::vector> fs; // block0 contains only listen_and_serv op, start run from block1. for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { - fs.push_back(framework::Async( - [&executor, &program, &recv_scope, blkid]() { + fs.push_back( + framework::Async([&executor, &program, &recv_scope, blkid]() { int run_block = blkid; // thread local try { - executor.Run(*program, &recv_scope, run_block, - false, false); + executor.Run(*program, &recv_scope, run_block, false, false); } catch (std::exception &e) { LOG(ERROR) << "run sub program error " << e.what(); } @@ -158,8 +157,7 @@ class ListenAndServOp : public framework::OperatorBase { // Run global block at final step, or block1 if there are only 2 blocks if (num_blocks >= 2) { try { - executor.Run(*program, &recv_scope, num_blocks - 1, - false, false); + executor.Run(*program, &recv_scope, num_blocks - 1, false, false); } catch (std::exception &e) { LOG(ERROR) << "run sub program error " << e.what(); } From bf66ce04940477375d8d605dcd8ece45ae2a4b61 Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Fri, 23 Mar 2018 19:15:05 +0800 Subject: [PATCH 163/314] update --- .clang_format.hook | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.clang_format.hook b/.clang_format.hook index edec286b77..1d92821686 100755 --- a/.clang_format.hook +++ b/.clang_format.hook @@ -1,7 +1,7 @@ #!/bin/bash set -e -readonly VERSION="7.0" +readonly VERSION="3.8" version=$(clang-format -version) From 043f47b27fa827cd87df93027124dce6d1d22d7e Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 23 Mar 2018 18:29:15 +0800 Subject: [PATCH 164/314] fix concat op --- paddle/fluid/operators/math/concat.cu | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index 60b266f08f..aede380006 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -70,9 +70,8 @@ __global__ void KernelConcat(T** inputs, const int input_col, const int output_rows, const int output_cols, T* output) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - double inv_input_col = 1.0 / input_col; for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { - int split = tid_x * inv_input_col; + int split = tid_x * 1.0 / input_col; int in_offset = tid_x - split * input_col; T* input_ptr = inputs[split]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; @@ -110,17 +109,16 @@ __global__ void KernelConcatGrad(const T* input, const int input_row, template __global__ void KernelConcatGrad(const T* input, const int input_row, - const int input_col, const int output_cols, + const int input_col, const int output_col, T** outputs) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - double inv_input_col = 1.0 / input_col; for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { - int split = tid_x * inv_input_col; - int in_offset = tid_x - split * input_col; + int split = tid_x / output_col; + int in_offset = tid_x - split * output_col; T* output_ptr = outputs[split]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * output_cols + in_offset] = + output_ptr[tid_y * output_col + in_offset] = input[tid_y * input_col + tid_x]; } } From 9075049a2921051f1ae3d685adcd562c76f4f247 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 23 Mar 2018 20:32:48 +0800 Subject: [PATCH 165/314] add unit test --- .../fluid/tests/unittests/test_concat_op.py | 30 ++++++++++++++----- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py index 558f3a4dcb..1e00d67d54 100644 --- a/python/paddle/fluid/tests/unittests/test_concat_op.py +++ b/python/paddle/fluid/tests/unittests/test_concat_op.py @@ -20,19 +20,35 @@ from op_test import OpTest class TestConcatOp(OpTest): def setUp(self): self.op_type = "concat" - x0 = np.random.random((2, 1, 4, 5)).astype('float32') - x1 = np.random.random((2, 2, 4, 5)).astype('float32') - x2 = np.random.random((2, 3, 4, 5)).astype('float32') - axis = 1 - self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]} - self.attrs = {'axis': axis} - self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)} + self.init_test_data() + self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]} + self.attrs = {'axis': self.axis} + self.outputs = { + 'Out': np.concatenate( + (self.x0, self.x1, self.x2), axis=self.axis) + } def test_check_output(self): self.check_output() def test_check_grad(self): self.check_grad(['x0'], 'Out') + self.check_grad(['x1'], 'Out') + self.check_grad(['x2'], 'Out') + + def init_test_data(self): + self.x0 = np.random.random((2, 1, 4, 5)).astype('float32') + self.x1 = np.random.random((2, 2, 4, 5)).astype('float32') + self.x2 = np.random.random((2, 3, 4, 5)).astype('float32') + self.axis = 1 + + +class TestConcatOp2(OpTest): + def init_test_data(self): + self.x0 = np.random.random((2, 3, 4, 5)).astype('float32') + self.x1 = np.random.random((2, 3, 4, 5)).astype('float32') + self.x2 = np.random.random((2, 3, 4, 5)).astype('float32') + self.axis = 1 if __name__ == '__main__': From 750aff10cebd03c3a52bec28508cc5a6195ef937 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Fri, 23 Mar 2018 21:00:24 +0800 Subject: [PATCH 166/314] code refine --- paddle/fluid/operators/math/concat.cu | 148 +++++++++++++------------- 1 file changed, 74 insertions(+), 74 deletions(-) diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index aede380006..1b637446a0 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -66,60 +66,60 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, } template -__global__ void KernelConcat(T** inputs, const int input_col, - const int output_rows, const int output_cols, - T* output) { +__global__ void KernelConcat(T** inputs_data, const int fixed_in_col, + const int out_rows, const int out_cols, + T* output_data) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { - int split = tid_x * 1.0 / input_col; - int in_offset = tid_x - split * input_col; - T* input_ptr = inputs[split]; + for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) { + int split = tid_x * 1.0 / fixed_in_col; + int in_offset = tid_x - split * fixed_in_col; + T* input_ptr = inputs_data[split]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) { - output[tid_y * output_cols + tid_x] = - input_ptr[tid_y * input_col + in_offset]; + for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) { + output_data[tid_y * out_cols + tid_x] = + input_ptr[tid_y * fixed_in_col + in_offset]; } } } template -__global__ void KernelConcatGrad(const T* input, const int input_row, - const int input_col, const int* output_cols, - int col_size, T** outputs) { +__global__ void KernelConcatGrad(const T* input_data, const int in_row, + const int in_col, const int* out_cols, + int out_cols_size, T** outputs_data) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int segment = upper_bound(output_cols, col_size, tid_x) - 1; - int curr_offset = output_cols[segment]; + int segment = upper_bound(out_cols, out_cols_size, tid_x) - 1; + int curr_offset = out_cols[segment]; int curr_segment = segment; - for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { T curr_col_offset; - while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) { + while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) { curr_offset = curr_col_offset; ++curr_segment; } int local_col = tid_x - curr_offset; int segment_width = curr_col_offset - curr_offset; - T* output_ptr = outputs[curr_segment]; + T* output_ptr = outputs_data[curr_segment]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) output_ptr[tid_y * segment_width + local_col] = - input[tid_y * input_col + tid_x]; + input_data[tid_y * in_col + tid_x]; } } template -__global__ void KernelConcatGrad(const T* input, const int input_row, - const int input_col, const int output_col, - T** outputs) { +__global__ void KernelConcatGrad(const T* input_data, const int in_row, + const int in_col, const int fixed_out_col, + T** outputs_data) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { - int split = tid_x / output_col; - int in_offset = tid_x - split * output_col; - T* output_ptr = outputs[split]; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x / fixed_out_col; + int in_offset = tid_x - split * fixed_out_col; + T* output_ptr = outputs_data[split]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) - output_ptr[tid_y * output_col + in_offset] = - input[tid_y * input_col + tid_x]; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; } } @@ -134,41 +134,40 @@ class ConcatFunctor { const std::vector& input, const int axis, framework::Tensor* output) { // TODO(zcd): Add input data validity checking - int num = input.size(); - int rows = 1; + int in_num = input.size(); + int in_row = 1; auto dim_0 = input[0].dims(); for (int i = 0; i < axis; ++i) { - rows *= dim_0[i]; + in_row *= dim_0[i]; } - int cols = input[0].numel() / rows; - int out_rows = rows, out_cols = 0; + int in_col = input[0].numel() / in_row; + int out_row = in_row, out_col = 0; - framework::Vector inputs_data(num * sizeof(T*) / 2); - framework::Vector inputs_cols(num + 1); - inputs_cols[0] = 0; + framework::Vector inputs_data(in_num * sizeof(T*) / 2); + framework::Vector inputs_col(in_num + 1); T** inputs_ptr = reinterpret_cast(inputs_data.data()); + inputs_col[0] = 0; bool sameShape = true; - for (int i = 0; i < num; ++i) { - int t_cols = input[i].numel() / rows; + for (int i = 0; i < in_num; ++i) { + int t_cols = input[i].numel() / in_row; if (sameShape) { - if (t_cols != cols) sameShape = false; + if (t_cols != in_col) sameShape = false; } - out_cols += t_cols; - inputs_cols[i + 1] = out_cols; + out_col += t_cols; + inputs_col[i + 1] = out_col; inputs_ptr[i] = const_cast(input[i].data()); } - T** ins_gpu = + T** dev_ins_data = reinterpret_cast(inputs_data.CUDAMutableData(context.GetPlace())); - const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace()); // computation // set the thread block and grid according to CurrentDeviceId const int kThreadsPerBlock = 1024; int block_cols = kThreadsPerBlock; - if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. - block_cols = ((out_cols + 31) >> 5) << 5; + if (out_col < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((out_col + 31) >> 5) << 5; } int block_rows = kThreadsPerBlock / block_cols; dim3 block_size = dim3(block_cols, block_rows, 1); @@ -177,18 +176,19 @@ class ConcatFunctor { int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int grid_cols = - std::min((out_cols + block_cols - 1) / block_cols, max_blocks); + std::min((out_col + block_cols - 1) / block_cols, max_blocks); int grid_rows = - std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); + std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1)); dim3 grid_size = dim3(grid_cols, grid_rows, 1); if (sameShape) { KernelConcat<<>>( - ins_gpu, cols, out_rows, out_cols, output->data()); + dev_ins_data, in_col, out_row, out_col, output->data()); } else { + const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace()); KernelConcat<<>>( - ins_gpu, ins_col_gpu, static_cast(inputs_cols.size()), out_rows, - out_cols, output->data()); + dev_ins_data, dev_ins_col_data, static_cast(inputs_col.size()), + out_row, out_col, output->data()); } } }; @@ -204,41 +204,40 @@ class ConcatGradFunctor { const framework::Tensor& input, const int axis, std::vector& outputs) { // TODO(zcd): Add input data validity checking - int num = outputs.size(); - int input_row = 1; + int o_num = outputs.size(); + int out_row = 1; auto dim_0 = outputs[0].dims(); for (int i = 0; i < axis; ++i) { - input_row *= dim_0[i]; + out_row *= dim_0[i]; } - int output_col_0 = outputs[0].numel() / input_row; - int input_col = 0; + int out_col = outputs[0].numel() / out_row; + int in_col = 0, in_row = out_row; bool sameShape = true; - framework::Vector outputs_data(num * sizeof(T*) / 2); - framework::Vector outputs_cols(num + 1); - outputs_cols[0] = 0; + framework::Vector outputs_data(o_num * sizeof(T*) / 2); + framework::Vector outputs_cols(o_num + 1); T** outputs_ptr = reinterpret_cast(outputs_data.data()); - for (int i = 0; i < num; ++i) { - int t_col = outputs[i].numel() / input_row; + outputs_cols[0] = 0; + for (int i = 0; i < o_num; ++i) { + int t_col = outputs[i].numel() / out_row; if (sameShape) { - if (t_col != output_col_0) sameShape = false; + if (t_col != out_col) sameShape = false; } - input_col += t_col; - outputs_cols[i + 1] = input_col; + in_col += t_col; + outputs_cols[i + 1] = in_col; outputs_ptr[i] = outputs[i].data(); } - T** outs_gpu = + T** dev_out_gpu_data = reinterpret_cast(outputs_data.CUDAMutableData(context.GetPlace())); - const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace()); // computation const int kThreadsPerBlock = 1024; int block_cols = kThreadsPerBlock; - if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32. - block_cols = ((input_col + 31) >> 5) << 5; + if (in_col < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((in_col + 31) >> 5) << 5; } int block_rows = kThreadsPerBlock / block_cols; dim3 block_size = dim3(block_cols, block_rows, 1); @@ -247,18 +246,19 @@ class ConcatGradFunctor { int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int grid_cols = - std::min((input_col + block_cols - 1) / block_cols, max_blocks); + std::min((in_col + block_cols - 1) / block_cols, max_blocks); int grid_rows = - std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1)); + std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1)); dim3 grid_size = dim3(grid_cols, grid_rows, 1); if (sameShape) { KernelConcatGrad<<>>( - input.data(), input_row, input_col, output_col_0, outs_gpu); + input.data(), in_row, in_col, out_col, dev_out_gpu_data); } else { + const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace()); KernelConcatGrad<<>>( - input.data(), input_row, input_col, outs_col_gpu, - static_cast(outputs_cols.size()), outs_gpu); + input.data(), in_row, in_col, dev_outs_col_data, + static_cast(outputs_cols.size()), dev_out_gpu_data); } } }; From 4466f0bec8c23558536959d06b45a1b4c2daab70 Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Wed, 14 Mar 2018 16:10:54 +0100 Subject: [PATCH 167/314] MKLDNN Relu Tanh Sqrt Abs activations added --- paddle/fluid/framework/operator.h | 8 + paddle/fluid/operators/CMakeLists.txt | 5 + .../fluid/operators/activation_mkldnn_op.cc | 192 ++++++++++++++++++ paddle/fluid/operators/activation_op.cc | 52 ++++- paddle/fluid/operators/activation_op.h | 65 +++++- paddle/fluid/platform/mkldnn_helper.h | 1 + python/paddle/fluid/layer_helper.py | 2 + .../paddle/fluid/tests/unittests/op_test.py | 12 +- .../tests/unittests/test_activation_op.py | 67 ++++++ 9 files changed, 401 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/activation_mkldnn_op.cc diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 41214b41cb..d354714d0e 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -84,6 +84,10 @@ class OperatorBase { return boost::get(attrs_.at(name)); } + inline bool HasAttr(const std::string& name) const { + return attrs_.count(name) != 0; + } + /// if scope is not null, also show dimensions of arguments virtual std::string DebugStringEx(const Scope* scope) const; @@ -195,6 +199,10 @@ class ExecutionContext { return op_.Attr(name); } + inline bool HasAttr(const std::string& name) const { + return op_.HasAttr(name); + } + size_t InputSize(const std::string& name) const { return op_.Inputs(name).size(); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index c0245379ac..9c367dd145 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -153,7 +153,12 @@ function(op_library TARGET) # pybind USE_OP_DEVICE_KERNEL for MKLDNN if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) + # Append first implemented MKLDNN activation operator + if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") + else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") + endif() endif() # pybind USE_OP diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc new file mode 100644 index 0000000000..65cf2fceb7 --- /dev/null +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "mkldnn.hpp" +#include "paddle/fluid/operators/activation_op.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; +using paddle::platform::MKLDNNDeviceContext; + +namespace { +template +void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm, + const T alpha = 0, const T beta = 0) { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); + + // get buffers + const auto *src = ctx.template Input("X"); + const auto *src_data = src->template data(); + + auto *dst = ctx.template Output("Out"); + const T *dst_data = dst->template mutable_data(ctx.GetPlace()); + + // get memory dim + PADDLE_ENFORCE(src->dims().size() == 4, + "Input dim must be with 4, i.e. NCHW"); + std::vector src_tz = framework::vectorize2int(src->dims()); + + // create memory description + // TODO(kbinias-intel): support more formats + auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + // create memory primitives + auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data); + auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data); + + auto forward_desc = mkldnn::eltwise_forward::desc( + mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta); + + // save prim desc into global device context to be referred in backward path + const std::string key = ctx.op().Output("Out"); + const std::string key_eltwise_pd = key + "@eltwise_pd"; + auto forward_pd = std::make_shared( + forward_desc, mkldnn_engine); + dev_ctx.SetBlob(key_eltwise_pd, forward_pd); + + auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline = {eltwise}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} + +template +void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm, + const T alpha = 0, const T beta = 0) { + auto &dev_ctx = ctx.template device_context(); + const auto &mkldnn_engine = dev_ctx.GetEngine(); + + // get buffers + const auto *x = ctx.template Input("X"); + const auto *src = x->template data(); + + auto *dout = ctx.template Input(framework::GradVarName("Out")); + const auto *diff_dst = dout->template data(); + + auto *dx = + ctx.template Output(framework::GradVarName("X")); + const T *diff_src = dx->template mutable_data(ctx.GetPlace()); + + // get memory dim + std::vector src_tz = framework::vectorize2int(x->dims()); + + // create memory description + auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + // create memory primitives + auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src); + auto diff_src_memory = + mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src); + auto diff_dst_memory = + mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst); + + auto backward_desc = + mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta); + + // retrieve eltwise primitive desc from device context + const std::string key = ctx.op().Input("Out"); + const std::string key_eltwise_pd = key + "@eltwise_pd"; + const std::shared_ptr forward_pd = dev_ctx.GetBlob(key_eltwise_pd); + PADDLE_ENFORCE(forward_pd != nullptr, + "Fail to find eltwise_pd in device context"); + auto *p_forward_pd = + static_cast(forward_pd.get()); + + auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc( + backward_desc, mkldnn_engine, *p_forward_pd); + + auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory, + diff_dst_memory, diff_src_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline = {eltwise_bwd}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); +} +} // anonymous namespace + +template +struct MKLDNNActivationFunc : public BaseActivationFunctor { + template + void operator()(const ExecContext &ctx) const { + eltwise_forward(ctx, algorithm); + } +}; + +template +struct MKLDNNActivationGradFunc : public BaseActivationFunctor { + template + void operator()(const ExecContext &ctx) const { + eltwise_grad(ctx, algorithm); + } +}; + +template +using ReluMkldnnFunctor = + MKLDNNActivationFunc; + +template +using TanhMkldnnFunctor = + MKLDNNActivationFunc; + +template +using SqrtMkldnnFunctor = + MKLDNNActivationFunc; + +template +using AbsMkldnnFunctor = + MKLDNNActivationFunc; + +template +using ReluMkldnnGradFunctor = + MKLDNNActivationGradFunc; + +template +using TanhMkldnnGradFunctor = + MKLDNNActivationGradFunc; + +template +using SqrtMkldnnGradFunctor = + MKLDNNActivationGradFunc; + +template +using AbsMkldnnGradFunctor = + MKLDNNActivationGradFunc; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \ + REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace, \ + ops::MKLDNNActivationKernel>); \ + REGISTER_OP_KERNEL( \ + act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \ + ops::MKLDNNActivationGradKernel>); + +#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ + __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor) \ + __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor) \ + __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor) \ + __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor); + +FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index ec637658c0..ae9ca9d4ff 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -25,6 +25,11 @@ class ActivationOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return ActivationHelper().GetKernelType(ctx, *this); + } }; class ActivationOpGrad : public framework::OperatorWithKernel { @@ -34,6 +39,11 @@ class ActivationOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return ActivationHelper().GetKernelType(ctx, *this); + } }; class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { @@ -87,6 +97,16 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker { : framework::OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "Input of Relu operator"); AddOutput("Out", "Output of Relu operator"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Relu Activation Operator. @@ -140,6 +160,16 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker { : framework::OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "Input of Tanh operator"); AddOutput("Out", "Output of Tanh operator"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Tanh Activation Operator. @@ -193,6 +223,16 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { : framework::OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "Input of Sqrt operator"); AddOutput("Out", "Output of Sqrt operator"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Sqrt Activation Operator. @@ -208,6 +248,16 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker { : framework::OpProtoAndCheckerMaker(proto, op_checker) { AddInput("X", "Input of Abs operator"); AddOutput("Out", "Output of Abs operator"); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "data_format", + "(string, default NCHW) Only used in " + "An optional string from: \"NHWC\", \"NCHW\". " + "Defaults to \"NHWC\". Specify the data format of the output data, " + "the input will be transformed automatically. ") + .SetDefault("AnyLayout"); AddComment(R"DOC( Abs Activation Operator. diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index b95e793586..084b6bace7 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -17,9 +17,36 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/detail/safe_ref.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + namespace paddle { namespace operators { +class ActivationHelper { + public: + framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx, + const framework::OperatorWithKernel& oper) const { + framework::LibraryType library{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + } +#endif + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + if (ctx.HasAttr("data_format")) { + std::string data_format = ctx.Attr("data_format"); + layout = framework::StringToDataLayout(data_format); + } + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.GetPlace(), layout, library); + } +}; + template class ActivationKernel : public framework::OpKernel { @@ -49,6 +76,27 @@ class ActivationKernel } }; +template +class MKLDNNActivationKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(!context.HasAttr("X"), + "Cannot find input tensor X, variable name = %s", + context.op().Input("X")); + PADDLE_ENFORCE(!context.HasAttr("Out"), + "Cannot find output tensor Out, variable name = %s", + context.op().Output("Out")); + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(context); + } +}; + template class ActivationGradKernel : public framework::OpKernel { @@ -77,6 +125,21 @@ class ActivationGradKernel } }; +template +class MKLDNNActivationGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(context); + } +}; + template struct BaseActivationFunctor { using ELEMENT_TYPE = T; diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 90b78142b8..281d38cb8a 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -42,6 +42,7 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, } inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) { + if (!ctx.HasAttr("use_mkldnn")) return false; bool use_mkldnn = ctx.Attr("use_mkldnn"); return use_mkldnn && platform::is_cpu_place(ctx.GetPlace()); } diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 58b6682271..d771837fc5 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -403,6 +403,8 @@ class LayerHelper(object): if 'use_mkldnn' in self.kwargs: act['use_mkldnn'] = self.kwargs.get('use_mkldnn') act_type = act.pop('type') + if 'use_mkldnn' in self.kwargs: + act['use_mkldnn'] = self.kwargs.get('use_mkldnn') self.append_op( type=act_type, inputs={"X": [input_var]}, diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 8393f7827b..2b10f16688 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -215,7 +215,8 @@ class OpTest(unittest.TestCase): '''Fix random seeds to remove randomness from tests''' cls._np_rand_state = np.random.get_state() cls._py_rand_state = random.getstate() - + cls.use_mkldnn = False + cls.data_format = 'AnyLayout' np.random.seed(123) random.seed(124) @@ -340,7 +341,14 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has different lod at " + str(place)) + def fill_attrs(self): + attrs = self.attrs if hasattr(self, "attrs") else dict() + attrs["use_mkldnn"] = self.use_mkldnn + attrs["data_format"] = self.data_format + return attrs + def check_output(self, atol=1e-5): + self.attrs = self.fill_attrs() places = [core.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) @@ -348,6 +356,7 @@ class OpTest(unittest.TestCase): self.check_output_with_place(place, atol) def check_output_customized(self, checker): + self.attrs = self.fill_attrs() places = [core.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) @@ -383,6 +392,7 @@ class OpTest(unittest.TestCase): in_place=False, max_relative_error=0.005, user_defined_grads=None): + self.attrs = self.fill_attrs() places = [core.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 1e3decfbaf..c6c86a5969 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -506,5 +506,72 @@ class TestSwish(OpTest): self.check_grad(['X'], 'Out', max_relative_error=0.008) +#--------------------test MKLDNN-------------------- +class TestMKLDNNRelu(OpTest): + def setUp(self): + self.op_type = "relu" + x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") + # The same reason with TestAbs + x[np.abs(x) < 0.005] = 0.02 + self.inputs = {'X': x} + self.outputs = {'Out': np.maximum(self.inputs['X'], 0)} + self.use_mkldnn = True + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + +class TestMKLDNNTanh(OpTest): + def setUp(self): + self.op_type = "tanh" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") + } + self.outputs = {'Out': np.tanh(self.inputs['X'])} + self.use_mkldnn = True + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + +class TestMKLDNNSqrt(OpTest): + def setUp(self): + self.op_type = "sqrt" + self.inputs = { + 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") + } + self.outputs = {'Out': np.sqrt(self.inputs['X'])} + self.use_mkldnn = True + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + +class TestMKLDNNAbs(OpTest): + def setUp(self): + self.op_type = "abs" + x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") + # The same reason with TestAbs + x[np.abs(x) < 0.005] = 0.02 + self.inputs = {'X': x} + self.outputs = {'Out': np.abs(self.inputs['X'])} + self.use_mkldnn = True + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + if __name__ == "__main__": unittest.main() From a64b312e3a922ea1e0520d59950e81189748c7f4 Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Tue, 20 Mar 2018 11:22:12 +0100 Subject: [PATCH 168/314] Correcting for PR comments --- paddle/fluid/framework/operator.h | 8 --- .../fluid/operators/activation_mkldnn_op.cc | 11 ++-- paddle/fluid/operators/activation_op.cc | 28 -------- paddle/fluid/operators/activation_op.h | 40 ------------ paddle/fluid/operators/mkldnn_activation_op.h | 64 +++++++++++++++++++ paddle/fluid/platform/mkldnn_helper.h | 1 - .../paddle/fluid/tests/unittests/op_test.py | 12 +--- .../tests/unittests/test_activation_op.py | 8 +-- 8 files changed, 75 insertions(+), 97 deletions(-) create mode 100644 paddle/fluid/operators/mkldnn_activation_op.h diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index d354714d0e..41214b41cb 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -84,10 +84,6 @@ class OperatorBase { return boost::get(attrs_.at(name)); } - inline bool HasAttr(const std::string& name) const { - return attrs_.count(name) != 0; - } - /// if scope is not null, also show dimensions of arguments virtual std::string DebugStringEx(const Scope* scope) const; @@ -199,10 +195,6 @@ class ExecutionContext { return op_.Attr(name); } - inline bool HasAttr(const std::string& name) const { - return op_.HasAttr(name); - } - size_t InputSize(const std::string& name) const { return op_.Inputs(name).size(); } diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc index 65cf2fceb7..6ff363d766 100644 --- a/paddle/fluid/operators/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "mkldnn.hpp" +#include "mkldnn_activation_op.h" #include "paddle/fluid/operators/activation_op.h" namespace paddle { @@ -183,10 +184,10 @@ namespace ops = paddle::operators; act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \ ops::MKLDNNActivationGradKernel>); -#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ - __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor) \ - __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor) \ - __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor) \ - __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor); +#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ + __macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \ + __macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \ + __macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \ + __macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index ae9ca9d4ff..043ffb01fc 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -100,13 +100,6 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); - AddAttr( - "data_format", - "(string, default NCHW) Only used in " - "An optional string from: \"NHWC\", \"NCHW\". " - "Defaults to \"NHWC\". Specify the data format of the output data, " - "the input will be transformed automatically. ") - .SetDefault("AnyLayout"); AddComment(R"DOC( Relu Activation Operator. @@ -163,13 +156,6 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); - AddAttr( - "data_format", - "(string, default NCHW) Only used in " - "An optional string from: \"NHWC\", \"NCHW\". " - "Defaults to \"NHWC\". Specify the data format of the output data, " - "the input will be transformed automatically. ") - .SetDefault("AnyLayout"); AddComment(R"DOC( Tanh Activation Operator. @@ -226,13 +212,6 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); - AddAttr( - "data_format", - "(string, default NCHW) Only used in " - "An optional string from: \"NHWC\", \"NCHW\". " - "Defaults to \"NHWC\". Specify the data format of the output data, " - "the input will be transformed automatically. ") - .SetDefault("AnyLayout"); AddComment(R"DOC( Sqrt Activation Operator. @@ -251,13 +230,6 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); - AddAttr( - "data_format", - "(string, default NCHW) Only used in " - "An optional string from: \"NHWC\", \"NCHW\". " - "Defaults to \"NHWC\". Specify the data format of the output data, " - "the input will be transformed automatically. ") - .SetDefault("AnyLayout"); AddComment(R"DOC( Abs Activation Operator. diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 084b6bace7..e607a5554f 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -37,10 +37,6 @@ class ActivationHelper { } #endif framework::DataLayout layout = framework::DataLayout::kAnyLayout; - if (ctx.HasAttr("data_format")) { - std::string data_format = ctx.Attr("data_format"); - layout = framework::StringToDataLayout(data_format); - } return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), layout, library); @@ -76,27 +72,6 @@ class ActivationKernel } }; -template -class MKLDNNActivationKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE(!context.HasAttr("X"), - "Cannot find input tensor X, variable name = %s", - context.op().Input("X")); - PADDLE_ENFORCE(!context.HasAttr("Out"), - "Cannot find output tensor Out, variable name = %s", - context.op().Output("Out")); - Functor functor; - - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = context.Attr(attr.first); - } - functor(context); - } -}; - template class ActivationGradKernel : public framework::OpKernel { @@ -125,21 +100,6 @@ class ActivationGradKernel } }; -template -class MKLDNNActivationGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - Functor functor; - - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = context.Attr(attr.first); - } - functor(context); - } -}; - template struct BaseActivationFunctor { using ELEMENT_TYPE = T; diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h new file mode 100644 index 0000000000..976e362911 --- /dev/null +++ b/paddle/fluid/operators/mkldnn_activation_op.h @@ -0,0 +1,64 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/safe_ref.h" + +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class MKLDNNActivationKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE(context.Input("X") != nullptr, + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + PADDLE_ENFORCE(context.Output("Out") != nullptr, + "Cannot find output tensor Out, variable name = %s", + context.op().Output("Out")); + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(context); + } +}; + +template +class MKLDNNActivationGradKernel + : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + Functor functor; + + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = context.Attr(attr.first); + } + functor(context); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 281d38cb8a..90b78142b8 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -42,7 +42,6 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, } inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) { - if (!ctx.HasAttr("use_mkldnn")) return false; bool use_mkldnn = ctx.Attr("use_mkldnn"); return use_mkldnn && platform::is_cpu_place(ctx.GetPlace()); } diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index 2b10f16688..8393f7827b 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -215,8 +215,7 @@ class OpTest(unittest.TestCase): '''Fix random seeds to remove randomness from tests''' cls._np_rand_state = np.random.get_state() cls._py_rand_state = random.getstate() - cls.use_mkldnn = False - cls.data_format = 'AnyLayout' + np.random.seed(123) random.seed(124) @@ -341,14 +340,7 @@ class OpTest(unittest.TestCase): "Output (" + out_name + ") has different lod at " + str(place)) - def fill_attrs(self): - attrs = self.attrs if hasattr(self, "attrs") else dict() - attrs["use_mkldnn"] = self.use_mkldnn - attrs["data_format"] = self.data_format - return attrs - def check_output(self, atol=1e-5): - self.attrs = self.fill_attrs() places = [core.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) @@ -356,7 +348,6 @@ class OpTest(unittest.TestCase): self.check_output_with_place(place, atol) def check_output_customized(self, checker): - self.attrs = self.fill_attrs() places = [core.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) @@ -392,7 +383,6 @@ class OpTest(unittest.TestCase): in_place=False, max_relative_error=0.005, user_defined_grads=None): - self.attrs = self.fill_attrs() places = [core.CPUPlace()] if core.is_compiled_with_cuda() and core.op_support_gpu(self.op_type): places.append(core.CUDAPlace(0)) diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index c6c86a5969..1d53737ac1 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -515,7 +515,7 @@ class TestMKLDNNRelu(OpTest): x[np.abs(x) < 0.005] = 0.02 self.inputs = {'X': x} self.outputs = {'Out': np.maximum(self.inputs['X'], 0)} - self.use_mkldnn = True + self.attrs = {"use_mkldnn": True} def test_check_output(self): self.check_output() @@ -531,7 +531,7 @@ class TestMKLDNNTanh(OpTest): 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") } self.outputs = {'Out': np.tanh(self.inputs['X'])} - self.use_mkldnn = True + self.attrs = {"use_mkldnn": True} def test_check_output(self): self.check_output() @@ -547,7 +547,7 @@ class TestMKLDNNSqrt(OpTest): 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") } self.outputs = {'Out': np.sqrt(self.inputs['X'])} - self.use_mkldnn = True + self.attrs = {"use_mkldnn": True} def test_check_output(self): self.check_output() @@ -564,7 +564,7 @@ class TestMKLDNNAbs(OpTest): x[np.abs(x) < 0.005] = 0.02 self.inputs = {'X': x} self.outputs = {'Out': np.abs(self.inputs['X'])} - self.use_mkldnn = True + self.attrs = {"use_mkldnn": True} def test_check_output(self): self.check_output() From d8bd436fc16497e1f29de2b1f4c2d6f59abb80de Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Wed, 21 Mar 2018 15:48:26 +0100 Subject: [PATCH 169/314] Fixed tests --- paddle/fluid/operators/activation_op.cc | 27 ++++------- paddle/fluid/operators/activation_op.h | 19 -------- paddle/fluid/operators/mkldnn_activation_op.h | 47 +++++++++++++++++++ 3 files changed, 56 insertions(+), 37 deletions(-) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 043ffb01fc..979115eee0 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/mkldnn_activation_op.h" namespace paddle { namespace operators { @@ -25,11 +26,6 @@ class ActivationOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", /*->*/ "Out"); } - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return ActivationHelper().GetKernelType(ctx, *this); - } }; class ActivationOpGrad : public framework::OperatorWithKernel { @@ -39,11 +35,6 @@ class ActivationOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext *ctx) const override { ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); } - - framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - return ActivationHelper().GetKernelType(ctx, *this); - } }; class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker { @@ -546,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker, REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad, ops::ActivationOpGrad); -REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad, - ops::ActivationOpGrad); +REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad, + ops::ActivationWithMKLDNNOpGrad); -REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad, - ops::ActivationOpGrad); +REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad, + ops::ActivationWithMKLDNNOpGrad); REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, tanh_shrink_grad, ops::ActivationOpGrad); @@ -558,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker, softshrink_grad, ops::ActivationOpGrad); -REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad, - ops::ActivationOpGrad); +REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad, + ops::ActivationWithMKLDNNOpGrad); -REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad, - ops::ActivationOpGrad); +REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad, + ops::ActivationWithMKLDNNOpGrad); REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad, ops::ActivationOpGrad); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index e607a5554f..4c575b4a7b 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -24,25 +24,6 @@ limitations under the License. */ namespace paddle { namespace operators { -class ActivationHelper { - public: - framework::OpKernelType GetKernelType( - const framework::ExecutionContext& ctx, - const framework::OperatorWithKernel& oper) const { - framework::LibraryType library{framework::LibraryType::kPlain}; -#ifdef PADDLE_WITH_MKLDNN - if (library == framework::LibraryType::kPlain && - platform::CanMKLDNNBeUsed(ctx)) { - library = framework::LibraryType::kMKLDNN; - } -#endif - framework::DataLayout layout = framework::DataLayout::kAnyLayout; - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.GetPlace(), layout, library); - } -}; - template class ActivationKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn_activation_op.h index 976e362911..083d03ebe6 100644 --- a/paddle/fluid/operators/mkldnn_activation_op.h +++ b/paddle/fluid/operators/mkldnn_activation_op.h @@ -60,5 +60,52 @@ class MKLDNNActivationGradKernel } }; +namespace { +framework::OpKernelType GetKernelType( + const framework::ExecutionContext& ctx, + const framework::OperatorWithKernel& oper) { + framework::LibraryType library{framework::LibraryType::kPlain}; +#ifdef PADDLE_WITH_MKLDNN + if (library == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library = framework::LibraryType::kMKLDNN; + } +#endif + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.GetPlace(), layout, library); +} +} // anonymous namespace + +class ActivationWithMKLDNNOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this); + } +}; + +class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out")); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this); + } +}; + } // namespace operators } // namespace paddle From 6461e800a5404762e6105a4080625bee64b1c2b0 Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Thu, 22 Mar 2018 15:47:02 +0100 Subject: [PATCH 170/314] Inheritance added for MKLDNN tests --- .../tests/unittests/test_activation_op.py | 50 ++++++------------- 1 file changed, 16 insertions(+), 34 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 1d53737ac1..4a2b35322d 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -507,58 +507,46 @@ class TestSwish(OpTest): #--------------------test MKLDNN-------------------- -class TestMKLDNNRelu(OpTest): +class TestMKLDNNRelu(TestRelu): def setUp(self): - self.op_type = "relu" + super(TestMKLDNNRelu, self).setUp() + x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") # The same reason with TestAbs x[np.abs(x) < 0.005] = 0.02 - self.inputs = {'X': x} - self.outputs = {'Out': np.maximum(self.inputs['X'], 0)} - self.attrs = {"use_mkldnn": True} - - def test_check_output(self): - self.check_output() + out = np.maximum(x, 0) - def test_check_grad(self): - self.check_grad(['X'], 'Out', max_relative_error=0.007) + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + self.attrs = {"use_mkldnn": True} -class TestMKLDNNTanh(OpTest): +class TestMKLDNNTanh(TestTanh): def setUp(self): - self.op_type = "tanh" + super(TestMKLDNNTanh, self).setUp() + self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") } self.outputs = {'Out': np.tanh(self.inputs['X'])} self.attrs = {"use_mkldnn": True} - def test_check_output(self): - self.check_output() - def test_check_grad(self): - self.check_grad(['X'], 'Out', max_relative_error=0.007) - - -class TestMKLDNNSqrt(OpTest): +class TestMKLDNNSqrt(TestSqrt): def setUp(self): - self.op_type = "sqrt" + super(TestMKLDNNSqrt, self).setUp() + self.inputs = { 'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32") } self.outputs = {'Out': np.sqrt(self.inputs['X'])} self.attrs = {"use_mkldnn": True} - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out', max_relative_error=0.007) - -class TestMKLDNNAbs(OpTest): +class TestMKLDNNAbs(TestAbs): def setUp(self): - self.op_type = "abs" + super(TestMKLDNNAbs, self).setUp() + x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32") # The same reason with TestAbs x[np.abs(x) < 0.005] = 0.02 @@ -566,12 +554,6 @@ class TestMKLDNNAbs(OpTest): self.outputs = {'Out': np.abs(self.inputs['X'])} self.attrs = {"use_mkldnn": True} - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['X'], 'Out', max_relative_error=0.007) - if __name__ == "__main__": unittest.main() From 30c750ebb99cd5fda477457679f3b3b39fd04f84 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Fri, 23 Mar 2018 10:27:36 -0700 Subject: [PATCH 171/314] Fix links to english docs --- doc/v2/howto/index_en.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/v2/howto/index_en.rst b/doc/v2/howto/index_en.rst index bf2320a169..35ef197f58 100644 --- a/doc/v2/howto/index_en.rst +++ b/doc/v2/howto/index_en.rst @@ -6,32 +6,32 @@ PaddlePaddle provides the users the ability to flexibly set various command line .. toctree:: :maxdepth: 1 - cmd_parameter/index_cn.rst + cmd_parameter/index_en.rst PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to: .. toctree:: :maxdepth: 1 - cluster/index_cn.rst + cluster/index_en.rst PaddlePaddle provides a C-API for inference. We provide the following guidelines for using the C-API: .. toctree:: :maxdepth: 1 - capi/index_cn.rst + capi/index_en.rst PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to: .. toctree:: :maxdepth: 1 - rnn/index_cn.rst + rnn/index_en.rst How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to: .. toctree:: :maxdepth: 1 - optimization/gpu_profiling_cn.rst + optimization/gpu_profiling_en.rst From b123e43bf99fa84b68c91e16d92a8aac5508e88e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sat, 24 Mar 2018 12:28:14 +0800 Subject: [PATCH 172/314] extract multi devices graph builder --- paddle/fluid/framework/CMakeLists.txt | 9 +- paddle/fluid/framework/details/CMakeLists.txt | 3 + .../details/multi_devices_graph_builder.cc | 140 ++++++++++ .../details/multi_devices_graph_builder.h | 46 ++++ .../framework/details/ssa_graph_builder.cc | 88 ++++++ .../framework/details/ssa_graph_builder.h | 56 ++++ paddle/fluid/framework/parallel_executor.cc | 254 ++---------------- 7 files changed, 354 insertions(+), 242 deletions(-) create mode 100644 paddle/fluid/framework/details/multi_devices_graph_builder.cc create mode 100644 paddle/fluid/framework/details/multi_devices_graph_builder.h create mode 100644 paddle/fluid/framework/details/ssa_graph_builder.cc create mode 100644 paddle/fluid/framework/details/ssa_graph_builder.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index f1d19efa97..d3f69ee9d8 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -88,14 +88,9 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) -if(WITH_GPU) - set(parallel_executor_cuda_deps nccl_all_reduce_op_handle) -else() - set(parallel_executor_cuda_deps) -endif() + cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - backward glog lod_rank_table simple_threadpool scale_loss_grad_op_handle - fetch_op_handle computation_op_handle ssa_graph ${parallel_executor_cuda_deps}) + backward glog lod_rank_table simple_threadpool multi_devices_graph_builder fetch_op_handle) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 9ed41ab94c..4432bc0245 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -7,3 +7,6 @@ nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_h cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry) cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) +cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) +cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle + nccl_all_reduce_op_handle scale_loss_grad_op_handle) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc new file mode 100644 index 0000000000..3fab6adf0f --- /dev/null +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -0,0 +1,140 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/nccl_helper.h" + +namespace paddle { +namespace framework { +namespace details { +MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( + const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶ms, + const std::vector &local_scopes, + platform::NCCLContextMap *nccl_ctxs) + : loss_var_name_(loss_var_name), + places_(places), + local_scopes_(local_scopes), + nccl_ctxs_(nccl_ctxs) { + for (auto &p : params) { + grad_names_.insert(GradVarName(p)); + } +} + +void MultiDevSSAGraphBuilder::Build(const ProgramDesc &program, + SSAGraph *graph) const { + SSAGraph &result = *graph; + result.vars_.resize(places_.size()); + + bool is_forwarding = true; + for (auto *op : program.Block(0).AllOps()) { + bool change_forward = false; + if (!is_forwarding) { + // FIXME(yy): Do not hard code like this + if (op->OutputArgumentNames().size() == 1 && + op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) { + continue; // Drop fill 1. for backward coeff; + } + } + + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + auto *s = local_scopes_[i]; + + result.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); + auto *op_handle = result.ops_.back().get(); + op_handle->dev_ctx_[p] = const_cast( + platform::DeviceContextPool::Instance().Get(p)); + + auto var_names = op->InputArgumentNames(); + + for (auto &each_var_name : var_names) { + VarHandle *var = + CreateOrGetLatestVarHandle(&result, each_var_name, p, i); + op_handle->AddInput(var); + } + var_names = op->OutputArgumentNames(); + + for (auto &each_var_name : var_names) { + CreateOpOutput(&result, op_handle, each_var_name, p, i); + } + + if (is_forwarding) { + if (var_names.size() == 1 && var_names[0] == loss_var_name_) { + // Insert ScaleCost OpHandle + op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p, + nccl_ctxs_->DevCtx(p)); + result.ops_.emplace_back(op_handle); + + // FIXME: Currently ScaleLossGradOp only use device_count as scale + // factor. So it does not depend on any other operators. + // VarHandle *loss = GetVarHandle(loss_var_name, place); + // loss->pending_ops_.emplace_back(op_handle); + // op_handle->inputs_.emplace_back(loss); + + CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i); + change_forward = true; + } + } + } + + if (change_forward) { + is_forwarding = false; + } + + if (!is_forwarding) { + auto var_names = op->OutputArgumentNames(); + for (auto &og : var_names) { + if (grad_names_.count(og) != 0) { // is param grad + // Insert NCCL AllReduce Op + result.ops_.emplace_back( + new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); + auto *op_handle = result.ops_.back().get(); + + for (size_t i = 0; i < places_.size(); ++i) { + auto &p = places_[i]; + auto &vars = result.vars_[i][og]; + + if (vars.empty()) { // This device has no data. continue. + continue; + } + auto *prev_grad = &vars[vars.size() - 1]; + op_handle->AddInput(prev_grad); + + auto &var = vars[vars.size()]; + var.place_ = p; + var.name_ = og; + var.version_ = vars.size() - 1; + + op_handle->AddOutput(&var); + } + } + } + } + } + + /* + Dependency graph has been constructed. However, there are still data + harzaeds need to be handled. + */ + PolishGraphToSupportDataHazards(&result); +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h new file mode 100644 index 0000000000..510f85bc87 --- /dev/null +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -0,0 +1,46 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + +namespace paddle { +namespace platform { +class NCCLContextMap; +} + +namespace framework { +class Scope; +namespace details { +class MultiDevSSAGraphBuilder : public SSAGraphBuilder { + public: + MultiDevSSAGraphBuilder(const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶ms, + const std::vector &local_scopes, + platform::NCCLContextMap *nccl_ctxs); + + void Build(const ProgramDesc &program, SSAGraph *graph) const override; + + private: + std::string loss_var_name_; + const std::vector &places_; + const std::vector &local_scopes_; + platform::NCCLContextMap *nccl_ctxs_; + std::unordered_set grad_names_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc new file mode 100644 index 0000000000..7a80a4b1e7 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph_builder.h" + +namespace paddle { +namespace framework { +namespace details { +void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { + for (auto &var_map : graph->vars_) { + for (auto &name_pair : var_map) { + if (name_pair.second.size() <= 1) { + return; + } + auto it_new = name_pair.second.rbegin(); + auto it_old = name_pair.second.rbegin(); + ++it_old; + for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { + auto *write_op = it_new->second.generated_op_; + auto &read_ops = it_old->second.pending_ops_; + auto *ex_write_op = it_old->second.generated_op_; + + if (ex_write_op == nullptr) { // Nobody write this var. + continue; + } + + for (auto *read_op : read_ops) { + // Manually add a dependency var from read_op to write_op; + if (read_op == write_op) { + // Read Write is the same op. + continue; + } + + auto *dep_var = new DummyVarHandle(); + read_op->AddOutput(dep_var); + write_op->AddInput(dep_var); + graph->dep_vars_.emplace(dep_var); + } + } + } + } +} + +VarHandle *SSAGraphBuilder::CreateOrGetLatestVarHandle( + SSAGraph *graph, const std::string &each_var_name, + const platform::Place &place, size_t place_offset) { + auto &var_holders = graph->vars_[place_offset]; + auto &var_holder = var_holders[each_var_name]; + VarHandle *var = nullptr; + if (var_holder.empty()) { + auto &init_var = var_holder[0]; + init_var.place_ = place; + init_var.name_ = each_var_name; + init_var.generated_op_ = nullptr; + init_var.version_ = 0; + var = &init_var; + } else { + var = &var_holder.rbegin()->second; + } + return var; +} + +void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, + const std::string &each_var_name, + const platform::Place &place, + size_t place_offset) { + auto &vars = graph->vars_[place_offset][each_var_name]; + size_t version = vars.size(); + auto &var = vars[version]; + var.version_ = version; + var.name_ = each_var_name; + var.place_ = place; + op_handle->AddOutput(&var); +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h new file mode 100644 index 0000000000..848b90293a --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/details/ssa_graph.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/platform/place.h" + +#include + +namespace paddle { +namespace framework { +namespace details { + +class SSAGraphBuilder { + public: + SSAGraphBuilder() {} + virtual ~SSAGraphBuilder() {} + virtual void Build(const ProgramDesc &program, SSAGraph *graph) const = 0; + + DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); + + protected: + /** + * We only handle write after read(WAR), since it should not have a write + * after write in program. If there are write after write operators, we need + * prune them. + * + * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) + */ + static void PolishGraphToSupportDataHazards(SSAGraph *graph); + + static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph, + const std::string &each_var_name, + const platform::Place &place, + size_t place_offset); + + static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, + const std::string &each_var_name, + const platform::Place &place, size_t place_offset); +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 5c10595db9..4ebb89181c 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -16,231 +16,14 @@ limitations under the License. */ #include "ThreadPool.h" #include "lod_tensor.h" #include "op_registry.h" -#include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" -#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" -#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" +#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/ssa_graph.h" +#include "paddle/fluid/platform/nccl_helper.h" namespace paddle { namespace framework { -using details::ComputationOpHandle; -using details::DummyVarHandle; -using details::FetchOpHandle; -using details::NCCLAllReduceOpHandle; -using details::OpHandleBase; -using details::ScaleLossGradOpHandle; -using details::SSAGraph; -using details::VarHandle; -using details::VarHandleBase; - -class SSAGraphBuilder { - public: - virtual ~SSAGraphBuilder() {} - virtual void Build(const ProgramDesc &program, SSAGraph *graph) const = 0; - - protected: - /** - * We only handle write after read(WAR), since it should not have a write - * after write in program. If there are write after write operators, we need - * prune them. - * - * https://en.wikipedia.org/wiki/Hazard_(computer_architecture)#Write_after_read_(WAR) - */ - static void PolishGraphToSupportDataHazards(SSAGraph *graph) { - for (auto &var_map : graph->vars_) { - for (auto &name_pair : var_map) { - if (name_pair.second.size() <= 1) { - return; - } - auto it_new = name_pair.second.rbegin(); - auto it_old = name_pair.second.rbegin(); - ++it_old; - for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { - auto *write_op = it_new->second.generated_op_; - auto &read_ops = it_old->second.pending_ops_; - auto *ex_write_op = it_old->second.generated_op_; - - if (ex_write_op == nullptr) { // Nobody write this var. - continue; - } - - for (auto *read_op : read_ops) { - // Manually add a dependency var from read_op to write_op; - if (read_op == write_op) { - // Read Write is the same op. - continue; - } - - auto *dep_var = new DummyVarHandle(); - read_op->AddOutput(dep_var); - write_op->AddInput(dep_var); - graph->dep_vars_.emplace(dep_var); - } - } - } - } - } - - static VarHandle *CreateOrGetLatestVarHandle(SSAGraph *graph, - const std::string &each_var_name, - const platform::Place &place, - size_t place_offset) { - auto &var_holders = graph->vars_[place_offset]; - auto &var_holder = var_holders[each_var_name]; - VarHandle *var = nullptr; - if (var_holder.empty()) { - auto &init_var = var_holder[0]; - init_var.place_ = place; - init_var.name_ = each_var_name; - init_var.generated_op_ = nullptr; - init_var.version_ = 0; - var = &init_var; - } else { - var = &var_holder.rbegin()->second; - } - return var; - } - - static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, - const std::string &each_var_name, - const platform::Place &place, - size_t place_offset) { - auto &vars = graph->vars_[place_offset][each_var_name]; - size_t version = vars.size(); - auto &var = vars[version]; - var.version_ = version; - var.name_ = each_var_name; - var.place_ = place; - op_handle->AddOutput(&var); - } -}; - -class MultiDevSSAGraphBuilder : public SSAGraphBuilder { - public: - MultiDevSSAGraphBuilder(const std::vector &places, - const std::string &loss_var_name, - const std::unordered_set ¶ms, - const std::vector &local_scopes, - platform::NCCLContextMap *nccl_ctxs) - : loss_var_name_(loss_var_name), - places_(places), - local_scopes_(local_scopes), - nccl_ctxs_(nccl_ctxs) { - for (auto &p : params) { - grad_names_.insert(GradVarName(p)); - } - } - - void Build(const ProgramDesc &program, SSAGraph *graph) const override { - SSAGraph &result = *graph; - result.vars_.resize(places_.size()); - - bool is_forwarding = true; - for (auto *op : program.Block(0).AllOps()) { - bool change_forward = false; - if (!is_forwarding) { - // FIXME(yy): Do not hard code like this - if (op->OutputArgumentNames().size() == 1 && - op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) { - continue; // Drop fill 1. for backward coeff; - } - } - - for (size_t i = 0; i < places_.size(); ++i) { - auto &p = places_[i]; - auto *s = local_scopes_[i]; - - result.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); - auto *op_handle = result.ops_.back().get(); - op_handle->dev_ctx_[p] = const_cast( - platform::DeviceContextPool::Instance().Get(p)); - - auto var_names = op->InputArgumentNames(); - - for (auto &each_var_name : var_names) { - VarHandle *var = - CreateOrGetLatestVarHandle(&result, each_var_name, p, i); - op_handle->AddInput(var); - } - var_names = op->OutputArgumentNames(); - - for (auto &each_var_name : var_names) { - CreateOpOutput(&result, op_handle, each_var_name, p, i); - } - - if (is_forwarding) { - if (var_names.size() == 1 && var_names[0] == loss_var_name_) { - // Insert ScaleCost OpHandle - op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p, - nccl_ctxs_->DevCtx(p)); - result.ops_.emplace_back(op_handle); - - // FIXME: Currently ScaleLossGradOp only use device_count as scale - // factor. So it does not depend on any other operators. - // VarHandle *loss = GetVarHandle(loss_var_name, place); - // loss->pending_ops_.emplace_back(op_handle); - // op_handle->inputs_.emplace_back(loss); - - CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, - i); - change_forward = true; - } - } - } - - if (change_forward) { - is_forwarding = false; - } - - if (!is_forwarding) { - auto var_names = op->OutputArgumentNames(); - for (auto &og : var_names) { - if (grad_names_.count(og) != 0) { // is param grad - // Insert NCCL AllReduce Op - result.ops_.emplace_back( - new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); - auto *op_handle = result.ops_.back().get(); - - for (size_t i = 0; i < places_.size(); ++i) { - auto &p = places_[i]; - auto &vars = result.vars_[i][og]; - - if (vars.empty()) { // This device has no data. continue. - continue; - } - auto *prev_grad = &vars[vars.size() - 1]; - op_handle->AddInput(prev_grad); - - auto &var = vars[vars.size()]; - var.place_ = p; - var.name_ = og; - var.version_ = vars.size() - 1; - - op_handle->AddOutput(&var); - } - } - } - } - } - - /* - Dependency graph has been constructed. However, there are still data - harzaeds need to be handled. - */ - PolishGraphToSupportDataHazards(&result); - } - - private: - std::string loss_var_name_; - const std::vector &places_; - const std::vector &local_scopes_; - platform::NCCLContextMap *nccl_ctxs_; - - std::unordered_set grad_names_; -}; - class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(size_t num_threads, @@ -256,17 +39,17 @@ class ParallelExecutorPrivate { std::unique_ptr nccl_ctxs_; - SSAGraph graph_; + details::SSAGraph graph_; // Use a simpler thread pool, might be faster. std::unique_ptr pool_; std::unique_ptr exception_; - void RunOp( - bool use_event, - std::unordered_map> &pending_vars, - OpHandleBase *op) { + void RunOp(bool use_event, + std::unordered_map> + &pending_vars, + details::OpHandleBase *op) { std::vector *> *ready_buffer = new std::vector *>(); for (auto *var : op->outputs_) { @@ -321,9 +104,9 @@ ParallelExecutor::ParallelExecutor( // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, - member_->local_scopes_, - member_->nccl_ctxs_.get()); + details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, + params, member_->local_scopes_, + member_->nccl_ctxs_.get()); builder.Build(main_program, &member_->graph_); // Step 3. Create vars in each scope; @@ -389,9 +172,9 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, FeedFetchList fetched_data(fetch_tensors.size()); // Version --> VarHandle member_->exception_.reset(); - std::unordered_map> pending_vars; - std::unordered_map pending_ops; - std::vector dummy_vars; + std::unordered_map> pending_vars; + std::unordered_map pending_ops; + std::vector dummy_vars; for (auto &var_map : member_->graph_.vars_) { for (auto &name_pair : var_map) { @@ -406,7 +189,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, pending_vars[var.get()] = var->generated_op_ == nullptr; } - std::vector to_run; + std::vector to_run; for (auto &op : member_->graph_.ops_) { if (op->inputs_.empty()) { // Special case, Op has no input. @@ -416,7 +199,8 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } } - std::unordered_map> fetched_vars; + std::unordered_map> + fetched_vars; for (auto &fetch_var_name : fetch_tensors) { for (auto &var_map : member_->graph_.vars_) { @@ -427,13 +211,13 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } } - std::vector fetch_ops; + std::vector fetch_ops; for (size_t i = 0; i < fetch_tensors.size(); ++i) { auto &var_name = fetch_tensors[i]; auto &vars = fetched_vars[var_name]; fetch_ops.emplace_back(&fetched_data, i, &member_->local_scopes_); - FetchOpHandle *op = &fetch_ops.back(); + details::FetchOpHandle *op = &fetch_ops.back(); // FIXME: Use new device context for (auto &p : member_->places_) { @@ -457,7 +241,7 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } while (!pending_vars.empty()) { - VarHandleBase *ready_var = nullptr; + details::VarHandleBase *ready_var = nullptr; for (auto &pair : pending_vars) { if (pair.second.load(std::memory_order_acquire)) { ready_var = pair.first; From 4c3361cda826f9ca2e5c96637b1481211f2bba63 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sat, 24 Mar 2018 13:39:57 +0800 Subject: [PATCH 173/314] Extract GraphExecutor --- paddle/fluid/framework/parallel_executor.cc | 323 ++++++++++++-------- 1 file changed, 194 insertions(+), 129 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 4ebb89181c..78ef66be51 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -24,42 +24,184 @@ limitations under the License. */ namespace paddle { namespace framework { -class ParallelExecutorPrivate { +using details::DummyVarHandle; +using details::FetchOpHandle; +using details::OpHandleBase; +using details::SSAGraph; +using details::VarHandleBase; + +class SSAGraphExecutor { + DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); + public: - explicit ParallelExecutorPrivate(size_t num_threads, - const std::vector &places) - : places_(places), - fetch_dev_ctxs_(places), - pool_(num_threads <= 1 ? nullptr : new ThreadPool(num_threads)) {} + explicit SSAGraphExecutor(SSAGraph *graph) : graph_(*graph) {} - std::vector places_; - platform::DeviceContextPool fetch_dev_ctxs_; - std::vector local_scopes_; - Scope *global_scope_; + virtual ~SSAGraphExecutor() {} - std::unique_ptr nccl_ctxs_; + virtual void Run(Scope *global_scope, + const std::vector &fetch_tensors, + const std::string &fetch_list_name) = 0; - details::SSAGraph graph_; + protected: + SSAGraph &graph_; +}; - // Use a simpler thread pool, might be faster. - std::unique_ptr pool_; +class ThreadedSSAGraphExecutor : public SSAGraphExecutor { + public: + ThreadedSSAGraphExecutor(size_t num_threads, bool use_event, + const std::vector &local_scopes, + const std::vector &places, + SSAGraph *graph) + : SSAGraphExecutor(graph), + pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr), + local_scopes_(local_scopes), + places_(places), + fetch_ctxs_(places), + use_event_(use_event) {} + + void Run(Scope *global_scope, const std::vector &fetch_tensors, + const std::string &fetch_list_name) override { + std::unordered_map pending_ops; + std::unordered_map> pending_vars; + std::unordered_set ready_ops; + + auto InsertPendingVar = [&pending_vars](VarHandleBase &var) { + pending_vars[&var] = var.generated_op_ == nullptr; + }; - std::unique_ptr exception_; + auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) { + pending_ops.insert({&op_instance, op_instance.inputs_.size()}); + }; + + // Transform SSAGraph to pending_ops & pending_vars + for (auto &var_map : graph_.vars_) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + InsertPendingVar(version_pair.second); + } + } + } + for (auto &var : graph_.dep_vars_) { + InsertPendingVar(*var); + } + + for (auto &op : graph_.ops_) { + if (op->inputs_.empty()) { // Special case, Op has no input. + ready_ops.insert(op.get()); + } else { + InsertPendingOp(*op); + } + } + + // Step 2. Insert FetchOps + std::vector fetch_ops; + std::vector dummy_vars; + FeedFetchList fetch_data(fetch_tensors.size()); + + std::unordered_map> fetched_vars; + + for (auto &fetch_var_name : fetch_tensors) { + for (auto &var_map : graph_.vars_) { + auto it = var_map.find(fetch_var_name); + if (it != var_map.end()) { + fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); + } + } + } - void RunOp(bool use_event, - std::unordered_map> - &pending_vars, - details::OpHandleBase *op) { + for (size_t i = 0; i < fetch_tensors.size(); ++i) { + auto &var_name = fetch_tensors[i]; + auto &vars = fetched_vars[var_name]; + fetch_ops.emplace_back(&fetch_data, i, &local_scopes_); + details::FetchOpHandle *op = &fetch_ops.back(); + + // FIXME: Use new device context + for (auto &p : places_) { + op->dev_ctx_[p] = fetch_ctxs_.Get(p); + } + + for (auto *var : vars) { + op->AddInput(var); + } + + dummy_vars.emplace_back(); + auto *var = &dummy_vars.back(); + var->generated_op_ = nullptr; + op->AddOutput(var); + InsertPendingVar(*var); + InsertPendingOp(*op); + } + + auto run_all_ready_ops = [&] { + for (auto *op : ready_ops) { + RunOp(pending_vars, op); + } + ready_ops.clear(); + }; + + // Step 3. Execution + while (!pending_vars.empty()) { + // 1. Run All Ready ops + run_all_ready_ops(); + + // 2. Find ready variable + VarHandleBase *ready_var = nullptr; + for (auto &pair : pending_vars) { + if (pair.second.load(std::memory_order_acquire)) { + ready_var = pair.first; + break; + } + } + + // if there is no variable ready + if (ready_var == nullptr) { + // FIXME use conditional var instead of busy wait. + // if there is an exception, throw it + if (exception_) { + throw * exception_; + } + // keep waiting the ready variables + continue; + } + + // 3. Remove the dependency of ready_var. + // Find the ready_ops after the ready_var. + pending_vars.erase(ready_var); + for (auto *op : ready_var->pending_ops_) { + auto &deps = pending_ops[op]; + --deps; + if (deps == 0) { + ready_ops.insert(op); + } + } + // Keep loop until all vars are ready. + } + + // Wait FetchOps. + for (auto &fetch_op : fetch_ops) { + fetch_op.WaitAndMergeCPUTensors(); + } + + *global_scope->Var(fetch_list_name)->GetMutable() = + fetch_data; + } + + ~ThreadedSSAGraphExecutor() {} + + private: + void RunOp( + std::unordered_map> &pending_vars, + details::OpHandleBase *op) { std::vector *> *ready_buffer = new std::vector *>(); for (auto *var : op->outputs_) { ready_buffer->emplace_back(&pending_vars[var]); } - auto op_run = [ready_buffer, op, this, use_event] { + auto op_run = [ready_buffer, op, this] { try { VLOG(10) << op->DebugString(); - op->Run(use_event); + op->Run(use_event_); for (auto *ready : *ready_buffer) { ready->store(true, std::memory_order_release); } @@ -76,6 +218,31 @@ class ParallelExecutorPrivate { op_run(); } } + + private: + std::unique_ptr<::ThreadPool> pool_; + std::vector local_scopes_; + std::vector places_; + platform::DeviceContextPool fetch_ctxs_; + const bool use_event_; + std::unique_ptr exception_; +}; + +class ParallelExecutorPrivate { + public: + explicit ParallelExecutorPrivate(const std::vector &places) + : places_(places), fetch_dev_ctxs_(places) {} + + std::vector places_; + platform::DeviceContextPool fetch_dev_ctxs_; + std::vector local_scopes_; + Scope *global_scope_; + + std::unique_ptr nccl_ctxs_; + + details::SSAGraph graph_; + + std::unique_ptr executor_; }; ParallelExecutor::ParallelExecutor( @@ -83,7 +250,7 @@ ParallelExecutor::ParallelExecutor( const std::unordered_set ¶ms, const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope) - : member_(new ParallelExecutorPrivate(num_threads, places)) { + : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; // Step 1. RunStartupProgram and Bcast the params to devs. @@ -109,6 +276,9 @@ ParallelExecutor::ParallelExecutor( member_->nccl_ctxs_.get()); builder.Build(main_program, &member_->graph_); + member_->executor_.reset(new ThreadedSSAGraphExecutor( + num_threads, true, member_->local_scopes_, places, &member_->graph_)); + // Step 3. Create vars in each scope; for (auto *scope : member_->local_scopes_) { for (auto *var : main_program.Block(0).AllVars()) { @@ -168,113 +338,8 @@ void ParallelExecutor::BuildNCCLCommunicator() const { void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { - bool use_event = true; - FeedFetchList fetched_data(fetch_tensors.size()); - // Version --> VarHandle - member_->exception_.reset(); - std::unordered_map> pending_vars; - std::unordered_map pending_ops; - std::vector dummy_vars; - - for (auto &var_map : member_->graph_.vars_) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - pending_vars[&version_pair.second] = - version_pair.second.generated_op_ == nullptr; - } - } - } - - for (auto &var : member_->graph_.dep_vars_) { - pending_vars[var.get()] = var->generated_op_ == nullptr; - } - - std::vector to_run; - - for (auto &op : member_->graph_.ops_) { - if (op->inputs_.empty()) { // Special case, Op has no input. - to_run.emplace_back(op.get()); - } else { - pending_ops.insert({op.get(), op->inputs_.size()}); - } - } - - std::unordered_map> - fetched_vars; - - for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : member_->graph_.vars_) { - auto it = var_map.find(fetch_var_name); - if (it != var_map.end()) { - fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); - } - } - } - - std::vector fetch_ops; - - for (size_t i = 0; i < fetch_tensors.size(); ++i) { - auto &var_name = fetch_tensors[i]; - auto &vars = fetched_vars[var_name]; - fetch_ops.emplace_back(&fetched_data, i, &member_->local_scopes_); - details::FetchOpHandle *op = &fetch_ops.back(); - - // FIXME: Use new device context - for (auto &p : member_->places_) { - op->dev_ctx_[p] = member_->fetch_dev_ctxs_.Get(p); - } - - for (auto *var : vars) { - op->AddInput(var); - } - - dummy_vars.emplace_back(); - auto *var = &dummy_vars.back(); - op->AddOutput(var); - pending_vars[var] = false; - - pending_ops.insert({op, op->inputs_.size()}); - } - - for (auto *op : to_run) { - member_->RunOp(use_event, pending_vars, op); - } - - while (!pending_vars.empty()) { - details::VarHandleBase *ready_var = nullptr; - for (auto &pair : pending_vars) { - if (pair.second.load(std::memory_order_acquire)) { - ready_var = pair.first; - } - } - if (ready_var == nullptr) { - // FIXME use conditional var instead of busy wait. - if (member_->exception_) { - throw * member_->exception_; - } - continue; - } - pending_vars.erase(ready_var); - to_run.clear(); - for (auto *op : ready_var->pending_ops_) { - auto &deps = pending_ops[op]; - --deps; - if (deps == 0) { - to_run.emplace_back(op); - } - } - for (auto *op : to_run) { - pending_ops.erase(op); - member_->RunOp(use_event, pending_vars, op); - } - } - - for (auto &fetch_op : fetch_ops) { - fetch_op.WaitAndMergeCPUTensors(); - } - - *member_->global_scope_->Var(fetched_var_name)->GetMutable() = - fetched_data; + member_->executor_->Run(member_->global_scope_, fetch_tensors, + fetched_var_name); } } // namespace framework From c70b60dd70d41a349a6ed4d5aad9a60facc49c60 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sat, 24 Mar 2018 13:56:52 +0800 Subject: [PATCH 174/314] Make executor steal graph inside --- .../details/multi_devices_graph_builder.cc | 7 +++- .../details/multi_devices_graph_builder.h | 2 +- .../framework/details/ssa_graph_builder.h | 3 +- paddle/fluid/framework/parallel_executor.cc | 41 +++++++++---------- 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 3fab6adf0f..b27647a8ee 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -37,8 +37,9 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( } } -void MultiDevSSAGraphBuilder::Build(const ProgramDesc &program, - SSAGraph *graph) const { +std::unique_ptr MultiDevSSAGraphBuilder::Build( + const ProgramDesc &program) const { + auto graph = new SSAGraph(); SSAGraph &result = *graph; result.vars_.resize(places_.size()); @@ -134,6 +135,8 @@ void MultiDevSSAGraphBuilder::Build(const ProgramDesc &program, harzaeds need to be handled. */ PolishGraphToSupportDataHazards(&result); + + return std::unique_ptr(graph); } } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 510f85bc87..17959a94d6 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -32,7 +32,7 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { const std::vector &local_scopes, platform::NCCLContextMap *nccl_ctxs); - void Build(const ProgramDesc &program, SSAGraph *graph) const override; + std::unique_ptr Build(const ProgramDesc &program) const override; private: std::string loss_var_name_; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index 848b90293a..df05bb7394 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -18,6 +18,7 @@ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/place.h" +#include #include namespace paddle { @@ -28,7 +29,7 @@ class SSAGraphBuilder { public: SSAGraphBuilder() {} virtual ~SSAGraphBuilder() {} - virtual void Build(const ProgramDesc &program, SSAGraph *graph) const = 0; + virtual std::unique_ptr Build(const ProgramDesc &program) const = 0; DISABLE_COPY_AND_ASSIGN(SSAGraphBuilder); diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 78ef66be51..88070a06a2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -34,16 +34,16 @@ class SSAGraphExecutor { DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); public: - explicit SSAGraphExecutor(SSAGraph *graph) : graph_(*graph) {} + // Steal graph inside + explicit SSAGraphExecutor(std::unique_ptr &&graph) + : graph_(std::move(graph)) {} virtual ~SSAGraphExecutor() {} - virtual void Run(Scope *global_scope, - const std::vector &fetch_tensors, - const std::string &fetch_list_name) = 0; + virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0; protected: - SSAGraph &graph_; + std::unique_ptr graph_; }; class ThreadedSSAGraphExecutor : public SSAGraphExecutor { @@ -51,16 +51,17 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ThreadedSSAGraphExecutor(size_t num_threads, bool use_event, const std::vector &local_scopes, const std::vector &places, - SSAGraph *graph) - : SSAGraphExecutor(graph), + std::unique_ptr &&graph) + : SSAGraphExecutor(std::move(graph)), pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr), local_scopes_(local_scopes), places_(places), fetch_ctxs_(places), use_event_(use_event) {} - void Run(Scope *global_scope, const std::vector &fetch_tensors, - const std::string &fetch_list_name) override { + // Run a SSAGraph by a thread pool + // Use topological sort algorithm + FeedFetchList Run(const std::vector &fetch_tensors) override { std::unordered_map pending_ops; std::unordered_map> pending_vars; std::unordered_set ready_ops; @@ -74,18 +75,18 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { }; // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_.vars_) { + for (auto &var_map : graph_->vars_) { for (auto &name_pair : var_map) { for (auto &version_pair : name_pair.second) { InsertPendingVar(version_pair.second); } } } - for (auto &var : graph_.dep_vars_) { + for (auto &var : graph_->dep_vars_) { InsertPendingVar(*var); } - for (auto &op : graph_.ops_) { + for (auto &op : graph_->ops_) { if (op->inputs_.empty()) { // Special case, Op has no input. ready_ops.insert(op.get()); } else { @@ -101,7 +102,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { std::unordered_map> fetched_vars; for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : graph_.vars_) { + for (auto &var_map : graph_->vars_) { auto it = var_map.find(fetch_var_name); if (it != var_map.end()) { fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); @@ -182,8 +183,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { fetch_op.WaitAndMergeCPUTensors(); } - *global_scope->Var(fetch_list_name)->GetMutable() = - fetch_data; + return fetch_data; } ~ThreadedSSAGraphExecutor() {} @@ -240,8 +240,6 @@ class ParallelExecutorPrivate { std::unique_ptr nccl_ctxs_; - details::SSAGraph graph_; - std::unique_ptr executor_; }; @@ -274,10 +272,10 @@ ParallelExecutor::ParallelExecutor( details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, member_->local_scopes_, member_->nccl_ctxs_.get()); - builder.Build(main_program, &member_->graph_); + auto graph = builder.Build(main_program); member_->executor_.reset(new ThreadedSSAGraphExecutor( - num_threads, true, member_->local_scopes_, places, &member_->graph_)); + num_threads, true, member_->local_scopes_, places, std::move(graph))); // Step 3. Create vars in each scope; for (auto *scope : member_->local_scopes_) { @@ -338,8 +336,9 @@ void ParallelExecutor::BuildNCCLCommunicator() const { void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { - member_->executor_->Run(member_->global_scope_, fetch_tensors, - fetched_var_name); + auto fetch_data = member_->executor_->Run(fetch_tensors); + *member_->global_scope_->Var(fetched_var_name)->GetMutable() = + fetch_data; } } // namespace framework From e3144393e3b6e0d74506f8b996c8b2931eb9641e Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sat, 24 Mar 2018 14:15:20 +0800 Subject: [PATCH 175/314] Extract Executors to indie modules --- paddle/fluid/framework/CMakeLists.txt | 3 +- paddle/fluid/framework/details/CMakeLists.txt | 3 + .../framework/details/ssa_graph_executor.cc | 28 +++ .../framework/details/ssa_graph_executor.h | 41 ++++ .../details/threaded_ssa_graph_executor.cc | 192 +++++++++++++++ .../details/threaded_ssa_graph_executor.h | 55 +++++ paddle/fluid/framework/parallel_executor.cc | 219 +----------------- 7 files changed, 327 insertions(+), 214 deletions(-) create mode 100644 paddle/fluid/framework/details/ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/ssa_graph_executor.h create mode 100644 paddle/fluid/framework/details/threaded_ssa_graph_executor.cc create mode 100644 paddle/fluid/framework/details/threaded_ssa_graph_executor.h diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index d3f69ee9d8..c425c71160 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -89,8 +89,7 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog lod_rank_table feed_fetch_method) -cc_library(parallel_executor SRCS parallel_executor.cc DEPS op_registry device_context scope - backward glog lod_rank_table simple_threadpool multi_devices_graph_builder fetch_op_handle) +cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 4432bc0245..f13ac276fc 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -10,3 +10,6 @@ cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle nccl_all_reduce_op_handle scale_loss_grad_op_handle) +cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph) +cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope + simple_threadpool device_context) diff --git a/paddle/fluid/framework/details/ssa_graph_executor.cc b/paddle/fluid/framework/details/ssa_graph_executor.cc new file mode 100644 index 0000000000..8da6ca889b --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_executor.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/ssa_graph_executor.h" + +namespace paddle { +namespace framework { +namespace details { + +SSAGraphExecutor::SSAGraphExecutor(std::unique_ptr &&graph) + : graph_(std::move(graph)) {} + +SSAGraphExecutor::~SSAGraphExecutor() {} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_executor.h b/paddle/fluid/framework/details/ssa_graph_executor.h new file mode 100644 index 0000000000..3b818b1a45 --- /dev/null +++ b/paddle/fluid/framework/details/ssa_graph_executor.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/details/ssa_graph.h" +#include "paddle/fluid/framework/feed_fetch_type.h" + +namespace paddle { +namespace framework { +namespace details { + +class SSAGraphExecutor { + DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); + + public: + // Steal graph inside + explicit SSAGraphExecutor(std::unique_ptr &&graph); + + virtual ~SSAGraphExecutor(); + + virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0; + + protected: + std::unique_ptr graph_; +}; +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc new file mode 100644 index 0000000000..86e880ed72 --- /dev/null +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -0,0 +1,192 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + +#include "paddle/fluid/framework/details/fetch_op_handle.h" +#include "paddle/fluid/framework/scope.h" + +namespace paddle { +namespace framework { +namespace details { +ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( + size_t num_threads, bool use_event, + const std::vector &local_scopes, + const std::vector &places, + std::unique_ptr &&graph) + : SSAGraphExecutor(std::move(graph)), + pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr), + local_scopes_(local_scopes), + places_(places), + fetch_ctxs_(places), + use_event_(use_event) {} + +FeedFetchList ThreadedSSAGraphExecutor::Run( + const std::vector &fetch_tensors) { + std::unordered_map pending_ops; + std::unordered_map> pending_vars; + std::unordered_set ready_ops; + + auto InsertPendingVar = [&pending_vars](VarHandleBase &var) { + pending_vars[&var] = var.generated_op_ == nullptr; + }; + + auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) { + pending_ops.insert({&op_instance, op_instance.inputs_.size()}); + }; + + // Transform SSAGraph to pending_ops & pending_vars + for (auto &var_map : graph_->vars_) { + for (auto &name_pair : var_map) { + for (auto &version_pair : name_pair.second) { + InsertPendingVar(version_pair.second); + } + } + } + for (auto &var : graph_->dep_vars_) { + InsertPendingVar(*var); + } + + for (auto &op : graph_->ops_) { + if (op->inputs_.empty()) { // Special case, Op has no input. + ready_ops.insert(op.get()); + } else { + InsertPendingOp(*op); + } + } + + // Step 2. Insert FetchOps + std::vector fetch_ops; + std::vector dummy_vars; + FeedFetchList fetch_data(fetch_tensors.size()); + + std::unordered_map> fetched_vars; + + for (auto &fetch_var_name : fetch_tensors) { + for (auto &var_map : graph_->vars_) { + auto it = var_map.find(fetch_var_name); + if (it != var_map.end()) { + fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); + } + } + } + + for (size_t i = 0; i < fetch_tensors.size(); ++i) { + auto &var_name = fetch_tensors[i]; + auto &vars = fetched_vars[var_name]; + fetch_ops.emplace_back(&fetch_data, i, &local_scopes_); + details::FetchOpHandle *op = &fetch_ops.back(); + + // FIXME: Use new device context + for (auto &p : places_) { + op->dev_ctx_[p] = fetch_ctxs_.Get(p); + } + + for (auto *var : vars) { + op->AddInput(var); + } + + dummy_vars.emplace_back(); + auto *var = &dummy_vars.back(); + var->generated_op_ = nullptr; + op->AddOutput(var); + InsertPendingVar(*var); + InsertPendingOp(*op); + } + + auto run_all_ready_ops = [&] { + for (auto *op : ready_ops) { + RunOp(pending_vars, op); + } + ready_ops.clear(); + }; + + // Step 3. Execution + while (!pending_vars.empty()) { + // 1. Run All Ready ops + run_all_ready_ops(); + + // 2. Find ready variable + VarHandleBase *ready_var = nullptr; + for (auto &pair : pending_vars) { + if (pair.second.load(std::memory_order_acquire)) { + ready_var = pair.first; + break; + } + } + + // if there is no variable ready + if (ready_var == nullptr) { + // FIXME use conditional var instead of busy wait. + // if there is an exception, throw it + if (exception_) { + throw * exception_; + } + // keep waiting the ready variables + continue; + } + + // 3. Remove the dependency of ready_var. + // Find the ready_ops after the ready_var. + pending_vars.erase(ready_var); + for (auto *op : ready_var->pending_ops_) { + auto &deps = pending_ops[op]; + --deps; + if (deps == 0) { + ready_ops.insert(op); + } + } + // Keep loop until all vars are ready. + } + + // Wait FetchOps. + for (auto &fetch_op : fetch_ops) { + fetch_op.WaitAndMergeCPUTensors(); + } + + return fetch_data; +} + +void ThreadedSSAGraphExecutor::RunOp( + std::unordered_map> &pending_vars, + details::OpHandleBase *op) { + std::vector *> *ready_buffer = + new std::vector *>(); + for (auto *var : op->outputs_) { + ready_buffer->emplace_back(&pending_vars[var]); + } + + auto op_run = [ready_buffer, op, this] { + try { + VLOG(10) << op->DebugString(); + op->Run(use_event_); + for (auto *ready : *ready_buffer) { + ready->store(true, std::memory_order_release); + } + delete ready_buffer; + } catch (platform::EnforceNotMet ex) { + exception_.reset(new platform::EnforceNotMet(ex)); + } catch (...) { + LOG(FATAL) << "Unknown exception catched"; + } + }; + if (pool_) { + pool_->enqueue(op_run); + } else { + op_run(); + } +} +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h new file mode 100644 index 0000000000..5b099c18c9 --- /dev/null +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -0,0 +1,55 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "ThreadPool.h" // ThreadPool in thrird party +#include "paddle/fluid/framework/details/ssa_graph_executor.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace details { + +class ThreadedSSAGraphExecutor : public SSAGraphExecutor { + public: + ThreadedSSAGraphExecutor(size_t num_threads, bool use_event, + const std::vector &local_scopes, + const std::vector &places, + std::unique_ptr &&graph); + + // Run a SSAGraph by a thread pool + // Use topological sort algorithm + FeedFetchList Run(const std::vector &fetch_tensors) override; + + ~ThreadedSSAGraphExecutor() {} + + private: + void RunOp( + std::unordered_map> &pending_vars, + details::OpHandleBase *op); + + private: + std::unique_ptr<::ThreadPool> pool_; + std::vector local_scopes_; + std::vector places_; + platform::DeviceContextPool fetch_ctxs_; + const bool use_event_; + std::unique_ptr exception_; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 88070a06a2..78963fd568 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,221 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" + #include "ThreadPool.h" -#include "lod_tensor.h" -#include "op_registry.h" -#include "paddle/fluid/framework/details/fetch_op_handle.h" -#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" -#include "paddle/fluid/framework/details/ssa_graph.h" + #include "paddle/fluid/platform/nccl_helper.h" +#include "paddle/fluid/framework/details/multi_devices_graph_builder.h" +#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" + namespace paddle { namespace framework { -using details::DummyVarHandle; -using details::FetchOpHandle; -using details::OpHandleBase; -using details::SSAGraph; -using details::VarHandleBase; - -class SSAGraphExecutor { - DISABLE_COPY_AND_ASSIGN(SSAGraphExecutor); - - public: - // Steal graph inside - explicit SSAGraphExecutor(std::unique_ptr &&graph) - : graph_(std::move(graph)) {} - - virtual ~SSAGraphExecutor() {} - - virtual FeedFetchList Run(const std::vector &fetch_tensors) = 0; - - protected: - std::unique_ptr graph_; -}; - -class ThreadedSSAGraphExecutor : public SSAGraphExecutor { - public: - ThreadedSSAGraphExecutor(size_t num_threads, bool use_event, - const std::vector &local_scopes, - const std::vector &places, - std::unique_ptr &&graph) - : SSAGraphExecutor(std::move(graph)), - pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr), - local_scopes_(local_scopes), - places_(places), - fetch_ctxs_(places), - use_event_(use_event) {} - - // Run a SSAGraph by a thread pool - // Use topological sort algorithm - FeedFetchList Run(const std::vector &fetch_tensors) override { - std::unordered_map pending_ops; - std::unordered_map> pending_vars; - std::unordered_set ready_ops; - - auto InsertPendingVar = [&pending_vars](VarHandleBase &var) { - pending_vars[&var] = var.generated_op_ == nullptr; - }; - - auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) { - pending_ops.insert({&op_instance, op_instance.inputs_.size()}); - }; - - // Transform SSAGraph to pending_ops & pending_vars - for (auto &var_map : graph_->vars_) { - for (auto &name_pair : var_map) { - for (auto &version_pair : name_pair.second) { - InsertPendingVar(version_pair.second); - } - } - } - for (auto &var : graph_->dep_vars_) { - InsertPendingVar(*var); - } - - for (auto &op : graph_->ops_) { - if (op->inputs_.empty()) { // Special case, Op has no input. - ready_ops.insert(op.get()); - } else { - InsertPendingOp(*op); - } - } - - // Step 2. Insert FetchOps - std::vector fetch_ops; - std::vector dummy_vars; - FeedFetchList fetch_data(fetch_tensors.size()); - - std::unordered_map> fetched_vars; - - for (auto &fetch_var_name : fetch_tensors) { - for (auto &var_map : graph_->vars_) { - auto it = var_map.find(fetch_var_name); - if (it != var_map.end()) { - fetched_vars[fetch_var_name].push_back(&it->second.rbegin()->second); - } - } - } - - for (size_t i = 0; i < fetch_tensors.size(); ++i) { - auto &var_name = fetch_tensors[i]; - auto &vars = fetched_vars[var_name]; - fetch_ops.emplace_back(&fetch_data, i, &local_scopes_); - details::FetchOpHandle *op = &fetch_ops.back(); - - // FIXME: Use new device context - for (auto &p : places_) { - op->dev_ctx_[p] = fetch_ctxs_.Get(p); - } - - for (auto *var : vars) { - op->AddInput(var); - } - - dummy_vars.emplace_back(); - auto *var = &dummy_vars.back(); - var->generated_op_ = nullptr; - op->AddOutput(var); - InsertPendingVar(*var); - InsertPendingOp(*op); - } - - auto run_all_ready_ops = [&] { - for (auto *op : ready_ops) { - RunOp(pending_vars, op); - } - ready_ops.clear(); - }; - - // Step 3. Execution - while (!pending_vars.empty()) { - // 1. Run All Ready ops - run_all_ready_ops(); - - // 2. Find ready variable - VarHandleBase *ready_var = nullptr; - for (auto &pair : pending_vars) { - if (pair.second.load(std::memory_order_acquire)) { - ready_var = pair.first; - break; - } - } - - // if there is no variable ready - if (ready_var == nullptr) { - // FIXME use conditional var instead of busy wait. - // if there is an exception, throw it - if (exception_) { - throw * exception_; - } - // keep waiting the ready variables - continue; - } - - // 3. Remove the dependency of ready_var. - // Find the ready_ops after the ready_var. - pending_vars.erase(ready_var); - for (auto *op : ready_var->pending_ops_) { - auto &deps = pending_ops[op]; - --deps; - if (deps == 0) { - ready_ops.insert(op); - } - } - // Keep loop until all vars are ready. - } - - // Wait FetchOps. - for (auto &fetch_op : fetch_ops) { - fetch_op.WaitAndMergeCPUTensors(); - } - - return fetch_data; - } - - ~ThreadedSSAGraphExecutor() {} - - private: - void RunOp( - std::unordered_map> &pending_vars, - details::OpHandleBase *op) { - std::vector *> *ready_buffer = - new std::vector *>(); - for (auto *var : op->outputs_) { - ready_buffer->emplace_back(&pending_vars[var]); - } - - auto op_run = [ready_buffer, op, this] { - try { - VLOG(10) << op->DebugString(); - op->Run(use_event_); - for (auto *ready : *ready_buffer) { - ready->store(true, std::memory_order_release); - } - delete ready_buffer; - } catch (platform::EnforceNotMet ex) { - exception_.reset(new platform::EnforceNotMet(ex)); - } catch (...) { - LOG(FATAL) << "Unknown exception catched"; - } - }; - if (pool_) { - pool_->enqueue(op_run); - } else { - op_run(); - } - } - - private: - std::unique_ptr<::ThreadPool> pool_; - std::vector local_scopes_; - std::vector places_; - platform::DeviceContextPool fetch_ctxs_; - const bool use_event_; - std::unique_ptr exception_; -}; - class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) @@ -239,8 +35,7 @@ class ParallelExecutorPrivate { Scope *global_scope_; std::unique_ptr nccl_ctxs_; - - std::unique_ptr executor_; + std::unique_ptr executor_; }; ParallelExecutor::ParallelExecutor( @@ -274,7 +69,7 @@ ParallelExecutor::ParallelExecutor( member_->nccl_ctxs_.get()); auto graph = builder.Build(main_program); - member_->executor_.reset(new ThreadedSSAGraphExecutor( + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( num_threads, true, member_->local_scopes_, places, std::move(graph))); // Step 3. Create vars in each scope; From a7b0d5bd26c03cc79deb1c36e061b91fafdd9897 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sat, 24 Mar 2018 14:23:03 +0800 Subject: [PATCH 176/314] Clean code --- paddle/fluid/framework/parallel_executor.cc | 19 ++++++++----------- paddle/fluid/framework/parallel_executor.h | 4 ++-- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 78963fd568..dc17f6a21f 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -27,15 +27,16 @@ namespace framework { class ParallelExecutorPrivate { public: explicit ParallelExecutorPrivate(const std::vector &places) - : places_(places), fetch_dev_ctxs_(places) {} + : places_(places) {} std::vector places_; - platform::DeviceContextPool fetch_dev_ctxs_; std::vector local_scopes_; Scope *global_scope_; + std::unique_ptr executor_; +#ifdef PADDLE_WITH_CUDA std::unique_ptr nccl_ctxs_; - std::unique_ptr executor_; +#endif }; ParallelExecutor::ParallelExecutor( @@ -54,8 +55,10 @@ ParallelExecutor::ParallelExecutor( member_->local_scopes_.push_back(&scope->NewScope()); } - // Bcast Parameters to all GPUs - BuildNCCLCommunicator(); +// Bcast Parameters to all GPUs +#ifdef PADDLE_WITH_CUDA + member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); +#endif if (platform::is_gpu_place(places[0]) && member_->local_scopes_.size() != 1) { // Is CUDA BCastParamsToGPUs(startup_program); @@ -123,12 +126,6 @@ void ParallelExecutor::BCastParamsToGPUs( #endif } -void ParallelExecutor::BuildNCCLCommunicator() const { -#ifdef PADDLE_WITH_CUDA - member_->nccl_ctxs_.reset(new platform::NCCLContextMap(member_->places_)); -#endif -} - void ParallelExecutor::Run(const std::vector &fetch_tensors, const std::string &fetched_var_name) { auto fetch_data = member_->executor_->Run(fetch_tensors); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 39a1c51b9e..14489a18c3 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -31,6 +31,8 @@ namespace framework { class ParallelExecutorPrivate; class ParallelExecutor { + DISABLE_COPY_AND_ASSIGN(ParallelExecutor); + public: explicit ParallelExecutor(size_t num_threads, const std::vector& places, @@ -46,8 +48,6 @@ class ParallelExecutor { ParallelExecutorPrivate* member_; void BCastParamsToGPUs(const ProgramDesc& startup_program) const; - - void BuildNCCLCommunicator() const; }; } // namespace framework From edfd741e3aac8ebaf6a6bad2204c66c67512818b Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Sat, 24 Mar 2018 15:00:43 +0800 Subject: [PATCH 177/314] Add simple python wrapper for ParallelExecutor --- paddle/fluid/framework/parallel_executor.cc | 6 +- paddle/fluid/framework/parallel_executor.h | 2 +- paddle/fluid/pybind/pybind.cc | 8 +- python/paddle/fluid/__init__.py | 2 + python/paddle/fluid/parallel_executor.py | 62 +++++++++++ .../tests/unittests/test_parallel_executor.py | 105 +++++++++++------- 6 files changed, 137 insertions(+), 48 deletions(-) create mode 100644 python/paddle/fluid/parallel_executor.py diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index dc17f6a21f..d1e1f0ed23 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -40,7 +40,8 @@ class ParallelExecutorPrivate { }; ParallelExecutor::ParallelExecutor( - size_t num_threads, const std::vector &places, + size_t num_threads, bool use_event, + const std::vector &places, const std::unordered_set ¶ms, const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope) @@ -73,7 +74,8 @@ ParallelExecutor::ParallelExecutor( auto graph = builder.Build(main_program); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - num_threads, true, member_->local_scopes_, places, std::move(graph))); + num_threads, use_event, member_->local_scopes_, places, + std::move(graph))); // Step 3. Create vars in each scope; for (auto *scope : member_->local_scopes_) { diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 14489a18c3..8bc09c5798 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -34,7 +34,7 @@ class ParallelExecutor { DISABLE_COPY_AND_ASSIGN(ParallelExecutor); public: - explicit ParallelExecutor(size_t num_threads, + explicit ParallelExecutor(size_t num_threads, bool use_event, const std::vector& places, const std::unordered_set& params, const ProgramDesc& startup_program, diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 60662244cc..e1b1bbec97 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -499,15 +499,15 @@ All parameter, weight, gradient are variables in Paddle. py::class_(m, "ParallelExecutor") .def("__init__", - [](ParallelExecutor &self, size_t num_threads, + [](ParallelExecutor &self, size_t num_threads, bool use_event, const std::vector &places, const std::unordered_set ¶ms, const ProgramDesc &startup_program, const ProgramDesc &main_program, const std::string &loss_var_name, Scope *scope) { - new (&self) - ParallelExecutor(num_threads, places, params, startup_program, - main_program, loss_var_name, scope); + new (&self) ParallelExecutor(num_threads, use_event, places, + params, startup_program, main_program, + loss_var_name, scope); }) .def("run", &ParallelExecutor::Run); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index fcea282204..5ea4d977f4 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -41,6 +41,7 @@ from memory_optimization_transpiler import memory_optimize, release_memory import profiler import unique_name import recordio_writer +from parallel_executor import ParallelExecutor Tensor = LoDTensor @@ -68,6 +69,7 @@ __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + [ 'profiler', 'unique_name', 'recordio_writer', + 'ParallelExecutor', ] diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py new file mode 100644 index 0000000000..5e0588fa73 --- /dev/null +++ b/python/paddle/fluid/parallel_executor.py @@ -0,0 +1,62 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import core +import multiprocessing +import framework +import executor + +__all__ = ['ParallelExecutor'] + + +class ParallelExecutor(object): + def __init__(self, loss_name, use_cuda, num_threads=None): + places = [] + if use_cuda: + for i in xrange(core.get_cuda_device_count()): + p = core.Place() + p.set_place(core.CUDAPlace(i)) + places.append(p) + else: + for i in xrange(multiprocessing.cpu_count()): + p = core.Place() + p.set_place(core.CPUPlace()) + places.append(p) + + if num_threads is None: + num_threads = min(len(places) * 2, multiprocessing.cpu_count()) + + startup = framework.default_startup_program() + main = framework.default_main_program() + scope = executor.global_scope() + + self.executor = core.ParallelExecutor( + num_threads, + True if use_cuda else False, # use_event + places, + set([ + p.name for p in main.global_block().iter_parameters() + if not p.stop_gradient + ]), + startup.desc, + main.desc, + loss_name, + scope) + self.scope = scope + + def run(self, fetch_list): + fetch_var_name = '@FETCHED_VAR_NAME@' + self.executor.run(fetch_list, fetch_var_name) + arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + return [arr[i] for i in range(len(arr))] diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index cabb8e769d..2ebdbaaca6 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -19,8 +19,54 @@ import paddle.v2.dataset.mnist as mnist import numpy +def simple_fc_net(): + reader = fluid.layers.open_recordio_file( + filename='./mnist.recordio', + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + img, label = fluid.layers.read_file(reader) + hidden = img + for _ in xrange(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + +def fc_with_batchnorm(): + reader = fluid.layers.open_recordio_file( + filename='./mnist.recordio', + shapes=[[-1, 784], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + img, label = fluid.layers.read_file(reader) + hidden = img + for _ in xrange(4): + hidden = fluid.layers.fc( + hidden, + size=200, + act='tanh', + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1.0))) + + hidden = fluid.layers.batch_norm(input=hidden) + + prediction = fluid.layers.fc(hidden, size=10, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + class ParallelExecutor(unittest.TestCase): - def setUp(self): + @classmethod + def setUpClass(cls): # Convert mnist to recordio file with fluid.program_guard(fluid.Program(), fluid.Program()): reader = paddle.batch(mnist.train(), batch_size=32) @@ -35,51 +81,28 @@ class ParallelExecutor(unittest.TestCase): fluid.recordio_writer.convert_reader_to_recordio_file( './mnist.recordio', reader, feeder) - def test_main(self): + def test_simple_fc(self): + self.check_network_convergence(simple_fc_net) + + def test_batchnorm_fc(self): + self.check_network_convergence(fc_with_batchnorm) + + def check_network_convergence(self, method): main = fluid.Program() startup = fluid.Program() - with fluid.program_guard(main, startup): - reader = fluid.layers.open_recordio_file( - filename='./mnist.recordio', - shapes=[[-1, 784], [-1, 1]], - lod_levels=[0, 0], - dtypes=['float32', 'int64']) - img, label = fluid.layers.read_file(reader) - hidden = img - for _ in xrange(4): - hidden = fluid.layers.fc( - hidden, - size=200, - act='tanh', - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1.0))) - prediction = fluid.layers.fc(hidden, size=10, act='softmax') - loss = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.mean(loss) + loss = method() adam = fluid.optimizer.Adam() adam.minimize(loss) - act_places = [] - for each in [fluid.CUDAPlace(0)]: - p = fluid.core.Place() - p.set_place(each) - act_places.append(p) - - exe = fluid.core.ParallelExecutor( - act_places, - set([p.name for p in main.global_block().iter_parameters()]), - startup.desc, main.desc, loss.name, fluid.global_scope()) - exe.run([loss.name], 'fetched_var') + exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) + first_loss, = exe.run([loss.name]) + first_loss = numpy.array(first_loss) - first_loss = numpy.array(fluid.global_scope().find_var('fetched_var') - .get_lod_tensor_array()[0]) - print first_loss + for i in xrange(10): + exe.run([]) - for i in xrange(10): - exe.run([], 'fetched_var') - exe.run([loss.name], 'fetched_var') - last_loss = numpy.array(fluid.global_scope().find_var('fetched_var') - .get_lod_tensor_array()[0]) + last_loss, = exe.run([loss.name]) + last_loss = numpy.array(last_loss) - print first_loss, last_loss - self.assertGreater(first_loss[0], last_loss[0]) + print first_loss, last_loss + self.assertGreater(first_loss[0], last_loss[0]) From 8090eb627273d88aad55966755c138dcde2feb93 Mon Sep 17 00:00:00 2001 From: Darcy Date: Sat, 24 Mar 2018 02:51:45 -0700 Subject: [PATCH 178/314] added proto_desc to device_tracer's dep list (#9342) --- paddle/fluid/platform/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 7eec6ab657..686c088914 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) -cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS}) +cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) From 85404d4cb987f25dd897af2a035f5ec6b8e73c49 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Sat, 24 Mar 2018 22:54:43 +0800 Subject: [PATCH 179/314] update cpp reader doc --- doc/fluid/design/concepts/cpp_data_feeding.md | 35 +++++++++++++++++-- 1 file changed, 33 insertions(+), 2 deletions(-) diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md index 8607b40ccb..6ed3f604dc 100644 --- a/doc/fluid/design/concepts/cpp_data_feeding.md +++ b/doc/fluid/design/concepts/cpp_data_feeding.md @@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e To create and invoke readers, some new ops are introduced: -### CreateReaderOp +### Operators That Creates Readers Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers. @@ -153,13 +153,17 @@ double_buffer_reader = create_double_buffer_op(batch_reader) The forwarding ops of the corresponding `main_program` would be like this: ``` -while_op { +not_completed = true +pass_count = 0 +while_op(not_completed) { has_next = has_next_op(double_buffer_reader) if_else_op(has_next) { batch_data = read_op(double_buffer_reader) ... (subsequent training ops) } else { reset_op(double_buffer_reader) + increase_op(pass_count) + not_completed = less_than_op(pass_count, reqiured_pass_num) } } ``` @@ -169,3 +173,30 @@ Two important considerations for these programs are as follows: 1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader. 2. All readers exist in both `startup_program` and `main_program`. And they are persistable. + +### Simplify Configuration by MultiPassReader + +The Program configuration mentioned above is somehow complicated. Users need to be very similar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to beginning users, we introduce `MultiPassReader`. + +`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several pass training. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`. + +With `MultiPassReader`, the startup program would be like this: + +``` +multiple_reader = open_files_op(...) +batch_reader = create_batch_reader_op(multiple_reader) +double_buffer_reader = create_double_buffer_op(batch_reader) +multi_pass_reader = create_multi_pass_reader_op(double_buffer_reader) +... (other initializers) +``` + +The forwarding part of the corresponding `main_program` would be like this: + +``` +not_completed = true +while_op(not_completed) { + batch_data = read_op(multi_pass_reader) + ... (subsequent training ops) + not_completed = has_next_op(multi_pass_reader) +} +``` From cffe1a91124b2b8aa45463ddbe8445c23023ece3 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Sat, 24 Mar 2018 22:55:28 +0800 Subject: [PATCH 180/314] Profiler can get elapsed time of `sendop` (#9345) --- paddle/fluid/operators/send_op.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index a77c38f633..fdf3c06ef0 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { @@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); + // For profiling + platform::RecordEvent record_event(Type(), &ctx); + auto client_var_name = Output("RPCClient"); PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), "Can not find variable '%s' in the scope.", From dd532e2086bc2e05e02b65d4459d2f12de46793a Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Sat, 24 Mar 2018 22:59:53 +0800 Subject: [PATCH 181/314] refine MultiPassReader's doc string --- .../fluid/operators/reader/create_multi_pass_reader_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc index 4d4e9fb909..47d9989bc8 100644 --- a/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc +++ b/paddle/fluid/operators/reader/create_multi_pass_reader_op.cc @@ -81,10 +81,10 @@ class CreateMultiPassReaderOpMaker : public DecoratedReaderMakerBase { This operator creates a multi-pass reader. A multi-pass reader is used to yield data for several pass training continuously. - It takes the the number of pass to run as one of its attributes + It takes the number of passes to run as one of its attributes ('pass_num'), and maintains a pass counter to record how many - passes it has completed. When the underlying reader reach the EOF, - the multi-pass reader checks whether it has completed training + passes it has completed. When the underlying reader reaches the + EOF, the multi-pass reader checks whether it has completed training of the given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. )DOC"); From 081b7824349f5a38e0437aae218392014f9f20c0 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Sun, 25 Mar 2018 11:18:49 +0800 Subject: [PATCH 182/314] update by comment --- paddle/fluid/operators/send_vars_op.cc | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc index af791bc8e2..523e9e2780 100644 --- a/paddle/fluid/operators/send_vars_op.cc +++ b/paddle/fluid/operators/send_vars_op.cc @@ -53,7 +53,7 @@ class SendVarsOp : public framework::OperatorBase { auto ins = Inputs("X"); std::vector epmap = Attr>("epmap"); - int flag_wait = Attr("wait"); + int sync_send = Attr("sync_sent"); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); @@ -68,12 +68,14 @@ class SendVarsOp : public framework::OperatorBase { for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; + // TODO(Yancey1989): we need to use an IO threadpool which has + // a larger number of threads than the computing threadpool. rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; } } - if (flag_wait) { + if (sync_send) { rpc_client->Wait(); } } @@ -86,16 +88,16 @@ class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(Tensor, SelectedRows) Input variables to be sent") .AsDuplicable(); AddOutput("RPCClient", - "(RPCClient) The RPC client object which is" + "(RPCClient) The RPC client object which will be" "initialized at most once."); AddComment(R"DOC( Send operator This operator will send variables to listen_and_serve op at the parameter server. )DOC"); - AddAttr("wait", + AddAttr("ync_send", "(int, default 0)" - "whether watting for all send request have been sent.") + "sync send or async send.") .SetDefault(0); AddAttr>("epmap", "(string vector, default 127.0.0.1:6164)" From 904fa05f4692eebdcebd8b3966a09c162ccd1da4 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 25 Mar 2018 02:29:02 -0700 Subject: [PATCH 183/314] Improve layer_norm speed transfomer on a single device step time reduces from 0.157 to 0.125 --- paddle/fluid/operators/layer_norm_op.h | 137 +++++++++++++++++++++---- 1 file changed, 116 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index 605b5c258c..63561aaa31 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -22,6 +22,99 @@ limitations under the License. */ namespace paddle { namespace operators { +// Wrap RowwiseMean and ColwiseMean. +// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is +// significantly faster. Unlike the RowwiseMean and ColwiseMean, the +// implementation only considers 2D. +template +struct RowwiseMean2D { + RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx); + + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, framework::Tensor* vec); +}; + +template +class RowwiseMean2D { + public: + RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) + : left_(left), right_(right) { + framework::DDim ones_dim({right_}); + divisor_.mutable_data(ones_dim, dev_ctx.GetPlace()); + math::set_constant(dev_ctx, &divisor_, 1.0 / right); + } + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + math::gemv( + context, false, left_, right_, 1., input.data(), divisor_.data(), + 0., out->data()); + } + + private: + int left_; + int right_; + framework::Tensor divisor_; +}; + +template +class RowwiseMean2D { + public: + RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {} + + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + row_mean_(context, input, out); + } + + private: + math::RowwiseMean row_mean_; +}; + +template +struct ColwiseSum2D { + ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx); + + void operator()(const platform::DeviceContext& context, + const framework::Tensor& input, framework::Tensor* vec); +}; + +template +class ColwiseSum2D { + public: + ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) + : left_(left), right_(right) { + framework::DDim ones_dim({left_}); + divisor_.mutable_data(ones_dim, dev_ctx.GetPlace()); + math::set_constant(dev_ctx, &divisor_, 1.0); + } + + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + math::gemv( + context, true, left_, right_, 1., input.data(), divisor_.data(), + 0., out->data()); + } + + private: + int left_; + int right_; + framework::Tensor divisor_; +}; + +template +class ColwiseSum2D { + public: + ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {} + + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, framework::Tensor* out) { + col_wise_(context, input, out); + } + + private: + math::ColwiseSum col_wise_; +}; + template struct SubAndSquareFunctor { inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } @@ -67,15 +160,15 @@ using DataLayout = framework::DataLayout; template class LayerNormKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext &ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { const float epsilon = ctx.Attr("epsilon"); - auto *scale = ctx.Input("Scale"); - auto *bias = ctx.Input("Bias"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); auto x = *ctx.Input("X"); - auto *y = ctx.Output("Y"); - auto *mean = ctx.Output("Mean"); - auto *var = ctx.Output("Variance"); + auto* y = ctx.Output("Y"); + auto* mean = ctx.Output("Mean"); + auto* var = ctx.Output("Variance"); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); const auto x_dims = x.dims(); @@ -94,8 +187,8 @@ class LayerNormKernel : public framework::OpKernel { out.ShareDataWith(*y); out.Resize(matrix_shape); - auto &dev_ctx = ctx.template device_context(); - math::RowwiseMean row_mean; + auto& dev_ctx = ctx.template device_context(); + RowwiseMean2D row_mean(left, right, ctx.device_context()); // get mean row_mean(dev_ctx, x, mean); @@ -126,31 +219,32 @@ class LayerNormKernel : public framework::OpKernel { template class LayerNormGradKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext &ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { const float epsilon = ctx.Attr("epsilon"); auto x = *ctx.Input("X"); - auto *y = ctx.Input("Y"); - auto *mean = ctx.Input("Mean"); - auto *var = ctx.Input("Variance"); - auto *scale = ctx.Input("Scale"); - auto *bias = ctx.Input("Bias"); + auto* y = ctx.Input("Y"); + auto* mean = ctx.Input("Mean"); + auto* var = ctx.Input("Variance"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); auto d_y = *ctx.Input(framework::GradVarName("Y")); const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); // init output - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_scale = ctx.Output(framework::GradVarName("Scale")); + auto* d_bias = ctx.Output(framework::GradVarName("Bias")); - const auto &x_dims = x.dims(); + const auto& x_dims = x.dims(); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); int left = static_cast(matrix_dim[0]); int right = static_cast(matrix_dim[1]); framework::DDim matrix_shape({left, right}); d_y.Resize(matrix_shape); - auto &dev_ctx = ctx.template device_context(); - math::ColwiseSum colwise_sum; + auto& dev_ctx = ctx.template device_context(); + ColwiseSum2D colwise_sum(left, right, + ctx.device_context()); Tensor temp; Tensor temp_norm; @@ -190,7 +284,8 @@ class LayerNormGradKernel : public framework::OpKernel { Tensor temp_vec; temp_vec.mutable_data(vec_shape, ctx.GetPlace()); - math::RowwiseMean row_mean; + RowwiseMean2D row_mean(left, right, + ctx.device_context()); if (d_scale) { // dy_dx From 1a4be55a476e2d02dc35fc945220f9aa9c205808 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Sun, 25 Mar 2018 02:46:59 -0700 Subject: [PATCH 184/314] Pass cpu build --- paddle/fluid/operators/layer_norm_op.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h index 63561aaa31..7b84ba0a7d 100644 --- a/paddle/fluid/operators/layer_norm_op.h +++ b/paddle/fluid/operators/layer_norm_op.h @@ -34,6 +34,7 @@ struct RowwiseMean2D { const framework::Tensor& input, framework::Tensor* vec); }; +#ifdef PADDLE_WITH_CUDA template class RowwiseMean2D { public: @@ -55,6 +56,7 @@ class RowwiseMean2D { int right_; framework::Tensor divisor_; }; +#endif template class RowwiseMean2D { @@ -78,6 +80,7 @@ struct ColwiseSum2D { const framework::Tensor& input, framework::Tensor* vec); }; +#ifdef PADDLE_WITH_CUDA template class ColwiseSum2D { public: @@ -100,6 +103,7 @@ class ColwiseSum2D { int right_; framework::Tensor divisor_; }; +#endif template class ColwiseSum2D { From efd7ee8521986e7789ea88ec0e9a2c7ff5c83ca9 Mon Sep 17 00:00:00 2001 From: m3ngyang Date: Sun, 25 Mar 2018 19:35:20 +0800 Subject: [PATCH 185/314] translate Cluster Training and Prediction --- doc/v2/faq/cluster/index_en.rst | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst index 855b7e8e53..7cbcaeefcb 100644 --- a/doc/v2/faq/cluster/index_en.rst +++ b/doc/v2/faq/cluster/index_en.rst @@ -2,4 +2,15 @@ Cluster Training and Prediction ############################### -TBD +.. contents:: + +1. Network connection errors in the log during muliti-node cluster training +------------------------------------------------ +The errors in the log belong to network connection during mulilti-node cluster training, for example, :code:`Connection reset by peer`. +This kind of error is usually caused by the abnormal exit of the training process in some node, and the others cannot connect with this node any longer. Steps to troubleshoot the problem as follows: + +* Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk. + +* If network connection gave rise to the first error in the log, this may be caused by the port conflict of the non-exclusive execution. Connect with the operator to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If so, change the port of job. + +* If the currnet MPI cluster does not support exclusive pattern, ask the operator to replace or update the current cluster. From f96f2860f9ca88a9967c73179c7d3f198ea778a7 Mon Sep 17 00:00:00 2001 From: wanglun Date: Mon, 26 Mar 2018 09:42:07 +0800 Subject: [PATCH 186/314] Fix typo of Softmax document --- python/paddle/trainer_config_helpers/activations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py index 00efc01c05..3683968262 100644 --- a/python/paddle/trainer_config_helpers/activations.py +++ b/python/paddle/trainer_config_helpers/activations.py @@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation): .. math:: - P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} } + P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} } """ def __init__(self): From 30f1bd6a6497f05e6e966bdca9af3569e08c0f68 Mon Sep 17 00:00:00 2001 From: Burness Duan Date: Mon, 26 Mar 2018 10:05:15 +0800 Subject: [PATCH 187/314] add the recordio in creator.py and change the " to \' (#9358) --- python/paddle/v2/reader/creator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py index 421f6c933d..fda5246d74 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/v2/reader/creator.py @@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could be used in user program. """ -__all__ = ['np_array', 'text_file', "cloud_reader"] +__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader'] def np_array(x): From 8ccc61f33490ae2136d234b16c8e64578f9efeee Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 26 Mar 2018 10:05:38 +0800 Subject: [PATCH 188/314] support empty tensor (#9338) * support empty tensor --- paddle/fluid/framework/tensor_impl.h | 8 ++++---- paddle/fluid/memory/memory_test.cc | 4 ++-- .../fluid/tests/unittests/test_tensor.py | 20 ++++++++++++++++++- 3 files changed, 25 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 638bd0db9d..7a48390440 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -117,10 +117,10 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { if (holder_ != nullptr) { holder_->set_type(type); } - PADDLE_ENFORCE_GT( - numel(), 0, - "When calling this method, the Tensor's numel must be larger than zero. " - "Please check Tensor::Resize has been called first."); + PADDLE_ENFORCE_GE(numel(), 0, + "When calling this method, the Tensor's numel must be " + "equal or larger than zero. " + "Please check Tensor::Resize has been called first."); int64_t size = numel() * SizeOfType(type); /* some versions of boost::variant don't have operator!= */ if (holder_ == nullptr || !(holder_->place() == place) || diff --git a/paddle/fluid/memory/memory_test.cc b/paddle/fluid/memory/memory_test.cc index ae98d0d525..eb27a52b25 100644 --- a/paddle/fluid/memory/memory_test.cc +++ b/paddle/fluid/memory/memory_test.cc @@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { EXPECT_EQ(total_size, 0UL); for (auto size : - {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { ps[paddle::memory::Alloc(cpu, size)] = size; // Buddy Allocator doesn't manage too large memory chunk @@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) { EXPECT_EQ(total_size, 0UL); for (auto size : - {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { + {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { ps[paddle::memory::Alloc(gpu, size)] = size; // Buddy Allocator doesn't manage too large memory chunk diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py index a369783245..379081c328 100644 --- a/python/paddle/fluid/tests/unittests/test_tensor.py +++ b/python/paddle/fluid/tests/unittests/test_tensor.py @@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase): def test_lod_tensor_gpu_init(self): if not core.is_compiled_with_cuda(): return - scope = core.Scope() place = core.CUDAPlace(0) lod_py = [[0, 2, 5], [0, 2, 4, 5]] lod_tensor = core.LoDTensor() @@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase): self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) self.assertListEqual(lod_py, lod_tensor.lod()) + def test_empty_tensor(self): + place = core.CPUPlace() + scope = core.Scope() + var = scope.var("test_tensor") + + tensor = var.get_tensor() + + tensor.set_dims([0, 1]) + tensor.alloc_float(place) + + tensor_array = numpy.array(tensor) + self.assertEqual((0, 1), tensor_array.shape) + + if core.is_compiled_with_cuda(): + gpu_place = core.CUDAPlace(0) + tensor.alloc_float(gpu_place) + tensor_array = numpy.array(tensor) + self.assertEqual((0, 1), tensor_array.shape) + if __name__ == '__main__': unittest.main() From ebbb428db99ab68dca496ec908442d26a47d2dfd Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 26 Mar 2018 10:46:01 +0800 Subject: [PATCH 189/314] fix ci --- paddle/fluid/operators/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9a8f52b232..035ecd0948 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -188,7 +188,7 @@ if(WITH_DISTRIBUTE) set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) else() - set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op) + set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op) endif() op_library(cond_op DEPS framework_proto tensor net_op) From ce84af638bc6204c30272f3163e7f7b3026bcfec Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 26 Mar 2018 10:48:54 +0800 Subject: [PATCH 190/314] update --- doc/fluid/design/concepts/cpp_data_feeding.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md index 6ed3f604dc..9c44dec4b9 100644 --- a/doc/fluid/design/concepts/cpp_data_feeding.md +++ b/doc/fluid/design/concepts/cpp_data_feeding.md @@ -185,8 +185,8 @@ With `MultiPassReader`, the startup program would be like this: ``` multiple_reader = open_files_op(...) batch_reader = create_batch_reader_op(multiple_reader) -double_buffer_reader = create_double_buffer_op(batch_reader) -multi_pass_reader = create_multi_pass_reader_op(double_buffer_reader) +multi_pass_reader = create_multi_pass_reader_op(batch_reader) +double_buffer_reader = create_double_buffer_op(multi_pass_reader) ... (other initializers) ``` @@ -195,8 +195,8 @@ The forwarding part of the corresponding `main_program` would be like this: ``` not_completed = true while_op(not_completed) { - batch_data = read_op(multi_pass_reader) + batch_data = read_op(double_buffer_reader) ... (subsequent training ops) - not_completed = has_next_op(multi_pass_reader) + not_completed = has_next_op(double_buffer_reader) } ``` From 4f522fa8d543715d9fcc633e79714302f496439c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 26 Mar 2018 11:38:06 +0800 Subject: [PATCH 191/314] fix compile send_op on mac (#9360) --- paddle/fluid/operators/detail/grpc_client.cc | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index eb19685aa6..e73bbe7537 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -49,9 +49,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, s->Prepare(var_h, time_out); s->response_call_back_ = NULL; - auto call = std::move(s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, - &cq_)); + auto call = s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, (void*)s); }); @@ -107,8 +106,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ::grpc::ByteBuffer buf; RequestToByteBuffer(req, &buf); - auto call = std::move(s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_)); + auto call = s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, (void*)s); }); From 5c7a523326b98b9c4fee1eca0c0c74e3112bc19a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Mar 2018 11:50:52 +0800 Subject: [PATCH 192/314] Add Graphviz output --- .../details/computation_op_handle.cc | 2 + .../framework/details/computation_op_handle.h | 2 + .../framework/details/fetch_op_handle.cc | 2 + .../fluid/framework/details/fetch_op_handle.h | 2 + .../details/multi_devices_graph_builder.cc | 6 ++ .../details/nccl_all_reduce_op_handle.cc | 2 + .../details/nccl_all_reduce_op_handle.h | 2 + .../fluid/framework/details/op_handle_base.h | 2 + .../details/scale_loss_grad_op_handle.cc | 2 + .../details/scale_loss_grad_op_handle.h | 2 + .../framework/details/ssa_graph_builder.cc | 58 +++++++++++++++++++ .../framework/details/ssa_graph_builder.h | 2 + .../details/threaded_ssa_graph_executor.cc | 6 ++ .../tests/unittests/test_parallel_executor.py | 2 +- 14 files changed, 91 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 5867f8fc55..348b944cf9 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -35,6 +35,8 @@ void ComputationOpHandle::RunImpl() { op_->Run(*scope_, place_); } + +std::string ComputationOpHandle::Name() const { return op_->Type(); } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 1fbfd4eabe..d6d2d731ca 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -31,6 +31,8 @@ struct ComputationOpHandle : public OpHandleBase { ComputationOpHandle(const OpDesc &op_desc, Scope *scope, platform::Place place); + std::string Name() const override; + protected: void RunImpl() override; }; diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index ab552081a4..c697a1c937 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -72,6 +72,8 @@ void FetchOpHandle::RunImpl() { } } +std::string FetchOpHandle::Name() const { return "Fetch"; } + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/fetch_op_handle.h b/paddle/fluid/framework/details/fetch_op_handle.h index 3123f7ba23..904b2d669f 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.h +++ b/paddle/fluid/framework/details/fetch_op_handle.h @@ -38,6 +38,8 @@ struct FetchOpHandle : public OpHandleBase { void WaitAndMergeCPUTensors() const; + std::string Name() const override; + protected: void RunImpl() override; }; diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index b27647a8ee..cb02d36714 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -136,6 +136,12 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( */ PolishGraphToSupportDataHazards(&result); + if (VLOG_IS_ON(10)) { + std::ostringstream sout; + PrintGraphviz(*graph, sout); + VLOG(10) << sout.str(); + } + return std::unique_ptr(graph); } } // namespace details diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index a79c61f359..f2303ff4ca 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -69,6 +69,8 @@ void NCCLAllReduceOpHandle::RunImpl() { } } } + +std::string NCCLAllReduceOpHandle::Name() const { return "NCCL AllReduce"; } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h index 7152d1a587..045070bb6a 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.h @@ -32,6 +32,8 @@ struct NCCLAllReduceOpHandle : public OpHandleBase { const std::vector &places, const platform::NCCLContextMap &ctxs); + std::string Name() const override; + protected: void RunImpl() override; }; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 5178b51d8d..99d8968486 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -33,6 +33,8 @@ struct OpHandleBase { std::string DebugString() const; + virtual std::string Name() const = 0; + virtual ~OpHandleBase(); void Run(bool use_event); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 2e69f1e5e8..a6a67c9b14 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -45,6 +45,8 @@ void ScaleLossGradOpHandle::RunImpl() { #endif } } + +std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index 3a35574919..ab7353a4fc 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -32,6 +32,8 @@ struct ScaleLossGradOpHandle : public OpHandleBase { ~ScaleLossGradOpHandle() final; + std::string Name() const override; + protected: void RunImpl() override; }; diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index 7a80a4b1e7..e0209fce76 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -83,6 +83,64 @@ void SSAGraphBuilder::CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, var.place_ = place; op_handle->AddOutput(&var); } + +template +void IterAllVar(const SSAGraph &graph, Callback callback) { + for (auto &each : graph.vars_) { + for (auto &pair1 : each) { + for (auto &pair2 : pair1.second) { + callback(pair2.second); + } + } + } + + for (auto &var : graph.dep_vars_) { + callback(*var); + } +} + +void SSAGraphBuilder::PrintGraphviz(const SSAGraph &graph, std::ostream &sout) { + size_t var_id = 0; + std::unordered_map vars; + + sout << "digraph G {\n"; + + IterAllVar(graph, [&](const VarHandleBase &var) { + auto *var_ptr = &var; + auto *var_handle_ptr = dynamic_cast(var_ptr); + auto *dummy_ptr = dynamic_cast(var_ptr); + + size_t cur_var_id = var_id++; + vars[var_ptr] = cur_var_id; + + if (var_handle_ptr) { + sout << "var_" << cur_var_id << " [label=\"" << var_handle_ptr->name_ + << "\\n" + << var_handle_ptr->place_ << "\\n" + << var_handle_ptr->version_ << "\"]" << std::endl; + } else if (dummy_ptr) { + sout << "var_" << cur_var_id << " [label=\"dummy\"]" << std::endl; + } + }); + + size_t op_id = 0; + for (auto &op : graph.ops_) { + std::string op_name = "op_" + std::to_string(op_id++); + sout << op_name << " [label=\"" << op->Name() << "\", shape=rect]" + << std::endl; + for (auto in : op->inputs_) { + std::string var_name = "var_" + std::to_string(vars[in]); + sout << var_name << " -> " << op_name << std::endl; + } + + for (auto out : op->outputs_) { + std::string var_name = "var_" + std::to_string(vars[out]); + sout << op_name << " -> " << var_name << std::endl; + } + } + + sout << "}\n"; +} } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/ssa_graph_builder.h b/paddle/fluid/framework/details/ssa_graph_builder.h index df05bb7394..bf20e7164a 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.h +++ b/paddle/fluid/framework/details/ssa_graph_builder.h @@ -51,6 +51,8 @@ class SSAGraphBuilder { static void CreateOpOutput(SSAGraph *graph, OpHandleBase *op_handle, const std::string &each_var_name, const platform::Place &place, size_t place_offset); + + static void PrintGraphviz(const SSAGraph &graph, std::ostream &sout); }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 86e880ed72..f609395d40 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -133,6 +133,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( if (exception_) { throw * exception_; } + + VLOG(10) << "============================="; + for (auto &op : pending_ops) { + VLOG(10) << op.first->DebugString(); + } + // keep waiting the ready variables continue; } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 2ebdbaaca6..dd6e70eadb 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -48,7 +48,7 @@ def fc_with_batchnorm(): dtypes=['float32', 'int64']) img, label = fluid.layers.read_file(reader) hidden = img - for _ in xrange(4): + for _ in xrange(1): hidden = fluid.layers.fc( hidden, size=200, From d573195dde9dfe64724b536654760e2f954f42b3 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 26 Mar 2018 12:46:50 +0800 Subject: [PATCH 193/314] rm libmklml_gnu.so --- cmake/inference_lib.cmake | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index fb81498fd6..0323cd9698 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -69,11 +69,11 @@ if(NOT CBLAS_FOUND) SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include DSTS ${dst_dir} ${dst_dir} ) -else() +elseif (WITH_MKLML) set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml") copy(mklml_lib - SRCS ${MKLML_LIB_DIR} ${MKLML_INC_DIR} - DSTS ${dst_dir} ${dst_dir} + SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR} + DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir} ) endif() From 54bd17fe7b537a20b88e09a39d0e16416d446b41 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Mar 2018 13:01:51 +0800 Subject: [PATCH 194/314] Complete Flowers --- .../fluid/framework/details/op_handle_base.cc | 8 +- .../framework/details/ssa_graph_builder.cc | 2 +- .../paddle/fluid/tests/unittests/.gitignore | 1 + .../tests/unittests/test_parallel_executor.py | 137 +++++++++++++++++- 4 files changed, 144 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index ca354a63c6..ea97aa5fb2 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -31,7 +31,13 @@ std::string OpHandleBase::DebugString() const { return ss.str(); } -OpHandleBase::~OpHandleBase() {} +OpHandleBase::~OpHandleBase() { +#ifdef PADDLE_WITH_CUDA + for (auto &ev : events_) { + cudaEventDestroy(ev.second); + } +#endif +} void OpHandleBase::Run(bool use_event) { #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index e0209fce76..a853da6fba 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -21,7 +21,7 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { for (auto &var_map : graph->vars_) { for (auto &name_pair : var_map) { if (name_pair.second.size() <= 1) { - return; + continue; } auto it_new = name_pair.second.rbegin(); auto it_old = name_pair.second.rbegin(); diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore index ad02bdecf4..51b1da4c84 100644 --- a/python/paddle/fluid/tests/unittests/.gitignore +++ b/python/paddle/fluid/tests/unittests/.gitignore @@ -2,3 +2,4 @@ mnist.recordio mnist_0.recordio mnist_1.recordio mnist_2.recordio +flowers.recordio diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index dd6e70eadb..d5d2275e4d 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -16,6 +16,7 @@ import unittest import paddle.fluid as fluid import paddle.v2 as paddle import paddle.v2.dataset.mnist as mnist +import paddle.v2.dataset.flowers as flowers import numpy @@ -64,6 +65,119 @@ def fc_with_batchnorm(): return loss +def squeeze_excitation(input, num_channels, reduction_ratio): + # pool = fluid.layers.pool2d( + # input=input, pool_size=0, pool_type='avg', global_pooling=True) + conv = input + shape = conv.shape + reshape = fluid.layers.reshape( + x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) + pool = fluid.layers.reduce_mean(input=reshape, dim=2) + + squeeze = fluid.layers.fc(input=pool, + size=num_channels / reduction_ratio, + act='relu') + excitation = fluid.layers.fc(input=squeeze, + size=num_channels, + act='sigmoid') + scale = fluid.layers.elementwise_mul(x=input, y=excitation, axis=0) + return scale + + +def conv_bn_layer(input, num_filters, filter_size, stride=1, groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) / 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act, momentum=0.1) + + +def shortcut(input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out: + if stride == 1: + filter_size = 1 + else: + filter_size = 3 + return conv_bn_layer(input, ch_out, filter_size, stride) + else: + return input + + +def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): + # The number of first 1x1 convolutional channels for each bottleneck build block + # was halved to reduce the compution cost. + conv0 = conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = conv_bn_layer( + input=conv0, + num_filters=num_filters * 2, + filter_size=3, + stride=stride, + groups=cardinality, + act='relu') + conv2 = conv_bn_layer( + input=conv1, num_filters=num_filters * 2, filter_size=1, act=None) + scale = squeeze_excitation( + input=conv2, + num_channels=num_filters * 2, + reduction_ratio=reduction_ratio) + + short = shortcut(input, num_filters * 2, stride) + + return fluid.layers.elementwise_add(x=short, y=scale, act='relu') + + +def SE_ResNeXt152(): + reader = fluid.layers.open_recordio_file( + filename='./flowers.recordio', + shapes=[[-1, 3, 224, 224], [-1, 1]], + lod_levels=[0, 0], + dtypes=['float32', 'int64']) + + img, label = fluid.layers.read_file(reader) + + conv = conv_bn_layer( + input=img, num_filters=64, filter_size=3, stride=2, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=64, filter_size=3, stride=1, act='relu') + conv = conv_bn_layer( + input=conv, num_filters=128, filter_size=3, stride=1, act='relu') + conv = fluid.layers.pool2d( + input=conv, pool_size=3, pool_stride=2, pool_padding=1, pool_type='max') + + cardinality = 64 + reduction_ratio = 16 + depth = [3, 8, 36, 3] + num_filters = [128, 256, 512, 1024] + + for block in range(len(depth)): + for i in range(depth[block]): + conv = bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + cardinality=cardinality, + reduction_ratio=reduction_ratio) + + shape = conv.shape + reshape = fluid.layers.reshape( + x=conv, shape=[-1, shape[1], shape[2] * shape[3]]) + pool = fluid.layers.reduce_mean(input=reshape, dim=2) + dropout = fluid.layers.dropout(x=pool, dropout_prob=0.2) + # Classifier layer: + prediction = fluid.layers.fc(input=dropout, size=1000, act='softmax') + loss = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(loss) + return loss + + class ParallelExecutor(unittest.TestCase): @classmethod def setUpClass(cls): @@ -81,24 +195,40 @@ class ParallelExecutor(unittest.TestCase): fluid.recordio_writer.convert_reader_to_recordio_file( './mnist.recordio', reader, feeder) + with fluid.program_guard(fluid.Program(), fluid.Program()): + reader = paddle.batch(flowers.train(), batch_size=4) + feeder = fluid.DataFeeder( + feed_list=[ + fluid.layers.data( + name='image', shape=[3, 224, 224]), + fluid.layers.data( + name='label', shape=[1], dtype='int64'), + ], + place=fluid.CPUPlace()) + fluid.recordio_writer.convert_reader_to_recordio_file( + "./flowers.recordio", reader, feeder) + def test_simple_fc(self): self.check_network_convergence(simple_fc_net) def test_batchnorm_fc(self): self.check_network_convergence(fc_with_batchnorm) - def check_network_convergence(self, method): + def check_network_convergence(self, method, memory_opt=True, iter=10): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): loss = method() adam = fluid.optimizer.Adam() adam.minimize(loss) + if memory_opt: + fluid.memory_optimize(main) + exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) first_loss, = exe.run([loss.name]) first_loss = numpy.array(first_loss) - for i in xrange(10): + for i in xrange(iter): exe.run([]) last_loss, = exe.run([loss.name]) @@ -106,3 +236,6 @@ class ParallelExecutor(unittest.TestCase): print first_loss, last_loss self.assertGreater(first_loss[0], last_loss[0]) + + def test_resnet(self): + self.check_network_convergence(SE_ResNeXt152, iter=20) From 54a85b7bfd1836585ed6f257ed67651e0d516557 Mon Sep 17 00:00:00 2001 From: dragonwarrior Date: Mon, 26 Mar 2018 13:24:10 +0800 Subject: [PATCH 195/314] Add lrn layer (#9157) * add LRN layer for fluid * add LRN layer for fluid * add documentation for LRN layer * add paper reference for LRN layer * add seperate documentation for LRN layer * rm lrn.py in doc/fluid/dev/src * change code style in lrn * fix style of comments in lrn --- python/paddle/fluid/layers/nn.py | 71 +++++++++++++++++++ .../fluid/tests/unittests/test_layers.py | 7 ++ 2 files changed, 78 insertions(+) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 679de6ce2a..2db4e5d27d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -74,6 +74,7 @@ __all__ = [ 'one_hot', 'autoincreased_step_counter', 'lod_reset', + 'lrn', ] @@ -3410,3 +3411,73 @@ def lod_reset(x, y=None, target_lod=None): raise ValueError("y and target_lod should not be both None.") return out + + +def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None): + """ + Local Response Normalization Layer. This layer performs a type of + "lateral inhibition" by normalizing over local input regions. + + The formula is as follows: + + .. math:: + + Output(i, x, y) = Input(i, x, y) / \left( + k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)} + (Input(j, x, y))^2 \right)^{\beta} + + In the above equation: + + * :math:`n`: The number of channels to sum over. + * :math:`k`: The offset (avoid being divided by 0). + * :math:`alpha`: The scaling parameter. + * :math:`beta`: The exponent parameter. + + Refer to `ImageNet Classification with Deep Convolutional Neural Networks + `_ + + Args: + input (Variable): The input tensor of this layer, and the dimension of input tensor must be 4. + n (int, default 5): The number of channels to sum over. + k (float, default 1.0): An offset (usually positive to avoid dividing by 0). + alpha (float, default 1e-4): The scaling parameter. + beta (float, default 0.75): The exponent. + name (str, default None): A name for this operation. + + Raises: + ValueError: If rank of the input tensor is not 4. + + Returns: + A tensor variable storing the transformation result. + + Examples: + .. code-block:: python + + data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32") + lrn = fluid.layers.lrn(input=data) + """ + helper = LayerHelper('lrn', **locals()) + dtype = helper.input_dtype() + input_shape = input.shape + dims = len(input_shape) + + if dims != 4: + raise ValueError( + "dims of input must be 4(not %d), and it's order must be NCHW" % + (dims)) + + mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) + lrn_out = helper.create_tmp_variable(dtype) + helper.append_op( + type="lrn", + inputs={"X": input}, + outputs={ + "Out": lrn_out, + "MidOut": mid_out, + }, + attrs={"n": n, + "k": k, + "alpha": alpha, + "beta": beta}) + + return lrn_out diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index b5fd59cf3a..2179826d81 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -231,6 +231,13 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(layers.softmax(hid)) print(str(program)) + def test_lrn(self): + program = Program() + with program_guard(program): + data = layers.data(name='data', shape=[6, 2, 2], dtype='float32') + self.assertIsNotNone(layers.lrn(data)) + print(str(program)) + def test_get_places(self): program = Program() with program_guard(program): From 02aaecca35632eae93ca2b5d5ca07db61e4087a3 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Mar 2018 13:24:16 +0800 Subject: [PATCH 196/314] Fix CPU compile --- paddle/fluid/framework/details/CMakeLists.txt | 8 +++- .../details/multi_devices_graph_builder.cc | 37 ++++++++++++++++--- .../details/multi_devices_graph_builder.h | 12 +++++- paddle/fluid/framework/parallel_executor.cc | 14 +++++-- paddle/fluid/framework/parallel_executor.h | 2 - .../reader/create_recordio_file_reader_op.cc | 2 + 6 files changed, 62 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index f13ac276fc..bf1a705ef5 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -8,8 +8,14 @@ cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_pr cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base) cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph) + +if(WITH_GPU) + set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle) +else() + set(multi_devices_graph_builder_deps) +endif() cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle - nccl_all_reduce_op_handle scale_loss_grad_op_handle) + scale_loss_grad_op_handle ${multi_devices_graph_builder_deps}) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph) cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context) diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index cb02d36714..6798776076 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -14,14 +14,18 @@ #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/computation_op_handle.h" -#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" #include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/platform/nccl_helper.h" + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h" +#endif namespace paddle { namespace framework { namespace details { + +#ifdef PADDLE_WITH_CUDA MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( const std::vector &places, const std::string &loss_var_name, @@ -32,6 +36,16 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( places_(places), local_scopes_(local_scopes), nccl_ctxs_(nccl_ctxs) { +#else +MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder( + const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶ms, + const std::vector &local_scopes) + : loss_var_name_(loss_var_name), + places_(places), + local_scopes_(local_scopes) { +#endif for (auto &p : params) { grad_names_.insert(GradVarName(p)); } @@ -78,9 +92,16 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( if (is_forwarding) { if (var_names.size() == 1 && var_names[0] == loss_var_name_) { - // Insert ScaleCost OpHandle +// Insert ScaleCost OpHandle +#ifdef PADDLE_WITH_CUDA + auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p); +#else + auto *communication_dev_ctx = + platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); +#endif + op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p, - nccl_ctxs_->DevCtx(p)); + communication_dev_ctx); result.ops_.emplace_back(op_handle); // FIXME: Currently ScaleLossGradOp only use device_count as scale @@ -103,7 +124,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( auto var_names = op->OutputArgumentNames(); for (auto &og : var_names) { if (grad_names_.count(og) != 0) { // is param grad - // Insert NCCL AllReduce Op + // Insert NCCL AllReduce Op +#ifdef PADDLE_WITH_CUDA result.ops_.emplace_back( new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_)); auto *op_handle = result.ops_.back().get(); @@ -125,6 +147,9 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( op_handle->AddOutput(&var); } +#else + PADDLE_ENFORCE("Not implemented"); +#endif } } } @@ -143,7 +168,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( } return std::unique_ptr(graph); -} +} // namespace details } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h index 17959a94d6..d3c8e582cf 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.h +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h @@ -26,11 +26,18 @@ class Scope; namespace details { class MultiDevSSAGraphBuilder : public SSAGraphBuilder { public: +#ifdef PADDLE_WITH_CUDA MultiDevSSAGraphBuilder(const std::vector &places, const std::string &loss_var_name, const std::unordered_set ¶ms, const std::vector &local_scopes, platform::NCCLContextMap *nccl_ctxs); +#else + MultiDevSSAGraphBuilder(const std::vector &places, + const std::string &loss_var_name, + const std::unordered_set ¶ms, + const std::vector &local_scopes); +#endif std::unique_ptr Build(const ProgramDesc &program) const override; @@ -38,8 +45,11 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder { std::string loss_var_name_; const std::vector &places_; const std::vector &local_scopes_; - platform::NCCLContextMap *nccl_ctxs_; std::unordered_set grad_names_; + +#ifdef PADDLE_WITH_CUDA + platform::NCCLContextMap *nccl_ctxs_; +#endif }; } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d1e1f0ed23..4936b8b656 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include "ThreadPool.h" +#ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" +#endif #include "paddle/fluid/framework/details/multi_devices_graph_builder.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" @@ -64,13 +66,18 @@ ParallelExecutor::ParallelExecutor( member_->local_scopes_.size() != 1) { // Is CUDA BCastParamsToGPUs(startup_program); } - // Startup Program has been run. All local scopes has correct parameters. +// Startup Program has been run. All local scopes has correct parameters. - // Step 2. Convert main_program to SSA form and dependency graph. Also, insert - // ncclOp +// Step 2. Convert main_program to SSA form and dependency graph. Also, insert +// ncclOp +#ifdef PADDLE_WITH_CUDA details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, params, member_->local_scopes_, member_->nccl_ctxs_.get()); +#else + details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name, + params, member_->local_scopes_); +#endif auto graph = builder.Build(main_program); member_->executor_.reset(new details::ThreadedSSAGraphExecutor( @@ -137,3 +144,4 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } // namespace framework } // namespace paddle +A \ No newline at end of file diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 8bc09c5798..503efa2e44 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -21,8 +21,6 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/tensor.h" - -#include "paddle/fluid/operators/nccl/nccl_gpu_common.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { diff --git a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc index 0e00f218f9..adaa0b9e5f 100644 --- a/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc +++ b/paddle/fluid/operators/reader/create_recordio_file_reader_op.cc @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/operators/reader/reader_op_registry.h" #include "paddle/fluid/recordio/scanner.h" From 3aa2a8ffcfd55eb6c18ff08744a5d4a2432077ad Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Mar 2018 13:29:53 +0800 Subject: [PATCH 197/314] Follow comments --- paddle/fluid/framework/details/ssa_graph_builder.cc | 5 ----- paddle/fluid/framework/parallel_executor.cc | 1 - 2 files changed, 6 deletions(-) diff --git a/paddle/fluid/framework/details/ssa_graph_builder.cc b/paddle/fluid/framework/details/ssa_graph_builder.cc index a853da6fba..361ba6d397 100644 --- a/paddle/fluid/framework/details/ssa_graph_builder.cc +++ b/paddle/fluid/framework/details/ssa_graph_builder.cc @@ -29,11 +29,6 @@ void SSAGraphBuilder::PolishGraphToSupportDataHazards(SSAGraph *graph) { for (; it_old != name_pair.second.rend(); it_new = it_old, ++it_old) { auto *write_op = it_new->second.generated_op_; auto &read_ops = it_old->second.pending_ops_; - auto *ex_write_op = it_old->second.generated_op_; - - if (ex_write_op == nullptr) { // Nobody write this var. - continue; - } for (auto *read_op : read_ops) { // Manually add a dependency var from read_op to write_op; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 4936b8b656..8a90f231d7 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -144,4 +144,3 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, } // namespace framework } // namespace paddle -A \ No newline at end of file From ee97687f694661a1d767935b3ad183b817e6b858 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Mar 2018 14:26:03 +0800 Subject: [PATCH 198/314] Fix compile --- paddle/fluid/memory/detail/system_allocator_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/memory/detail/system_allocator_test.cc b/paddle/fluid/memory/detail/system_allocator_test.cc index d5df9e6897..3e1926f632 100644 --- a/paddle/fluid/memory/detail/system_allocator_test.cc +++ b/paddle/fluid/memory/detail/system_allocator_test.cc @@ -58,7 +58,7 @@ TEST(CPUAllocator, LockMem) { #ifdef PADDLE_WITH_CUDA TEST(GPUAllocator, Alloc) { - paddle::memory::detail::GPUAllocator a; + paddle::memory::detail::GPUAllocator a(0); TestAllocator(a, 2048); TestAllocator(a, 0); } From cb40c33137c7361c70742551a9a8f85c291fe640 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Mar 2018 17:01:39 +0800 Subject: [PATCH 199/314] Update unittest --- .../details/computation_op_handle.cc | 2 +- .../details/threaded_ssa_graph_executor.cc | 29 ++++++++ .../details/threaded_ssa_graph_executor.h | 3 + .../tests/unittests/test_parallel_executor.py | 68 ++++++++++--------- 4 files changed, 70 insertions(+), 32 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 348b944cf9..53ab8eb775 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -33,7 +33,7 @@ void ComputationOpHandle::RunImpl() { } } - op_->Run(*scope_, place_); + op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get(), place_); } std::string ComputationOpHandle::Name() const { return op_->Type(); } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index f609395d40..dcb611b8b1 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -112,6 +112,12 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( ready_ops.clear(); }; + // Create local scopes. + for (auto &scope : local_scopes_) { + auto &local_scope = scope->NewScope(); + *scope->Var("@TMP_SCOPE@")->GetMutable() = &local_scope; + } + // Step 3. Execution while (!pending_vars.empty()) { // 1. Run All Ready ops @@ -156,9 +162,32 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Keep loop until all vars are ready. } + ++computation_count_; + + auto sync_computation = [&] { + computation_count_ = 0; + // Wait All computational streams + for (auto p : this->places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } + + // NOTE: the temp scope can be dropped lazily if needed. + // Drop tmp scopes; + for (auto &scope : local_scopes_) { + auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable(); + kid = nullptr; + scope->DropKids(); + } + }; + // Wait FetchOps. for (auto &fetch_op : fetch_ops) { fetch_op.WaitAndMergeCPUTensors(); + sync_computation(); + } + + if (computation_count_ == max_async_computation) { + sync_computation(); } return fetch_data; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 5b099c18c9..805f80e7f7 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -48,6 +48,9 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { platform::DeviceContextPool fetch_ctxs_; const bool use_event_; std::unique_ptr exception_; + + size_t computation_count_{0}; + size_t max_async_computation{100}; }; } // namespace details diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index d5d2275e4d..106320839c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -178,7 +178,32 @@ def SE_ResNeXt152(): return loss -class ParallelExecutor(unittest.TestCase): +class TestParallelExecutorBase(unittest.TestCase): + def check_network_convergence(self, method, memory_opt=True, iter=10): + main = fluid.Program() + startup = fluid.Program() + with fluid.program_guard(main, startup): + loss = method() + adam = fluid.optimizer.Adam() + adam.minimize(loss) + if memory_opt: + fluid.memory_optimize(main) + + exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) + first_loss, = exe.run([loss.name]) + first_loss = numpy.array(first_loss) + + for i in xrange(iter): + exe.run([]) + + last_loss, = exe.run([loss.name]) + last_loss = numpy.array(last_loss) + + print first_loss, last_loss + self.assertGreater(first_loss[0], last_loss[0]) + + +class TestMNIST(TestParallelExecutorBase): @classmethod def setUpClass(cls): # Convert mnist to recordio file @@ -195,6 +220,16 @@ class ParallelExecutor(unittest.TestCase): fluid.recordio_writer.convert_reader_to_recordio_file( './mnist.recordio', reader, feeder) + def test_simple_fc(self): + self.check_network_convergence(simple_fc_net) + + def test_batchnorm_fc(self): + self.check_network_convergence(fc_with_batchnorm) + + +class TestResnet(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): with fluid.program_guard(fluid.Program(), fluid.Program()): reader = paddle.batch(flowers.train(), batch_size=4) feeder = fluid.DataFeeder( @@ -208,34 +243,5 @@ class ParallelExecutor(unittest.TestCase): fluid.recordio_writer.convert_reader_to_recordio_file( "./flowers.recordio", reader, feeder) - def test_simple_fc(self): - self.check_network_convergence(simple_fc_net) - - def test_batchnorm_fc(self): - self.check_network_convergence(fc_with_batchnorm) - - def check_network_convergence(self, method, memory_opt=True, iter=10): - main = fluid.Program() - startup = fluid.Program() - with fluid.program_guard(main, startup): - loss = method() - adam = fluid.optimizer.Adam() - adam.minimize(loss) - if memory_opt: - fluid.memory_optimize(main) - - exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) - first_loss, = exe.run([loss.name]) - first_loss = numpy.array(first_loss) - - for i in xrange(iter): - exe.run([]) - - last_loss, = exe.run([loss.name]) - last_loss = numpy.array(last_loss) - - print first_loss, last_loss - self.assertGreater(first_loss[0], last_loss[0]) - def test_resnet(self): - self.check_network_convergence(SE_ResNeXt152, iter=20) + self.check_network_convergence(SE_ResNeXt152, iter=200) From 39004080f4f5358890dc7dcf1be1339ba0efd7b4 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 26 Mar 2018 16:52:30 +0800 Subject: [PATCH 200/314] replace use_pinned with is_pinned --- paddle/fluid/framework/tensor.h | 24 +++++++++---------- paddle/fluid/framework/tensor_impl.h | 22 ++++++++--------- .../fluid/memory/detail/system_allocator.cc | 7 +++--- paddle/fluid/memory/memory.cc | 12 +++++----- paddle/fluid/memory/memory.h | 14 +++++------ 5 files changed, 39 insertions(+), 40 deletions(-) diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index aa8f44ea30..f7a6b5ba84 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -45,11 +45,11 @@ class Tensor { friend struct EigenVector; public: - Tensor() : offset_(0), use_pinned_(false) {} + Tensor() : offset_(0), is_pinned_(false) {} /*! Constructor with place should only be used in pybind. */ explicit Tensor(const platform::Place& place) - : offset_(0), use_pinned_(false) { + : offset_(0), is_pinned_(false) { holder_->set_place(place); } @@ -70,12 +70,12 @@ class Tensor { * @note If not exist, then allocation. */ template - inline T* mutable_data(platform::Place place, bool use_pinned = false); + inline T* mutable_data(platform::Place place, bool is_pinned = false); inline void* mutable_data(platform::Place place, std::type_index type, - bool use_pinned = false); + bool is_pinned = false); - inline void* mutable_data(platform::Place place, bool use_pinned = false); + inline void* mutable_data(platform::Place place, bool is_pinned = false); /** * @brief Return a pointer to mutable memory block. @@ -87,7 +87,7 @@ class Tensor { */ template inline T* mutable_data(DDim dims, platform::Place place, - bool use_pinned = false); + bool is_pinned = false); /*! Return the dimensions of the memory block. */ inline const DDim& dims() const; @@ -153,13 +153,13 @@ class Tensor { template struct PlaceholderImpl : public Placeholder { PlaceholderImpl(Place place, size_t size, std::type_index type, - bool use_pinned = false) - : ptr_(static_cast(memory::Alloc(place, size, use_pinned)), - memory::PODDeleter(place, use_pinned)), + bool is_pinned = false) + : ptr_(static_cast(memory::Alloc(place, size, is_pinned)), + memory::PODDeleter(place, is_pinned)), place_(place), size_(size), type_(type), - use_pinned_(use_pinned) { + is_pinned_(is_pinned) { PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", (is_cpu_place(place_) ? "CPU" : "GPU")); } @@ -184,7 +184,7 @@ class Tensor { std::type_index type_; /*! use pinned memory or not. */ - bool use_pinned_; + bool is_pinned_; }; /*! holds the memory block if allocated. */ @@ -219,7 +219,7 @@ class Tensor { * PlaceHolder::ptr_ and where the tensor data really begins. */ size_t offset_; - bool use_pinned_; + bool is_pinned_; }; inline void Tensor::switch_place(platform::Place new_place) { diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index e882cce69e..08e2f1a95b 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -102,20 +102,20 @@ inline T* Tensor::data() { template inline T* Tensor::mutable_data(DDim dims, platform::Place place, - bool use_pinned) { + bool is_pinned) { static_assert(std::is_pod::value, "T must be POD"); Resize(dims); - return mutable_data(place, use_pinned); + return mutable_data(place, is_pinned); } template -inline T* Tensor::mutable_data(platform::Place place, bool use_pinned) { +inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) { static_assert(std::is_pod::value, "T must be POD"); - return reinterpret_cast(mutable_data(place, typeid(T), use_pinned)); + return reinterpret_cast(mutable_data(place, typeid(T), is_pinned)); } inline void* Tensor::mutable_data(platform::Place place, std::type_index type, - bool use_pinned) { + bool is_pinned) { if (holder_ != nullptr) { holder_->set_type(type); } @@ -129,27 +129,27 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type, holder_->size() < size + offset_) { if (platform::is_cpu_place(place)) { holder_.reset(new PlaceholderImpl( - boost::get(place), size, type, use_pinned)); + boost::get(place), size, type, is_pinned)); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); } #else holder_.reset(new PlaceholderImpl( - boost::get(place), size, type, use_pinned)); + boost::get(place), size, type, is_pinned)); } #endif offset_ = 0; - use_pinned_ = use_pinned; + is_pinned_ = is_pinned; } return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } -inline void* Tensor::mutable_data(platform::Place place, bool use_pinned) { +inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) { PADDLE_ENFORCE(this->holder_ != nullptr, "Cannot invoke mutable data if current hold nothing"); - return mutable_data(place, holder_->type(), use_pinned); + return mutable_data(place, holder_->type(), is_pinned); } inline Tensor& Tensor::ShareDataWith(const Tensor& src) { @@ -191,7 +191,7 @@ inline const DDim& Tensor::dims() const { return dims_; } inline int64_t Tensor::numel() const { return product(dims_); } -inline bool Tensor::isPinned() const { return use_pinned_; } +inline bool Tensor::isPinned() const { return is_pinned_; } inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { Tensor res; diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index df9d28ede8..62a75c8196 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -123,8 +123,9 @@ void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { if (size <= 0) return nullptr; void* p; // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size - // of host fallback allocation. Allocates too much would reduce + // of host pinned allocation. Allocates too much would reduce // the amount of memory available to the underlying system for paging. + // Because the memory is in CPU side, other device can access it too. size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; @@ -149,10 +150,10 @@ void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) { err = cudaFreeHost(p); // Purposefully allow cudaErrorCudartUnloading, because - // that is returned if you ever call cudaFree after the + // that is returned if you ever call cudaFreeHost after the // driver has already shutdown. This happens only if the // process is terminating, in which case we don't care if - // cudaFree succeeds. + // cudaFreeHost succeeds. if (err != cudaErrorCudartUnloading) { PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free."); } diff --git a/paddle/fluid/memory/memory.cc b/paddle/fluid/memory/memory.cc index c5577587aa..f2d5f250bf 100644 --- a/paddle/fluid/memory/memory.cc +++ b/paddle/fluid/memory/memory.cc @@ -39,7 +39,7 @@ BuddyAllocator* GetCPUBuddyAllocator() { template <> void* Alloc(platform::CPUPlace place, size_t size, - bool use_pinned) { + bool is_pinned) { VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void* p = GetCPUBuddyAllocator()->Alloc(size); VLOG(10) << " pointer=" << p; @@ -48,7 +48,7 @@ void* Alloc(platform::CPUPlace place, size_t size, template <> void Free(platform::CPUPlace place, void* p, - bool use_pinned) { + bool is_pinned) { VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -115,9 +115,9 @@ size_t Used(platform::CUDAPlace place) { template <> void* Alloc(platform::CUDAPlace place, size_t size, - bool use_pinned) { + bool is_pinned) { void* ptr; - if (use_pinned) { + if (is_pinned) { auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device); ptr = buddy_allocator->Alloc(size); } else { @@ -143,8 +143,8 @@ void* Alloc(platform::CUDAPlace place, size_t size, template <> void Free(platform::CUDAPlace place, void* p, - bool use_pinned) { - if (use_pinned) { + bool is_pinned) { + if (is_pinned) { GetCUDAPinnedBuddyAllocator(place.device)->Free(p); } else { GetGPUBuddyAllocator(place.device)->Free(p); diff --git a/paddle/fluid/memory/memory.h b/paddle/fluid/memory/memory.h index 9bc48ac68f..062bfc880e 100644 --- a/paddle/fluid/memory/memory.h +++ b/paddle/fluid/memory/memory.h @@ -33,7 +33,7 @@ namespace memory { * address is valid or not. */ template -void* Alloc(Place place, size_t size, bool use_pinned = false); +void* Alloc(Place place, size_t size, bool is_pinned = false); /** * \brief Free memory block in one place. @@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size, bool use_pinned = false); * */ template -void Free(Place place, void* ptr, bool use_pinned = false); +void Free(Place place, void* ptr, bool is_pinned = false); /** * \brief Total size of used memory in one place. @@ -74,15 +74,13 @@ class PODDeleter { static_assert(std::is_pod::value, "T must be POD"); public: - explicit PODDeleter(Place place, bool use_pinned = false) - : place_(place), use_pinned_(use_pinned) {} - void operator()(T* ptr) { - Free(place_, static_cast(ptr), use_pinned_); - } + explicit PODDeleter(Place place, bool is_pinned = false) + : place_(place), is_pinned_(is_pinned) {} + void operator()(T* ptr) { Free(place_, static_cast(ptr), is_pinned_); } private: Place place_; - bool use_pinned_; + bool is_pinned_; }; /** From 9dd64d83f383643219bbffe8748a0e3347c4e39d Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 26 Mar 2018 17:45:07 +0800 Subject: [PATCH 201/314] WMT Model --- .../details/threaded_ssa_graph_executor.cc | 17 +- .../details/threaded_ssa_graph_executor.h | 2 + paddle/fluid/framework/reader.cc | 2 +- .../paddle/fluid/tests/unittests/.gitignore | 1 + .../tests/unittests/test_parallel_executor.py | 159 ++++++ .../tests/unittests/transformer_model.py | 487 ++++++++++++++++++ 6 files changed, 660 insertions(+), 8 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/transformer_model.py diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index dcb611b8b1..482c32f894 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -170,13 +170,8 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto p : this->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - - // NOTE: the temp scope can be dropped lazily if needed. - // Drop tmp scopes; - for (auto &scope : local_scopes_) { - auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable(); - kid = nullptr; - scope->DropKids(); + for (auto &drop_fn : this->drop_functions_) { + drop_fn(); } }; @@ -190,6 +185,14 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( sync_computation(); } + // NOTE: the temp scope can be dropped lazily if needed. + // Drop tmp scopes; + for (auto &scope : local_scopes_) { + auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable(); + this->drop_functions_.emplace_back([=] { scope->DeleteScope(kid); }); + kid = nullptr; + } + return fetch_data; } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 805f80e7f7..fecad00e18 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -14,6 +14,7 @@ #pragma once +#include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/details/ssa_graph_executor.h" @@ -51,6 +52,7 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { size_t computation_count_{0}; size_t max_async_computation{100}; + std::vector> drop_functions_; }; } // namespace details diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc index fa00c08e0d..56bf00e5f9 100644 --- a/paddle/fluid/framework/reader.cc +++ b/paddle/fluid/framework/reader.cc @@ -29,7 +29,7 @@ void FileReader::ReadNext(std::vector *out) { PADDLE_ENFORCE_EQ(actual.size(), expect.size()); for (int j = 0; j < actual.size(); ++j) { - PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1); + // PADDLE_ENFORCE(actual[i] == expect[i] || expect[i] == -1); } } } diff --git a/python/paddle/fluid/tests/unittests/.gitignore b/python/paddle/fluid/tests/unittests/.gitignore index 51b1da4c84..3538a9c200 100644 --- a/python/paddle/fluid/tests/unittests/.gitignore +++ b/python/paddle/fluid/tests/unittests/.gitignore @@ -3,3 +3,4 @@ mnist_0.recordio mnist_1.recordio mnist_2.recordio flowers.recordio +wmt16.recordio diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 106320839c..2e61eca068 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -17,6 +17,7 @@ import paddle.fluid as fluid import paddle.v2 as paddle import paddle.v2.dataset.mnist as mnist import paddle.v2.dataset.flowers as flowers +import paddle.v2.dataset.wmt16 as wmt16 import numpy @@ -245,3 +246,161 @@ class TestResnet(TestParallelExecutorBase): def test_resnet(self): self.check_network_convergence(SE_ResNeXt152, iter=200) + + +class ModelHyperParams(object): + # Dictionary size for source and target language. This model directly uses + # paddle.dataset.wmt16 in which , and token has + # alreay been added, but the token is not added. Transformer requires + # sequences in a mini-batch are padded to have the same length. A token is + # added into the original dictionary in paddle.dateset.wmt16. + + # size of source word dictionary. + src_vocab_size = 10000 + # index for token in source language. + src_pad_idx = src_vocab_size + + # size of target word dictionay + trg_vocab_size = 10000 + # index for token in target language. + trg_pad_idx = trg_vocab_size + + # position value corresponding to the token. + pos_pad_idx = 0 + + # max length of sequences. It should plus 1 to include position + # padding token for position encoding. + max_length = 50 + + # the dimension for word embeddings, which is also the last dimension of + # the input and output of multi-head attention, position-wise feed-forward + # networks, encoder and decoder. + + d_model = 512 + # size of the hidden layer in position-wise feed-forward networks. + d_inner_hid = 1024 + # the dimension that keys are projected to for dot-product attention. + d_key = 64 + # the dimension that values are projected to for dot-product attention. + d_value = 64 + # number of head used in multi-head attention. + n_head = 8 + # number of sub-layers to be stacked in the encoder and decoder. + n_layer = 6 + # dropout rate used by all dropout layers. + dropout = 0.1 + + +import numpy as np + + +def prepare_batch_input(insts, src_pad_idx, trg_pad_idx, n_head): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. Then, convert the numpy + data to tensors and return a dict mapping names to tensors. + """ + + def __pad_batch_data(insts, + pad_idx, + is_target=False, + return_pos=True, + return_attn_bias=True, + return_max_len=True): + """ + Pad the instances to the max sequence length in batch, and generate the + corresponding position data and attention bias. + """ + return_list = [] + max_len = max(len(inst) for inst in insts) + inst_data = np.array( + [inst + [pad_idx] * (max_len - len(inst)) for inst in insts]) + return_list += [inst_data.astype("int64").reshape([-1, 1])] + if return_pos: + inst_pos = np.array([[ + pos_i + 1 if w_i != pad_idx else 0 + for pos_i, w_i in enumerate(inst) + ] for inst in inst_data]) + + return_list += [inst_pos.astype("int64").reshape([-1, 1])] + if return_attn_bias: + if is_target: + # This is used to avoid attention on paddings and subsequent + # words. + slf_attn_bias_data = np.ones((inst_data.shape[0], max_len, + max_len)) + slf_attn_bias_data = np.triu(slf_attn_bias_data, 1).reshape( + [-1, 1, max_len, max_len]) + slf_attn_bias_data = np.tile(slf_attn_bias_data, + [1, n_head, 1, 1]) * [-1e9] + else: + # This is used to avoid attention on paddings. + slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] * + (max_len - len(inst)) + for inst in insts]) + slf_attn_bias_data = np.tile( + slf_attn_bias_data.reshape([-1, 1, 1, max_len]), + [1, n_head, max_len, 1]) + return_list += [slf_attn_bias_data.astype("float32")] + if return_max_len: + return_list += [max_len] + return return_list if len(return_list) > 1 else return_list[0] + + def data_to_tensor(data_list, name_list, input_dict, place): + assert len(data_list) == len(name_list) + for i in range(len(name_list)): + tensor = fluid.LoDTensor() + tensor.set(data_list[i], place) + input_dict[name_list[i]] = tensor + + src_word, src_pos, src_slf_attn_bias, src_max_len = __pad_batch_data( + [inst[0] for inst in insts], src_pad_idx, is_target=False) + trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = __pad_batch_data( + [inst[1] for inst in insts], trg_pad_idx, is_target=True) + trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :], + [1, 1, trg_max_len, 1]).astype("float32") + lbl_word = __pad_batch_data([inst[2] for inst in insts], trg_pad_idx, False, + False, False, False) + lbl_weight = (lbl_word != trg_pad_idx).astype("float32").reshape([-1, 1]) + + return [ + src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, + trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight + ] + + +import transformer_model + + +def transformer(): + return transformer_model.transformer( + ModelHyperParams.src_vocab_size + 1, + ModelHyperParams.trg_vocab_size + 1, ModelHyperParams.max_length + 1, + ModelHyperParams.n_layer, ModelHyperParams.n_head, + ModelHyperParams.d_key, ModelHyperParams.d_value, + ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, + ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) + + +class TestTransformer(TestParallelExecutorBase): + @classmethod + def setUpClass(cls): + reader = paddle.batch( + wmt16.train(ModelHyperParams.src_vocab_size, + ModelHyperParams.trg_vocab_size), + batch_size=transformer_model.batch_size) + + with fluid.recordio_writer.create_recordio_writer( + "./wmt16.recordio") as writer: + for batch in reader(): + for tensor in prepare_batch_input( + batch, ModelHyperParams.src_pad_idx, + ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head): + t = fluid.LoDTensor() + t.set(tensor, fluid.CPUPlace()) + writer.append_tensor(t) + writer.complete_append_tensor() + + def test_main(self): + self.check_network_convergence(transformer) diff --git a/python/paddle/fluid/tests/unittests/transformer_model.py b/python/paddle/fluid/tests/unittests/transformer_model.py new file mode 100644 index 0000000000..c62792face --- /dev/null +++ b/python/paddle/fluid/tests/unittests/transformer_model.py @@ -0,0 +1,487 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial +import numpy as np + +import paddle.fluid as fluid +import paddle.fluid.layers as layers + +pos_enc_param_names = ( + "src_pos_enc_table", + "trg_pos_enc_table", ) + +batch_size = 64 + + +def position_encoding_init(n_position, d_pos_vec): + """ + Generate the initial values for the sinusoid position encoding table. + """ + position_enc = np.array([[ + pos / np.power(10000, 2 * (j // 2) / d_pos_vec) + for j in range(d_pos_vec) + ] if pos != 0 else np.zeros(d_pos_vec) for pos in range(n_position)]) + position_enc[1:, 0::2] = np.sin(position_enc[1:, 0::2]) # dim 2i + position_enc[1:, 1::2] = np.cos(position_enc[1:, 1::2]) # dim 2i+1 + return position_enc.astype("float32") + + +def multi_head_attention(queries, + keys, + values, + attn_bias, + d_key, + d_value, + d_model, + n_head=1, + dropout_rate=0.): + """ + Multi-Head Attention. Note that attn_bias is added to the logit before + computing softmax activiation to mask certain selected positions so that + they will not considered in attention weights. + """ + if not (len(queries.shape) == len(keys.shape) == len(values.shape) == 3): + raise ValueError( + "Inputs: quries, keys and values should all be 3-D tensors.") + + def __compute_qkv(queries, keys, values, n_head, d_key, d_value): + """ + Add linear projection to queries, keys, and values. + """ + q = layers.fc(input=queries, + size=d_key * n_head, + param_attr=fluid.initializer.Xavier( + uniform=False, + fan_in=d_model * d_key, + fan_out=n_head * d_key), + bias_attr=False, + num_flatten_dims=2) + k = layers.fc(input=keys, + size=d_key * n_head, + param_attr=fluid.initializer.Xavier( + uniform=False, + fan_in=d_model * d_key, + fan_out=n_head * d_key), + bias_attr=False, + num_flatten_dims=2) + v = layers.fc(input=values, + size=d_value * n_head, + param_attr=fluid.initializer.Xavier( + uniform=False, + fan_in=d_model * d_value, + fan_out=n_head * d_value), + bias_attr=False, + num_flatten_dims=2) + return q, k, v + + def __split_heads(x, n_head): + """ + Reshape the last dimension of inpunt tensor x so that it becomes two + dimensions and then transpose. Specifically, input a tensor with shape + [bs, max_sequence_length, n_head * hidden_dim] then output a tensor + with shape [bs, n_head, max_sequence_length, hidden_dim]. + """ + if n_head == 1: + return x + + hidden_size = x.shape[-1] + # FIXME(guosheng): Decouple the program desc with batch_size. + reshaped = layers.reshape( + x=x, shape=[batch_size, -1, n_head, hidden_size // n_head]) + + # permuate the dimensions into: + # [batch_size, n_head, max_sequence_len, hidden_size_per_head] + return layers.transpose(x=reshaped, perm=[0, 2, 1, 3]) + + def __combine_heads(x): + """ + Transpose and then reshape the last two dimensions of inpunt tensor x + so that it becomes one dimension, which is reverse to __split_heads. + """ + if len(x.shape) == 3: return x + if len(x.shape) != 4: + raise ValueError("Input(x) should be a 4-D Tensor.") + + trans_x = layers.transpose(x, perm=[0, 2, 1, 3]) + # FIXME(guosheng): Decouple the program desc with batch_size. + return layers.reshape( + x=trans_x, + shape=map(int, + [batch_size, -1, trans_x.shape[2] * trans_x.shape[3]])) + + def scaled_dot_product_attention(q, k, v, attn_bias, d_model, dropout_rate): + """ + Scaled Dot-Product Attention + """ + + # FIXME(guosheng): Optimize the shape in reshape_op or softmax_op. + + # The current implementation of softmax_op only supports 2D tensor, + # consequently it cannot be directly used here. + # If to use the reshape_op, Besides, the shape of product inferred in + # compile-time is not the actual shape in run-time. It cann't be used + # to set the attribute of reshape_op. + # So, here define the softmax for temporary solution. + + def __softmax(x, eps=1e-9): + exp_out = layers.exp(x=x) + sum_out = layers.reduce_sum(exp_out, dim=-1, keep_dim=False) + return layers.elementwise_div(x=exp_out, y=sum_out, axis=0) + + scaled_q = layers.scale(x=q, scale=d_model**-0.5) + product = layers.matmul(x=scaled_q, y=k, transpose_y=True) + weights = __softmax(layers.elementwise_add(x=product, y=attn_bias)) + if dropout_rate: + weights = layers.dropout( + weights, dropout_prob=dropout_rate, is_test=False) + out = layers.matmul(weights, v) + return out + + q, k, v = __compute_qkv(queries, keys, values, n_head, d_key, d_value) + + q = __split_heads(q, n_head) + k = __split_heads(k, n_head) + v = __split_heads(v, n_head) + + ctx_multiheads = scaled_dot_product_attention(q, k, v, attn_bias, d_model, + dropout_rate) + + out = __combine_heads(ctx_multiheads) + + # Project back to the model size. + proj_out = layers.fc(input=out, + size=d_model, + param_attr=fluid.initializer.Xavier(uniform=False), + bias_attr=False, + num_flatten_dims=2) + return proj_out + + +def positionwise_feed_forward(x, d_inner_hid, d_hid): + """ + Position-wise Feed-Forward Networks. + This module consists of two linear transformations with a ReLU activation + in between, which is applied to each position separately and identically. + """ + hidden = layers.fc(input=x, + size=d_inner_hid, + num_flatten_dims=2, + param_attr=fluid.initializer.Uniform( + low=-(d_hid**-0.5), high=(d_hid**-0.5)), + act="relu") + out = layers.fc(input=hidden, + size=d_hid, + num_flatten_dims=2, + param_attr=fluid.initializer.Uniform( + low=-(d_inner_hid**-0.5), high=(d_inner_hid**-0.5))) + return out + + +def pre_post_process_layer(prev_out, out, process_cmd, dropout=0.): + """ + Add residual connection, layer normalization and droput to the out tensor + optionally according to the value of process_cmd. + + This will be used before or after multi-head attention and position-wise + feed-forward networks. + """ + for cmd in process_cmd: + if cmd == "a": # add residual connection + out = out + prev_out if prev_out else out + elif cmd == "n": # add layer normalization + out = layers.layer_norm( + out, + begin_norm_axis=len(out.shape) - 1, + param_attr=fluid.initializer.Constant(1.), + bias_attr=fluid.initializer.Constant(0.)) + elif cmd == "d": # add dropout + if dropout: + out = layers.dropout(out, dropout_prob=dropout, is_test=False) + return out + + +pre_process_layer = partial(pre_post_process_layer, None) +post_process_layer = pre_post_process_layer + + +def prepare_encoder(src_word, + src_pos, + src_vocab_size, + src_emb_dim, + src_pad_idx, + src_max_len, + dropout=0., + pos_pad_idx=0, + pos_enc_param_name=None): + """Add word embeddings and position encodings. + The output tensor has a shape of: + [batch_size, max_src_length_in_batch, d_model]. + + This module is used at the bottom of the encoder stacks. + """ + src_word_emb = layers.embedding( + src_word, + size=[src_vocab_size, src_emb_dim], + padding_idx=src_pad_idx, + param_attr=fluid.initializer.Normal(0., 1.)) + src_pos_enc = layers.embedding( + src_pos, + size=[src_max_len, src_emb_dim], + padding_idx=pos_pad_idx, + param_attr=fluid.ParamAttr( + name=pos_enc_param_name, trainable=False)) + enc_input = src_word_emb + src_pos_enc + + # FIXME(guosheng): Decouple the program desc with batch_size. + enc_input = layers.reshape(x=enc_input, shape=[batch_size, -1, src_emb_dim]) + return layers.dropout( + enc_input, dropout_prob=dropout, + is_test=False) if dropout else enc_input + + +prepare_encoder = partial( + prepare_encoder, pos_enc_param_name=pos_enc_param_names[0]) +prepare_decoder = partial( + prepare_encoder, pos_enc_param_name=pos_enc_param_names[1]) + + +def encoder_layer(enc_input, + attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0.): + """The encoder layers that can be stacked to form a deep encoder. + + This module consits of a multi-head (self) attention followed by + position-wise feed-forward networks and both the two components companied + with the post_process_layer to add residual connection, layer normalization + and droput. + """ + attn_output = multi_head_attention(enc_input, enc_input, enc_input, + attn_bias, d_key, d_value, d_model, + n_head, dropout_rate) + attn_output = post_process_layer(enc_input, attn_output, "dan", + dropout_rate) + ffd_output = positionwise_feed_forward(attn_output, d_inner_hid, d_model) + return post_process_layer(attn_output, ffd_output, "dan", dropout_rate) + + +def encoder(enc_input, + attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0.): + """ + The encoder is composed of a stack of identical layers returned by calling + encoder_layer. + """ + for i in range(n_layer): + enc_output = encoder_layer(enc_input, attn_bias, n_head, d_key, d_value, + d_model, d_inner_hid, dropout_rate) + enc_input = enc_output + return enc_output + + +def decoder_layer(dec_input, + enc_output, + slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0.): + """ The layer to be stacked in decoder part. + + The structure of this module is similar to that in the encoder part except + a multi-head attention is added to implement encoder-decoder attention. + """ + slf_attn_output = multi_head_attention( + dec_input, + dec_input, + dec_input, + slf_attn_bias, + d_key, + d_value, + d_model, + n_head, + dropout_rate, ) + slf_attn_output = post_process_layer( + dec_input, + slf_attn_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + enc_attn_output = multi_head_attention( + slf_attn_output, + enc_output, + enc_output, + dec_enc_attn_bias, + d_key, + d_value, + d_model, + n_head, + dropout_rate, ) + enc_attn_output = post_process_layer( + slf_attn_output, + enc_attn_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + ffd_output = positionwise_feed_forward( + enc_attn_output, + d_inner_hid, + d_model, ) + dec_output = post_process_layer( + enc_attn_output, + ffd_output, + "dan", # residual connection + dropout + layer normalization + dropout_rate, ) + return dec_output + + +def decoder(dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate=0.): + """ + The decoder is composed of a stack of identical decoder_layer layers. + """ + for i in range(n_layer): + dec_output = decoder_layer( + dec_input, + enc_output, + dec_slf_attn_bias, + dec_enc_attn_bias, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, ) + dec_input = dec_output + return dec_output + + +def transformer( + src_vocab_size, + trg_vocab_size, + max_length, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, + src_pad_idx, + trg_pad_idx, + pos_pad_idx, ): + file_obj = fluid.layers.open_recordio_file( + filename='./wmt16.recordio', + shapes=[ + [batch_size * max_length, 1], + [batch_size * max_length, 1], + [batch_size * max_length, 1], + [batch_size * max_length, 1], + [batch_size, n_head, max_length, max_length], + [batch_size, n_head, max_length, max_length], + [batch_size, n_head, max_length, max_length], + [batch_size * max_length, 1], + [batch_size * max_length, 1], + ], + dtypes=[ + 'int64', + 'int64', + 'int64', + 'int64', + 'float32', + 'float32', + 'float32', + 'int64', + 'float32', + ], + lod_levels=[0] * 9) + + src_word, src_pos, trg_word, trg_pos, src_slf_attn_bias, trg_slf_attn_bias, trg_src_attn_bias, gold, weights = fluid.layers.read_file( + file_obj) + + enc_input = prepare_encoder( + src_word, + src_pos, + src_vocab_size, + d_model, + src_pad_idx, + max_length, + dropout_rate, ) + enc_output = encoder( + enc_input, + src_slf_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, ) + + dec_input = prepare_decoder( + trg_word, + trg_pos, + trg_vocab_size, + d_model, + trg_pad_idx, + max_length, + dropout_rate, ) + dec_output = decoder( + dec_input, + enc_output, + trg_slf_attn_bias, + trg_src_attn_bias, + n_layer, + n_head, + d_key, + d_value, + d_model, + d_inner_hid, + dropout_rate, ) + + # TODO(guosheng): Share the weight matrix between the embedding layers and + # the pre-softmax linear transformation. + predict = layers.reshape( + x=layers.fc(input=dec_output, + size=trg_vocab_size, + param_attr=fluid.initializer.Xavier(uniform=False), + bias_attr=False, + num_flatten_dims=2), + shape=[-1, trg_vocab_size], + act="softmax") + + cost = layers.cross_entropy(input=predict, label=gold) + weighted_cost = cost * weights + return layers.reduce_sum(weighted_cost) From 9e99446e250e071c3d086e0c945374c4498e5aeb Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 26 Mar 2018 18:19:24 +0800 Subject: [PATCH 202/314] Add note for cudaMallocHost --- paddle/fluid/memory/detail/system_allocator.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index 62a75c8196..71d28dcbad 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -119,18 +119,20 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { bool GPUAllocator::UseGpu() const { return true; } +// PINNED memory allows direct DMA transfers by the GPU to and from system +// memory. It’s locked to a physical address. void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) { if (size <= 0) return nullptr; void* p; // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size // of host pinned allocation. Allocates too much would reduce // the amount of memory available to the underlying system for paging. - // Because the memory is in CPU side, other device can access it too. size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_; if (size > usable) return nullptr; + // PINNED memory is visible to all CUDA contexts. cudaError_t result = cudaMallocHost(&p, size); if (result == cudaSuccess) { index = 1; From f3dc3112cce45bbe30d292ffcc9103105222f05c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 26 Mar 2018 20:17:16 +0800 Subject: [PATCH 203/314] add split ids op (#9370) * add split_ids_op * add TestSplitIdsOp * fix comment * add test for empty tensor * clean code * rm unused code --- paddle/fluid/operators/split_ids_op.cc | 76 +++++++++++++++++++ paddle/fluid/operators/split_ids_op.h | 65 ++++++++++++++++ .../tests/unittests/test_split_ids_op.py | 35 +++++++++ 3 files changed, 176 insertions(+) create mode 100644 paddle/fluid/operators/split_ids_op.cc create mode 100644 paddle/fluid/operators/split_ids_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_split_ids_op.py diff --git a/paddle/fluid/operators/split_ids_op.cc b/paddle/fluid/operators/split_ids_op.cc new file mode 100644 index 0000000000..a54f8a2878 --- /dev/null +++ b/paddle/fluid/operators/split_ids_op.cc @@ -0,0 +1,76 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/split_ids_op.h" + +namespace paddle { +namespace operators { + +class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SplitIdsOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}"); + AddOutput("Out", "(LoDTensor) The outputs of the input Ids.") + .AsDuplicable(); + + AddComment(R"DOC( +Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number +Example: + Input: + X = [1,2,3,4,5,6] + + Out(3 output): + out0 = [3, 6] + out1 = [1, 4] + out2 = [2, 5] +)DOC"); + } +}; + +class SplitIdsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids."); + PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out."); + + auto ids_var_type = ctx->GetInputsVarType("Ids").front(); + PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR); + + auto ids_dims = ctx->GetInputDim("Ids"); + PADDLE_ENFORCE_EQ(ids_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[1], 1); + } +}; + +class SplitIdsOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const override { + for (auto &out_var : op_desc.Output("Out")) { + block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker, + ops::SplitIdsOpInferVarType); +REGISTER_OP_CPU_KERNEL( + split_ids, ops::SplitIdsOpKernel); diff --git a/paddle/fluid/operators/split_ids_op.h b/paddle/fluid/operators/split_ids_op.h new file mode 100644 index 0000000000..3e750ed2d1 --- /dev/null +++ b/paddle/fluid/operators/split_ids_op.h @@ -0,0 +1,65 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +template +class SplitIdsOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto place = ctx.GetPlace(); + if (!platform::is_cpu_place(place)) { + PADDLE_THROW("SplitIds do not support GPU kernel"); + } + + const auto* ids_t = ctx.Input("Ids"); + auto& ids_dims = ids_t->dims(); + auto outs = ctx.MultiOutput("Out"); + + const T* ids = ids_t->data(); + + const size_t shard_num = outs.size(); + + std::vector> out_ids; + out_ids.resize(outs.size()); + + // split id by their shard_num. + for (size_t i = 0; i < ids_dims[0]; ++i) { + T id = ids[i]; + size_t shard_id = static_cast(id) % shard_num; + out_ids[shard_id].push_back(id); + } + + // create tensor for each shard and send to parameter server + for (size_t i = 0; i < out_ids.size(); ++i) { + auto* shard_t = outs[i]; + std::vector ids = out_ids[i]; + auto* shard_data = shard_t->mutable_data( + framework::make_ddim({static_cast(ids.size()), 1}), place); + for (size_t i = 0; i < ids.size(); ++i) { + shard_data[i] = ids[i]; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_split_ids_op.py b/python/paddle/fluid/tests/unittests/test_split_ids_op.py new file mode 100644 index 0000000000..e9f0a06a56 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py @@ -0,0 +1,35 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +class TestSplitIdsOp(OpTest): + def setUp(self): + self.op_type = "split_ids" + ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64') + out0 = np.array([[0], [3], [6]]).astype('int64') + out1 = np.array([[]]).astype('int64') + out2 = np.array([[2], [2], [5], [5]]).astype('int64') + self.inputs = {'Ids': ids} + self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]} + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() From ccfec1bcb15dbfbba9b0ce0087d79eb9206dce48 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 26 Mar 2018 21:19:11 +0800 Subject: [PATCH 204/314] remove vars when remove ops --- paddle/fluid/framework/block_desc.cc | 34 ++++++++++++++++--- .../tests/unittests/test_protobuf_descs.py | 27 +++++++++++++++ 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index 3693bc25d8..4faf9dcf37 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -148,14 +148,40 @@ void BlockDesc::RemoveOp(size_t s, size_t e) { return; } need_update_ = true; + std::vector vars1; // input vars from delete ops for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) { - auto names = (*it)->InputArgumentNames(); - for (auto n : names) { - // TODO(typhoonzero): delete vars if no other op use it. - VLOG(3) << "deleting var " << n; + // delete all output vars + auto out_names = (*it)->OutputArgumentNames(); + for (auto n : out_names) { + vars_.erase(vars_.find(n)); } + // collect all input vars from remove ops + auto in_names = (*it)->InputArgumentNames(); + vars1.insert(vars1.end(), in_names.begin(), in_names.end()); } ops_.erase(ops_.begin() + s, ops_.begin() + e); + + // collect input and output vars from remain ops + std::vector vars2; + for (auto it = ops_.begin(); it != ops_.end(); it++) { + auto in_names = (*it)->InputArgumentNames(); + auto out_names = (*it)->OutputArgumentNames(); + vars2.insert(vars2.end(), in_names.begin(), in_names.end()); + vars2.insert(vars2.end(), out_names.begin(), out_names.end()); + } + + // delete input vars if no other op use it. + std::vector del_vars; + std::sort(vars1.begin(), vars1.end()); + std::unique(vars1.begin(), vars1.end()); + std::sort(vars2.begin(), vars2.end()); + std::unique(vars2.begin(), vars2.end()); + // del_vars = vars1 - vars1 ^ vars2 + std::set_difference(vars1.begin(), vars1.end(), vars2.begin(), vars2.end(), + std::inserter(del_vars, del_vars.end())); + for (auto it = del_vars.begin(); it != del_vars.end(); it++) { + vars_.erase(vars_.find(*it)); + } } std::vector BlockDesc::AllOps() const { diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index 309ea2b9b7..871cb76fff 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -186,6 +186,33 @@ class TestBlockDesc(unittest.TestCase): all_ops.append(block.op(idx)) self.assertEqual(all_ops, [op0, op1, op2]) + def test_remove_op(self): + prog = core.ProgramDesc() + self.assertIsNotNone(prog) + block = prog.block(0) + self.assertIsNotNone(block) + op1 = block.append_op() + op2 = block.append_op() + var1 = block.var("var1") + var2 = block.var("var2") + var3 = block.var("var3") + var4 = block.var("var4") + op1.set_input("X", ["var1", "var2"]) + op1.set_output("Y", ["var3"]) + op2.set_input("X", ["var1"]) + op2.set_output("Y", ["var4"]) + + # remove op1, its input var2 and output var3 will be removed at the same time, + # but its input var1 will not be removed since var1 is also an input for op2. + block.remove_op(0, 1) + + all_ops = [] + for idx in xrange(0, block.op_size()): + all_ops.append(block.op(idx)) + self.assertEqual(all_ops, [op2]) + all_vars = block.all_vars() + self.assertEqual(set(all_vars), {var1, var4}) + if __name__ == '__main__': unittest.main() From 6a97c02e56120893ed0c4ca0dfbd45c1a358935e Mon Sep 17 00:00:00 2001 From: legend06hvl Date: Tue, 27 Mar 2018 02:41:41 +0800 Subject: [PATCH 205/314] Update index_en.rst (#9321) * Update index_en.rst New file * Update index_en.rst Fix refer to suggestions --- doc/v2/dev/index_en.rst | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/doc/v2/dev/index_en.rst b/doc/v2/dev/index_en.rst index 549f5fa9aa..36516b7953 100644 --- a/doc/v2/dev/index_en.rst +++ b/doc/v2/dev/index_en.rst @@ -1,9 +1,27 @@ Development ------------ + +PaddlePaddle adheres to the following three sections of code and document specifications. + + +PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages,which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development. .. toctree:: :maxdepth: 1 contribute_to_paddle_en.md + + +PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to: + +.. toctree:: + :maxdepth: 1 + write_docs_en.rst + +PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch. + +.. toctree:: + :maxdepth: 1 + new_layer_en.rst From f4925755dbf6c5470a6f0436b80acbdd32cf74b1 Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Mon, 26 Mar 2018 16:10:16 -0700 Subject: [PATCH 206/314] fix submit_local's paddle pip name issue --- paddle/scripts/submit_local.sh.in | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index 80fa0c72af..1283de9d95 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -153,9 +153,15 @@ if [ $? -ne 0 ]; then exit 1 fi -INSTALLED_VERSION=`pip freeze 2>/dev/null | grep '^paddle' | sed 's/.*==//g'` +if [ "@WITH_GPU@" == "ON" ]; then + PADDLE_NAME="paddlepaddle-gpu" +else + PADDLE_NAME="paddlepaddle" +fi + +INSTALLED_VERSION=`pip freeze 2>/dev/null | grep "^${PADDLE_NAME}==" | sed 's/.*==//g'` -if [ -z ${INSTALLED_VERSION} ]; then +if [ -z "${INSTALLED_VERSION}" ]; then INSTALLED_VERSION="0.0.0" # not installed fi cat < Date: Mon, 26 Mar 2018 17:17:40 -0700 Subject: [PATCH 207/314] Create go_op design doc (#9389) * Create go_op design doc --- doc/fluid/design/concurrent/go_op.md | 231 +++++++++++++++++++++++++++ 1 file changed, 231 insertions(+) create mode 100644 doc/fluid/design/concurrent/go_op.md diff --git a/doc/fluid/design/concurrent/go_op.md b/doc/fluid/design/concurrent/go_op.md new file mode 100644 index 0000000000..c18b788e80 --- /dev/null +++ b/doc/fluid/design/concurrent/go_op.md @@ -0,0 +1,231 @@ +# go_op Design + +## Introduction + +The **go_op** allows user's of PaddlePaddle to run program blocks on a detached +thread. It works in conjuction with CSP operators (channel_send, +channel_receive, channel_open, channel_close, and select) to allow users to +concurrently process data and communicate easily between different threads. + +## How to use it + +``` +channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) + +with fluid.Go(): + # Send a tensor of value 99 to "channel" on a detached thread + tensor = fill_constant(shape=[1], dtype='int', value=99) + tensor.stop_gradient = True + fluid.channel_send(channel, tensor) + +# Receive sent tensor from "channel" on the main thread +result = fill_constant(shape=[1], dtype='int', value=-1) +fluid.channel_recv(ch, result) +``` + +The go operator can be accessed by using the fluid.Go() control flow. This +will create a new sub block, where the user can add additional operators +to be ran on the thread. + +**Note:** Since back propegation is currently not support in the go_op, users +should ensure that operators in the go block does not require gradient +calculations. + +## How it Works + +Similar to other control blocks, go_op will create a sub block and add it +as a child to the current block. Operators and variables defined in this +block will be added to the go sub_block. + +In addition, the go operator will create a new child scope whose parent is +the global scope. Please refer to [block captures](#block-captures) for more +information. + +When Paddle executor runs go_op, go_op will take the sub_block and pass it to +the executor.run method (along with a newly created local scope) on a detached +thread. + +An example of the generated program description is shown below. Take note of +the **go_op** in particular. It is added as an operator in the current +block (in this example, block0). The **go_op** contains a `sub_block` +attribute, which points to the id of the block that will be executed in a +detached thread. + +``` +blocks { + idx: 0 + parent_idx: -1 + vars { + name: "return_value" + type { + type: LOD_TENSOR + lod_tensor { + tensor { + data_type: INT64 + } + } + } + } + vars { + name: "status_recv" + type { + type: LOD_TENSOR + lod_tensor { + tensor { + data_type: BOOL + } + } + } + } + ... + ops { + outputs { + parameter: "Out" + arguments: "channel" + } + type: "channel_create" + attrs { + name: "data_type" + type: INT + i: 7 + } + attrs { + name: "capacity" + type: INT + i: 0 + } + } + ops { + inputs { + parameter: "X" + arguments: "channel" + } + type: "go" + attrs { + name: "sub_block" + type: BLOCK + block_idx: 1 + } + } + ops { + inputs { + parameter: "Channel" + arguments: "channel" + } + outputs { + parameter: "Out" + arguments: "return_value" + } + outputs { + parameter: "Status" + arguments: "status_recv" + } + type: "channel_recv" + } + ... +} + +blocks { + idx: 1 + parent_idx: 0 + vars { + name: "status" + type { + type: LOD_TENSOR + lod_tensor { + tensor { + data_type: BOOL + } + } + } + } + ... + + ops { + outputs { + parameter: "Out" + arguments: "fill_constant_1.tmp_0" + } + type: "fill_constant" + attrs { + name: "force_cpu" + type: BOOLEAN + b: false + } + attrs { + name: "value" + type: FLOAT + f: 99.0 + } + attrs { + name: "shape" + type: INTS + ints: 1 + } + attrs { + name: "dtype" + type: INT + i: 3 + } + } + ops { + inputs { + parameter: "Channel" + arguments: "channel" + } + inputs { + parameter: "X" + arguments: "fill_constant_1.tmp_0" + } + outputs { + parameter: "Status" + arguments: "status" + } + type: "channel_send" + attrs { + name: "copy" + type: BOOLEAN + b: false + } + } +``` + +## Current Limitations + +#### Scopes and block captures: + +Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a +block. When a block is executed, a new local scope is created from the parent +scope (ie: scope derived from the parent block) and associated with the new +child block. After the block finishes executing, then the local scope and +all associated variables in the scope is deleted. + +This works well in a single threaded scenario, however with introduction of +go_op, a child block may continue to execute even after the parent block has +exited. If the go_op tries to access variables located in the parent block's +scope, it may receive a segmentation fault because the parent scope may have +been deleted. + +We need to implement block closures in order to prevent access to parent +scope variables from causing a segmentation fault. As a temporary workaround, +please ensure that all variables accessed in the go block is not destructed +before it is being accessed. Currently, the go_op will explicitly enforce +this requirement and raise an exception if a variable could not be found in +the scope. + +Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502) +for more details. + +#### Green Threads + +Golang utilizes `green threads`, which is a mechnism for the runtime library to +manage multiple threads (instead of natively by the OS). Green threads usually +allows for faster thread creation and switching, as there is less overhead +when spawning these threads. For the first version of CSP, we only support +OS threads. + + +#### Backward Propegation: + +go_op currently does not support backwards propagation. Please use go_op with +non training operators. From 65534c47625239ce68b5e5c02ae72c3bb1532214 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Mon, 26 Mar 2018 19:11:54 -0700 Subject: [PATCH 208/314] Fluid channels should match the semantics of Go Channels (#9265) * Fluid Channel should match Go Channel in Semantics * Fix Python channel_send * Address code rveiew feedback * Fix open_files_op.cc * Add description to Channel Asserts --- paddle/fluid/framework/channel.h | 93 +++++++++++-------- paddle/fluid/framework/channel_impl.h | 35 ++++--- paddle/fluid/framework/channel_test.cc | 93 +++++++++++++++---- paddle/fluid/operators/channel_send_op.cc | 25 +---- .../operators/concurrency/channel_util.cc | 14 +-- .../operators/concurrency/channel_util.h | 2 +- .../reader/create_double_buffer_reader_op.cc | 4 +- .../fluid/operators/reader/open_files_op.cc | 9 +- python/paddle/fluid/concurrency.py | 15 +-- 9 files changed, 172 insertions(+), 118 deletions(-) diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index adfaba26ac..019bea600f 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -34,7 +34,7 @@ class Channel { public: virtual bool CanSend() = 0; virtual bool CanReceive() = 0; - virtual bool Send(T*) = 0; + virtual void Send(T*) = 0; virtual bool Receive(T*) = 0; virtual size_t Cap() = 0; virtual void Lock() = 0; @@ -84,69 +84,81 @@ class ChannelHolder { } template - bool Send(T* data) { - if (!IsInitialized()) return false; - PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T))); + void Send(T* data) { + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + PADDLE_ENFORCE_EQ( + holder_->Type(), std::type_index(typeid(T)), + "Channel type is not same as the type of the data being sent"); // Static cast should be safe because we have ensured that types are same Channel* channel = static_cast*>(holder_->Ptr()); - return channel != nullptr ? channel->Send(data) : false; + PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null."); + channel->Send(data); } template bool Receive(T* data) { - if (!IsInitialized()) return false; - PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T))); + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + PADDLE_ENFORCE_EQ( + holder_->Type(), std::type_index(typeid(T)), + "Channel type is not same as the type of the data being sent"); Channel* channel = static_cast*>(holder_->Ptr()); - return channel != nullptr ? channel->Receive(data) : false; + PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null."); + return channel->Receive(data); } bool IsClosed() { - if (IsInitialized()) { - return holder_->IsClosed(); - } - return false; + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + return holder_->IsClosed(); } bool CanSend() { - if (IsInitialized()) { - return holder_->CanSend(); - } - return false; + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + return holder_->CanSend(); } bool CanReceive() { - if (IsInitialized()) { - return holder_->CanReceive(); - } - return false; + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + return holder_->CanReceive(); } void close() { - if (IsInitialized()) holder_->Close(); + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + holder_->Close(); } size_t Cap() { - if (IsInitialized()) return holder_->Cap(); - return -1; + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + return holder_->Cap(); } void Lock() { - if (IsInitialized()) holder_->Lock(); + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + holder_->Lock(); } void Unlock() { - if (IsInitialized()) holder_->Unlock(); + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + holder_->Unlock(); } template void AddToSendQ(const void* referrer, T* data, std::shared_ptr cond, std::function cb) { - if (IsInitialized()) { - Channel* channel = static_cast*>(holder_->Ptr()); - if (channel != nullptr) { - channel->AddToSendQ(referrer, data, cond, cb); - } + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + Channel* channel = static_cast*>(holder_->Ptr()); + if (channel != nullptr) { + channel->AddToSendQ(referrer, data, cond, cb); } } @@ -154,26 +166,31 @@ class ChannelHolder { void AddToReceiveQ(const void* referrer, T* data, std::shared_ptr cond, std::function cb) { - if (IsInitialized()) { - Channel* channel = static_cast*>(holder_->Ptr()); - if (channel != nullptr) { - channel->AddToReceiveQ(referrer, data, cond, cb); - } + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + Channel* channel = static_cast*>(holder_->Ptr()); + if (channel != nullptr) { + channel->AddToReceiveQ(referrer, data, cond, cb); } } void RemoveFromSendQ(const void* referrer) { - if (IsInitialized()) holder_->RemoveFromSendQ(referrer); + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + holder_->RemoveFromSendQ(referrer); } void RemoveFromReceiveQ(const void* referrer) { - if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer); + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); + holder_->RemoveFromReceiveQ(referrer); } inline bool IsInitialized() const { return holder_ != nullptr; } inline const std::type_index Type() { - PADDLE_ENFORCE_EQ(IsInitialized(), true); + PADDLE_ENFORCE_EQ(IsInitialized(), true, + "The Channel hasn't been initialized"); return holder_->Type(); } diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h index 457abbf373..378a0bab1c 100644 --- a/paddle/fluid/framework/channel_impl.h +++ b/paddle/fluid/framework/channel_impl.h @@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel { public: virtual bool CanSend(); virtual bool CanReceive(); - virtual bool Send(T *); + virtual void Send(T *); virtual bool Receive(T *); virtual size_t Cap() { return cap_; } virtual void Lock(); @@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel { } }; - bool send_return(bool value) { + void send_return() { send_ctr--; destructor_cond_.notify_all(); - return value; } bool recv_return(bool value) { @@ -118,15 +117,15 @@ bool ChannelImpl::CanReceive() { } template -bool ChannelImpl::Send(T *item) { +void ChannelImpl::Send(T *item) { send_ctr++; std::unique_lock lock{mu_}; - // If channel is closed, do nothing + // If channel is closed, throw exception if (closed_) { lock.unlock(); - // TODO(abhinavarora) Should panic on closed channel - return send_return(false); + send_return(); + PADDLE_THROW("Cannot send on closed channel"); } // If there is a receiver, directly pass the value we want @@ -143,7 +142,7 @@ bool ChannelImpl::Send(T *item) { if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND); if (do_send) *(m->data) = std::move(*item); - else + else { // We cannot do the data transfer because // this QueueMessage was added by Select // and some other case was executed. @@ -151,12 +150,17 @@ bool ChannelImpl::Send(T *item) { // We do not care about notifying other // because they would have been notified // by the executed select case. - return send_return(Send(item)); + lock.unlock(); + Send(item); + send_return(); + return; + } // Wake up the blocked process and unlock m->Notify(); lock.unlock(); - return send_return(true); + send_return(); + return; } // Unbuffered channel will always bypass this @@ -167,7 +171,8 @@ bool ChannelImpl::Send(T *item) { buf_.push_back(std::move(*item)); // Release lock and return true lock.unlock(); - return send_return(true); + send_return(); + return; } // Block on channel, because some receiver will complete @@ -175,8 +180,12 @@ bool ChannelImpl::Send(T *item) { auto m = std::make_shared(item); sendq.push_back(m); m->Wait(lock); - // TODO(abhinavarora) Should panic on closed channel - return send_return(!m->chan_closed); + if (m->chan_closed) { + lock.unlock(); + send_return(); + PADDLE_THROW("Cannot send on closed channel"); + } + send_return(); } template diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc index 73be5cdbe2..e2380bb54b 100644 --- a/paddle/fluid/framework/channel_test.cc +++ b/paddle/fluid/framework/channel_test.cc @@ -16,7 +16,6 @@ limitations under the License. */ #include #include - #include "gtest/gtest.h" using paddle::framework::Channel; @@ -41,7 +40,7 @@ void RecevingOrderEqualToSendingOrder(Channel *ch) { unsigned sum_send = 0; std::thread t([&]() { for (int i = 0; i < 5; i++) { - EXPECT_EQ(ch->Send(&i), true); + ch->Send(&i); sum_send += i; } }); @@ -61,7 +60,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) { const size_t buffer_size = 10; auto ch = MakeChannel(buffer_size); for (size_t i = 0; i < buffer_size; ++i) { - EXPECT_EQ(ch->Send(&i), true); // should not block + ch->Send(&i); } size_t out; @@ -82,7 +81,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel *ch) { const size_t data = 5; std::thread send_thread{[&]() { size_t i = data; - EXPECT_EQ(ch->Send(&i), true); // should not block + ch->Send(&i); // should not block }}; std::thread recv_thread{[&]() { @@ -94,12 +93,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel *ch) { send_thread.join(); recv_thread.join(); - // After closing send should return false. Receive should - // also return false as there is no data in queue. + // After closing send should panic. Receive should + // also false as there is no data in queue. CloseChannel(ch); send_thread = std::thread{[&]() { size_t i = data; - EXPECT_EQ(ch->Send(&i), false); // should return false + bool is_exception = false; + try { + ch->Send(&i); + } catch (paddle::platform::EnforceNotMet e) { + is_exception = true; + } + EXPECT_EQ(is_exception, true); }}; recv_thread = std::thread{[&]() { size_t i; @@ -129,7 +134,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { auto ch = MakeChannel(buffer_size); for (size_t i = 0; i < buffer_size; ++i) { - EXPECT_EQ(ch->Send(&i), true); // sending should not block + ch->Send(&i); // sending should not block } size_t out; @@ -160,9 +165,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { // Try to write more than buffer size. for (size_t i = 0; i < 2 * buffer_size; ++i) { if (i < buffer_size) - EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations - else - EXPECT_EQ(ch->Send(&i), false); + ch->Send(&i); // should block after 10 iterations + else { + bool is_exception = false; + try { + ch->Send(&i); + } catch (paddle::platform::EnforceNotMet e) { + is_exception = true; + } + EXPECT_EQ(is_exception, true); + } } }); std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec @@ -231,7 +243,13 @@ void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) { t[i] = std::thread( [&](bool *ended, bool *success) { int data = 10; - *success = ch->Send(&data); + bool is_exception = false; + try { + ch->Send(&data); + } catch (paddle::platform::EnforceNotMet e) { + is_exception = true; + } + *success = !is_exception; *ended = true; }, &thread_ended[i], &send_success[i]); @@ -316,8 +334,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) { // Try to send more number of times // than receivers for (int i = 0; i < 4; i++) { - ch->Send(&i); - sum_send += i; + try { + ch->Send(&i); + sum_send += i; + } catch (paddle::platform::EnforceNotMet e) { + } } }); for (int i = 0; i < 3; i++) { @@ -382,7 +403,13 @@ void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) { t[i] = std::thread( [&](bool *ended, bool *success) { int data = 10; - *success = ch->Send(&data); + bool is_exception = false; + try { + ch->Send(&data); + } catch (paddle::platform::EnforceNotMet e) { + is_exception = true; + } + *success = !is_exception; *ended = true; }, &thread_ended[i], &send_success[i]); @@ -508,7 +535,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) { unsigned sum_send = 0; std::thread t([&]() { for (int i = 0; i < 5; i++) { - EXPECT_EQ(ch->Send(&i), true); + ch->Send(&i); sum_send += i; } }); @@ -541,8 +568,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) { ChannelHolder *ch = new ChannelHolder(); EXPECT_EQ(ch->IsInitialized(), false); int i = 10; - EXPECT_EQ(ch->Send(&i), false); - EXPECT_EQ(ch->Receive(&i), false); + bool send_exception = false; + try { + ch->Send(&i); + } catch (paddle::platform::EnforceNotMet e) { + send_exception = true; + } + EXPECT_EQ(send_exception, true); + + bool recv_exception = false; + try { + ch->Receive(&i); + } catch (paddle::platform::EnforceNotMet e) { + recv_exception = true; + } + EXPECT_EQ(recv_exception, true); + bool is_exception = false; try { ch->Type(); @@ -669,7 +710,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { t[i] = std::thread( [&](bool *ended, bool *success) { int data = 10; - *success = ch->Send(&data); + bool is_exception = false; + try { + ch->Send(&data); + } catch (paddle::platform::EnforceNotMet e) { + is_exception = true; + } + *success = !is_exception; *ended = true; }, &thread_ended[i], &send_success[i]); @@ -760,7 +807,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { t[i] = std::thread( [&](bool *ended, bool *success) { int data = 10; - *success = ch->Send(&data); + bool is_exception = false; + try { + ch->Send(&data); + } catch (paddle::platform::EnforceNotMet e) { + is_exception = true; + } + *success = !is_exception; *ended = true; }, &thread_ended[i], &send_success[i]); diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc index 47cf7d7efc..66d33617ed 100644 --- a/paddle/fluid/operators/channel_send_op.cc +++ b/paddle/fluid/operators/channel_send_op.cc @@ -23,21 +23,10 @@ limitations under the License. */ static constexpr char Channel[] = "Channel"; static constexpr char X[] = "X"; -static constexpr char Status[] = "Status"; -static constexpr char copy[] = "copy"; namespace paddle { namespace operators { -void SetSendStatus(const platform::Place &dev_place, - framework::Variable &status_var, bool status) { - auto cpu = platform::CPUPlace(); - auto status_tensor = - status_var.GetMutable()->mutable_data({1}, - cpu); - status_tensor[0] = status; -} - class ChannelSendOp : public framework::OperatorBase { public: ChannelSendOp(const std::string &type, @@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase { "Input(Channel) of ChannelSendOp should not be null."); PADDLE_ENFORCE(ctx->HasInput(X), "Input(X) of ChannelSendOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput(Status), - "Output(Status) of ChannelSendOp should not be null."); - ctx->SetOutputDim("Status", {1}); } private: @@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase { auto input_var = scope.FindVar(Input(X)); // Send the input data through the channel. - bool ok = concurrency::ChannelSend(ch, input_var); - - // Set the status output of the `ChannelSend` call. - SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok); + concurrency::ChannelSend(ch, input_var); } }; @@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker { .AsDuplicable(); AddInput(X, "(Variable) The value which gets sent by the channel.") .AsDuplicable(); - AddOutput(Status, - "(Tensor) An LoD Tensor that returns a boolean status of the" - "result of the send operation.") - .AsDuplicable(); - AddAttr(copy, "(bool, default false) Should copy before send") - .SetDefault(false); AddComment(R"DOC( )DOC"); } diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc index a483af7aff..246c99489c 100644 --- a/paddle/fluid/operators/concurrency/channel_util.cc +++ b/paddle/fluid/operators/concurrency/channel_util.cc @@ -17,20 +17,20 @@ limitations under the License. */ namespace poc = paddle::operators::concurrency; -bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { +void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { auto type = framework::ToVarType(var->Type()); if (type == framework::proto::VarType_Type_LOD_TENSOR) - return ch->Send(var->GetMutable()); + ch->Send(var->GetMutable()); else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) - return ch->Send(var->GetMutable()); + ch->Send(var->GetMutable()); else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) - return ch->Send(var->GetMutable()); + ch->Send(var->GetMutable()); else if (type == framework::proto::VarType_Type_SELECTED_ROWS) - return ch->Send(var->GetMutable()); + ch->Send(var->GetMutable()); else if (type == framework::proto::VarType_Type_READER) - return ch->Send(var->GetMutable()); + ch->Send(var->GetMutable()); else if (type == framework::proto::VarType_Type_CHANNEL) - return ch->Send(var->GetMutable()); + ch->Send(var->GetMutable()); else PADDLE_THROW("ChannelSend:Unsupported type"); } diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h index c3674bd981..cd18ca78c6 100644 --- a/paddle/fluid/operators/concurrency/channel_util.h +++ b/paddle/fluid/operators/concurrency/channel_util.h @@ -21,7 +21,7 @@ namespace paddle { namespace operators { namespace concurrency { -bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); +void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 76cdb794cc..141a3eb935 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -166,7 +166,9 @@ void DoubleBufferReader::PrefetchThreadFunc() { std::swap(gpu_batch, batch.payloads_); } - if (!buffer_->Send(&batch)) { + try { + buffer_->Send(&batch); + } catch (paddle::platform::EnforceNotMet e) { VLOG(5) << "WARNING: The double buffer channel has been closed. The " "prefetch thread will terminate."; break; diff --git a/paddle/fluid/operators/reader/open_files_op.cc b/paddle/fluid/operators/reader/open_files_op.cc index 414c76fea0..b6ac7b21d5 100644 --- a/paddle/fluid/operators/reader/open_files_op.cc +++ b/paddle/fluid/operators/reader/open_files_op.cc @@ -146,14 +146,19 @@ void MultipleReader::PrefetchThreadFunc(std::string file_name, while (reader->HasNext()) { std::vector ins; reader->ReadNext(&ins); - if (!buffer_->Send(&ins)) { + try { + buffer_->Send(&ins); + } catch (paddle::platform::EnforceNotMet e) { VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch " "thread of file '" << file_name << "' will terminate."; break; } } - if (!available_thread_idx_->Send(&thread_idx)) { + + try { + available_thread_idx_->Send(&thread_idx); + } catch (paddle::platform::EnforceNotMet e) { VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. " "Fail to send thread_idx."; } diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py index d65e1a6858..a0f5ef2329 100644 --- a/python/paddle/fluid/concurrency.py +++ b/python/paddle/fluid/concurrency.py @@ -339,11 +339,6 @@ def channel_send(channel, value, is_copy=False): main_program = helper.main_program channel_send_block = main_program.current_block() - status = helper.create_variable( - name=unique_name.generate('status'), - type=core.VarDesc.VarType.LOD_TENSOR, - dtype=core.VarDesc.VarType.BOOL) - X = value if is_copy is True: @@ -359,15 +354,11 @@ def channel_send(channel, value, is_copy=False): type="assign_op", inputs={"X": value}, outputs={"Out": copied_X}) X = copied_X - channel_send_op = channel_send_block.append_op( - type="channel_send", - inputs={ + channel_send_block.append_op( + type="channel_send", inputs={ "Channel": channel, "X": X, - }, - outputs={"Status": status}) - - return status + }) def channel_recv(channel, return_value): From c7bf77d0e14ca1ec8caac53badb4f80adb8b02d1 Mon Sep 17 00:00:00 2001 From: Thuan Nguyen Date: Mon, 26 Mar 2018 19:18:21 -0700 Subject: [PATCH 209/314] Add in is_copy attribute to SelectCase. (#9393) This is a temporary solution to allowing for variables to be copied during a channel send operations. Also fixed issue with is_copy for "channel_send" method, and also updated unit tests. --- python/paddle/fluid/concurrency.py | 41 ++++++++++++++----- python/paddle/fluid/tests/test_concurrency.py | 23 ++--------- 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py index a0f5ef2329..470dd0df52 100644 --- a/python/paddle/fluid/concurrency.py +++ b/python/paddle/fluid/concurrency.py @@ -82,11 +82,14 @@ class SelectCase(object): RECEIVE = 2 def __init__(self, + select, case_idx, case_to_execute, channel_action_fn=None, channel=None, - value=None): + value=None, + is_copy=False): + self.select = select self.helper = LayerHelper('conditional_block') self.main_program = self.helper.main_program self.is_scalar_condition = True @@ -99,7 +102,24 @@ class SelectCase(object): self.action = (self.SEND if channel_action_fn.__name__ == ('channel_send') else self.RECEIVE) if channel_action_fn else self.DEFAULT - self.value = value + + X = value + if self.action == self.SEND and is_copy: + # We create of copy of the data we want to send + copied_X = self.select.parent_block.create_var( + name=unique_name.generate(value.name + '_copy'), + type=value.type, + dtype=value.dtype, + shape=value.shape, + lod_level=value.lod_level, + capacity=value.capacity + if hasattr(value, 'capacity') else None, ) + + self.select.parent_block.append_op( + type="assign", inputs={"X": value}, outputs={"Out": copied_X}) + X = copied_X + + self.value = X self.channel = channel def __enter__(self): @@ -173,6 +193,7 @@ class SelectCase(object): class Select(BlockGuard): def __init__(self, name=None): self.helper = LayerHelper('select', name=name) + self.parent_block = self.helper.main_program.current_block() self.cases = [] super(Select, self).__init__(self.helper.main_program) @@ -183,12 +204,12 @@ class Select(BlockGuard): super(Select, self).__enter__() return self - def case(self, channel_action_fn, channel, value): + def case(self, channel_action_fn, channel, value, is_copy=False): """Create a new block for this condition. """ - select_case = SelectCase( - len(self.cases), self.case_to_execute, channel_action_fn, channel, - value) + select_case = SelectCase(self, + len(self.cases), self.case_to_execute, + channel_action_fn, channel, value, is_copy) self.cases.append(select_case) @@ -197,7 +218,7 @@ class Select(BlockGuard): def default(self): """Create a default case block for this condition. """ - default_case = SelectCase(len(self.cases), self.case_to_execute) + default_case = SelectCase(self, len(self.cases), self.case_to_execute) self.cases.append(default_case) @@ -341,17 +362,17 @@ def channel_send(channel, value, is_copy=False): X = value - if is_copy is True: + if is_copy: copied_X = helper.create_variable( name=unique_name.generate(value.name + '_copy'), type=value.type, dtype=value.dtype, shape=value.shape, lod_level=value.lod_level, - capacity=value.capacity) + capacity=value.capacity if hasattr(value, 'capacity') else None) assign_op = channel_send_block.append_op( - type="assign_op", inputs={"X": value}, outputs={"Out": copied_X}) + type="assign", inputs={"X": value}, outputs={"Out": copied_X}) X = copied_X channel_send_block.append_op( diff --git a/python/paddle/fluid/tests/test_concurrency.py b/python/paddle/fluid/tests/test_concurrency.py index 924895a9af..e8f6cfb4a9 100644 --- a/python/paddle/fluid/tests/test_concurrency.py +++ b/python/paddle/fluid/tests/test_concurrency.py @@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase): with while_op.block(): result2 = fill_constant( shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - x_to_send_tmp = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) - - # TODO(abhinav): Need to perform copy when doing a channel send. - # Once this is complete, we can remove these lines - assign(input=x, output=x_to_send_tmp) with fluid.Select() as select: - with select.case(fluid.channel_send, channel, - x_to_send_tmp): + with select.case( + fluid.channel_send, channel, x, is_copy=True): assign(input=x, output=x_tmp) assign(input=y, output=x) assign(elementwise_add(x=x_tmp, y=y), output=y) @@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase): core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.FP64) - pong_result = self._create_tensor('pong_return_value', - core.VarDesc.VarType.LOD_TENSOR, - core.VarDesc.VarType.FP64) - def ping(ch, message): - message_to_send_tmp = fill_constant( - shape=[1], dtype=core.VarDesc.VarType.FP64, value=0) - - assign(input=message, output=message_to_send_tmp) - fluid.channel_send(ch, message_to_send_tmp) + fluid.channel_send(ch, message, is_copy=True) def pong(ch1, ch2): fluid.channel_recv(ch1, ping_result) - assign(input=ping_result, output=pong_result) - fluid.channel_send(ch2, pong_result) + fluid.channel_send(ch2, ping_result, is_copy=True) pings = fluid.make_channel( dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) From e0b5691e41f8dd28bdbf8d4ca7140824f918bec8 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Tue, 27 Mar 2018 11:10:53 +0800 Subject: [PATCH 210/314] Add drop_out_op unit test (#9364) --- paddle/fluid/operators/CMakeLists.txt | 1 + paddle/fluid/operators/dropout_op.cu | 5 +- paddle/fluid/operators/dropout_op_test.cc | 96 +++++++++++++++++++++++ 3 files changed, 99 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/dropout_op_test.cc diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9a11e1be70..8341170d68 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -264,3 +264,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) +nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) diff --git a/paddle/fluid/operators/dropout_op.cu b/paddle/fluid/operators/dropout_op.cu index 94382739b5..184c095e48 100644 --- a/paddle/fluid/operators/dropout_op.cu +++ b/paddle/fluid/operators/dropout_op.cu @@ -55,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel { y->mutable_data(context.GetPlace()); float dropout_prob = context.Attr("dropout_prob"); - auto X = EigenMatrix::Reshape(*x, 1); - auto Y = EigenMatrix::Reshape(*y, 1); - auto& place = *context.template device_context().eigen_device(); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); @@ -76,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel { T><<>>( size, seed, dropout_prob, x_data, mask_data, y_data); } else { + auto X = EigenMatrix::Reshape(*x, 1); + auto Y = EigenMatrix::Reshape(*y, 1); Y.device(place) = X * static_cast(1.0f - dropout_prob); } } diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc new file mode 100644 index 0000000000..db97ba4f64 --- /dev/null +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(dropout); + +void Compare(f::Scope& scope, p::DeviceContext& ctx) { + // init + auto var = scope.Var("X"); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + + std::vector init; + for (int64_t i = 0; i < 10 * 10; ++i) { + init.push_back(1.0); + } + + TensorFromVector(init, ctx, tensor); + + auto place = ctx.GetPlace(); + auto out_var = scope.Var("Out"); + auto out_tensor = out_var->GetMutable(); + out_tensor->Resize({10, 10}); + out_tensor->mutable_data(place); // allocate + + auto mask_var = scope.Var("Mask"); + auto mask_tensor = mask_var->GetMutable(); + mask_tensor->Resize({10, 10}); + mask_tensor->mutable_data(place); // allocate + + // run + f::AttributeMap attrs; + float dropout_prob = 0.5; + attrs.insert({"fix_seed", 1}); + attrs.insert({"seed", 3}); + attrs.insert({"dropout_prob", dropout_prob}); + auto dropout_op = f::OpRegistry::CreateOp( + "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs); + + dropout_op->Run(scope, place); + + std::vector out_vec; + TensorToVector(*out_tensor, ctx, &out_vec); + + std::vector std_out = { + 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, + 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, + 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, + 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, + 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1}; + + EXPECT_EQ(out_vec.size(), std_out.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], std_out[i]); + } +} + +TEST(Dropout, CPUDense) { + f::Scope scope; + p::CPUPlace place; + p::CPUDeviceContext ctx(place); + Compare(scope, ctx); +} + +TEST(Dropout, GPUDense) { + f::Scope scope; + p::CUDAPlace place; + p::CUDADeviceContext ctx(place); + Compare(scope, ctx); +} From 123cf165fb031e8e0e9170c17ba59deb95e9dc76 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Tue, 27 Mar 2018 11:11:24 +0800 Subject: [PATCH 211/314] Set stop_gradient=True for some variables in SSD API. (#9396) --- python/paddle/fluid/layers/detection.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index cd519e1ee0..3e649dc5fd 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -134,6 +134,7 @@ def detection_output(loc, scores = nn.softmax(input=scores) scores = ops.reshape(x=scores, shape=old_shape) scores = nn.transpose(scores, perm=[0, 2, 1]) + scores.stop_gradient = True nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) helper.append_op( type="multiclass_nms", @@ -148,6 +149,7 @@ def detection_output(loc, 'score_threshold': score_threshold, 'nms_eta': 1.0 }) + nmsed_outs.stop_gradient = True return nmsed_outs @@ -837,4 +839,6 @@ def multi_box_head(inputs, mbox_locs_concat = tensor.concat(mbox_locs, axis=1) mbox_confs_concat = tensor.concat(mbox_confs, axis=1) + box.stop_gradient = True + var.stop_gradient = True return mbox_locs_concat, mbox_confs_concat, box, var From aba46f077baf028530d92621afb26fcf2382258a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 11:23:28 +0800 Subject: [PATCH 212/314] Disable P2P --- paddle/fluid/framework/init.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc index 3c0d93642a..c30bf9037b 100644 --- a/paddle/fluid/framework/init.cc +++ b/paddle/fluid/framework/init.cc @@ -85,7 +85,7 @@ void InitDevices() { for (int i = 0; i < count; ++i) { places.emplace_back(platform::CUDAPlace(i)); } - InitP2P(count); + // InitP2P(count); platform::DeviceContextPool::Init(places); } From 833e522d1661624662ec39da2acd1a0f8704fc70 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 12:12:20 +0800 Subject: [PATCH 213/314] Enhance drop kids --- .../fluid/framework/details/threaded_ssa_graph_executor.cc | 5 ++--- paddle/fluid/framework/details/threaded_ssa_graph_executor.h | 1 - 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 482c32f894..d9b855503b 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -170,8 +170,8 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto p : this->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - for (auto &drop_fn : this->drop_functions_) { - drop_fn(); + for (auto &scope : local_scopes_) { + scope->DropKids(); } }; @@ -189,7 +189,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Drop tmp scopes; for (auto &scope : local_scopes_) { auto &kid = *scope->Var("@TMP_SCOPE@")->GetMutable(); - this->drop_functions_.emplace_back([=] { scope->DeleteScope(kid); }); kid = nullptr; } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index fecad00e18..14b10cd0eb 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -52,7 +52,6 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { size_t computation_count_{0}; size_t max_async_computation{100}; - std::vector> drop_functions_; }; } // namespace details From 68c199432b67049e39be585979c0af35c9f06c10 Mon Sep 17 00:00:00 2001 From: m3ngyang Date: Tue, 27 Mar 2018 12:31:02 +0800 Subject: [PATCH 214/314] fix typo --- doc/v2/faq/cluster/index_en.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/doc/v2/faq/cluster/index_en.rst b/doc/v2/faq/cluster/index_en.rst index 7cbcaeefcb..fa942a0962 100644 --- a/doc/v2/faq/cluster/index_en.rst +++ b/doc/v2/faq/cluster/index_en.rst @@ -4,13 +4,13 @@ Cluster Training and Prediction .. contents:: -1. Network connection errors in the log during muliti-node cluster training +1. Network connection errors in the log during multi-node cluster training ------------------------------------------------ -The errors in the log belong to network connection during mulilti-node cluster training, for example, :code:`Connection reset by peer`. -This kind of error is usually caused by the abnormal exit of the training process in some node, and the others cannot connect with this node any longer. Steps to troubleshoot the problem as follows: +There are maybe some errors in the log belonging to network connection problem during multi-node cluster training, for example, :code:`Connection reset by peer`. +This kind of error is usually caused by the abnormal exit of a training process in some node, and the other nodes cannot connect with this node any longer. Steps to troubleshoot the problem are as follows: * Find the first error in the :code:`train.log`, :code:`server.log`, check whether other fault casued the problem, such as FPE, lacking of memory or disk. -* If network connection gave rise to the first error in the log, this may be caused by the port conflict of the non-exclusive execution. Connect with the operator to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If so, change the port of job. +* If the first error in server.log says "Address already used", this may be caused by the port conflict of the non-exclusive execution. Connect the sys-admin to check if the current MPI cluster supports jobs submitted with parameter :code:`resource=full`. If the current MPI cluster does not support this parameter, change the server port and try agian. -* If the currnet MPI cluster does not support exclusive pattern, ask the operator to replace or update the current cluster. +* If the current MPI cluster does not support exclusive pattern which allows a process to occupy the whole node, ask the administrator to replace or update the this cluster. From f385228f059f77a450e4c7252359f973cc6d6321 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 13:35:55 +0800 Subject: [PATCH 215/314] Add Paddle Enforce --- paddle/fluid/framework/details/op_handle_base.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index ea97aa5fb2..63affb7054 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -34,7 +34,7 @@ std::string OpHandleBase::DebugString() const { OpHandleBase::~OpHandleBase() { #ifdef PADDLE_WITH_CUDA for (auto &ev : events_) { - cudaEventDestroy(ev.second); + PADDLE_ENFORCE(cudaEventDestroy(ev.second)); } #endif } @@ -44,8 +44,9 @@ void OpHandleBase::Run(bool use_event) { if (events_.empty() && use_event) { for (auto &p : dev_ctx_) { int dev_id = boost::get(p.first).device; - cudaSetDevice(dev_id); - cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming); + PADDLE_ENFORCE(cudaSetDevice(dev_id)); + PADDLE_ENFORCE( + cudaEventCreateWithFlags(&events_[dev_id], cudaEventDisableTiming)); } } #else @@ -60,7 +61,7 @@ void OpHandleBase::Run(bool use_event) { int dev_id = boost::get(p.first).device; auto stream = static_cast(p.second)->stream(); - cudaEventRecord(events_.at(dev_id), stream); + PADDLE_ENFORCE(cudaEventRecord(events_.at(dev_id), stream)); } } #endif From 5a02739ce9c564c728e4631c731137cd0eb99bf7 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 13:41:42 +0800 Subject: [PATCH 216/314] Throw error --- .../fluid/framework/details/threaded_ssa_graph_executor.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index d9b855503b..501e1dfad7 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -208,6 +208,11 @@ void ThreadedSSAGraphExecutor::RunOp( try { VLOG(10) << op->DebugString(); op->Run(use_event_); + + for (auto &dev_ctx : op->dev_ctx_) { + dev_ctx.second->Wait(); // Sync error + } + for (auto *ready : *ready_buffer) { ready->store(true, std::memory_order_release); } From 55e2cc3d878237b026b301a0e46c816d43703bbb Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 13:49:45 +0800 Subject: [PATCH 217/314] FetchOp Force sync --- paddle/fluid/framework/details/fetch_op_handle.cc | 4 +++- paddle/fluid/framework/details/threaded_ssa_graph_executor.cc | 4 ---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index c697a1c937..03323e3da7 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -47,9 +47,11 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const { } void FetchOpHandle::RunImpl() { + auto cpu_ctx = + platform::DeviceContextPool::Instance().Get(platform::CPUPlace()); for (auto *input : inputs_) { auto *var = static_cast(input); - var->generated_op_->Wait(this->dev_ctx_[var->place_]); + var->generated_op_->Wait(cpu_ctx); } tensors_.resize(inputs_.size()); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 501e1dfad7..7d1f7e46b8 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -209,10 +209,6 @@ void ThreadedSSAGraphExecutor::RunOp( VLOG(10) << op->DebugString(); op->Run(use_event_); - for (auto &dev_ctx : op->dev_ctx_) { - dev_ctx.second->Wait(); // Sync error - } - for (auto *ready : *ready_buffer) { ready->store(true, std::memory_order_release); } From b6ca3711b4efad23afb13d5d3ca72d462550d7b0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 13:52:16 +0800 Subject: [PATCH 218/314] Get error --- paddle/fluid/framework/details/op_handle_base.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 63affb7054..07a4b89217 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -33,6 +33,9 @@ std::string OpHandleBase::DebugString() const { OpHandleBase::~OpHandleBase() { #ifdef PADDLE_WITH_CUDA + for (auto &ctx : dev_ctx_) { + ctx.second->Wait(); + } for (auto &ev : events_) { PADDLE_ENFORCE(cudaEventDestroy(ev.second)); } From 76570c2e969df26fff28f22e1d6e8fe18cf5e45c Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 13:56:14 +0800 Subject: [PATCH 219/314] Wait fetch op --- paddle/fluid/framework/details/fetch_op_handle.cc | 1 + paddle/fluid/framework/details/op_handle_base.cc | 3 --- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 03323e3da7..26c09eb8eb 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -66,6 +66,7 @@ void FetchOpHandle::RunImpl() { if (platform::is_gpu_place(var->place_)) { #ifdef PADDLE_WITH_CUDA TensorCopy(t, cpu, *dev_ctx_[t.place()], &tensors_[i]); + dev_ctx_[t.place()]->Wait(); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 07a4b89217..63affb7054 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -33,9 +33,6 @@ std::string OpHandleBase::DebugString() const { OpHandleBase::~OpHandleBase() { #ifdef PADDLE_WITH_CUDA - for (auto &ctx : dev_ctx_) { - ctx.second->Wait(); - } for (auto &ev : events_) { PADDLE_ENFORCE(cudaEventDestroy(ev.second)); } From 222763296f31ff723260155ad0b0169c285212cd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 14:02:16 +0800 Subject: [PATCH 220/314] Change fetch op --- paddle/fluid/framework/details/fetch_op_handle.cc | 7 ++----- .../framework/details/threaded_ssa_graph_executor.cc | 9 +-------- 2 files changed, 3 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 26c09eb8eb..9ed974151f 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -33,11 +33,6 @@ void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) { } void FetchOpHandle::WaitAndMergeCPUTensors() const { - // Wait fetch stream done. - for (auto &ctx : dev_ctx_) { - ctx.second->Wait(); - } - std::vector tensors_ptr; tensors_ptr.reserve(tensors_.size()); for (auto &t : tensors_) { @@ -72,6 +67,8 @@ void FetchOpHandle::RunImpl() { tensors_[i].ShareDataWith(t); tensors_[i].set_lod(t.lod()); } + + this->WaitAndMergeCPUTensors(); } } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 7d1f7e46b8..7cfd668379 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -96,12 +96,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto *var : vars) { op->AddInput(var); } - - dummy_vars.emplace_back(); - auto *var = &dummy_vars.back(); - var->generated_op_ = nullptr; - op->AddOutput(var); - InsertPendingVar(*var); InsertPendingOp(*op); } @@ -176,8 +170,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( }; // Wait FetchOps. - for (auto &fetch_op : fetch_ops) { - fetch_op.WaitAndMergeCPUTensors(); + if (!fetch_ops.empty()) { sync_computation(); } From 9af870854e99c4eba22506b085cdb1b521f70f20 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 14:30:58 +0800 Subject: [PATCH 221/314] Use heap variables --- paddle/fluid/framework/details/op_handle_base.h | 10 +++++++++- .../framework/details/threaded_ssa_graph_executor.cc | 9 ++++----- .../fluid/tests/unittests/test_parallel_executor.py | 3 +++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 99d8968486..78f566c035 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -16,11 +16,17 @@ #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/macros.h" + namespace paddle { namespace framework { namespace details { -struct OpHandleBase { +class OpHandleBase { + private: + DISABLE_COPY_AND_ASSIGN(OpHandleBase); + + public: std::vector inputs_; std::vector outputs_; std::unordered_map events_; #endif + OpHandleBase() {} + std::string DebugString() const; virtual std::string Name() const = 0; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 7cfd668379..41034e9f05 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -67,7 +67,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } // Step 2. Insert FetchOps - std::vector fetch_ops; + std::vector> fetch_ops; std::vector dummy_vars; FeedFetchList fetch_data(fetch_tensors.size()); @@ -84,9 +84,9 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (size_t i = 0; i < fetch_tensors.size(); ++i) { auto &var_name = fetch_tensors[i]; - auto &vars = fetched_vars[var_name]; - fetch_ops.emplace_back(&fetch_data, i, &local_scopes_); - details::FetchOpHandle *op = &fetch_ops.back(); + auto &vars = fetched_vars.at(var_name); + auto *op = new FetchOpHandle(&fetch_data, i, &local_scopes_); + fetch_ops.emplace_back(op); // FIXME: Use new device context for (auto &p : places_) { @@ -138,7 +138,6 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( for (auto &op : pending_ops) { VLOG(10) << op.first->DebugString(); } - // keep waiting the ready variables continue; } diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 2e61eca068..a5eea30f87 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -231,6 +231,9 @@ class TestMNIST(TestParallelExecutorBase): class TestResnet(TestParallelExecutorBase): @classmethod def setUpClass(cls): + import os + if os.path.exists('./flowers.recordio'): + return with fluid.program_guard(fluid.Program(), fluid.Program()): reader = paddle.batch(flowers.train(), batch_size=4) feeder = fluid.DataFeeder( From dfb8680018a4b7f34f4585f82ac62815cce5f660 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 14:39:37 +0800 Subject: [PATCH 222/314] Early drop fetch op --- paddle/fluid/framework/details/threaded_ssa_graph_executor.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 41034e9f05..13789667b8 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -170,6 +170,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // Wait FetchOps. if (!fetch_ops.empty()) { + fetch_ops.clear(); sync_computation(); } From 52dd8ff09a73b37c6b1275a672b8dc8269530e8d Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 14:50:05 +0800 Subject: [PATCH 223/314] Force sync dev --- paddle/fluid/framework/details/threaded_ssa_graph_executor.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 13789667b8..50c24d3afa 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -199,7 +199,7 @@ void ThreadedSSAGraphExecutor::RunOp( auto op_run = [ready_buffer, op, this] { try { - VLOG(10) << op->DebugString(); + VLOG(10) << op->Name() << " : " << op->DebugString(); op->Run(use_event_); for (auto *ready : *ready_buffer) { @@ -211,6 +211,7 @@ void ThreadedSSAGraphExecutor::RunOp( } catch (...) { LOG(FATAL) << "Unknown exception catched"; } + PADDLE_ENFORCE(cudaDeviceSynchronize()); }; if (pool_) { pool_->enqueue(op_run); From 5b92dd4026ac1afb5904646688a3a8ada6b29c65 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 15:06:07 +0800 Subject: [PATCH 224/314] Remove dev sync --- paddle/fluid/framework/details/threaded_ssa_graph_executor.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 50c24d3afa..c1a28f1d1d 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -211,7 +211,6 @@ void ThreadedSSAGraphExecutor::RunOp( } catch (...) { LOG(FATAL) << "Unknown exception catched"; } - PADDLE_ENFORCE(cudaDeviceSynchronize()); }; if (pool_) { pool_->enqueue(op_run); From c42c4a6718599126bd9e7ba7f0407db18618c9e0 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 15:26:58 +0800 Subject: [PATCH 225/314] Add performance tests --- .../tests/unittests/test_parallel_executor.py | 73 ++++++++++++------- 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index a5eea30f87..727dc6a56c 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -135,14 +135,11 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): return fluid.layers.elementwise_add(x=short, y=scale, act='relu') -def SE_ResNeXt152(): - reader = fluid.layers.open_recordio_file( - filename='./flowers.recordio', - shapes=[[-1, 3, 224, 224], [-1, 1]], - lod_levels=[0, 0], - dtypes=['float32', 'int64']) - - img, label = fluid.layers.read_file(reader) +def SE_ResNeXt152(batch_size=4): + img = fluid.layers.fill_constant( + shape=[batch_size, 3, 224, 224], dtype='float32', value=0.0) + label = fluid.layers.fill_constant( + shape=[batch_size, 1], dtype='int64', value=0.0) conv = conv_bn_layer( input=img, num_filters=64, filter_size=3, stride=2, act='relu') @@ -179,8 +176,15 @@ def SE_ResNeXt152(): return loss +import time + + class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence(self, method, memory_opt=True, iter=10): + def check_network_convergence(self, + method, + memory_opt=True, + iter=10, + batch_size=None): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): @@ -191,6 +195,9 @@ class TestParallelExecutorBase(unittest.TestCase): fluid.memory_optimize(main) exe = fluid.ParallelExecutor(loss_name=loss.name, use_cuda=True) + if batch_size is not None: + batch_size *= fluid.core.get_cuda_device_count() + begin = time.time() first_loss, = exe.run([loss.name]) first_loss = numpy.array(first_loss) @@ -198,6 +205,12 @@ class TestParallelExecutorBase(unittest.TestCase): exe.run([]) last_loss, = exe.run([loss.name]) + end = time.time() + + if batch_size is not None: + print "%.4f Instance per second" % ( + (batch_size * iter + 2) / (end - begin)) + last_loss = numpy.array(last_loss) print first_loss, last_loss @@ -229,26 +242,32 @@ class TestMNIST(TestParallelExecutorBase): class TestResnet(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - import os - if os.path.exists('./flowers.recordio'): - return - with fluid.program_guard(fluid.Program(), fluid.Program()): - reader = paddle.batch(flowers.train(), batch_size=4) - feeder = fluid.DataFeeder( - feed_list=[ - fluid.layers.data( - name='image', shape=[3, 224, 224]), - fluid.layers.data( - name='label', shape=[1], dtype='int64'), - ], - place=fluid.CPUPlace()) - fluid.recordio_writer.convert_reader_to_recordio_file( - "./flowers.recordio", reader, feeder) + # @classmethod + # def setUpClass(cls): + # # import os + # # if os.path.exists('./flowers.recordio'): + # # return + # with fluid.program_guard(fluid.Program(), fluid.Program()): + # reader = paddle.batch(flowers.train(), batch_size=4) + # feeder = fluid.DataFeeder( + # feed_list=[ + # fluid.layers.data( + # name='image', shape=[3, 224, 224]), + # fluid.layers.data( + # name='label', shape=[1], dtype='int64'), + # ], + # place=fluid.CPUPlace()) + # fluid.recordio_writer.convert_reader_to_recordio_file( + # "./flowers.recordio", reader, feeder, compressor=fluid.core.RecordIOWriter.Compressor.NoCompress) def test_resnet(self): - self.check_network_convergence(SE_ResNeXt152, iter=200) + import functools + batch_size = 4 + self.check_network_convergence( + functools.partial( + SE_ResNeXt152, batch_size=batch_size), + iter=20, + batch_size=batch_size) class ModelHyperParams(object): From 3f88fad08ce6d7800356372e7cb20a3b70cd3208 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 15:30:57 +0800 Subject: [PATCH 226/314] Fix merge op --- paddle/fluid/framework/details/fetch_op_handle.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 9ed974151f..4fc05b3248 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -67,9 +67,9 @@ void FetchOpHandle::RunImpl() { tensors_[i].ShareDataWith(t); tensors_[i].set_lod(t.lod()); } - - this->WaitAndMergeCPUTensors(); } + + this->WaitAndMergeCPUTensors(); } std::string FetchOpHandle::Name() const { return "Fetch"; } From c0c2e15920fefb95010c86aa9654f2868d1b29fd Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 15:49:13 +0800 Subject: [PATCH 227/314] NCCL AllReduce --- paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc | 4 ---- paddle/fluid/platform/nccl_helper.h | 6 ++---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index f2303ff4ca..116b13d330 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -50,10 +50,6 @@ void NCCLAllReduceOpHandle::RunImpl() { auto &lod_tensor = s->FindVar(var_name)->Get(); void *buffer = const_cast(lod_tensor.data()); - uintptr_t buf = reinterpret_cast(buffer); - if (buf % sizeof(float) != 0) { - VLOG(3) << "Buffer is not aligned " << buf; - } if (dtype == -1) { dtype = platform::ToNCCLDataType(lod_tensor.type()); diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index 2999004320..ecdd98987d 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -36,12 +36,10 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { class NCCLGroupGuard { public: - inline NCCLGroupGuard() { - mutex().lock(); - PADDLE_ENFORCE(dynload::ncclGroupStart()); - } + inline NCCLGroupGuard() { PADDLE_ENFORCE(dynload::ncclGroupStart()); } inline ~NCCLGroupGuard() { + mutex().lock(); PADDLE_ENFORCE(dynload::ncclGroupEnd()); mutex().unlock(); } From 7dcb217e3147642221b65fd20820010ebe78d316 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 15:54:12 +0800 Subject: [PATCH 228/314] Refine allreduce op --- .../details/nccl_all_reduce_op_handle.cc | 18 ++++++++++++++---- paddle/fluid/platform/nccl_helper.h | 6 ++++-- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index 116b13d330..f77a4b55a1 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -41,7 +41,7 @@ void NCCLAllReduceOpHandle::RunImpl() { int dtype = -1; size_t numel = 0; - platform::NCCLGroupGuard guard; + std::vector> all_reduce_calls; for (size_t i = 0; i < local_scopes_.size(); ++i) { auto &p = places_[i]; @@ -58,10 +58,20 @@ void NCCLAllReduceOpHandle::RunImpl() { if (numel == 0) { numel = static_cast(lod_tensor.numel()); } + auto &nccl_ctx = nccl_ctxs_.at(dev_id); - PADDLE_ENFORCE(platform::dynload::ncclAllReduce( - buffer, buffer, numel, static_cast(dtype), ncclSum, - nccl_ctx.comm_, nccl_ctx.stream())); + auto stream = nccl_ctx.stream(); + auto comm = nccl_ctx.comm_; + all_reduce_calls.emplace_back([=] { + PADDLE_ENFORCE(platform::dynload::ncclAllReduce( + buffer, buffer, numel, static_cast(dtype), ncclSum, + comm, stream)); + }); + } + + platform::NCCLGroupGuard guard; + for (auto &call : all_reduce_calls) { + call(); } } } diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index ecdd98987d..2999004320 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -36,10 +36,12 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) { class NCCLGroupGuard { public: - inline NCCLGroupGuard() { PADDLE_ENFORCE(dynload::ncclGroupStart()); } + inline NCCLGroupGuard() { + mutex().lock(); + PADDLE_ENFORCE(dynload::ncclGroupStart()); + } inline ~NCCLGroupGuard() { - mutex().lock(); PADDLE_ENFORCE(dynload::ncclGroupEnd()); mutex().unlock(); } From 25317bd312124cb3f26a2248c04215591d4e8446 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Tue, 27 Mar 2018 16:32:31 +0800 Subject: [PATCH 229/314] Make the first device share data with the global scope in parallel_do_op. (#9398) --- paddle/fluid/operators/parallel_do_op.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/parallel_do_op.cc b/paddle/fluid/operators/parallel_do_op.cc index 4001b9a130..b28c16b13f 100644 --- a/paddle/fluid/operators/parallel_do_op.cc +++ b/paddle/fluid/operators/parallel_do_op.cc @@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase { PADDLE_ENFORCE(scope.FindVar(param)->IsType(), "Only support parameter type as LoDTensor"); auto &src = scope.FindVar(param)->Get(); - for (size_t i = 0; i < sub_scopes.size(); ++i) { + + auto *sub_scope0 = sub_scopes[0]; + auto *dst0 = sub_scope0->Var(param)->GetMutable(); + dst0->ShareDataWith(src); + + for (size_t i = 1; i < sub_scopes.size(); ++i) { auto &place = places[i]; auto *sub_scope = sub_scopes[i]; auto *dst = sub_scope->Var(param)->GetMutable(); From 50f71f50057c3c28e110da65cec7251a7d91e86a Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 18:30:11 +0800 Subject: [PATCH 230/314] Using blocking queue --- .../details/threaded_ssa_graph_executor.cc | 54 ++++++------------- .../details/threaded_ssa_graph_executor.h | 32 +++++++++-- 2 files changed, 44 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index c1a28f1d1d..0bf05c3c11 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -35,11 +35,17 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( FeedFetchList ThreadedSSAGraphExecutor::Run( const std::vector &fetch_tensors) { std::unordered_map pending_ops; - std::unordered_map> pending_vars; + std::unordered_set pending_vars; + + BlockingQueue ready_vars; + std::unordered_set ready_ops; - auto InsertPendingVar = [&pending_vars](VarHandleBase &var) { - pending_vars[&var] = var.generated_op_ == nullptr; + auto InsertPendingVar = [&pending_vars, &ready_vars](VarHandleBase &var) { + pending_vars.insert(&var); + if (var.generated_op_ == nullptr) { + ready_vars.Push(&var); + } }; auto InsertPendingOp = [&pending_ops](OpHandleBase &op_instance) { @@ -101,7 +107,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( auto run_all_ready_ops = [&] { for (auto *op : ready_ops) { - RunOp(pending_vars, op); + RunOp(ready_vars, op); } ready_ops.clear(); }; @@ -118,29 +124,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( run_all_ready_ops(); // 2. Find ready variable - VarHandleBase *ready_var = nullptr; - for (auto &pair : pending_vars) { - if (pair.second.load(std::memory_order_acquire)) { - ready_var = pair.first; - break; - } - } - - // if there is no variable ready - if (ready_var == nullptr) { - // FIXME use conditional var instead of busy wait. - // if there is an exception, throw it - if (exception_) { - throw * exception_; - } - - VLOG(10) << "============================="; - for (auto &op : pending_ops) { - VLOG(10) << op.first->DebugString(); - } - // keep waiting the ready variables - continue; - } + VarHandleBase *ready_var = ready_vars.Pop(); // 3. Remove the dependency of ready_var. // Find the ready_ops after the ready_var. @@ -189,23 +173,15 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( } void ThreadedSSAGraphExecutor::RunOp( - std::unordered_map> &pending_vars, - details::OpHandleBase *op) { - std::vector *> *ready_buffer = - new std::vector *>(); - for (auto *var : op->outputs_) { - ready_buffer->emplace_back(&pending_vars[var]); - } - - auto op_run = [ready_buffer, op, this] { + BlockingQueue &ready_var_q, details::OpHandleBase *op) { + auto op_run = [&ready_var_q, op, this] { try { VLOG(10) << op->Name() << " : " << op->DebugString(); op->Run(use_event_); - for (auto *ready : *ready_buffer) { - ready->store(true, std::memory_order_release); + for (auto &each : op->outputs_) { + ready_var_q.Push(each); } - delete ready_buffer; } catch (platform::EnforceNotMet ex) { exception_.reset(new platform::EnforceNotMet(ex)); } catch (...) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 14b10cd0eb..26ff147863 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -24,6 +24,33 @@ class Scope; namespace details { +template +class BlockingQueue { + public: + void Push(const T &v) { + { + std::lock_guard g(mutex_); + q_.emplace_back(v); + } + cv_.notify_one(); + } + + T Pop() { + std::unique_lock lock(mutex_); + while (q_.empty()) { + cv_.wait(lock); + } + T v = q_.front(); + q_.pop_front(); + return v; + } + + private: + std::mutex mutex_; + std::condition_variable cv_; + std::deque q_; +}; + class ThreadedSSAGraphExecutor : public SSAGraphExecutor { public: ThreadedSSAGraphExecutor(size_t num_threads, bool use_event, @@ -38,9 +65,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor { ~ThreadedSSAGraphExecutor() {} private: - void RunOp( - std::unordered_map> &pending_vars, - details::OpHandleBase *op); + void RunOp(BlockingQueue &ready_var_q, + details::OpHandleBase *op); private: std::unique_ptr<::ThreadPool> pool_; From dcf7bd2d92482927ab9ae2d3ad88d5b06e4961cf Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 18:42:29 +0800 Subject: [PATCH 231/314] Add initP2P --- paddle/fluid/framework/init.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/init.cc b/paddle/fluid/framework/init.cc index c30bf9037b..3c0d93642a 100644 --- a/paddle/fluid/framework/init.cc +++ b/paddle/fluid/framework/init.cc @@ -85,7 +85,7 @@ void InitDevices() { for (int i = 0; i < count; ++i) { places.emplace_back(platform::CUDAPlace(i)); } - // InitP2P(count); + InitP2P(count); platform::DeviceContextPool::Init(places); } From 201f79d03985114de6e49adbaad7887fed8939b6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 27 Mar 2018 18:53:54 +0800 Subject: [PATCH 232/314] Use Extend method --- .../framework/details/threaded_ssa_graph_executor.cc | 5 +---- .../framework/details/threaded_ssa_graph_executor.h | 11 +++++++++++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 0bf05c3c11..fc84031556 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -178,10 +178,7 @@ void ThreadedSSAGraphExecutor::RunOp( try { VLOG(10) << op->Name() << " : " << op->DebugString(); op->Run(use_event_); - - for (auto &each : op->outputs_) { - ready_var_q.Push(each); - } + ready_var_q.Extend(op->outputs_); } catch (platform::EnforceNotMet ex) { exception_.reset(new platform::EnforceNotMet(ex)); } catch (...) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 26ff147863..8392170311 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -35,6 +35,17 @@ class BlockingQueue { cv_.notify_one(); } + template + void Extend(const U &items) { + { + std::lock_guard g(mutex_); + for (auto &item : items) { + q_.emplace_back(item); + } + } + cv_.notify_all(); + } + T Pop() { std::unique_lock lock(mutex_); while (q_.empty()) { From 7f4012247e09aec9c9d912a806bdf6b5dfabe97a Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 27 Mar 2018 18:55:32 +0800 Subject: [PATCH 233/314] adjust remove rule for variables --- paddle/fluid/framework/block_desc.cc | 73 +++++++++++-------- paddle/fluid/framework/block_desc.h | 5 ++ .../tests/unittests/test_protobuf_descs.py | 9 ++- 3 files changed, 52 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/framework/block_desc.cc b/paddle/fluid/framework/block_desc.cc index 4faf9dcf37..fbe08349c3 100644 --- a/paddle/fluid/framework/block_desc.cc +++ b/paddle/fluid/framework/block_desc.cc @@ -147,40 +147,51 @@ void BlockDesc::RemoveOp(size_t s, size_t e) { if (ops_.begin() + s == ops_.end() || ops_.begin() + e == ops_.end()) { return; } + auto get_vars = [](std::deque>::iterator &op, + std::vector &v) { + auto in_names = (*op)->InputArgumentNames(); + v.insert(v.end(), in_names.begin(), in_names.end()); + auto out_names = (*op)->OutputArgumentNames(); + v.insert(v.end(), out_names.begin(), out_names.end()); + std::sort(v.begin(), v.end()); + auto last = std::unique(v.begin(), v.end()); + v.erase(last, v.end()); + }; need_update_ = true; - std::vector vars1; // input vars from delete ops - for (auto it = ops_.begin() + s; it != ops_.begin() + e; it++) { - // delete all output vars - auto out_names = (*it)->OutputArgumentNames(); - for (auto n : out_names) { - vars_.erase(vars_.find(n)); + + for (size_t i = s; i < e; i++) { + // since remove op one by one, every time remove the first op. + auto op = ops_.begin() + s; + + // collect input and output variables from current delete op + std::vector cur_vars; + get_vars(op, cur_vars); + + // remove current op + ops_.erase(ops_.begin() + s); + + // collect input and output variables from other ops + std::vector other_vars; + for (auto it = ops_.begin(); it != ops_.end(); it++) { + get_vars(it, other_vars); } - // collect all input vars from remove ops - auto in_names = (*it)->InputArgumentNames(); - vars1.insert(vars1.end(), in_names.begin(), in_names.end()); - } - ops_.erase(ops_.begin() + s, ops_.begin() + e); - - // collect input and output vars from remain ops - std::vector vars2; - for (auto it = ops_.begin(); it != ops_.end(); it++) { - auto in_names = (*it)->InputArgumentNames(); - auto out_names = (*it)->OutputArgumentNames(); - vars2.insert(vars2.end(), in_names.begin(), in_names.end()); - vars2.insert(vars2.end(), out_names.begin(), out_names.end()); - } - // delete input vars if no other op use it. - std::vector del_vars; - std::sort(vars1.begin(), vars1.end()); - std::unique(vars1.begin(), vars1.end()); - std::sort(vars2.begin(), vars2.end()); - std::unique(vars2.begin(), vars2.end()); - // del_vars = vars1 - vars1 ^ vars2 - std::set_difference(vars1.begin(), vars1.end(), vars2.begin(), vars2.end(), - std::inserter(del_vars, del_vars.end())); - for (auto it = del_vars.begin(); it != del_vars.end(); it++) { - vars_.erase(vars_.find(*it)); + // variables should be deleted + std::vector delete_vars; + // delete_vars = cur_vars - cur_vars ^ other_input_vars + std::set_difference(cur_vars.begin(), cur_vars.end(), other_vars.begin(), + other_vars.end(), + std::inserter(delete_vars, delete_vars.end())); + // remove variables + for (size_t i = 0; i < delete_vars.size(); i++) { + auto name = delete_vars[i]; + auto it = vars_.find(name); + PADDLE_ENFORCE(it != vars_.end(), + "%s is not in variable list, it should not be deleted", + name); + vars_.erase(it); + VLOG(3) << "deleting variable " << name; + } } } diff --git a/paddle/fluid/framework/block_desc.h b/paddle/fluid/framework/block_desc.h index 185f018ac1..468423e0e8 100644 --- a/paddle/fluid/framework/block_desc.h +++ b/paddle/fluid/framework/block_desc.h @@ -89,6 +89,11 @@ class BlockDesc { OpDesc *InsertOp(size_t index); + /* + * Remove Op and its input/output variables. + * Note that for either input or ouput variable, if it is also an input or + * output variable of other ops, we should remain it. + */ void RemoveOp(size_t s, size_t e); std::vector AllOps() const; diff --git a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py index 871cb76fff..da85786d0c 100644 --- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py +++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py @@ -197,13 +197,14 @@ class TestBlockDesc(unittest.TestCase): var2 = block.var("var2") var3 = block.var("var3") var4 = block.var("var4") + var5 = block.var("var5") op1.set_input("X", ["var1", "var2"]) - op1.set_output("Y", ["var3"]) + op1.set_output("Y", ["var3", "var4"]) op2.set_input("X", ["var1"]) - op2.set_output("Y", ["var4"]) + op2.set_output("Y", ["var4", "var5"]) # remove op1, its input var2 and output var3 will be removed at the same time, - # but its input var1 will not be removed since var1 is also an input for op2. + # but its input var1 and output var4 will not be removed since they are used for op2. block.remove_op(0, 1) all_ops = [] @@ -211,7 +212,7 @@ class TestBlockDesc(unittest.TestCase): all_ops.append(block.op(idx)) self.assertEqual(all_ops, [op2]) all_vars = block.all_vars() - self.assertEqual(set(all_vars), {var1, var4}) + self.assertEqual(set(all_vars), {var1, var4, var5}) if __name__ == '__main__': From 587781153eb21ad69e571d012002dd97b93d9a88 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 27 Mar 2018 20:41:21 +0800 Subject: [PATCH 234/314] fix slr deser --- paddle/fluid/operators/detail/variable_response.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 12e8eb0b4d..d0f103c455 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -153,6 +153,7 @@ bool VariableResponse::CopySelectRowsData( const platform::DeviceContext& ctx, int length) { auto var = scope_->FindVar(meta_.varname()); auto* slr = var->GetMutable(); + slr->mutable_rows()->resize(length / 8); int64_t* rows_data = slr->mutable_rows()->data(); // copy rows CPU data, GPU data will be copied lazily. From 094d5096899344206892cc2f82b85bfe2bae2bac Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Tue, 27 Mar 2018 20:41:33 +0800 Subject: [PATCH 235/314] fix slr deser --- paddle/fluid/operators/detail/variable_response.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index d0f103c455..3787b139a5 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -153,7 +153,7 @@ bool VariableResponse::CopySelectRowsData( const platform::DeviceContext& ctx, int length) { auto var = scope_->FindVar(meta_.varname()); auto* slr = var->GetMutable(); - slr->mutable_rows()->resize(length / 8); + slr->mutable_rows()->resize(length / 8); // int64 int64_t* rows_data = slr->mutable_rows()->data(); // copy rows CPU data, GPU data will be copied lazily. From cc1c6afbbf6df880b2954b61cf1afdc9c368597d Mon Sep 17 00:00:00 2001 From: "yi.wu" Date: Tue, 27 Mar 2018 23:17:30 +0800 Subject: [PATCH 236/314] fix slr serde --- .../operators/detail/variable_response.cc | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 3787b139a5..bdda570343 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -48,6 +48,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, void* dest, int size) { const void* data = NULL; int size_to_write = 0; + int length = size; + int total_written = 0; if (platform::is_gpu_place(place)) { #ifdef PADDLE_WITH_CUDA @@ -56,16 +58,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, platform::CPUPlace cpu; char* p = reinterpret_cast(dest); - while (size > 0) { + while (total_written < length) { if (!input->GetDirectBufferPointer(&data, &size_to_write)) { return false; } - + // NOTE: if raw buffer is large and have two neighbor fields of raw + // buffers GetDirectBufferPointer can get all of them, use length to + // truncate it. + if (total_written + size_to_write > length) { + size_to_write = length - total_written; + } memory::Copy(boost::get(place), reinterpret_cast(p), cpu, data, size_to_write, gpu_dev_ctx.stream()); p += size_to_write; - size -= size_to_write; + total_written += size_to_write; input->Skip(size_to_write); } @@ -77,16 +84,21 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, } char* p = reinterpret_cast(dest); - while (size > 0) { + while (total_written < length) { if (!input->GetDirectBufferPointer(&data, &size_to_write)) { return false; } + // NOTE: if raw buffer is large and have two neighbor fields of raw buffers + // GetDirectBufferPointer can get all of them, use length to truncate it. + if (total_written + size_to_write > length) { + size_to_write = length - total_written; + } // TODO(gongwb): can we avoid copy? platform::CPUPlace cpu; memory::Copy(cpu, reinterpret_cast(p), cpu, data, size_to_write); p += size_to_write; - size -= size_to_write; + total_written += size_to_write; input->Skip(size_to_write); } @@ -234,7 +246,6 @@ int VariableResponse::Parse(Source* source) { if (tag != 0) { return -1; } - return 0; } From c078ed4608c9dd4b43a73f21c6030097aeb1ae1c Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 28 Mar 2018 02:57:54 +0800 Subject: [PATCH 237/314] Enhance reshape_op by adding Input(Shape) --- paddle/fluid/operators/reshape_op.cc | 101 ++++------------- paddle/fluid/operators/reshape_op.h | 106 +++++++++++++++++- python/paddle/fluid/layers/nn.py | 63 ++++++----- .../fluid/tests/unittests/test_reshape_op.py | 22 ++++ 4 files changed, 184 insertions(+), 108 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index c817b35693..4b1aaf5849 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -17,88 +17,18 @@ limitations under the License. */ namespace paddle { namespace operators { -class ReshapeOp : public framework::OperatorWithKernel { - public: - ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorWithKernel(type, inputs, outputs, attrs) {} - - void InferShape(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("X"), - "Input(X) of ReshapeOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of ReshapeOp should not be null."); - - const std::vector &shape = ctx->Attrs().Get>("shape"); - PADDLE_ENFORCE(!shape.empty(), - "The shape information must be set by Attr(shape)."); - - std::vector output_shape; - auto x_dims = ctx->GetInputDim("X"); - auto out_dims = ValidateShape(shape, x_dims); - ctx->SetOutputDim("Out", out_dims); - // NOTE: Reshape op cannot reshape an input sequence batch into an - // output sequence batch that has a different number of time steps. Here - // output always shares the LoD information with input. But if - // Attr(shape) contains 0 or -1, the actual output shape can only be - // determined during runtime. The check for wheather it is a valid - // output sequence batch is performed in runtime. - ctx->ShareLoD("X", /*->*/ "Out"); - } - - private: - framework::DDim ValidateShape(const std::vector shape, - const framework::DDim &in_dims) const { - const int64_t in_size = framework::product(in_dims); - // only one dimension canbe set to -1, whose size will be automatically - // infered. - const int64_t unk_dim_val = -1; - const int64_t copy_dim_val = 0; - - std::vector output_shape(shape.size(), 0); - int64_t capacity = 1; - int unk_dim_idx = -1; - for (size_t i = 0; i < shape.size(); ++i) { - if (shape[i] == unk_dim_val) { - PADDLE_ENFORCE( - unk_dim_idx == -1, - "Only one input dimension of Attr(shape) can be unknown."); - unk_dim_idx = i; - } else if (shape[i] == copy_dim_val) { - PADDLE_ENFORCE( - static_cast(i) < in_dims.size(), - "The index of dimension to copy from input shape must be less " - "than the size of input shape."); - } else { - PADDLE_ENFORCE( - shape[i] > 0, - "Each input dimension of Attr(shape) must not be negtive except " - "one unknown dimension."); - } - - capacity *= (shape[i] ? shape[i] : in_dims[i]); - output_shape[i] = - (shape[i] ? static_cast(shape[i]) : in_dims[i]); - } - - if (unk_dim_idx != -1) { - output_shape[unk_dim_idx] = -in_size / capacity; - PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size, - "Invalid shape is given."); - } else { - PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); - } - return framework::make_ddim(output_shape); - } -}; - class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { public: ReshapeOpMaker(OpProto *proto, OpAttrChecker *op_checker) : OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "The input tensor of reshape operator."); - AddOutput("Out", "The output tensor of reshape operator."); + AddInput("X", "(Tensor). The input tensor of reshape operator."); + AddInput("Shape", + "(Tensor, optional). If provided, reshape according to " + "this given shape. That is to say it has a higher priority than " + "the shape attribute, while the shape attribute still should be " + "set correctly to gurantee shape inference in compile time.") + .AsDispensable(); + AddOutput("Out", "(Tensor). The output tensor of reshape operator."); AddAttr>( "shape", "(std::vector) Target shape of reshape operator."); AddAttr("inplace", @@ -110,8 +40,8 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { AddComment(R"DOC( Reshape Operator. -Reshape Input(X) into the shape specified by Attr(shape). The data in Input(X) -are unchanged. +Reshape Input(X) into the shape specified by Attr(shape) or Input(Shape). The +data in Input(X) are unchanged. Examples: @@ -141,6 +71,9 @@ Input(X) and remaining dimensions. dimension value will be copied from Input(X) at runtime. Note that the index of 0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. +1. Input(Shape) has a higher priority than Attr(shape) if it is provided, while +Attr(shape) still should be set correctly to gurantee shape inference in +compile-time. )DOC"); } @@ -160,6 +93,14 @@ class ReshapeGradOp : public framework::OperatorWithKernel { "Input(Out@GRAD) shouldn't be null."); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } }; } // namespace operators diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 59adb5e87c..3a9a769229 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -20,15 +20,115 @@ limitations under the License. */ namespace paddle { namespace operators { +class ReshapeOp : public framework::OperatorWithKernel { + public: + ReshapeOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of ReshapeOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of ReshapeOp should not be null."); + + const std::vector &shape = ctx->Attrs().Get>("shape"); + PADDLE_ENFORCE(!shape.empty(), + "The shape information must be set by Attr(shape)."); + + if (ctx->HasInput("Shape") && ctx->IsRuntime()) { + // If true, set the shape of Output(Out) according to Input(Shape) in + // ReshapeKernel with ExecutionContext. Also check LoD in ReshapeKernel. + ctx->ShareLoD("X", /*->*/ "Out"); + return; + } + + auto x_dims = ctx->GetInputDim("X"); + auto out_dims = ValidateShape(shape, x_dims); + ctx->SetOutputDim("Out", out_dims); + if (x_dims[0] == out_dims[0]) { + // Only pass LoD when the first dimension of output and Input(X) + // are the same. + ctx->ShareLoD("X", /*->*/ "Out"); + } + } + + static framework::DDim ValidateShape(const std::vector shape, + const framework::DDim &in_dims) { + const int64_t in_size = framework::product(in_dims); + // only one dimension canbe set to -1, whose size will be automatically + // infered. + const int64_t unk_dim_val = -1; + const int64_t copy_dim_val = 0; + + std::vector output_shape(shape.size(), 0); + int64_t capacity = 1; + int unk_dim_idx = -1; + for (size_t i = 0; i < shape.size(); ++i) { + // std::cout<< shape[i] << "haha"; + if (shape[i] == unk_dim_val) { + PADDLE_ENFORCE( + unk_dim_idx == -1, + "Only one input dimension of Attr(shape) can be unknown."); + unk_dim_idx = i; + } else if (shape[i] == copy_dim_val) { + PADDLE_ENFORCE( + static_cast(i) < in_dims.size(), + "The index of dimension to copy from input shape must be less " + "than the size of input shape."); + } else { + PADDLE_ENFORCE( + shape[i] > 0, + "Each input dimension of Attr(shape) must not be negtive except " + "one unknown dimension."); + } + + capacity *= (shape[i] ? shape[i] : in_dims[i]); + output_shape[i] = + (shape[i] ? static_cast(shape[i]) : in_dims[i]); + } + + if (unk_dim_idx != -1) { + output_shape[unk_dim_idx] = -in_size / capacity; + PADDLE_ENFORCE_EQ(output_shape[unk_dim_idx] * capacity, -in_size, + "Invalid shape is given."); + } else { + PADDLE_ENFORCE_EQ(capacity, in_size, "Invalid shape is given."); + } + return framework::make_ddim(output_shape); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + ctx.device_context()); + } +}; + template class ReshapeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const { auto *out = ctx.Output("Out"); auto *in = ctx.Input("X"); + auto *shape_tensor = ctx.Input("Shape"); - auto out_dims = out->dims(); - + framework::DDim out_dims = out->dims(); + if (shape_tensor) { + auto *shape_data = shape_tensor->data(); + if (platform::is_gpu_place(ctx.GetPlace())) { + framework::Tensor cpu_shape_tensor; + TensorCopy(*shape_tensor, platform::CPUPlace(), ctx.device_context(), + &cpu_shape_tensor); + shape_data = cpu_shape_tensor.data(); + } + auto shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + out_dims = ReshapeOp::ValidateShape(shape, in->dims()); + } if (!in->lod().empty()) { PADDLE_ENFORCE_EQ( out_dims[0], in->dims()[0], @@ -39,9 +139,11 @@ class ReshapeKernel : public framework::OpKernel { } bool inplace = ctx.Attr("inplace"); + out->Resize(out_dims); if (!inplace) { out->mutable_data(ctx.GetPlace()); framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); + // TensorCopy will resize to in_dims. out->Resize(out_dims); } else { out->ShareDataWith(*in); diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 0e8354a4a0..098a629c89 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3320,42 +3320,54 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): return counter -def reshape(x, shape, act=None, inplace=True, name=None): +def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): """ Gives a new shape to the input Tensor without changing its data. - This layer takes a tensor and the attribute shape which specifies the - new shape as its inputs. The shape attribute must be given. It cannot be - empty. One and only one dimension of shape can be -1. More than one - dimension of shape can be 0. + The target shape can be given by :attr:`shape` or :attr:`actual_shape`. + :attr:`shape` is a list of integer while :attr:`actual_shape` is a tensor + variable. :attr:`actual_shape` has a higher priority than :attr:`shape` + if it is provided, while :attr:`shape` still should be set correctly to + gurantee shape inference in compile-time. - -1 means the value of this dimension is inferred from the total element - number of x and remaining dimensions. + Some tricks exist when specifying the target shape. - 0 means the actual dimension value is going to be copied from the - corresponding dimension of x. + 1. -1 means the value of this dimension is inferred from the total element + number of x and remaining dimensions. Thus one and only one dimension can + be set -1. + + 1. 0 means the actual dimension value is going to be copied from the + corresponding dimension of x. The indice of 0s in shape can not exceed + Rank(X). + + Here are some examples to explain it. 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - specified by Attr(shape) is [6, 8], the reshape operator will transform x - into a 2-D tensor with shape [6, 8] and leaving x's data unchanged. + is [6, 8], the reshape operator will transform x into a 2-D tensor with + shape [6, 8] and leaving x's data unchanged. 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will - transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data - unchanged. In this case, one and only dimension of Attr(shape) can be set - to -1, the value of this dimension is inferred from the total element number - of x and remaining dimensions. + specified is [2, 3, -1, 2], the reshape operator will transform x into a + 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this + case, one dimension of the target shape is set to -1, the value of this + dimension is inferred from the total element number of x and remaining + dimensions. 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape - specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will - transform x into a 4-D tensor with shape [2, 4, 3, 2] and leaving x's data - unchanged. In this case, besides -1, 0 means the actual dimension value is - going to be copied from the corresponding dimension of x during runtime. + is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor + with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case, + besides -1, 0 means the actual dimension value is going to be copied from + the corresponding dimension of x. Args: input(variable): The input tensor. shape(list): The new shape. At most one dimension of the new shape can be -1. + actual_shape(variable): An optional input. If provided, reshape + according to this given shape rather than + :attr:`shape` specifying shape. That is to + say :attr:`actual_shape` has a higher priority + than :attr:`shape`. act (str): The non-linear activation to be applied to output variable. inplace(bool): If this flag is set true, a new output tensor is created whose data is copied from input x, otherwise the output @@ -3366,12 +3378,9 @@ def reshape(x, shape, act=None, inplace=True, name=None): Examples: .. code-block:: python data = fluid.layers.data( - name='data', shape=[2, 4, 6], dtype='float32' - ) + name='data', shape=[2, 4, 6], dtype='float32') reshaped = fluid.layers.reshape( - x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True - ) - + x=data, shape=[-1, 0, 3, 2], act='tanh', inplace=True) """ if not (isinstance(shape, list) or isinstance(shape, tuple)): @@ -3396,7 +3405,9 @@ def reshape(x, shape, act=None, inplace=True, name=None): reshaped = helper.create_tmp_variable(dtype=x.dtype) helper.append_op( type="reshape", - inputs={"X": x}, + inputs={"X": x, + "Shape": actual_shape} + if isinstance(actual_shape, Variable) else {"X": x}, attrs={"shape": shape, "inplace": inplace}, outputs={"Out": reshaped}) diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index 1a54427ab5..88c9933da3 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -122,5 +122,27 @@ class TestReshapeOpDimInferInplace2(OpTest): self.check_grad(["X"], "Out") +class TestReshapeOpWithInputShape(OpTest): + def setUp(self): + ori_shape = (6, 5) + new_shape = (0, -1, 5) + actual_shape = (2, 3, 5) + + self.op_type = "reshape" + self.inputs = { + "X": np.random.random(ori_shape).astype("float32"), + "Shape": np.array( + actual_shape, dtype="int32") + } + self.attrs = {"shape": new_shape} + self.outputs = {"Out": self.inputs["X"].reshape(actual_shape)} + + def test_check_output(self): + self.check_output() + + # def test_check_grad(self): + # self.check_grad(["X"], "Out") + + if __name__ == "__main__": unittest.main() From 0e7413938a109285e41f3a55650c6a338279c355 Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Tue, 27 Mar 2018 14:32:42 -0700 Subject: [PATCH 238/314] added missing *.pb.h *.pb.cc generation to fix distribute build issue --- cmake/generic.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c749c97f13..c0808ac06c 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -597,6 +597,9 @@ function(grpc_library TARGET_NAME) COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}" + COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} + ARGS --cpp_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}" + "${ABS_PROTO}" DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc) # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it From 54a8c04fab9310ef78f0b000ae411fd7ae706ee7 Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 27 Mar 2018 22:09:43 +0000 Subject: [PATCH 239/314] add inplace attr to bn --- python/paddle/fluid/layers/nn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2db4e5d27d..0332556f62 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1483,6 +1483,7 @@ def batch_norm(input, param_attr=None, bias_attr=None, data_layout='NCHW', + in_place=False, name=None, moving_mean_name=None, moving_variance_name=None): @@ -1538,7 +1539,7 @@ def batch_norm(input, saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True) - batch_norm_out = helper.create_tmp_variable(dtype) + batch_norm_out = input if in_place else helper.create_tmp_variable(dtype) helper.append_op( type="batch_norm", From f34f2d40267ce7334af6092242c7eef83e3f33aa Mon Sep 17 00:00:00 2001 From: Yang Yang Date: Tue, 27 Mar 2018 22:10:32 +0000 Subject: [PATCH 240/314] make bn inplace in img_conv_group by default --- python/paddle/fluid/nets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 3b2e1a3073..bbedf6fde0 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -98,7 +98,7 @@ def img_conv_group(input, use_mkldnn=use_mkldnn) if conv_with_batchnorm[i]: - tmp = layers.batch_norm(input=tmp, act=conv_act) + tmp = layers.batch_norm(input=tmp, act=conv_act, in_place=True) drop_rate = conv_batchnorm_drop_rate[i] if abs(drop_rate) > 1e-5: tmp = layers.dropout(x=tmp, dropout_prob=drop_rate) From d4f49355309f257f33ce08c4d680c712ee5cf2a0 Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Tue, 27 Mar 2018 18:56:04 -0700 Subject: [PATCH 241/314] test removal of redundant line --- cmake/generic.cmake | 1 - 1 file changed, 1 deletion(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c0808ac06c..981da16a45 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -587,7 +587,6 @@ function(grpc_library TARGET_NAME) get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) - protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}") From 06aaea8a64c59467d45f2cf2e4eea3d0e91d946a Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Tue, 27 Mar 2018 19:10:04 -0700 Subject: [PATCH 242/314] Revert "test removal of redundant line" This reverts commit d4f49355309f257f33ce08c4d680c712ee5cf2a0. --- cmake/generic.cmake | 1 + 1 file changed, 1 insertion(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 981da16a45..c0808ac06c 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -587,6 +587,7 @@ function(grpc_library TARGET_NAME) get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}") From 1daa96579cd5df393b8f848c72ea9974a8d25b62 Mon Sep 17 00:00:00 2001 From: Xi Chen Date: Tue, 27 Mar 2018 20:14:34 -0700 Subject: [PATCH 243/314] adding comments for this fix --- cmake/generic.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c0808ac06c..3fe750f47e 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -587,6 +587,9 @@ function(grpc_library TARGET_NAME) get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE) get_filename_component(PROTO_PATH ${ABS_PROTO} PATH) + #FIXME(putcn): the follwoing line is supposed to generate *.pb.h and cc, but + # somehow it didn't. line 602 to 604 is to patching this. Leaving this here + # for now to enable dist CI. protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}") set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc") set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h") From 0ce558f19e49cb29db299cf5b50ce5c13e36590c Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 28 Mar 2018 11:17:30 +0800 Subject: [PATCH 244/314] kernels of increment op --- paddle/fluid/operators/increment_op.cc | 89 +++++++++----------------- paddle/fluid/operators/increment_op.cu | 21 ++++++ paddle/fluid/operators/increment_op.h | 39 +++++++++++ 3 files changed, 90 insertions(+), 59 deletions(-) create mode 100644 paddle/fluid/operators/increment_op.cu create mode 100644 paddle/fluid/operators/increment_op.h diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index 6b5c3db13c..2893ab7127 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -1,71 +1,37 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/fluid/framework/op_registry.h" +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/increment_op.h" namespace paddle { namespace operators { -class IncrementInferShape : public framework::InferShapeBase { +class IncrementOp : public framework::OperatorWithKernel { public: - void operator()(framework::InferShapeContext *ctx) const override { + IncrementOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorWithKernel(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext *ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of IncrementOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of IncrementOp should not be null."); PADDLE_ENFORCE_EQ(1, framework::product(ctx->GetInputDim("X"))); ctx->SetOutputDim("Out", ctx->GetInputDim("X")); - } -}; - -struct IncrementFunctor { - IncrementFunctor(const framework::LoDTensor &x, framework::LoDTensor *out, - float value) - : x_(x), out_(out), value_(value) {} - - template - void operator()() const { - *out_->data() = *x_.data() + static_cast(value_); - } - - const framework::LoDTensor &x_; - framework::LoDTensor *out_; - float value_; -}; - -class IncrementOp : public framework::OperatorBase { - public: - IncrementOp(const std::string &type, const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto &x = scope.FindVar(Input("X"))->Get(); - auto &out = - *scope.FindVar(Output("Out"))->GetMutable(); - - PADDLE_ENFORCE(platform::is_cpu_place(x.place())); - out.Resize(x.dims()); - out.mutable_data(x.place(), x.type()); - float value = Attr("step"); - VLOG(10) << Output("Out") << " increase " << Input("X") << " with " - << value; - framework::VisitDataType(framework::ToDataType(out.type()), - IncrementFunctor(x, &out, value)); + ctx->ShareLoD("X", "Out"); } }; @@ -108,5 +74,10 @@ class IncrementGradOpMaker : public framework::SingleGradOpDescMaker { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementInferShape, - ops::IncrementOpMaker, ops::IncrementGradOpMaker); +REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker, + ops::IncrementGradOpMaker); +REGISTER_OP_CPU_KERNEL( + increment, ops::IncrementKernel, + ops::IncrementKernel, + ops::IncrementKernel, + ops::IncrementKernel) diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu new file mode 100644 index 0000000000..0b6cb1fc85 --- /dev/null +++ b/paddle/fluid/operators/increment_op.cu @@ -0,0 +1,21 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/minus_op.h" + +REGISTER_OP_CUDA_KERNEL( + increment, ops::IncrementKernel, + ops::IncrementKernel, + ops::IncrementKernel, + ops::IncrementKernel) \ No newline at end of file diff --git a/paddle/fluid/operators/increment_op.h b/paddle/fluid/operators/increment_op.h new file mode 100644 index 0000000000..d0e8c66255 --- /dev/null +++ b/paddle/fluid/operators/increment_op.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class IncrementKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x_tensor = context.Input("X"); + auto* out_tensor = context.Output("Out"); + float step = context.Attr("step"); + + out_tensor->mutable_data(context.GetPlace()); + auto& dev = + *context.template device_context().eigen_device(); + framework::EigenScalar::From(*out_tensor).device(dev) = + framework::EigenScalar::From(*x_tensor) + static_cast(step); + } +}; + +} // namespace operators +} // namespace paddle From e9370fe59fc0c630b1d7665e3b392ce574c0ba1c Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 28 Mar 2018 11:51:38 +0800 Subject: [PATCH 245/314] fix compile bugs --- paddle/fluid/operators/increment_op.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu index 0b6cb1fc85..7ef688ca1d 100644 --- a/paddle/fluid/operators/increment_op.cu +++ b/paddle/fluid/operators/increment_op.cu @@ -12,8 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/operators/minus_op.h" +#include "paddle/fluid/operators/increment_op.h" +namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( increment, ops::IncrementKernel, ops::IncrementKernel, From 6dfc33c226a3fcb7c0d96c179c3dbbc687d9570f Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 28 Mar 2018 12:47:18 +0800 Subject: [PATCH 246/314] fix compile errors --- paddle/fluid/operators/increment_op.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/increment_op.cu b/paddle/fluid/operators/increment_op.cu index 7ef688ca1d..7fb6425fe9 100644 --- a/paddle/fluid/operators/increment_op.cu +++ b/paddle/fluid/operators/increment_op.cu @@ -19,4 +19,4 @@ REGISTER_OP_CUDA_KERNEL( increment, ops::IncrementKernel, ops::IncrementKernel, ops::IncrementKernel, - ops::IncrementKernel) \ No newline at end of file + ops::IncrementKernel) From 5408854090230b0bb47315c66abcf4e364d26c06 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 28 Mar 2018 13:23:39 +0800 Subject: [PATCH 247/314] Disable model evaluation in unittests --- .../paddle/fluid/tests/unittests/test_parallel_executor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index 727dc6a56c..cb16ce26c6 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy import unittest + import paddle.fluid as fluid import paddle.v2 as paddle import paddle.v2.dataset.mnist as mnist -import paddle.v2.dataset.flowers as flowers import paddle.v2.dataset.wmt16 as wmt16 -import numpy def simple_fc_net(): @@ -214,7 +214,7 @@ class TestParallelExecutorBase(unittest.TestCase): last_loss = numpy.array(last_loss) print first_loss, last_loss - self.assertGreater(first_loss[0], last_loss[0]) + # self.assertGreater(first_loss[0], last_loss[0]) class TestMNIST(TestParallelExecutorBase): From 09743b61170718c7de8681cef813e93d816e53af Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 28 Mar 2018 13:36:59 +0800 Subject: [PATCH 248/314] Refine test_reshape_op --- python/paddle/fluid/tests/unittests/test_reshape_op.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index 88c9933da3..f51b5a7e99 100644 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -140,8 +140,8 @@ class TestReshapeOpWithInputShape(OpTest): def test_check_output(self): self.check_output() - # def test_check_grad(self): - # self.check_grad(["X"], "Out") + def test_check_grad(self): + self.check_grad(["X"], "Out") if __name__ == "__main__": From 9f4a98f39729d1f6c6019e5d95cd6c3b6721259f Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 28 Mar 2018 15:13:33 +0800 Subject: [PATCH 249/314] Add design doc --- .../images/parallel_executor_overview.dot | 83 ++++++++++++++ .../images/parallel_executor_overview.png | Bin 0 -> 179321 bytes doc/design/parallel_executor.md | 104 ++++++++++++++++++ 3 files changed, 187 insertions(+) create mode 100644 doc/design/images/parallel_executor_overview.dot create mode 100644 doc/design/images/parallel_executor_overview.png create mode 100644 doc/design/parallel_executor.md diff --git a/doc/design/images/parallel_executor_overview.dot b/doc/design/images/parallel_executor_overview.dot new file mode 100644 index 0000000000..40753cb140 --- /dev/null +++ b/doc/design/images/parallel_executor_overview.dot @@ -0,0 +1,83 @@ +digraph G { + subgraph cluster_init { + label="Initialization" + startup_program [label="startup", shape=box] + node_w_g0 [label="W\nGPU0"] + startup_program -> node_w_g0 [label="Initialize"] + node_w_g1 [label="W\nGPU1"] + node_w_g0 -> node_w_g1 [label="broadcast"] + } + + subgraph cluster_train { + label="forward_backward" + + subgraph cluster_gpu0 { + label="GPU0" + fc_0 [label="fc\nGPU0", shape=box] + hidden_0 [label="hidden\nGPU0"] + node_w_g0 -> fc_0 + fc_0 -> hidden_0 + loss0 [label="loss\nGPU0"] + hidden_0 -> loss0 [label="many ops omitted"] + scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box] + loss_g0 [label="loss_grad\nGPU0"] + scale_loss_0->loss_g0 + + fc_g_0 [label="w_grad\nGPU0", shape=box] + loss0 -> fc_g_0 + loss_g0 -> fc_g_0 + hidden_0 -> fc_g_0 + } + + subgraph cluster_gpu1 { + label="GPU1" + fc_1 [label="fc\nGPU1", shape=box] + hidden_1 [label="hidden\nGPU1"] + node_w_g1 -> fc_1 + fc_1 -> hidden_1 + loss1 [label="loss\nGPU1"] + hidden_1 -> loss1 [label="many ops omitted"] + scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box] + loss_g1 [label="loss_grad\nGPU1"] + scale_loss_1->loss_g1 + + fc_g_1 [label="w_grad\nGPU1", shape=box] + loss1 -> fc_g_1 + loss_g1 -> fc_g_1 + hidden_1 -> fc_g_1 + } + } + + all_reduce_w [label="Merge Gradients(AllReduce)", shape=box] + fc_g_0 -> all_reduce_w + fc_g_1 -> all_reduce_w + + fc_g_0_merged [label="w_grad\nMerged\nGPU0"] + fc_g_1_merged [label="w_grad\nMerged\nGPU1"] + all_reduce_w -> fc_g_0_merged + all_reduce_w -> fc_g_1_merged + + subgraph cluster_optimization { + label="Optimization" + subgraph cluster_opt_gpu0 { + label="GPU0" + sgd_0 [label="SGD Op\nGPU0", shape=box] + + fc_g_0_merged -> sgd_0 + node_w_g0 -> sgd_0 + optimized_w_0 [label="Optimized W\nGPU0"] + sgd_0 -> optimized_w_0 + } + subgraph cluster_opt_gpu1 { + label="GPU1" + sgd_1 [label="SGD Op\nGPU1", shape=box] + + fc_g_1_merged -> sgd_1 + node_w_g1 -> sgd_1 + optimized_w_1 [label="Optimized W\nGPU0"] + sgd_1 -> optimized_w_1 + } + } + + +} diff --git a/doc/design/images/parallel_executor_overview.png b/doc/design/images/parallel_executor_overview.png new file mode 100644 index 0000000000000000000000000000000000000000..d890c0ffee3b38dc7cb74a2b56c2ab4831532211 GIT binary patch literal 179321 zcmd?RWmuGJ^gaqBqX>#1r64eZN(rcxfWROr2+|z}(jhG%X@DT5gwi1(t#k^uf&&a) z0|pHPN{7IIy@Px2-|u`o=Q>}`b-uW-tuymJv7WW=b>H{%KGf7uK1jtvMMg$;@REvx zHW?WTPe!(%nsP7v&1w7Ca`+Fqo3`>rvf|cbQ)FaVvP%kby0^(^3HzQheCYZ3TfX&@ zR;&$YSRjTkm@0I?7J~xCWgfn(uRq$ilJD`sk;hqHR6G!P^nD-?4<*kjO--%Xf#(v(M+I0;Jo{3riEayBm!Yge!6jR7XddvM_;UD9XQoU}Mm#Ox{;K70Adb zXt96);KtJ84x#@23>nIWidNy%{*QD2d<(u$tFS2iKYu3~MS*t-ekJukzD0rmbKrkG zi#C@`&OQ0+Y~+7WCzBJA-Sa6tGUA z6QR(!!tOVF>$VKz?Vl0ih3ZLSGLCGxi^vM9=mugWwd5j|F_%#C4-UM_QBOLRCzk)E z99w3EJ4c{sjY1|AeUaQXQzuld`@i}3!}L>9wwm>h|2zNZn($lqt92v(U4!TpSn2-$ z5>v*1ml{ijru@J0Q&>I^WR>#mRkHO?1;})9F#78v8iZ~ z`QZk;-;x&VnXA|(D}D_#Z@$ofN7r7MD7&42IHi{kRZDpd-a2Pc8Og@{sltp`V_d;<3OP>(*oT!}T?| z6sfAu;YE`HgKVCCZ{me4oYWEoD_BJB-V;uvM?~#=zPcr&+B!9)1llSMH^=&qe@tSR z_US9P?W{aHC8%L`HovXR_RHgwVtyO$QqwQ8I0t(Rjc%M&i+8sW<7Dmpb1}Rq`Od-@ z&#mR&-1V=?G|2J5u%R9)*Y`6VuxF?=tMgp>$m-lz@4Yw_YLlfD!Fd|uuHwrRZj-U*xFW~F%DJa= zgu1$>)!&~uss<|T2VZ?+w_jgftN>NV~`4XF{m9NVywcMpsq>&;y{ON4kjc!rb zQ579ba^q}Sn@mrRI%`!;x|?I)%QqX#GuHDBtEt5L#nxitCpNESV(m=vO0Pvgf0Fa- zsfE{BGv*a`aS_kiOke|3*<9Y!&GirFzVIL3zCb#$_Y4AKCV#C+8>HSuwZsd! zy(vV=k4dHZt@N+-Tc_2q&c+HCl^*l%y2<*Z8O8|O6QSbhFK6H~RaXxGzs$KVVkrVB`yrbS!CD8L-mz8~^&OIOS$@?5o9AskZF5JY#QL zQUvoYs2$b zg2d@&Tu|h(0a&!;T8|l};Tgrk+6RX?*!w47%iNZS+}fmL9Pg*JNUgrS_Kx$@q@t6C zr$&)+1?Qlva*)m67_tm$)U#$DH&_YbVOH-;QSaZVlvR0d(VV$VX zXRi@I>%{&Owa2_iV=7&KDzzMZOBm1GQ0#qUPa484s%Uz)R{x27`4j2MH{*t!|J*nC z*uE;#L*87aPgZ2a&EU?LYpf{I_Ca7G)wfuUT8kxYdR3M5^KlxPna6%jy8e2zudb>& zM)1Y^p9Y$24$4So{VWr-vt=LW*ZYz2cpx?zbswFT@3j4AT`<6VXKU5V`_n-dk_@law*y5}WO}h&>}Fk_V=BznVNH`hw@De5R)#!y^4GG2!6WQt za&+b^tLD#*ixJ|(HZ|)#Jw+zst6H2~?KR%rp1(gZ*NM;z`t&Thy(mtR6yS*fINN<5tvoQky0J7RcH`p#sik&#d7@_fhqg4? zk#&Y8iNRfvp%M=tEj^L9FGyrRK!lG-4$mn=&xmj-N0!6E;WmuOsUQKhAO;M#WHdo3 zYxj!?L^+4Rwo87k|6w#*6-X;sPu2(}WE>P#ViByv^U=0YrKk(73gBn%jRUnL-;9&i|JN6?lk3>Cusxa{bdr zrSDH^BtI?RctN?l42>|m%?fK9W|S@M+OKzqh|>in8qLot9r~~PZLU%kYsh z7`CYmmAy4|e&7Ka>LK-?6%4-f;t?%lbex|~HUr2*Rp2Pvq z*;k)x>HVQ#h60EWi-@Ap*X0i;>A$?nu2WWUe{$oSg|l6{DPNvl?eecL>T zy{!!<$oCz)ns#U5q2oaL7aG|=w+4OZ8we8Ct;ti<)A+2A?h=cU2J>>8n1=0*88Mr7 zOwtAK(uS>-LH%3}HmRAyvcik^sbbs~QGcfkoz!MFx6M8jFRkJ0J2f7}f$mf4iKD@I zoh#DaDT;FL5iFvGis4K-fi*5aKO*R7wSQ}M#2L_1*&&&=Cja#@=9#X%Cj#d6edy|U zA&0!)EPjk&b)4xbTItfc@+|lV4Vz@2mNMJTlKO=(YJa^Er#hk)RQkIq?j4t$WYPG1 z`{r0)&vysK4WU0VrIq$svZ?zPOzp6>vAlv4A9*_6vp!e9sKF}X`rHrJP(SDLGlogc zle>ZZff$Gk}X!6tc_&dK|&yjJ>!m-2+*L#;y$2V?o*l(^3IumR>1}hy2G5};Ss%5sFIsH)aQmg}pcYDrn`^bZBWS3Bm zSd1cy$|jE8rI|PPsnf1)Ce>>wZy95F`eBIj13j)Nyk%E2>_77xH4hvBG0U+|dW+FzcukDX+>F z!Md-hzH<@@hqk^7w;9&Bjx8kf1hf~JT1Jpr-2Q&wDb~olOO^0T^~>X(t%~o`zJm=D zf0n%mfBF?0zRK(_GU-L{{AO?2-c13|jp1Z>zwL^`?nN_94#U{zlp4&u2;Hq07BAP$ z6xS`3BR?)e6`^3?m@4g?*n$1QyFL>>(KqQQ?$GzmH7l-B{qJFMhUDTuLpN|2dl(&S zQK)5!i~J28OaHYHHYu{2&7*9x18$dHSBSdkyZOOqC3s?TBQ`T5 zGimQn0l@Rmq>7;z`OV(^&nQh{lwrZxw?4=$np@l%XsoyC;cwRLo*EPmK7=h3ShpK9NUUMY9YZOD8kBhpc1JHJ%eFCil7N37@_nC&ks@?ISGT*QeF z29obFEiMaQt-zOpWDHABfOPCMhiK7*z~fj(}NXQBT32P)^dd#CKtG~5Q>8?-WJzKo`3B?%{*LXup-Y{cDgQLO3nE>lA{Kp|oXO-1?S))L zXnD;wO7-bZzj#kon6<}-%^;S{c++;&sLXd%C^6jlP~2N4jXZ-A^X+%uBw8adaGpYC z;+I)MJBH#d_e)QraaCZrQa|tR^eN;ZP$(M|Sm{1rCl}b^}y;s!e-3?BCpX;zGb)UBenkyAC{K*|7=*Q zn^m^RsIKq718hxCgYT*#$4v(l4UNbttx`*?#sneL>N7X?CLcitjxmKXf1h?P7W;-G z4pyy>G1f2lnCMqInb`w0QbGt0VSB*QN`p;bNa*G5aiH5G{m1#r6t1?X%jJZOG#?K+ zb{*-*;W&MER?m{&UCHMPn6dz0^)Dy2cu|zkO|FKpdA_{a5}&OB2C{yX zO3xK7jS{u%dYP@3VAY0pIl23N22L!k86_w;liAjnZcH>C zIU4^+!TMh66HbKuvQ&Xhmj9K^6y$JNQ*xgKzP~HWlp5JF>@}F>{r$fsP?QXO-_ZC( zex!9r{^1A(W#+r~7E>-1FO#O^C#dBPFk4<3`tDt%^z9IAV8GIzA}ppI#qv=9xR7K! zLFFfDDSpsmLL>Ktn*pk<3{=Y0`y3uge5a2DP0%boCbadn%vcz$N?jncg7D!KPalm? z?g?Yi@0=*<{T zuaGkGXDax5;iQrvD(EXUWTBg{PnPe7R!|p^j=956g)y{L&Q@z6J;EQRyJ8a6&Q5{iYf^E?N<~_cmXAR#5L0XD zaO@r#Y8`##VXM$vU(zHUm|%IvhsOqJRkC2^42q0zCW$+1{4|pIOHe+egAY8CvR1^* zP>38Zx@hj+;~~HdKSPP3m4{v4OO*rz@+TJxwR#Y_Mm$eC%$VCSz&A6(#*YGp(6Ox< zZ&k{L>Oz!A=j?#(rZuH;ygCnJ zCS&*OoJE!?Fb)2BtBbW99TZ5-BLrUz^y0*khy|XxdT9`DLfDG+0XCU@BTI6s8kTND^ zR_ed`zpEg-L{5Riz*!H}$b3jip@`+e^PR3f9Lt`SLYm?SNRooZ$37hoY!X8b#36Oi0jSQ;Wil5o5SrkA`` zq|-DPDRXn!7aBzxdMT3d`Dx2AzH+jl!ox6`g4!qHq?5>lh1e!LA_n>C zH*BV0DyuRn(kLM2B@MR}?dg6sz${Yk zlHIib`5b)U^o*w9zM#w0HYX&xxjzA;iL@4XM$cY*d#^fiWwA|m@R@b0Z8(#_z?_RB zL8$hzMZi|~|Alk^c91l18;BD5&Tdm4)D&I2N_q1*E~&_Q~_bP6+J>*WcQHwwQDl1Axvh6sR(xcCduhX{{W*L@q zfVJ((y}}{&EpSg}<@V~$dJjh(a#qq3T;zgyODrD0NT(7ND1NvI4KOR0kFbr@_i1sT}A71%DwO z%)ebA>yfPg#g}KSB>PO~A&uJn&!=tcZFwn5k;iOJp!c-e0T%nyK)Ee_|Msx&d_PcB zPAm`S33XnUQbM~?ssrb7cGjSn3W}kPyMgVoSMl3DV(2pEV=nhvyks_K%ZNa_mnpM* zzaP8o8?-u9<7S2#sByDSlJTppV?yWt4<5V-04ftO;f1?QIKyM4@ffqNLYQzv=D$q% zoR4vplOxb}B?ph5d%yP(`>~@pN2*=2rX*um2|!*l&x+f2yccum(`?V{BdKoDXyi1v zKQu+5{OL8DX3+2Pj-8%_(=Qe_=NkuqeSb&X-kh)EPP@~wz4r6aCiJ}oYe3{>z<3Wg ztZaAY>+uiT=B5rqwOUlD-|nKvLZgQ$W3`-`4=%iL67SP6~oLpbb14{7*jiy zLZ1o&eY4yY&%{B$?{DvWu8$=Q%I#ybOfYr5^gImKt&O1IJ?uT9SZVv*R8EQnKQX?L zBFd_qYtB)eJEhsK;x2bkyR|fIJVCH<^ibvvV#ytQbUO=9zr8rwE>3|dPy)CHc%)K? zUSk+j96q#mJgLXV%l-H3BMQzxMM-COg)+J|M%wR79Q2Ao#0UlMfzO!~_{E7fuk{vK z_QoV!7Vw(;Q}jEhZC@S139$JsW-(u87EAEB$USkDlr}98$XaS*!+?6j6zX-N_Ju(K znPku9qw7(Pzdy1%f+{hmBZ#?`E;T zqXnSF-6mY5esLfh6;#N;6C*{XvzBtf$a~6^0kuv0RM^6-aVX=o}tP z%NNRca_bDTT!GF|rdHGq+G#0Dm9bODSnl_zI2GFdJY3U}DR{gXJ4l`GAv30v%NMA> z#mJ}o5~y$!q1_(rI4uyE6e(X%az;Cm6AWKyx0~XpoDNOwb~TuO$PizcvSlb}&%n8R zZ(KpkP6|PaQlxcb^@*{^HdzCL*({W|p&{*D2b`EE8mjjxF-5z zPgjjl?0LYP>XORmHxdgFkHcFYC`%vBClcvY?m|Vk#f~$O26Yqx`A4L=l`wS(xSiV` zn@0BYCE%5iE$+Y>WgwAYK$P*wgQ2B^&WRZAluk!!nFgRYLC7N2j~NlANEnSD0qNw> zDJfv(mY`HS0@1;nm!?YkD(!zvu+iyscN#{k$Y!1{o3$f^_gW(-Wv? zfOJH}nXEphmXTV*=lZY$6|cJ@Vb}n;JyzX*BVW*mHM90ub#(!y1)x}geo-kdN!+^f zcrx2=Yx=K&1N$w2)b?fn3^-N4i?V5uEI6rLj3T$7vN>UVA2X0Zkk2B8&|&BZHOQjm z>9AKfQRcL`18jp{KXgc^{2RpGE`krD!=!MXF;j$3!P@+Qb}N$p5f~TJCs9CzR-U`r ze8#{X|D_MO3RnybEC%@$PvS$YCR&6=UBg||H_CIjxjF$2a`XXEBQU(E=5RO^ys^5Z zw)PuWP=|~iHE?*)w0upL=!AHh2)2rl;lC|tP@LSbKAyA!bm<^e(4R`#qFzf-#fyNC zA#c2%NFrot^RYUQOVh?#3$m%Ec$Nl(kK7_HiI&qcH9* zo!QbhFuIZd%^qrfLSoC@u^%!a_Y9o8$rZAN!c?7 zj$ruZ50TLSl?3M04c2-TNL|($jA^DT!EBOZV@(f+T_P=CNBbE?uI|;_;OpgeGVxiK z+eE9;Ej@s8q}f@ah4>#J2{}5j2^P!|tSwHhlR#n`KRKn2xINscU+KQke3QumRBKhD zsJ)XL*L76xgy!x*wSl8b(KhE(hWrP*g%Bc(;E-r*O|gd#CG3KJvFWpkAIs}2`h2G$ z?z)T32B&+9?7mqV~VtJv967 zkZd{P3IecVR^ZitlNmAeh}yI_(Ome|q=GRm&==^Wx{oLW2x9hZm)mCqxyyl2>*39g zxO_(ZH*x51!;5M{4VF~|&!m>TV%@yQQrpq;G%mcy*(BZ1IQ}E5@^M0u7MNoia@S~Y~o2V)$zCQb@ zRF@0K#<~9Y0|aM_bcV#!%nwu)kmQr+KE4wZ!nqq5`ZR1O$#G zow*h~Hac3JT?P}UL zd-0J^@L%x3N}YI&jJff_4g{2tG*q{$$vP(N_h*XCbu79%biPe6CilWCW`bRERqc3HwJ?_0~S9C619 zN!xYth(}ep=EaS>fV*fA5?xgQCXG~_Fq)x|hmK?KJi>}m{#pAOB^Y>2#3q`WGp-pA zzMpFAe#-p&Rt$Jt%*Ky&+lAO48F~zVPYDC8No}?3u@Eel!rX6JPaHiI#>hVl&C^TR zoI!!W!_nj170(iNjCt8v{CZEb!xm&fT=3>iq?2gvpj%L^L$cCpnrW#ZbUz4$c9dCP zNuVz@Iv>k_Y(+au=^(3^!&7OHG6kKDsXy7oDNyYujo36N-tCw8pc`ilP~;5l z7rD%Rf>D#;x!L!NuVZi&o?Mqi!>GUkCbJc2xU;LslMj-L=)aYrsS>67eG(2LG|L0( z*Ltg6emZ(xSG!D}U0V;fC8x2Eh&_(=%#FFvO1t~KZ4rn`NS}=2wqMHMncT$>k*{*5y5Tp3IrEMPs%h>62GUL?ygdj)S`s3rw}=` z)Se+v1XEVoT^dQAtlsQOhrXA`J03te?1Q-y}`plMd7pKZ7lCJZmpC77Bm8lm6&K>{LrCS{f>E_SzRS{eq1EN25fRNW;+pT z#qO-#5>F`Cmw5j7)WH%;Hd4;hQAP*q4|RT?f_ymWj(ChWwHCac833I zG*4KdHpbj%>SR@VSdQ(Yn#4-q4Rx*>B!#AaTf9fFf$R|eVVvA%gPr_i-qR|X> zQ1G85r&`|Hn)+SqMH6M_`f)5K9c5tZa-YqA?fcx`^Ri|ap*j`?>}(5)OG>60W_7?h zs{@qMnte&3%1G_j>KsAJ=ZTOxKjYYVl5?dphhG<>iH3{eDM`y5fN1tv95!f_%YGZV zXkhh*oNC76t0UEVTcJU&mFh-(e7S!>s^k=BQ*us|@+u~RU8xf6sERAgo#ADzk}Z%@ z=`ZR0))}VUvg z%!yJ{&E|cfK%H-s6A5BdjiOq8zc>SS=G0rzhd(*Wj>YO|-&z?p=!ay7P*p^5%W3$J zb&(ia+VfBynmmpkq$z7cSRqU=C{oxnx2c;D5JV#}8m*Gl5KDtQN~m$0`0Od$_=_p@ z85lLvk}eJIc94L`Q%>1v+{K=wumW#~RW*1|<%jAER&|n+?y|k3kO<)=x3OqA&1$I0{34Cpf#!eqS+;3hUDd~#Y>^- zafpSWYaf{ZkiO;f!*Oe}AhP|1Mr9w>4>6~ovewfPEeqWU+=UyvE`17>tAMO?7?*gS z(M2~fMgA?ryZ(Ig!|l2*iQ|!vkCOM#SY7&CUPu=JXfZ+H&<^?_bFR zY@~(@1lT|dI0rXwe2P@J53j&9!w6;zN?w;DRsjU|gZtyvo;!xTW=6MAC^@eP!^Xz3LQ^`X-518^$$ zb_R39-&G`WULo=ros<4aPR%U%?}`U4{b{)?jHqkrAjntR;5kh|ztEoMFMK@JnL7x% z+0ppgEvOxyLoT15S-rRoFo19{GI7S6I8af{BA}4jq(3Owu%r#{h!pDAU_Uk%&-%PR z0e*_B&$qnxXR#+%O1uSoI_7RReZ8(8dSGpunt7)<-Jrd+rZr63qAnT zq^?T$SP1YzcHh~UNF5g%(uOrskAl+_lZC1YGSZlvf0zt0%rp==^bJqk1nQHi);Uw^ z0uYlDaBOfA#PUv8SGS<5yg9rwhlk@j2oGFoegHb1Oy&-c#5nr6fQg}wnBT%*6I3&} zGWz6-PtwALiM<~s^uYOoFH@a6=qFJVUN$hB}(;=aWA5*6In$q)4VL2H>~o*SP4E5-Kb+^KyzXL!KTR-_xnfOCSF4`5hSI)%pPi}Pv|7jE3*U{` zKB&$AM$JrG<2Wxy6Td7x0_y4n0)WKLv^_J!xIAc9K(kpgcfo%< zYU85BPoMW23qJ{eUT8VYe@71(H!0b24Ikjzcudb`Y42_-@oTuusg#6euOW!YZQhL- z#8f?{JtJf+$XafK+Aoc#7o*izeOKKJj&?wVCr2CLYP*fl_EcoTyFeOn?L#Kng z5dNX)!X#Uk-qDJVjB%rtFzqAd)87eRBSfW7jI?|^28&8S8!leDBCVOIGIx>BX`mij zjRAWlec723>gKqfEO6Wonih1}E+vo^appA*_-_AMY%}AyElV${S(c*^Wy-&$Jim-cIrOKTXh{GY5By`N-Nq)Aq^0Qz=y zApgdBbh~6@Yu+e+qhA!mn(t3Et)%mHCK{%~q?uC|pZ?rauV7^_Rse&!64a2tQhbl{8G z%3yu{$ZbLAsgCS9C2aQj%gN{QgfmQSzZD4v1xNS-jLIA*W4>lu$Wv8{L@fY3Fq<^( zV4>3!EN41#NfGaGY-gjl(Ue7bu4DLvMfvh7*@QAij=bnjZtMea@>b`~r$W-9uz%!N z9h{(xKBJ1$jMnYSq{HiiFe!>uK*B8dhlAgR_lmhIP`Ukl98AY<7CE(F>N0Nm(qut4I*9yxc0fg7cJ{f`QjhAGV zkMQX_f#JZ9kU)nEP`{KGsC|Bk`Ka^)GvU_@Q-=t1`)m#AnT*DIo^LBT61s_@POgt= zibZ{jbWdbQ@i2xWVSPd^aD?Z}EKw4OwF^Ek*iBbowQzRlz7K3pH$$# zC<7M#H+Fa8tMs#4nxFN`t9_afs?9hnS*;&fL^lg1pf!8TCII%|oV)^MFI3%AhH}Ow z0Y44|@IO4xubL*XIQ|SKFS4mdI1Aqp?UOu%%uj)cTb}LjMC`*=A`)EjgR2XJxrzG~ zq6|A%iVR=9MfVw>R|%?Q7q4L6}^^EG-`IT$@onrJ4K)WjN`!FzyQc%Tb<11 z6hY4^m*z)C$`w3;JQ(5`0>Adnub!G`igNIs2abbIy%g(*=J>!ZNB2U%C=nc!u^oFGoIdW+IJU(rg5H+<#hGS3)KD%P|!P-LP-s6&Hi49Qm@chK)IY#`f&o}>(E zlJ!f>iTU{75v<~#df@g@A?vuv#H-+s2$M(<$<0oC zXIO4o!Q_B{Q8u2{^Z0vW)YD+>1(eiWb(TrrBox=7@iVyl($2#dRuyr+?4owDNjYsM z=gj!Ul=k(oyy@}Hj)>MMKgi2U`>P1-n9g+%9vqK8M!xlB9k8&ccqPcY{B&i*P3ve#d_V+{!Ajdy{Go`eGdareM+WSIg!diekDq&>{9`Kc;vSEoK!G2)r;$<_>Uk z^NiB=NeL|4aLZfeYEhaF!LnciTG-=KwAECgH2=d3$f5ixGaD*xr;8E8OvJEVtXe^} z?UMeYhW#5VSwiTGCZnuo7Ofo+Ukbq-TAY%&SpyQu@X0an9-a!NrE-xCXl$}Ag? zhiX*vET7~|$J<6sk1_7~=Uo8WzHdC1%9v8L`fnzfnVx}TYaUqCufju#c?qGuRC&rl z4pW5kr^?I{GR^9w(p=W`f2EWcdl;Xm!|io9bvsDi8h;qa;lGyQdk659Q~W`kVLIMQ zeIz&8iBPX+h!3q$3I$%=Goe+xZ~MbmGKq*za3c3R-W^)u&pg%dJNZDFQ6O&)-81Y4 z?LR>zV9JSqLLTC1RADnu9Jni)!1!5*|@Vk$RuDqs`M4?UpI&42X#xBBJrQAtyrhpGwzy*3bKo_)xo9Nx=B#P z7H_7373=r0*4LDlDOjLFFCQYX(wTwfqY5^KY!e`wE3XC8U)9-;8ow!D)eHoeK`%b6 zzSW8xOt>{(HI-P(HIEaumU#1J;doY0`*cuoE~?`_W9iQPNI71K_83qMo;<3Os3y`ymDws&%o^FD#0tQj{Gd6 z@RKu#&DyWf4&D{~1%zqc&ct$^TG{={4EzibG}XQbmu?PF7dBXLH5P6P@K9PlK~UZD zsoujXLFs7xt7wmaQq)x#8rMgkB5U<`6=0Q5f~&sN`HTwAgxc<{8`J1+s#4Jp3BgW9 zqdC^nB4tW^xs%#SS3u?w=a`3}`=Qiv!_35`icW#qAo6E0Px?+;tQ!Q0PtUxVdRrC9 zwX0)Bi)Ay^uT~TIx0dZ-;&wzNWT32Zdi+9>H~xF6%ZT$EA^CZ~@E?7VjxD&_g*f*Lfi77A zasurW>Cm{|!A=nyKZesO+N7&5g@bb;r~);M6wBCw?94`0@2sBTt)s@eN>f)E0K0(? zBOx`NjnXnN(2M>t$6cEk`}g*s0eOB4+GmUamU3_Z#AYG2fYa55(bP5+#yf3ObY`(s zIe0tgE}yse_RkI9hkGD3B4-HIljv@+C%OhzS_D)nSTL8A=|JX_+Kel>*d&hfKa`2i zmX&XW2HI)kHR44=DDIIpJT1lB6X|p|(vk2|h0iznbB)CXr%zeUpLH5K-&w!Hm)IsF z`Lf4;OSXuCo(h$RII^<^!P60lJ|3zsZaaQx%yAW9#F)*>OmEaM08$Q09-?YitB(RR zp?(%N(|;;E0bxbl#D(msG|-*U$vKZ^B>9D(OSrQnaXX@E1rSM9?mKJHeqVw?-F`1k zV}fw&`FCTsG)K-fi9Eq-#!Ii(@e}1SjHqGU*V?mv{D}uu@-T@HbPg3+ZM>(y(gzeN z!9iAaA9TAQ!PH^vfT0}zeu0ZN0fN@lbQU__-&8!IJwK@BYL43g)jxOTleF)OyW2JU zR(qtx%VTw{rdynsn6Kg-OY&Cgo8UDWjh?GAr%fuqy6@a|GStNuDJO|J4uyKrzFyjw zrCeGU2d}XpG@7!Zjxx8;9+TVHcWLK^46s%`m5wG==Q8zeFI%ENKJp?vh{cSeHc<0L zIk({kmx72$$?~J4!b9@m$^_tjoSb~ewC6!f5lq+DEYj0vyt;2{)o_&4J4PD{X6%AC z+!LS>2-T08a@s+z^gP?My<8H|AF#d1A$gdbqVzJBx?wV-QPNw{yT|&*Yed z%x9gnb;A-+)00h!CtIa!ZpqLqgp`fc*im$xqJ57sK5tyS20TF}XtQ;)2i>x+4ftE0 zr$Zg3|Fuw<#rz0g?o0JT)VRnx`wqN7!XidD8tA@}tufVrh7Q~#umiOiZc4cLK)cjA zgX^cIOu=g2wejdenj`SCRlo}YdbcDD!j%x3&lELW?h8iKDAP62H;JGN-iiWtvWEBE za-Bw5fKAL1p_lqjo}g))HnB-OWi|rmZHlPbm;0D+SAsD6@P`}Vn%EHAHiGLm_vwfW zYQc{Vp_Zq+g#kHCe#eyGTLiW|SWNwYGjggzc=<~ojut>x(vJO_*Pi%K&ufODd12~FCKLBq`f#Qa@K&XlzrmudW zENaN5fq)n2L5vV(86@k7jo1W)MW}>WC}yQwK0jZ-I7E}Ee`Sh;sxXevvqhllX(exz zv2dWxQFk~tJhDkYoRJG3tA~o@F~3l9S0;W*rh_g|y2~GMprSS-rzsQBMt)NQOZhGm zUTd=2?!pq6D(;-Bc2q}m3;MZ!sLUN`to=6p#1TE!7{DSGu}D2?kp z-V>Y!(FeKk>}BIIDUiOGr#e5$tdG*i)2>{-4epTZ`*{v3h_g@Q8v_CiK7sNvq*CGk zY0?EJ+WWLpuh_JIOqc(~9MnJ7BCD{5>#w+pnBd9hTH16d%J+vq=U|K_!jTK9N!%+B zSe;%DIyXx9oGe3>P@c*_)Q#+jsf<#O8U3o*8fIPoip#bbmbjkDB`~+lvQrfrU90Gu z{U90r&%F&rI(VB%83pluWxVW zM2s1%K**i@4W{5abVe*o#|tekW=$`!OwdR#X^P)f$sbT`umYB!0eb&t3K6QJOy1Pe zJe%Q3L>PtR;e=4_z#{V|2}apNhn7Y?nIoSaHw707>1DUqZZPX!NxjgI<9sf6py>09 ztHy`WJ$1y&q`|bE6Tn|eNUA1-#$J(id(y&I10cG%B+?T5#0o z9FVhc3uUQ}cf(3?%Sm6OBG^tk_4beby`}kDIB?nhtSt6dtY2;B=W zy6B<2J$0=A{9&R;2kGvhcjuWkU-;#qcgZ6o+x*ZIF!mfnIPh6Z;xUsk}%UduRS`*dI?_$;<5lSAoXrvoya}n468yvuA zbd*J?O29Gq3kBMqXG^4gPy%ub7-H@w2rTZM69?HOyJD4#cvo8`CXR_O=Q>?KpmzMP z#nck~Sf_8g@hOYIchnJHy$sP*A@d*GqJv)rFE5?ubcof-j&g6R7Zs0x0G0c_qaU%e zcOKwCUXYWt7g?3Bi8GNN$0_iybo;>Bc!`cuhgRSWx*aiD!L`P7y#6E`-KMj&e;0D}nsdFCQ#*$uZ~!;)wwtmx>dtK~KWO2kzBe+SH`3<00Mv2; zU+5MHb|tDoq@EV|_i&Hfm`XY@geTdYK)qq0G?5JHRZE=H;KCaR-bXUWJ+pLB{yo7i zGYDOkEuuP_h(f`Q%<*1fyj%sOJOY=JWCQ%uUjRwiZ6?ALbanqdIN5}0a2J+A1saKE zRUt_#Yb3AgSL*bDoh>h4K&`~UJv6s!x>UD7gS{&acM}jN%E%MCf9?`7!p-%FByP!S z1I<72_UwAuWs@9x0G5!-1?=9}>Ug%^LM3n`2ooONhIhFPf`#ci0GM)wBNcNVt|z{c z>AOfvysMGEpVG1b`}FQ4csHx@jD(6}I&ac1Vc(!OQu6YCmvd&$@p{8M1cq=A6U9oP zo_Tib8T22L{iAsceRS;}Msm+&^5LMDl=mb;^*LttzJm+vgHT3j;{_)qk+`~tE+4wR zv%TfGF#1LCR*&7oHtE%5xHI`VJGg%BM|dT~gHfVU!2bJpuxaG)`9#>F4>tz_8 zQ;)Ppv#&Sj-16A2d3>uye(ndA&3@M{MdvrcXr~8<2+kcP7P6i0K9JW^1Wc=6R_k%! zIVR`+_MR@p4SN4QzZ1rSZ~@|B%LH;En(F#>(i_{b$5F<8io$nzoQ=tsL0=<6^7dTR zT!msLaLHVS@zTyyjO5gFUC`E3fnzZOTt5p;$4d}gdq?F+o2GI(9jdicq%FD=Q9RQ8 zH@*<=WeP5$*xXoroiJVPZqRB0sHqO$EyH{j63_Rz7MlR^wi@4RkXY< z`1u$yLVtz){2q*_wOW%%>P23>AO$a-LEeL~0t|H0Hk;~AXqpiq4Obh-*$cx@Pjhd)x5Z4ykmrPwJJ(>+Xf*OsB=u)CNU zT5q9Ab-cUpkVbRq4+{LHc|p>n&m3+bwgoX+*n*Q%8hDvO|IfR3P-FKauT5A1M9>G9 za~zx-mgAvleKq$Ui@VpVOq)W>u@7#&$)Sw%u4Ds|f+ikrCbE#@&3=L&N4q?O9L!iAB(AL?X8UFsg2FYg|JXw0P#rVtPKZ#$Qb8X`hrvdl~ns?}S z{mkbuXKsDNA~#Kj=EuOZQ=~$D<9CV4Udb>WPTIXez;D)KBoFkm%!^Kk-*>XYoBh}| zQUp-C12!5C*zrRwpUd+jjqOU%o2$O3fn5H7$olShs^9njoRrnzAe9j!tB53-hYs0$ zXOt+r;vkz!86`rJ?3Jv@o+TnOj+vQ^Y%(JIcisAYzQ6Uy`|*A}-t|7`yzbY1U-xxA zujli5F+nT`;uB9xVxs6TsJRN@&Km;QFh;{zfB{kxa#EQDFm*%zg5_P7@SeKoM50;U z%OyPF(ACj{b6vRdcnK7LgUoGp*htmCjn%N&sK!60wiRm zR*1)KuFiqG+YRJm*FW|(oJs;VoS(2Kgn~eF!~-g@`H74QfCHhD@-BsC6C5;pp;PVv z<#uVtvuIwU%gRl@Tw=XGX??A3DB9Q^Ym;b)mzsYVO3Mny@ntgs8G<*>5bPl>Qk%}V zH{jw^S2FpUWdTvG0M#+NhR~vGqek!AfeM0AAwlBDt2@Bo7nV>D#<%|O6XisZEfN2zyB zmqLanARlwwv>E~`^tJNmOJr-v*|5jdqZwHGw?XRjoEaR_pfg{naM3?@0=R4W#Y65* zD_Jw07c;qi=b?djEkh%NqtLde=;m|D?)5i!YbaI$73nMNF=f1b^q;bDi!CAb)2aKz zB(A6L!!}(K4hox;V1l*tv!JlbcfXUO?>CN`=kxcgrqW`JsW+ifU!r~Zl+*xXXd?n} z%Xh#dVW&;5=P(t66JW0w9W$OURkr%-D19?jz?5S#NO7sjYdqq-cmA?=Ou z)Rl)RKt9TP0z;H3crb;OV$a%bj;I|-0WxEi_%pqQ4y&vOX3ctuzFHq{3xm3Gurslh zuzv{H5@QdBM?=J0w%t@^iW=eDR6f~2VmHWL@Sq(cXWgyVxH+kwhE+(MEy2qt*V^(O zl&vMBgHLvMPd$5$Zn{0G;;aqn0M(;lcb0Q+v;fCiYqKip%XHU;q-uN!WIjW}M>B>z z7mtuPo~7;4H5gk63bl3ukC|DUv^FcpF$3fwRdGM^j%AX1bg0`DP_-KD2lNMDiU3vBuZ4G?ynwVg=l< z)dkVd0V40x!&n*H0|63x7u0QLVmF)HWT@w!?wl z8uMH_mpD{%5ob`dF5+O)O32m+pE2oN{WWkQ8vm8G+Ba)I@o(L#LFTbD$7u8zmvA8I zYSOY(W6LKVVl6%(O~vp3PSkhzNl)b^Ju;H*oG+9dtFjRF>5%uu`fY z!JZEk@>ivHef3n3VI%03abM-lW$8N$5`FqIMaN!h8a-Z1uLfz~SjHD|v$$uacj)lD zuw#FX2Ttt1LDU3T66AVd7T?BBns17;K3aC;Jsk4`rkU+hI>MRq1kLHkdROcqk?8jX zx7V^Y(>}Qss~q_Uk$l;uuVC-CPTDM0bgMA&o|FsWr?dm|(hT_2N%YA~af+=$8)|;3 zIx>yM>vRytO5m#429rCq%sk3t4oB9=?u9_|$8d?B-w4!W2jb#^XJC4hGYDkKfC97i z8UTcxFWkG&c#$$IDWQuGF$4z*Z&^1Wb^RmK4_&6&Tce^AZ=421m@^rvB73r!W{}3% zUjJQy7H>|)lG54^O4WerQG#28&_L+diE7D+p}+V#J6aY>GOM!ndyrSV{EEI^cH&Zj z(~SwuBSpneOE6YmG9V)r7Rx>%D%}4>Pjw}ychk6Zh?E$25LvPK?jCr>)wk<~r=lTG z@qJ*`ET4$e>FBdQa{E9eB`duhuYw+tJO?NqZb?ufwl_TUc`mv7DrQbryWTq5A@w%5 zfnMm$IG^pVK=oF8^Kjy}aQpvT*+t%e}len zP7hLIz~%Y_70zp-&@NWJ;9zBQI@!I)1vW-l`Xc@wtuwA?N;?~<#yH8aMLFF|Mj0hh8cwG6UeE#V)_;tLfw zJJQu%qY8kUqaEk8GHIuot9!cF@ts+9RgAXS?meaCwu&~>-x+4MO6$Kd_H()Wj8A`h z9c~nF9Y5JxDF!|8jfyvRcgo|6U0J$vS7^=^B>5UmouTY@{+z5&U~Y^JiTsij=A^m^ zJ7S^qa$)67fQ=wFHe~tUGHTr2G(Km}A@*UrrkAF{$ZwKx#3bcG&&aXtlatMpG?_;Z zq?zu^ZgrmDBFV=X=N>K-%u7fjPCsZ|Z5eH@O(ZTnP+FOOvc5)aehC=Klc`(yb1GN% zcHHk`up8W&KV|d;cm2Q;8PTsYR9K_}{Pg$()JA#!d3S(93rbOn`Ww)1@`Ch5$tQu% z1m^_ZIpx%te_oec&_149*hzFIC)LAE9X_Bsb?kev(!J00cw1B3!AFY!{Vth;>dhzJ zY}5wuF=f3WS_D6!be8MVE(^44t(^RiKF$Q9!PJXi66B9m0I9jpoFiwol-q7l@miD; zT^6j7@aGwwsSc#FXEgv+l1~aI?m|Dv%);oSIa3SU-a)noJW-8SI9-O-4s?hNx%H6z z*#rA97!Znr`}B9|{%_tHIr7a|+%T3YpuEIzto1q4{`nC=-K0rjY z&}`#Up~c3x|8()3@c2A8Qng5_+Y|~Yd6*Y>&lW&h?X?aXW3??ZC({4-(U6f1q}vOd z0PkP(r=&-fK=AJM5XAYVRtSbCH+prM0`rshw|vsq;E?JCVCmm5!n|P>^A_LxOYO*? zhu^y8A+CY6F-l+|=RWW$*~*v3j2!@Oo`oAwRm~D%bnqkuc#@QGEjA3f0&a?&my{U7 zHowaqaEn9CLF5Sf#`ZJtVInqakH9dcg4G?=TtCZxbC_g5m;``0`ppuDn-(&=2(^O` zvIy2Xg&{{A?pkkZI{Y1=rlZDGE$_&l529ZLF!0kKMS`LhV8Xn3Kj210JDlf8JuP@O z1f3o&^uu`#r{$n^0Cn{CzJ@yp=tJg#dNfz}6d5VVsf0622W(Z~Z|oit2K|{gpK|Bd^`e3>jWzNzJ2LtYo|LN&aJNm(`(F+}S z*&L0nrXB@h*jwA8&}N`kNQKqrbpir13%< zNIOvpEM104dG&L?aob>|sgx!(3`_3bz!cu)-rF7j=U>|Ul0?XCK-+hD6D~yB%z%Hk zmM@Te`d?A2?#oCe88{*hXnPAli!Vm>KZfl+H4l<;Q}9Ky`}&fCwTC$fZ!xDy{9;}D zzJh=-m(*r>Pg>}wgI<+vBtRb@DRi=py5P$K@b!5S}cy z4spjmgnuh&k)ek+hKC+=*J8ww<0(R0X#6$5MZvkI4f%{tDjgy%9e*`VL;k_H{c;1W z8W(&-?UpF=)KrLsHGgM;l+Xju{KxQIs}B>xCZWs_&r{m_3pmy(Olr94X?hT&avuPr zjOz&EJZDxVu>?SjKo;Z`tGPbgO@r4Fi->{RhTsgqKq4V%E$QOx>RZhxky?Jy#aUz^+3mcEl`a zsnN;N(GI#XX}YuO`c+^p;6fjadYqrNkQ@U z+vE_(+;#Pzu#~!V_h7W97R_WW^3O>w8cnZ7q}tFt1MK?-re>)Wa)5+%D_E0MWy z>d+mu@Q09gx@5noB&L@G3BJD5TxFs;@vOziUTItuTHY#6e%e82hM(bS@eIwumH&5a z#QGtB4*<`YmTi=wCV~SUyncf5CVXbS%MD}?A8HNn?jM1zi4^r$Y{6yhTc*}sRVjkX zYs@7qo&@^+J5*#on zVdjuR;%y-?G>Ro5s0HGRsNUTBa-x4sZsxJ9*5#m{Wy+L`O%!|B2^a=Jf;mNH^Z-C~ z9q`ekQ;6cxKI5qRx^NU}ek&&AhzC{w`+H#o#h+TW1DcH}OD z?MTc|v!H41Z^p57U`YpChE&Z+vR8*=gLWm%QcX4DXg5xn^r>4cA+H5fi=SZJf0X;$scK`r(B zBzqimVV#$P%NhPGe5c{e`|1h$Kv%{FWnlv{n3a zA6OpviX5yzWj`>s=X;A><_~lp@M5tomw$h;CL8Akm|L2YG@wZi`p-06=LdSrT;^;5 z&$IgGZ(`o{70?27QN2Ky$63W`8P;G@G#J4HHQ*8X9qRYK@r!)v>waV`@4-^P7x0%8 zJzX|itJ-zx-Sy{QfT5>xmVq4i2BMlU&gq_%PYcG_4raZ5(g-l~Fvjb^ z7X14?kv*7T@#2uG?06TPc1_)|Fn1CbDqTW>&l^&C`I zF71k3(U`74TCUfQFHU*P3SL^8eFAne7pH%HMlFz3k8fay>b^Q_4GPF7fakOx06l4I zh_k_Pku^tO%pLK&&(c4kuXqWpj@#Dtj)ovNd=KHU9ApIckmSE-yYje%5P;+c5Up-Q zHDWRRpa|3-WtvXShmwfPh>ggI1#rq1k{sAz)U7HfKN$E%uWgAzb3*O-F2?d~nVEo*?tkZgHf}uR=op zzt+n6$eVP}E`!ww*U>%)@!thr=xcc^=lwrbFAZ9}j^pnSayUQRl^Mm8I*N1>dt$Xr z#6S9I!h4$NHzRZs2oPYL5^zL@%PElD%Ksx@PD#yyHfU|$>-b`QD;Tc)TaqD;ftza! zcN(Lm0L2>QE=;S}o>tPJJWZ6D;^~7M{swAwYu|CoPNaJUJ!ZtQL~5Hp)YF4>Dj(MB zNsFvC%f`rWkPO8*q;(p1Bwu}g7^|ilt}1``Zbwg zb{F$$1X_(PjcyU6Qv(QZ&}C?azLz)vv$GJ!*0x{t84b&uVSqh`3f}(-GcS05FZk^C zkCP%PphKaw>ZUvps9zFU#0YP%Y22y}pmAWCg;nd1l2EGaEpz6RDG=~LR(${Cb?2c+ z$vxc8Jp-zrn^3K1NkUDQf}v~cC|mrE=n78ew(Gx*(^8A&z-4-uG9;qj+SAGt0cos3M$?QJ}|3xeu%;GuR-P|9a#YbCaV{5E?4 zKL%Tlp5z3)mzI5E>3E>GP+}s>7NN`5`-p;e_WRA}s&&@+`7L73(A)P~~eF--4#Gheij30f;eA&#Nw=AiVA1kL6iR#*#&ulD3h0gNs?vIQD zMV@m(5tQ^=MOa0pM|S7fH?ct+{rXiewYq2X^(+o^dgN=r3^+?cV@P?9+xpV3o0s>l zQ?=7VY^m$7q01L;w@CbWUSD~^sb#|R<-|nGK#yn3=CU^k@(17PWlvZ6T3s-sK19R( z$S~n3weLRCAX*%r;dTuT`4|()y43TN7aQkGCs+_Yr{#|{H*=15dWdpkKFnx{dMdv& z-YbG96y5e+W!pzWUdyGyvvVUvb|0ox7H6GJ1|@!!se6XHSI{={B z5X9A99b?zLAq7Dxw&_H*6>d%>1aXo`bLludyL(tsL+~7ijR7=^B^HbAMHa zD#|!(`se2-3{0>ewy8GyfKtHqF7mN4Q4r=_Xx-(%rY`0(YX#UwrE*}-*xB9!fCf9{ z^fTZx5h<7HM=Qgs@n|1=89BA7ENc|`)8m3Q6zX+iLl)t))v=tvIHfEdul&xR1u0Dp z;zhf9!fn~I@Vzo%=bc94gF=VFzEqA=AcyC9>_lw`OeTkm8wDL|O*TR=!J#^4K;=9+ z4jzmAnBVMf&*5W7-S@+To%h-CWS|1F4DnbBo;QdxAAD8CqsS?6tthL>yjGke3Ci}g zt}X))Kv8C4w?3Do3z$GSBNE!Vz(2RMrJeUaT|m5m>5p-HJG-@uTNM{YA9SAMGz3aF-J)k=bt^; zCGx}5i4klOoTfshUDH(gh>j0GvBHkUu^Pbd{q!2GwiX34au=~~=MV<~eK4&;OQ1oE z#8N#1&7Ki)7?|`-@%S^nhro;HbXnEih!bf@NcM+o7@kpePCu9Bc-5j<(z6%9zjD19=sMJTv&8AIVpJJAXA!Zm4QFRduoy$Ogf0zBPMw+; ztT7wu3e0=m@2`K7?kM1FX%lM`_3o1$I!--ygKi}o&O86P`)M710w!gh5igh??(K!^ zG&nq&t<>aUQY8-7wI_(J4ZBU#yfopB4*T(FtUm!>V#2jJD zeY?DlQ|9)_Vaj9He!T#?6s}c`;rL?Po)$T#uR2#HX)3$)u35fQ?DN}uOb!axq;OI> zL2^>RkI=W8z`i8qyyzmNLKh+sr@jDez6rgI@Ql;kmPJ3jJrH?Ds5=%~zNLhD80^wu z5F<jSm<{dCXEX_1qFdrUlS z3KpP#ONR~xj$)XWxh*j98g;-`T+FwKt^G|$Nb<}8=KpfCC+dvxWueD0kI{Zi!SADq z#qv{LVJ#=6YnIK6%A2L8-c-4Du2V@khVsLOo&rkvAqm2mfXR09dn_0dc2=5Vs22DA z;jS)YUya_vo~CJ5B6x2<6EuR&?Ft9S>2}T}59u4eC$a-Kdf8Z4qY3Hj&}4UPA(Iln za3vEhvyi&)Gymw@YQfap7f9-Oz2OpU^7Jy~kJzla9q)Au%^Wyq@cv^_?=8lDNcpk> ztV^RCQyR@SXhj_EJXKf9d+E*hgKRXxK&NL5x&R{~GABs*bT_J>b8rf;!-yiPGT>MB zzixkA-v`O84X89Dou}G4SKsHhf~fv>2a`n&dsqPQ2HL>V@pa%YtNpNU`$)AoJ}MD{ zK5`#HZp;r}0HK`iCOkAN6MPOE!D_C3H zD02T}!&M~!wlNbu;`e`F@3|v4^Bur_yT8AG=FDC>Dro@*FSq@CkC_D${Fk-2vGcqO z(fGdy7>tLp@!SX8&!{GuIE7L`82w@V7>_5MvFbR8LTD%KC+*u`=&2h{`kA=&)}ay} zu?-iz?jwT6bgrH+_*Ys&nkZ~}QfDawa^Al0c{fc?alEud&P9m-FB#P)fl?%x9rGTs zjQLB)u;oy5%{{%)a^8~P4&sDN*+O!NSmWzUcjj}Jy^%$Kpq-_71vHmcb0s^X0zee| zDUh&WkjCk zdw27>4wmcWW6$JEy%w|+*cWeADn^$Ij|3qp8FGW2po$BW zQp79c$O2`~d{JdYPvz*pkCDpBGHvqq5cOZ^O2rD~i7ZcD&0z#nD(9VAqi7w<$@S%l zeyCPQ(>m11xb0`d>o(ZyGhJc-1-q7f7c3<-98LvlT@kb_kBP@qk;Qo~DbDnMNS~sT zIWr4<`ufPzv>KX^FFHtwO*`p7ie%Mkd&~p99Jme)J zK%rWzfL58>Q(lt#hexHX`aVWEbR3lZqYQ=L2lFn_MKwAAXG1%M(h%STRqtlV4fweE z7>;JAG;aTd<}#s*2tD{hs8~`GRQM&s61qg`Aj)3DLiVzVB`)Ewq zS=3}%B}viH5On!b(WW;l%{y6tY_Cg*X?r z+P}Rd*KM;1g~Ldh8RDM?j)bd;eWh@dYYd=on_h-Q%)H0g34*dQ))XWxPah`**Jn%A z7w~8)sssJ9ZAmA*y%0+Tf}_&ZNlKK8`Qm;-pt(^0`sLCiU%hjZiCNzp-E&e-2xapx zz=#aBr#!VWwM&TVEf}blEzETjHB|?J!hS~Nw~22xUceN#8f#PgostI8@INs?c@i{u8; z4q??0fi0MSZK!!T{^|) z+|Z~Uclgsso08ET{&ov$)fs$&@o_pe{y6L8gsd!C{ds1gx`!dRjZ#}Jg68gfwdIAK zzWQ442mLb+?KA;%CC6lri^uog4{7*5cClB>vbh*}+)k63=b}H)dca!pVIQnmpA~{s z#6M-tdSRQe0a#OSvNH%Sp@*94skm`6`@n*5hj*SpAF# z*(_T~z?!Xnn;|F?3wM0sJkG zPP?h$sy<>}pf0}Wcm1XB%YOg-m9Jege1%I_yPfJBTI7!l)oJL7eB7HdGOK+1OeI+K zL?>x-zy5-Hh_b!(s=3+R?D^mG_|C?u$6TT6_7*LG-l1U*=vZJ?|M#&QsDTS>=7svVCQ^6JLmE1#}5>6 zy{iR3)iayotX_Kf8&hyx=XgQSd?jWg!cx7sNAl8R)~6Fm!Tk9!hp_zFV@l--do7i# zKUT(Zd`ZB2$hP)3VK+HMcS}8no%8zmT9(eRkbfQ`qI&EAucYkI6KtYV$cqM!gz@>L z*~efJk$sK0mgKGS>innFRetyEk&Apnfhs+j-u8ckCwR42`gBaesMD?Kr7?RjY`jF&^fetjPswI>ls_9zLln405*=~Rv%uH=HtU^a;7x-zG1%swZi3r z8@R+>Sh(t4Z~13U)pFl7aC+t}9GMkdZr-rl#uiL}&~iW^N?v;VhOyzH@#-uq#;KO1 z^yh5;YuED3{Xx5VW+pGi%x#<=VLT?>(u4Q7gHb7gl%x~Ianpaw_9s+J+q2aD%Vx_- zv0iZ>u(5+~iO#1e6n?vz8$LI(*4r{)=e3p7l^uVf%%2BmXE6KzE!p#%g2vg$`J57J zMtNR;24rsA>Wih4P7z*OW%eO9P}VG~Hs5D=r^T}V`Ql5xbiJgbU|!Q61X%|y8ud393GnpVLadxOZM^RQVC7{zGB3C&hvb)m2X7)_nyT zh?ufSxNV1wUq65mw=cg@@Kzmi15c0N|bI%VR86P@~C&)YzYC! zgQ~Xxc8g&boz#q8EO^aRZL;2A3MK>dno`ZtN*%MUR^hq2-u*^iJ(Sacdua}5zVSS~ zZlYE79a89GU|RK~DLogXuQTs9B^%{FFN|^Cz<3Wj_w<5SfHfejPE{lDmrretsJu@y zL*902$G-;!XGc!j#o!C7&YSPWPii}epVwQ}-@2!tzePC;)61S>h*|G%1(lyAwt>D+ z+eISqwu5Dq&$K|rZ&?PIr`q19k#WD>=x6OWIcrT%8@Y>`T2Y9De15T-Gl_Fn@Lypo zLV_o}pWsUONI&FLrpr&<0)r3HpIHGX8aL7jElWL-cVF5bjPBivo(53(^4cq4B9^@O z?z@}OHRb6-e;)wX*}cq25=|sA5kBHGnQof7!Li1v2dNm?3c#imQiV-vJ;LUoF8FG^ zX$$a}%fu~UAifg9z(A~zK+!rCMUZ_?dFogZwUbO-w|!CX7q75Vkk1YV%>WPT7V^UA z4>&RAonC%t1CuU5eN^JN?`C^iq!p(Q#S0$!LcmE<3=;*=myG~UQa@2cs6wh53JDc-IxKQ;)pCeuVtB5Ra5Gfol%+drHx20fj|DI?l zW|@vJ(qoBCoP&$qIp<^_`CHwuh)`Ov_zqG|@CojcS|-;a$Hm6BxgUQR5R9Bg;G~lfNGJSD_0zYfGO(O zIz2E{yofw&+x!LiXvn#|I9a0t4T>rJFGon{))tb!=XyMTCs*NwQVqJ+(B^27tflR8 zUok(;CsYTc2*&UwHH$n`0Yb*NKGTgF-wgFB6uHK0_^iEs)6$VslY{2a-X-t&5H83| zzcLJN`ZGv%=n6e+$wx%-&!3Q@+xoL=f_ki9o;%^&xcCe7llSX;AKxy{s5n zo#s6zz+0`G-KPqUCz8Cz3)DFJd?dBtC}VP5aC^B5Xa>kt(J7`7 z(`{L{->-r#;n|CzOmC25lnSpoE@(W6g7WHR@F&O)?0}AZR-f7U&>nyk%?a=I>_SO4 z&+9Khi#9PHLiTxMFB4b#fLZV2aB_T`jJa47<;Irm3FXpQV&DC@<+4DZDd%y2!X6N@ zhcpCzODvfAS8j|QA@L;1Kd;|skQ&KVXYD{t^52{ghfI8?JvDgQV+TrPlas&TDvj{) z=cD?ZtVlj~z;Ew<4aqL*YqY%78ALFg1qy~MQQ07npGqwtQ@Iqb-t~bK@3O4^&a=N`B)XrMufP77a+Xo z4&f7E6P+muW}hulhSBvAOC?}C!*Bl^8Q`Vqy*=}d6j2(BGlak0{PrXd<=}K7OOR3) zLUq%}Uown|+K?Ff4w}*Fd`aFqQP^Ve;iYm=t%t0`jT*Vhwcd8+=;dmKRIZoLmr?%@ zP^D8+E#uc?)T0`Gir{`YBa?dS>>5QU;5;s1P7Ru+Dk>AMwUGN#tw#RcrsQ?8U;d@} z#qfih&YK1O$m*!BL4Rn!hGb)9tR zL{&4iskw!9yk1(3zqU*EfBzeU7C?=fwbtZC2Z!rnPKjQqN9#n@u>pwu{5Ps#>zVB} zu;F|bqeKfiG-%s>M{DzVb|f$45P2w&_#MUeLUCHQFH^=>N*{sYQ< zwj^4=QBm}(N9@;Zls|CQb}KJAgw(yNzMf%I0n$y9W}0FLkk~BG1*ii z!mOX6RR{O$rL_~M*#H+czx8G>>{wv_;_JVKIAjA2J(mA~D}>ZJq>Q-GD$!ATx0Y^{ z2YsUuP3Z0>D86TFs04)NZW1S+vaes?yIw;*NSC}h-GP(FpY}myr=iENe5#N{A+x1; zz>x|gwziWtB}j@4Wk;tJ!>ehb1{m`paGzcIO-f`3zE)n@Y5(uh_p6}YBkO@9u$x12 zLXtgq_%K*@8kdwoh9K^GDj zq75`vr{4;}akt;`6#!NbFTj$6+MPxcIR8=J*LrIT60h#G7`d?sSp@KCGwpp=2PsDq z`2HEc?S-RS`a60lT#uAB_0WITr-W3NFR4&QqxVyUJAw_0ARFpMNhJ)360Eb|D|am( zeuOl4kYR)0q>RqC3fLp7aMg+TgdofmEUTtPP95WG*!wlq4Pt(6fmW%#l3+?{%xkjF z3?)`FzhIE((M!D?gwM?zr(j4K;LWEKnwxF(okXF9&0x&1-T1ww`hzx4!Vf>IX>|N6tGDiyDeXbSy{zU4XN+4c0X0e#qE=yo~x-K_9Tk z8lF(cPt5M#Anu3efv6u)E6PJ%5CrKh^ogb}>y)&2g(?6ca*N-gsx`JqB(1yr@4h<0 z9vaRLMv6O*HKoAiDuO9WeE{Cg3;A|IC3xk#)jROk%{fe;}EtEpEukq!AJINs8 z<}u5p0z;(HtrXNddcx!w&`*|Rb?V>Fw}`{F1X!g`>AA>Q{%Mz{UYF#1#Jsnu^vNfc zeRy-27QdwaU8UtxD}b(2D_A5S>xZT-9MIBmsLhrpnoVp74p!iw{fRCK#{%BFaTOlFmxXRkb zZWMMYr)-+H*s&w=3faWw01zm~BK>ExBQvEqBegn1pp0y_ywW)XE>3BM%)I|LLk%gW6uS+9AK8Ulq6xZAKQ+k3 z(Hh`#umN~>C~c(|QjnoHu)rlZA)vqUcyf&OUo-mmbKEe#K!|{b71(rVO*v=-6(DZs zS=?*1|1ErxFA|#FcA?bxuQjFDC686RnSs&|&I?0>FlGCOWANvH(QcbGf21W*xTud> zyA!J5&cUVS@H;=?I1i{39V<{*I#ii-%rvpN4YjKIZM@ge2xG!L6Y{IQ9q0DND<}U% z|DEa1qwuZaIN0|p($NK!#QvvYDk5)-(5wiUsi2(}lPZbUdoM=AP#dfUE9JS58#lms zz-4nKs5?F~(&~;cnm(W`kuW$8&S3@6gsOueGU4*x!}9(k^h26R3lz&BmbgIT_C4$> zOY~x57Bo3w%GsHgIASore{W+gE39fK^G~XIcil&$Z z-G>2y{U1L0iJ;Z`D~n^K*eG3g^95U$}O1^}1`RQ6GaNtl}&seIBa&%h!4 z00QhUalbC_?aUMf)=YLpGjNI_2b!G?@P-ugX6OL>Vg;Bd=;XXx<5db(S}-sjwQfQK0o4=M1H%sRVehX93EbUI2a3G*`X_(Gr9O;)cnQMLa)y zJ$zN+F;dcy8v4A&Zh8D@ao zqJhB;dNdz^Z*jop{t~eYCV|g_msmd1;byBS|GxR}>Xq<&8Z?Qd%_#juDccMiKUOQ{TDoH*+MmGBw;*y)Jx>E8`434 z1jU~7+6qN*pS{LDHuhYsyF6p_{`x1(CXjKBf8heHpK;Q6n248}aIa>>qX`#fgt4F( zDU*A%S*8ryR3sW}1g%CBq+vDglUu0ZZT+tZCLkKc$HBfstpp(~`IwX+aw|hItu|RU zZ_u&n+~v#M6xZd(G#+;&xgrbn?jyb=2FUe!ZUccRm0-f_U%fdmUJk2e4|EYkPxOGM z5E^1am}hoB^a48J*(yvAg%2W@#G_!GJicHHZ3)?Opc4(8d3l%3LKuEf8TmJ`N6xNl zECiC;LBPP`OPlXZy^{|A!#y1;LTBTzuPMl@t90({E^>iErADra1Go^gK^vqzw>kPk zJ!*>gLn$>2%)+(8Z5>`TelS`zt^J@S+Sw_xA6;pXfsGM}%jbYt=E7*GHN^yf=C{Ru z9*LhV!c{4SrmcVA`5#vs7uiJzfA)kswVa~!)k^p~9zWk7y$6DWJmgfh462Y9Dt~!m zZDv0I!B35ig5+N5z)HgXB~K|5elhY_KgW1V1UnBhKh)7my??O|zQvP`)knY7oo6i7 zoR81(`7ENW0)A9F0hwnDBxwt^$^D72s2CQGWw7?6p0m^i(x6dYT#V8c~2%F?}P8_X(zXf1VU!m=3%~JykD>=Fxydr~6 zl_3s71RUf@C#U+0?{G|iW(*AKQ_xWWIfg7dxG_3Ufb7#xL1*99q|A3=Z>eNCcvl8) zs|e>QaGZ2w*8&Po16(u4y;fhO^!dyDa;RtD?%v~=_L&WlW7x;8`RRDz26_1m%&VUW=9V*mJufKr|f`%Hl@(3>p)uitT6lOa%cj&qrP`Uut1P+iuu`?mZsqmS^z zL_XT>{Nd?Nx9I7>oP?`Mfd=eE1*Q4hyW4B7%j{LrVs;0#^ewM8RTBvB!T0T=Qd@}vH)ML0e%om zp!`#A(H*#-8EC_iFNe5@Wh0%wnz`W_noNN??2kA_oHCvAd3Q8Nh@QL_nZ5kE!mf4Z zXNFjkRKMw>LegpyUz%D9T)dSzS$zRtg7`lK41q0a?0J35xR1JK2=meRAzZo6adN#u zvPaMb!~gD04tNYtHH5aw0lC^MgHajQmTrAHNgl90Dh zRSo?8dJp{IgYawZ_f78J6yU3WpmZMvj2Afeu$Y`rXWmIWb5%~JTUn=>3{plCfc$Zd za3Hy0xA-oE9JELp+xD?g_SN;T(?Dk-s}g4Uxm_x@alRH-Vy&cRRFe?T;e z@13Q~ght8>lt33|mQ`5jqH9h3OcbqGcVhwBIMmm?ovoV5@}-AEaz5_vaxw%0zfE8@ z)Ja{F$wv-LJowolTp&N`5qEi(u??#BqEhPALmd4Bw|6y7L-Iy{qPx(K08_LBURIm0 z=q1bDDNJKxxX8Re=?G*~5~N9)$lx4*ccS8%$567XUryKUc2r_Q zvV6>003?cZ(Xu;LLhFirZ{FN~cLlQnhuf7xAt{r3KB6}7wyF3~atPmMyo#cVdAOqD zzO<$!2X=?>5d@+}E@|uO&<2|2wU<fW#@XkO5nAXQaa~9ckiV~E^Qziirh##xx6=pVJ#ot z-pLD&%xWVbrly>UbcHsj)VsscA|v#L1d5MuZG$(p5(y(ha>O>J0@(nY&xs;3?r8#S z*iI!at=HCnOr({9QqQ|CsxTGysz2|0^;R_sh+YM6n*vh9ldye%Z7O;#gJBBnb*Ydm)+}?5Hy^$2xADcN>pG{ z+DK72_D);TM4~tt(EGeLdZMb!u|Yb6(ER>uz2|heTc}FC8wy6|q5>b{f1&wm*iCGk zk#!Gc-r1sDBl~XNvH@KV$D67|jS6ngp~bhSimw$)fmg*~)RBdfwaI%C!O!%j3*EP& zmNg>Wd^@9FCJKW&TXGaz2fx_HUqva9411^uP4-deY&Gsf5hFq7DEew~vg%U>egs5< z&e$E=Q)LFkM$^gY#)fqLiHb=5>@!Vt$(Q3$YMpI$6lMv;>Pc7x%6Yk?`#2E zX{nUSY*M>_9M!)W8{jD~*u?3fF z87V&*(_P=GH!5s@gpaKAWt#Z^=(y~)UNREIXKW?9&d`)9Czvq)GEvut;!f%C*OgQm ze#U92T6X8Z-=8q{+uh}RnOtJ6;%E|9T{2-l5(yCjWMP-3?>D=ZiMo4@(4d^Ik37Y8 z!dwS;HEB13o#=BtRZo!+qD7;*5920>OB_wgj0r>JX71w~5`jTun1h=4CS{9x&R!Fa zbs2*ZIROIztQ?p~&d4+&4VolNrt)WS-p?&MA*sm-sl%FS*;W7djFRqz#GCGk?`?vH z659xxiOhjHMe<28Q|y{OVGY2eW?$?Jrj%#6B~T{Bq-iDcM~=vm#K?G>c)usi>L85# zG_4ALteN$RlnjOovRK2|))bIj^5ac(K9=2eri`61B*+cQ?bnP5fh%1cOaF;RZUv?` z*?yY1*E!MeKd2o~H-#U!AnL9+Ad97qz7$2;dOb~zq|AYNgU)lQth4bwrV=4=W!Z3? zywmUhDL6;|YOSo^u@F;nqV8(Jy_M-PSSxZaKRoe5`^7%|AAODeEcJb^NzGYQXmOUU~)4cSeV<@_ZD}Kn|2e=SG&a1OHSkrb;&w=@$W8 zJL4cI7UJQf90S$y&C5*!+AQPsh_0>PUyNKj?mot-o>N$oB)i8(I?T7*El}PPAL0Ab?YTP(UjqH>7Ap00r$rSHrN>84nch{} zR^v#}<<`T^IBSw*-#n>KC^S@WvJVuMr>znF@dtrF@4Xv!_!z2nM32oGvxmqvE$$WI zX%rFH@W5jA{Fzj2aADB34^^}PrYtePJm3ALu3cp@`{wAWdcISPwBkeStf#^?qE0Py zMj4OjrH&tEX=XQ^J%S6Z$NfMhD94oqV|F7(y8>&uMMe4;I`mPY&q?Xb4EgE$-Wt4~ zDoXIKu>(A@Nbu$tgHH86H84>tPSoc6qx9n89)?5Z=O{OE{IX+8d7T?19sd4gqYwgE zLxudn-mnJRv|}u*(wIh;42?&Q0-7(cSW4ESFMi~YiE7da|JO6Z# zT_!|2)|eQHJoY)XB^DZmpWRY^1FELrrfjbZO4o99J^oecwA>#8^gQ)ay>+_<_Wny1 zHdLU1-IK|=Q*%8iq4S>Ym0wUNO`B`jUNj24gIIl9xykU`_W7~Wa)$@#y-Go@c*uA% ze1UNfCNZPnUN#dx^Pws3Rqmk|%*E|5nkuh^sX;qL+WJ4x0Q`?tsEm+4l-j(9wvF-m z0Z+aBkT2!vW6rBM>%L?~1LJo~;FZ*eEYEFUuZ+3N?ZF)xv1ZcG$97F3qQjYbRB$5CW^N$v??DjMx}x%kmpXEwTK5KOis@0{?2?qA=axEIr8 z4*DcW4yi#WsTP)qT;{B`e?svjd)WUI+6>&!K0*;T%T5PCM`c0M8v%b`tz5p%Y!Pet znC&7cAo{>CQ@J##16>pO%B-9;k~eBdQA@`9Ci&hQh^J7%BllI1!M$od-Y(`+7q){e z{-CY~S2l6L1t;2#q$BrGF)qIpxfLMQ?g0io#vQ#R(+|+rvn3t#P7tk!47s5&hpeg^ z8JD9;IP0(ZMiWKynj)$9PK+={G1w}#uq(u-Zhdc0j$I(J=LY<@(E0+?OK%K(rBf+6 zVa;jwdmevknK8v)Pa7`0f&gzAr8f%;UI_FVvW(r8184!|DYDF}hx~7y7Ae^Mj)-TTAH3%f1fvjw(_-HMK1DHr)Agv`Vw@d;em1ohrtf+-2j9|!Ff!C2H$Jg%T+-?^khg_cc;O(P>>d42^^%&?irI_@i-y6uSr!(8M(Mqyef zfarGw1KgB6#J^p~HwlQvLgjFwLtQaV)%BF;Zy7~7S-1I=a5cwh$2(iB5=Ji6gXD_lua<27JV%0PKMC#)<;!+Aj4Q z5#;!x7mA%hNd33xMap)<*SSC`;)I5veRP*|MVTh870k#(81Ph%i2a{P-X!P#S%5kV z5hup}S(;Ipi*4(h%NM$uxB<_?v1XI5qfGc8X#7&#qq%#K9F@-L?w(v*7E@bk>6Ro zRy3ow&dkwX@rTFPBj;99Z_Dm{=HI6Q-H0n4Hs}8nMm)j(92qG0TBpap%q7QMn)lmJ zR@OO#EChmw7NLS7%Xv;9hg(v>PfM;~fE%5=-`MLG=ncPMO(1k4`3ab|aB03~k7iG3 zb4A?ldwoXXOE3_L1(S?+#mo zvFC)K zQe+})^w$u2(@qQbVZ70UenWS7yfFzstXzTTK0ckd!{$l9wiHh6g;XJIDBlyIBA^o` zJiU0=FpV5JBs(N09`m82yo+=?fq0=V2@rT;-0%stlT1#(zU6`U`1e~e_Ajq09Jk5A z{sp9DKw5o!aJnge#|4^^uKni0=ZqaGEhRP~1GNK)$p7)%d;lCiptc!+FPjgX`chT^ z1g5X(rnP@>4=+u86+k+As_N5;x*u+-x|+H=cD{5uo;e33+w#$^#&Z~@w@LKP&}HK< z7K^K|%+}3(sS5LP9DoB}{=0_CK->)X1dhv_CRGRpl~*KWS-B`8 z%CLh#y?DcPJhtT&1~zwbFn?Ktr8^o9rsDGf(#dM@wdmxec!BG_7hUtf+UeC!9IS>p2A|!{ACD%Q+}U4y$**;@1I5=eLl^(SRW9j- ztjrU0wV($211HX}~k={!P^kxkgdyKX3{)G{fL3a6H&xBdc_C2@>w~Sc| zwAyp39z2wwknnqZJkYMNOE1hS$oj|TyVicIFEv{IisQ>4IvbZ#e5J?$TBXl`9s$S3 zflJtDLo|J1WQ!Lx-!t$VPkL{xEi6cBU*tMYs>B`|w4=Pb{2^skNi6E^&Lch|B~426 z38TW#tkgP~zCdZ)?z6~#?i_Dgu(Ng!Q(5XYLf2{2qEg5a`m#guyA-APBk#WPO-LTI zWp&48_HQhzWVRa-R|ljBo*#Wl($3l%b=5jP@zrZV>_S(4Hdl95^6{+GL;Lu`gy5bI< z7nje6FTL8Djwn1E#thg}T7TJB0|I>?y)tKu9ia>8i{Vs(Sh_igK=Xxb%3gNhCrI^q zoOx96FKI!dMob4VDvoWnw13v3Fuwcgz~1O_fFyrajR9o1nQ9uYeBi;~=vT^HlYCH^ z!B4Qg@Jwar&2dshEFJ~%;%!ZaLz&<%9%vFFZDD;zx&{X8m7Bv9#dF#B2bRk`PLr@W zlW@D3GE)7i3K|mPbqU%3=`^bh*14HYgz!5p%~*2!dtS2F(pN4tO|#UC`qCglyL6hW zy$<%`A?&FQ3OEfH_p=j?Ko}Tu2l-q-51pa9M?s zva;Dn>`WqP_a!zVqQ_>+b^hpd0PVcc?bPKYH!E6m%?{q;jeJn5R|I1C3q89R!0;e?iu5hs zc+>Dlw)p3`EL#||$=5#k!GI+D9tbG%=Kz^#n|Qt|lpCd?W(m1O1-J zld-|&95CtlakPCFYX!O4%*U6RNTrQ|sipN-dL z@0iY?P|iN@nh#nEdcK^eu0dB!^33bZCqzLAh6KJmUmS$rDZdw!ztgUE9jyGE=b8Jq z%HdB+ypBPzbLi{;iuZ^2@vXK56db+mp>147#ih=hj1O@d6xDB9X`f;HcnG*1I`0Rr zOdWOpLvYbx29(QQ@?(NH1$Ni06!gWxFlEq$hf}-pBGt%k^BlpHYH9_4oGeWxcpnI5 zrOLj@Kf#rdWdZKVQT+|+KAsr{yfK?#=`X;L0a#*HU_V}1%K6i|>w{{DVxwBVb8}Ed zFU#PDk>CO#v;$(I$C)rlHKp8g0j{25LQvdaHJn*d^5VxmuA!n~#mdjBOK-j$M5o}( zPfdCd0I?DKT*wcumLdP*7*H1=M1pSga(6+Xdd_h<6K)u-rLI2<@MOR_ zmbFrRo?Z&)yPz*I-CsE5b^=tV&dKDijD&Nk^S@QE>gh{BvzqCdMitLfSkW8twZX*V zyXVCfay>cQQk5By@55JmJLsCbn00#DpKp$Y7$)cvuwfSgK=+x|^Q~;5;iT9N{7m=% zV(%@(q71vWVHrjp(xId!q@-I?kdkgex}-t6LqI@~?ha8wq*G8(QbJlmLQonc1Vj;w zcilXDzwbVd@Avm_|5F(5nYrV-);iZ2Ld+8E>%w0psvKyLLEM~eP2)#naRUdX<3^|N zM}SfB#a!n35QlIK8Xe!!vG?XoQ7S3!>{&}wo^63WaRzY}wa@p6CNrG%XVIzpm4R<8 zCox6tsa5B|`oey+&=sV6{Jmy|FX0~MU>X4wk0##NUP>jDIs?yNj z7qwnp;n?)vz;lov18$f`T_J0J1_wuO={r{ISo&ZnRzC8W%r|TFHXbWK++(RP+5A%Q z)vN8+n0djt{`Vhn*L^x)PWo)k#eciohA#A^W|;G2Z`@{>6ienG za-M5;)eJ*E*{f`hD+VgDb)G2hf=F4y@RnV=7n3|iI&vPp$xDu!*yLN2YtpW@I+-2b zd2-WN+Wl2f63rS>Z{=nCNb!Dn>rt2!+gaF*2e6)bxS}$l()AtlvHG<~16-M&=bu z9}*?&h+cZ&;2M$Y3JcN`v(eL*>LKGE5UO)hcWNK-ka`tx%AuY1WZH;b zRV$=YCX3-JX{UN_z7+FEsb>Qj&M+1eMHxFDTJNlNXH(l8pc}=H!D!O=;a=knxuDC3 zPY9q(ImnVVCD{q-HfS?B?LV_I4!uTPv4~)$vzD9^Nk+7zcoBxtA|(Ov?7r96JOI2YC%d3U#-7XS4xYFa&eOFI5$8AImS{VA+d zy((~MKjhG$k8*cRZJ*5jNKGsq1+}lXAdao(J9|^+wCt&>Up9&2Y&s-~^2Jm*JQ4}i zLT^en3%=AJT=Dt>h3O8nf4SPbWQn(H9$2loilJ3m#~K~=5R({qsE0yf`XlFeZRP$6Xrw+* z>2NCI%QF5%FLBxeZJRyu5<<7UTnk%m3#C((c4!6S%<}OYU8F208j+T+50Z@fD9h12 zPtAo--A#Cx!1bO0A_2TEFgzF_MTXOAsq zh3IAEe#C>*Mi`C|x}dcV>zbozg7xnsY+43~{54G4Y$&ABfa*VQcf%>oNrgbS#U~w4 zfbu#cKbjOE_cN!&wJrrl0|m8@9IoFWPl%$FNidU}Y#KTcjU0YT9^>1H?FvGOh;?)| z&Unh})Z#T#$+yGG1%~vAWF(j?0;+&&Y%)I03vYvHE%4^Gv<%oeQD~?S?V3e=V^XPUKTp2v;hc_Zi%vMAzStwR#Dxl$3Vj$K|UCDDlGY zYhmduz9G1^(o)V=?&+9c2djY%wJB(5T~Gdgy{mzm75Y0mIVb|io@diU;aNh|b)>ZH zv%7`ww7y9R7)-!vTwDOxiyXK1U>;2fm6+G90Sd!0r>gsRz!kRY>Dk1Pe+`PLB{gWH$fCbw74(7pjBKs?b>Mk)>R$;w^Wn2-1jtvvNvB?m^oBPRX`w!`Q zYk%&wEX`tQ5TX{;SbiNdH2CN7pZI0kV=?E#TWASZM>rj8FaG;zvQ{XMM_!C21oBH1usRZ=5Y`;Bk|5Ol3TQX4DBP-6!pA$O6PQwi z_UBvO8WP&|CWYNW+KVbAbg;1u%tqW>F!4OejKZoVWPC|0z&{XI962`hyfXv;#B zTR}kVMWifKkPklxIeGW4Vgu3PVOS0UiStamC_y!ETvUMiDLE8c%Hde+op)nifBmcJ zkopec4k6^ZgO6bB{xW-+$S84!a#^B@pU4|-oz6!i2Ml$3_rPH47f=vn%9;~Ubg~;43p|EQP z+Q0;$7?L#{N!b1%6l;4H2$8Ij2N*;_GagQJ%~gO#ytNz3PC5E`=1jN(ayi^=n7GDc zYdwJ32LwlC{RVsOWrN|ie~oUR7Qk%Jk(3ZH+hn?M7~N6?GHgBr@9M#ri3~a!)m3(+ zYN=FMUuVF56fIj#@)H)oD%dI@$gy3{?1n65S0@2Ton_^W@`z4{r1_%z2G7X-s;acb z&1D%akloZMcI1N$!5gCRkBr5mPoZO&p>=CZp`|a)!eSW1m06ZVcaT&koNaou2-tra zOYTWtJ%ojqBPxv9qaP5iaXHp3P*4NE2w;2v3{*5ufUtR}+*%0!rV#{O z$cQo5UuVKQNMer6b#LbZ)nxX^r@MD)VUiG(gtT9vNwEXmh2SIA1{NJ5%ebrBm^F09 zFOgB51Vr?qWMiDuNTMQ)p`PJ-06_VYLa2R=&0#(QG4V!3)Vn(Sj}hNr-H7$8;z}37 zyU?nEeDp9J@P$i1^MHbD8=*GCe*Fe9xIy?7rg$~*?4eK|h9%O7T?7t$Aw29x%*8iP z4YZ2<&5+s%QJo;I!E;?RSeE^P(jMM+dItiXtO4ONMYYa>n#(MDtnL1}GnjPQVg(Yj zE+0q|hV_601^iZG{kt24XwQ_ekK#zcT%1|iD7Ie=x%B3Iw_<-N?_vJ$(gdI9gyeK_ zNIaof25|Sk*BLn2XXF>{X;R4E_g||UeB55RqH#itups=um#VU0{cNma4c$o*pjsRQ z`EDbKvKr;u4QF6Y&shixFe=q z#gGmUnDr*`upk_6pT&)IZrh5g=KAxO{5wSFT!bV+*sueSvxodNV72*TV|B15v{pJI zlu$s>ZKuEw3F(AX&bYyj-V6}W^xk6k{Pzpehsj7$dlqG!p>jGp+5LlX%me75rut-7 zDvIY4^&=O$%5trqeoA`or|78~3pd339VxtkbSeU!OO;w`z4xXrG5{g#m*CZVp9|In zvxu+SzkeAq4v;&9P-hV8C|N)0Xgye^PlCwC*--PVWK_?)puPqu+deApQb`9kngIf- z8Mzg*HABGH_F3T%7)M^ul)r$V^pGtqhWPuTpcn>y9|5}84DL3eqyP+~6>~3ETXSDY zxq*lU%p%CYPpJu1`i~-E0<>WYVG7mt&)Pw>;yb{aH4R)~N2IBAyjcYi)QG9Pp@GU} z#DJqP=rtk@R%kXD;%z_Fedmm4YwMu*#l5#LgX;BKuJblxIA+`C91tm+jM@16P?8N6 zpCrV$AK@P#_0|A?dN*zr(4pN|EP-Dyx63q2R5{;PE%AhX=H<~SG{Gwd&n`%A^XEzW zRc}g{9Yv_`e_)NaX`pz<+w=}JjEKWkgUP2W_KA170C`5^3HJ;u`<*(%{w749u@gnK4g(WCq0yacxOoY}Y*5rZ}1U zUB>4}5}OP{miFSOf{dIdbC45n1&bi3*a+OrQ5`TL`Ix00Lj9gTtAx#v*!Jh4LZjcc zh2L`3I;4YvqZ&~1P<$T@NLu(_mlViAneevyC^HmHTxd+v*RdE85Jo;N%j&!iQXMG1 z76tKDm9q{d*7Cq)G?DBw=2!s<1crqcTfz+^^6#nZVnrzi#}g*; zRS?jYV(u|{O`E9%n6!Mp!w$2c()|bNfiD0499PE&*V|OV2zqxFG+<;w+3O$_z-hJE z7>N^ogG?!V{`J3Z6;5;e4sD~Ez$pE*MrSdY+5Aa5GcNwGFTx&13>VOiT_S+u88(Q@ zQHWIi0u4aI58&p!)Pr%d=&^_&5crCp-nn`L%h+l_P%sPuZkPJHx7^G$_2D zpZ|=|;_?V@o`*h&MGL;X6c@sRwPj>vxQ&NIg!TBmMh0ADz3c_vsmO%YxAQC|Iq8^_ z?!&gmO)Zp3X#N9KF$a3J?v@pa$ta?xcndAXn@d0+uGj1^3DH~$^yt7j-=fh(IulsK z@k@RD^*2P9nw&VRJonPi?MPAov>Ek*0_uH^$646#QP9`+=G^4AYt@EO5`o=GbA<%| zyeH%Z^5Ym)K{v%2vd;a%z&v~(O##oF237}B9wp0MhSl%yA(ATT0|L}_reY3Z!BL7U z=C2#;wEn?8Mh@Tm0Qy*;#l0o73LDZ-fko+u*Cte-`}MKuNYmYyy@u^zmte5V5s+hi zi$*;9Kiorl2rI3&1l2mZt5%C*>@eHbliKiSWB78b?jw+w4Yv_qnZ9YAuWihR+%12S zGpJvJ&m^yUI{yL@3E2PBQ= zF;+Ws1o)nBTNMky{OF1g*W^*^?ikCgX@yG+0;H<$=ux4pSYSO{t`ua$pyR^~o}Xes zAiSXu{kd3L>8PmjFRy-@-y8nEOtMrX>eP{AxUQZU%I}7#2hQ(G1nCiV$=SZuZ z5*tnVz@fnz+BmQI51-mz)p9!BDB*hmRd`X*$(74b7lZ;wRrHhkX6027^b!s&952B`2#m|MCG8PLjcq-~v+T+=` zTJ(=v#Ku@_l(&8F)rO<|5a1wR05Yp960S5CV65AaZ1}w*_+|b^Fk2%PMQNNZp*4b`_>?cqPushIgT!7{BOwopm zcA3V1cYIJ*<-@)>vbp)_+yAJYW)OMtIiFcXzVKhwQ@50Tx1b{uK*7B!e{jL9NC?I+nqWh9NL_Ll zi=o1?g;2{dT+~#>XYU&x=d4Dt;-+Ln0K;<>I?eGk^M;t-8-4`FB;Yf|aVfP@z3DWI-_L*|g=j`F!Cu?9WX| zlo6s_0oxW6<1bH0F}G8}Z%RlduNlsyC&{fg|2jkn%X`+Nm2=4&u&6Dxgx{_UVv37o z!Xh)S+wC&mV9gHSrc5>d0KV81<_~l?)jLqdOE`xVTftw2paIHrMaT=t9bKhfR%7QB z5q5%hU*9VdKW%zLnfWGtoy~apfLiKoDju&UW_HOS zUY8mqy6p0?g77LvjIzR22;<|tx%B%kBU_}VETaY*Nx(zOUZjiL2~yT;(#OTIfb$d5 ztF{V%BZ1dnfNPlOP5OivgT5E`ib@7X5O)AcRPAB1cp$Z9?N}s-;+`9^ zZb4jQ&sXRjf&vUiTyXy9zzcaBBqT^-`Q(WvXE;pXvE!i=Pwv1st^&2+{(Vg@UCw*5 z#h7hqA$ac`LbZJ-34APze<|5BYNvy#A;)QB9?n{f^@C9jzRSr5w2<1NDEJg!co`#7 z-_(D@o;PU0B!VRP#5fdn=<>jx6N)m9RZ=J`;u4W#bU*_UMFR1u(}+N-^y^!f{)UHv z7310kFwp47Lm`U#D)V+4zr_4(9tphg1Cpm;VtX3y2a*qDI{wy>+ED&~9^V{g9F4$zguf@sEevVrKpp3Z|qk2-9>eQBd(Oo)={0*{T274I+&AnC^#I#|+6 zll{u_Js)h|AwL(+h*&N6t!2W>%Nh}8MA`y@MBqjMe)u-PUH11StDs=HLVD|kcR9@P z`FbljK#&?4D~n_Y!If@?a(Mxvca}aB$fScEaimk_(OYc)4b(vtOPkrX5hy=O28VF`w5;W50l*Qz6AH!$(wYKw5IgJ7I1 ze^vBcx+J{3Rmkv=92KA@TQMfs8+$6XMs8H-XJimG#sy}gAi*?%6VVV+pafF~eT+Xd z+(!CObuOY7*^0IOzO9cwhs7l-s5qXt7LFKRKqCRQnLoO*Tsu1EM-0;NBFP!7maU7? zv|%{mmB>jrE13W%p&Se3gH|qc=(Y8EA92HD=82*-_kV~8aFEcvULOKv87}7R<)lYz;{GsB1G$aAvda5i8QbSNr;IYV3ih*4- z%v1sa$Dp&O;b%-HfwzsfEQNDaECpA1^_hN3U}x2) zfx|gQA`L@KSUn{t(NDnkaZos)kd^m_Ixc*#06due%kb{_1#AJ}3F+8q1fu~$Lk-}E zTpw|5y~AOvChieNgm}d)DBx=~0kpO#y9Ll=9&D|NLZ?i1p%zoB>nG zTh-((#$*dL)GL1L6Dbyt>>B^a=AU}%o^IF-<|--{^4H-5+{J4IdtFd~XT3f{74YBY zY^N6(cu(;e8}(z$G?*0zoY)OmxiTMf9bf%U%^<)mvE1*7g_|tu4A*%1SP9&|W1v?9 zK6b%h_=X)}VKs2FUM)TGgs;(tZXyi=yy-mVZDJRc-eMb6U?J7` z)d)nbbX8ee0$a!);BMkRi_|H?$iAlk6qa(WPCr=vKaspcPV*&HVa&p9vTN0Y-6_sY z$$V&U#LULH8B&*msRfsZNOAsg+2TnJ!$USeqP>Ks<{BT^FM*BB5?&=@(jF+n!3Z6f zLSv18IiQ&wd^iRrZ`G2wn3C^;32Wb6dj8!<_B}a>p$)ksLPr}^0PAT%t7T@)iU>|` z#!;8enh56H*Z+5jQdx2RP<5?%}k)t-ja#vGE z&k3NF2?UQ;%GN&{F2$q`2}LF_!6}T-w6cqXqTFP~krL{q2k_4;>f})JT4UZ@?o?~DiVfjE?kLD&bH_SY1WBOud9oQVsB%L(9ZC2_PA3@dL5?D*(0 zL8qaKf=Of@46Ow6@7d`8&pzN1&S(KH34IbV`0|ANLld9HZDb8u!-;;mHqiIOhcxC# zTu{ISgKvt9d6uv1Jwf;$i39kM(Y^=DXZKYxC2}e_Ssm(U5eJARl0AjSRQ6LgN-EP> zy{#J9qNEPzUFFMJN)MOKf$rdKWI`OgnL5d>*P-psswY@rJ3-}!)vnKgx^(Vit5@0; zO=?VfEZ_y|(=Bib`pKbU6H?6eWyJELkmxg*gxz=6j%R})xbg5@VhOM$B4-6Bc^zg% zhMLNB8aW1Jg>1wEtXLghW!v>e5zb)g>Bvyz$y@{A4oJ-LHnicqa(#gGrM5?ULK6k& zkkynNjv}Zb%bcqbl~W2~5B(tkKvWx45=Q= zIN|kxXJuv3eE<-FnhDON68}34T28GPJkovuId2vjpUesf*8Y#KUWL?>RYFA@(b_GM zZoq7Bvc}>QxQCG+NdY|=!8hwumnGm?x{QV~dxGQ*C1p5fmr60%9ZkKydlwK5Yq9hF zPwSQ7F%mfx21g*vFNP~BM3NM|{gl}G}Cl5C@4oOGBz1QH2#7K`uGHF5m(t-r> z(NF4}M-!3^GvL7U;Q*%{;E4;~44ZWD-{8~E7256@z^t(dDLo$nRAU`W(y;v&C?q=( zdcm@y6nREh^y%<*F_LT>Tz?uN!M(J~oO93uzWtwqASL4P|M8c2D4>)SA{hv5T7~&! zf1jjvA~Ve&f5s#+E7CAl5xuzfw+QD9WAa55JTcENCrM4*X<3YJe1C)W7;(P+1aZJj zSbqQm&xbzPU_^U9=s+M5#A*nJ`O;;-H0B0p+m^r=O zrykw7Kr3LaFvGQ|3ekrUy&?p+8VcTkeq-~Y7q(voWoYUVg@mucc(*BkTc&{c94rL< zYvBk6;emlwm6AO-V?)ec3}Q3Vb7X;?Fz<8->ImaCi54t0ZvW1~E!RPsEYlb%j9dtX zKE~t*G?JQwOvhZo_Fi1_oJ64+Dx-c*stTbB{$vOTm#^u1+qW*?hOa&cxX}bi6R;$T z)&W_sA40ZW;a=zdDGufzF|+$J-coSe>9UN0mTqhAc@4s>gi*U~1}9Uju0r-v?{VC~ zZyB63vW#MAW5^SSUbfjJ#GnScOG{`ty}1;Pc_yA60(bxxj&@UJT1B6|^AVDSxQZo+ zd_Z5q^%AT-Tme847e6OaAdPDAx}7A=10PipLkH7(7?+nxjkn#Imv}dd7~c;b!u&PA zgV$4>@6<9y!4mPhfw;14QJR0i2e@LU2AQ5E$F#_)mM@0J`N@KAMo=GEvXEzAdxtJK^OgKhu z@bD0U%7KjV=@39kPK$%pLnC!Givp$Bf`}7SvN0O8O_wM`9@XFD_T;u>W@1D~%93IZ zVNiuIY}eq1yNV>70DSK^1a?TS@SIA0gSXBDd5LJO-A(&wl4NYb;(*?VC|ProY4GP5 z20xoAJAc5sTW4U_K1cWN!s$`-G1w=MfG2JTx1|7P5&}0e5&zlc8-Ux}=WCQ#(=jqK zNt{o6%7g*W&@h$ri`!5c*fhi}KU9Vh8dEYkYL9+tEbSwn9`PT3#zershE+r=0)-xg zMQ+4v=D*-UAH;X4a#h_f&G#Ifz#4V9|0Kb~f%i_8`wVOfe$Z$=4FI3=3ekPU^%_vR z=U>F>x!F*pC2`;=(1cYMK{*tDFtVr5s!DDdd&}JrKWD7rQ#L6M#}JG?qf(#=WGKo4 zToZ?ykZL+YCQ;i7omkbnyJ|7KKmIr$$}u)!TwulFeNRr1PlvB3sqrNn+b@MeNF=Sl zzEqeUCOD;p|2ZS>_m=1R&!S)CC?&bZJ197~ZY)TPaP#Fyh2N>$AlexHA36naEg64f zhG1m5hG^Bsa2wMUA?vx{S$T}=Fri%OL^1UO8PP6>LSAWA{} zs-%@O~@HgazfX*Z>C{6xKaL<%j4G|N#`4o%EYShBfiQJ@T^cB z!!@+IXmp$8h*krSK#ocR^s~-RIYtU&3xo{U+8Q|!^1)hV!-!|D%dC-Y#?6nPoGPl; zgr}@#^{L_Zc#v?WYYR%PEo;R`9{)fdpgIygt@sIU;9b_DtiQ%RYX1;Q&rhb{eVBjw zT)X5DOX9SDjNrF)7Qh}AFHmYVFtbX!eEYKUqxm2Bgx3(sFT0DD$fUjuI1F#ej;}Hoa!J&ER3)U<`Z9Qo{QDX&=GjEgUy!0o0ijGJ0ZsSc&hjwYL zHW)t*Ll~#lhpdFF3)A}wZ)S}`%@)jcUcn(NGn^2kgqD~Oo8?e}Up>U){2VFF(OC#5o~Lz+D@* z=mwNFzj>>pJpjhNRFe@VdNJB69$`Oh!d!^vMSkox4v8CVr78vFDA>P;-$K;VN+X4f zWFcIuJ9%WTd?Q!0wJb$brd+SPoZ0*Yrer(6euEN|x{>eCo6NvCo@V#v5Ak}ZxB1iP zGm^I)yh!pFDej#k^7MT-&HEMZ`*SiEA~zB0YRDZJdX$qCS0tRj4|$P)zw*vvg8zU5 z@hbebJ}#rZhg6NKadP`}?3c@oOfRloli)(*$DJWK+w7fvtrGIq?s&-JCeR<0mlb{? zExt`emmlFyCjmyG`8WP#7@9jyN7!23p36|w?(&|pyZM&_HGCb($6Z6LNY!|n(&IS2 zhebYQBR=o#`8Hjh)pS4J8(lU>$Mnj$O!I#OhxuFxcKi4_sY~-Ri5lXnR!LOu_%{PP zFZ^^e8QXsjg-OArVMZeZyO@$UXr|A==&iZ6F0Xr^#5Kk5S0|n)5cxd2gb7L_H(fr~ ze0hQe+f9Y-X}B`<7A(T_Y%UxWwLEXmiSxaOviV-X`N7oJe|Oy0khBCbd1iI8QvLTA zaqz}n_s95{v>njXBi%N6y{HSGk?XW~S~^2Uh4>?TZGtlQdExHU%zmRnIvfv+WcU)Z ztb;`U6CF;=k?;tdR~>b#eO*!yg~4S((!L*tMq`doMqwf=1PR>|CT>@#{^dL3jo-C? z=q6WOiz&9UG)ax2nZ~{bE$SK!dHhheDYDnMM%21t=Di`cA6xN2REZ{=dHyU-1@IlK&`MeoJJ2tRyXc@G$Ebn94xr}hg-NV($N2Mu z4+eM-&Mi#4EXIc32&2Ti7_T8Z(}T2>Bdw{)0ey5{>Q0wa^z$Uwj*IUR4?e!^(6qmC zD-_5~Os=IkNc03*@pwlnea5`_coBwCDlOq+c<#of+iD7H%yZTxR}AHZ<1V==h{?xB z%I0DjeLZp`9hRgY{q1yFjv(OW=q1)5vvb@ecvumm+og8y(w5jQYncL8duCF;JwZlN zl%t#_>34kVa^SSCMS0ZO#n3^=5v#rhoqiIM7&)t8TB zo?b&`w=c>>?5SH*lMaz5K!N7^Mr1+mvMiyt;Bnpe=wZGT4uwS$Mr%Bi9I!9o=c=rY zVDgAV?!;|oS;7?%6!l5Lx^_;Fn6xaW>oWhOzMeRba49{~*NX4=kab(+-%BMe@#2rA ziNIp+ogt1Cd&8vAU^yhV9B;*|mxGVC%XMVC&%m^2U=iZ;4e}Qa(#N!5_o)x-fPQ;b zofP(J>2E-vZ5=8#+VG?K?casRo(W)fJXir6#`npO zQkB`VJFz=>L?U>`2b!cM->!AjSS*rI30Oy7AG=IO(Hu%zyxhv#L1Lc^Zh)hUaqK5j z--gJvR-Y-tDe1v7S`&$mfzd^H01UBNW&2`;t)lmzcjCQ-zCF5SB7iN&i$&}u6gJ+M zUU_^{cuZ$2S(tYSUs1yyJ@YgW3@6qQrtqxnr)6M&&b#s)e^UHQ2kyl)?%hNhZh6fV z!&h|VB-9W`)7Gu(m-*~L?JqDRS}`rqK|D4W&~yFA$%oM^J;=5D!{+l7p^buHE%E(t z;QNv{?qWBm{D2*$oa$pwwC6OcHoY$3Y_p?$7ea)v>{5s*q3&l?TIJdOcD=gR?peT#W3!GQ0Q)DRdVjnH?(fvg*}m+T!L0_ zl{V^NYiUfVjlM53An-sF5kt&XD~!dxougaN11p@-w8>3LqU?mTj<>|H>w(>a?r!FHG==@RZ&x67XRp@n#-d_&ldtKOeFchpp`5O z&)`-eraED&)cBjBID0=uWX|OQQyM&4N8iong+S-B98DXB<&Ct+4KeLX-99+6u0TCC z0ov|sPJ2l|2jCV912a!+opWGd(xr$jkdWD|iM@nP3TWB?>??~fY96ntdS^RjIPbNl zzM!@815$8ORV>)VzZYK?e&&vBh!L#ujTa|#UmqM;4W?(g_4)~qw>4TVTvvD02|}l? z#Hviqhq7#5)7=Wd`zGaUciTwY7~IR>R8ejd_fbER;MPKaQgYU-In-z37^~Nj6YIavbojXmaqk*S`ki90P`BTbyAT zK18$YTJbV%_D9-METP!I3q1mdjKt&hp1Z%8tqN~ghz_#j6Od*+E+7{BeCD~Rr7p?U zsMT1jDK;6me+>o};-8+XS2^c->T{Ktb7hS1(`!Nz+JAVf)R_hw~mvK~px_J$#9txJnkgd7w zGMaSlmxCAd8LSgZ6B(jQI0^nsQ0#9MYW&1T^n#B#l@t5p1yf(&p#*>Hc7YF)lus-^ zzp2JH&mj%davT9qqxNQ0<0UP^dybFRpGWDA6pK29=Wx3)NPR2jdJ=;Ylw8vV zaqk+dW~sLJ6$Tsr8JOhJ3^RN{_E}9v#n|p#Rg=O`o(p@iGlxh ze!zKW-|Bu%ImtkHEv{sL2R`Y}NUsN@>R$sY3^Wxf+Qa6cn-8bxlFN9KY-`-9tHwhS zPcJE2d)3f#m=E=m!sNvQ&Cz*~wqTG>r$gdf>oEcVIPaXmPhpMwkmZu*9ZTb9pLQCza;477>1Gu17da63Urq{B(JMzEUmNR7Ak zv4+Q%3)QU}n-4c$x}!lB;1klgP+x8ONWjEZ4w3b}tOs&3QRSJBrT_S4MxYhyh}ITcKbY}_sV*;;dp;!77C zddL=^OnLW9(@Mnw=O@@w{{Gn?Q45Bj64^aV7ECjF3HP6*e)3egT&SEAXjAQ%=-iDx zdKBV8P+7e4GxKJ1OF$i8R=7{sv$tg3ma`iH<=>^Ix}~>;4xwW{Z@{$plU*w1lhweh z&*uzkiA>t`RyUR*Piv-J;)~hG81|{t8*yh=y#ePa?xwbQ0Q%pZv$}p3$DxyHd|e}D zHC!WF6^;0%+13xv%rN3Brkr#gjB@N`WmVmjaTDC(z)!p5=MjUe^CbyotrDulwT;nLXapty1BMpzA3q(rz$VKv5F7HV^oI6=-NG+e^APt|? z&3JHYr_81lldbbK7aIc6r-iys%T%6zIKN<;BS=ZdWU}aj@=TCv7KGz=f&od?d~!;!vsuC(0GEZYkaIa{f(uCck82=OlA z76H1q8(byOz@uQn&)4ad!y_SwqxcZesQ0&syz`W)Z~NMC73`=pFh%gl#E3HfA`UDPQ! z%iF33y{i6b;F~nZtF${cjmP?)Z5teBR*oGuC@&in|LmWAOrXa$6l^?v|0O%WP^UWP zz;R?oSL;Cs{hZp%GcEg>pDI+Es14h{`-hor7vva97sd#Q1W)Xu#IK#H315Jk%$H62 z)u`3lL{`Et>S%UBnWmrS(o!XdQ85H!EOV6H&qJr@uRiy!wex@0OGax&vlj3$VQ^i5 za95}BVdH_qn>!-KBf)&t4s`Eftgx?8U7mlR|C_x9D*w3bW=BTuS+}8;WIZX7RKAj- zvKpakEx7|kqcC!tl=?ECz5`QkdOIc-j0tk6pR%7LvNR+)UlzX2`q1i*jbIYMfBzecCop?CJQxIU#4O zWuQv5&-Qt@dA0*CcZ?Mt4mqur`!N{#Z}Nf#X(O3#`H`nE*HC2*0Vq5j=&wi~`O4gpS zO=)%Dday~twVgXLFDYYk-oCi^R|w_P`dWpja2oTDe^jmMpAWg;S6RbjLI`*|IEWxE zqZpT8T8SNEdho!Lv#;&ZfnwD$w;OxnS6{!8E36*ELuuH*FImXW%TZV2t{vVzIl8O) zF6MJz7XXLockCV!BCb!x>yd<%-te_tsNdRF7#jY4juK7j~ zYI0-Ow-Ry$ry_elN8amZFt%z`oyvTm_b?{(vvBzGrMf(lFJp(y@%JAFX3y^2UgwUL zw-e#C=g3&MA{Wa@dCEro?1KT_cfUsBrTb~Kk_oe;{NE@`38#Cci|Bh_PWfz~dss&g z+~g|ea%uCp^O1|OQ&)Sq>v~A<_{C&lE{-Urzc-DB4xwf$o3_ANSoCfo{uvukMB>zom9n1}Qzl zBkwU8`v}skMd9-(d{gS_3KQ*+kZ(~YnzEZ7=1CD_bhh8R_E%;rd8O^EmCKa*bWVi7 zP*~vX#p)ZiosC_yF%s`k7VJgYCf>erIoa>C^Y}A8csDMbp1GYqks^9}wd}3z^HRI9 zsc96?`f%Yfd*hGzw^H6TUny3SN)c2Uv85`!$vWc;YMCnTXHT7hO7Q@G#?eflp*y>`J z62;=MTgdTHpv&gNO~uc>Be9;5dIW0e_k(4AYKjSMS79Ti>tc-h#Qlhe<=X|`-H25G zWGJ^Be!s8hgVEPC9@Xm07ul%7&Ko;gl)auRyRr79CyhZ(T0P`}Ue{P%hrqR(9h-{x zANs`kmoylxf5${pW!KmKmNmZF*xd7-6-~MF=?C>a_J!>7w?{=$Sd?BONiH9cLbST@ z4mXtwnV67;<#H@N0b0priRo^&=@!=RUspPwP|;zM__KViU(q4K6FpL6qKlUr->e;@+h)9?4&^I~hj8Pk^kKNbg_Y{NGm3b2XK4&1_SfeaY=9Ja|K zWK+wzf6#Pg=+FMP73aWgVm{8kx`61E;S|%<3#9|a>0@sqmc0vS>DTnW`~D0|c3 zYJE|0kIcC?&ifw))_1iLG~JkH*;=7M81mthoWI{yGc>)4U$3@}<`t%T_^+~db1;jS zct6VF47gs_)6)L^cGoxPCHZ(nZE$4g65ZlYs-&{OY-6_72jhHlD@J*5J0u*>`LmD( zCc8WPTaYQ;|McFC9*=(4RJmVCDkr3Q?{Sa&PT&Rjc}ThsL? zm^OD}PR||hgLb3RIr3=Bcy zp8Rm|uewbP|70ZqeXo{jmu5c6#M;+=~vUH(DxGO{u$JLUhDeCa1F2ZTupf&*_PFpM~otY=3sKD`#k{6 z)Y9!^Vl*pl6|5eRz8T78WT9Ca$t9H-qQ09-9?d?R)t~FE_vrZ1SKqR!sFvkdOq}Pf zBx8dWV+@%YJ!*`f=xVFIiV-YQv5s=lz0_N7ZlbgDy7o_ldC>Qf$!8228j=sX9?zN4 zEyif#J{>d>{RPxsc z{|VCwt?%S{IxN=K+ce(C_TnZb%Q-MPUtB^vTr{kRDsOu{|CK*`LaPf`?!8J;H>pQ@ zH~#C%FX&GPS>;Z)oJk_`G0imJJM4xlTqH7HcMn-)3ldg0e9K{xjjV3eEPR;zwD&O= zUt*U`C;tPihKcFBnDg(KV>!^bYNhwvB15jF<-QG+svFPl{=C>{y!B4P_&lb~mwl|= zayCYWw~aj_IblS5tA1jiA6e#9GcrSBas8&oR2R&q6eep_2*xApSN=UWK6$oNwrM$i z+%RCQI%-(u!Ft8Q`=XFJRlq|+`O|YHr2cYp@cpybW8>nrHzH~&t8YB6u2ftlVdv1J zepNcO8WPh*s^QjtAK;|2=0>67x6#lXcU1;oa1S(=~qi%XTnZEYT>5 z$wwpRlX!!Jumb@&1!7`WBdf9NPo$ou$e8am%)-VfO%HqHqlIkef^PfB*MaDwj)1XO z%Hm|i>R+};Y&S_6njU!o0AztD4Cs4}iG6&md%+o{fZn>#z?D>-2> zeaw|ar6fA-q_6hssX_Bh-)FAu-;5XduNsxf5sW+LOJzG<=DZt2laa6$JRQ0y9k0O> zAuMXjf3EAF73G7lz6eg$b2!_h%jyh8>U?oGxxQKX-F_glk$r_dHMS(raGkl9x z$&2QS%j>Zr?B0B-sc$Z&uz;BL66O~5vS7mUkY`b)hUpb6^~}DMvR98t4iYm1Xbgq= zNHdRw8?+h~mo62xm6^}^I!}w=TISavy+lr+V8Y1W3BoW+V>OE!qJoOVGozPAi#en* zKaVZj2XR%%3ho=Jk5ggO@s`nxQI~vauPVy-yBK!N&Q97kp9E}+eIuFt$dM#Q9-zMc zir#zd`mWZIyw_KBq4X4!P9S#AS>OEjJ-PcWYNM_hqlGK0Oap`4mJIgvLG=^!a))Wh zPi>#y!kiKA7sYHufeT;k$>*60Le^~mM^44OsL+H#0*iFE7rWl}kjJ&WDo9KW)@m~h z%RScJ9jz`=(_A*j;XD%v&%w^!L9MolEwi2o*HGey`_W(z+q49kJT3DAtAN$pSOtOw z@iC=Mch|Zo`MHFAvA0{}M-CWE37bdJ3NJa{OuRUny*Cj2cpR@OMC)4+T7$)Rpl79T zfb7!w3y8%)ot10?QNyO7CF$L)pq6}PvqW4Wsd34X6s2D_m3i_gd_0n*I88OGDf=TQWzV18Omy|9Je3{JfkGxrn``Xgb!*&<94FB#65grrup5`G z3PY3AmC^}OVD*uNkULV#+cc|Q968*4QXXwh;)!YzF8=Zc33OPNmnZD|G94@SFU8;YB0lcS?jQQ{H5`} zZd$*zHqRyVbg{>i)o$M0s(*Es&`Jqp^G}P6#I@9w-}c~=-b-ia-T(U>RjdDeng#G z|Md{XM|M52Y>Tn;ymu&RrBo_?zD_Dpx>QQ=bO6)(Ip&gE4hUQ+#z5B+4H2kZoW`jn||YEI#d(X$6+uybX|p`oth!_lvj>zrej0i`r9p`P&9vE-ORpe;Py}F8gm%JU(%z&qzFFCu(`I2mxoUJD?Y46kr_)T?;R?~txZ$L6=PBY8t>#?ywz{y zq_rgjnP`+Hmd-u@dYDr>Po23E-v!)eQ&GjL-w284cX!IQI-J{eoj+<*-MQ%mtkIim zTw0rcgcuiKFYK~Dq5k=0p+ef~^XS_pAg7qWOdes04*QgF<+)<@6qWD50^W<+fV)Gb zo4zmcxvE=Ax_kq*1rG*<9EPcel9}X5zk1LFSTEg696M7t;W1IAU#s|(*+-G0;IfY% zX_Y*ZX#X;7+h`JFFGqJSq*9&kwmZk+$X(NWO5?MrTqH=-(#gt|cNod+$S16roY}}u zT3fvLEGz5ATMdg>zrJwYyO+pej^@u1-+Y|*F%G-%K_wNg`3C9V=zcmNLx=3hNu*&= zZ5`dC#ZJK~>D}N<<#Z*L$ZEYZq4X`=hGb;yd84x=ufU$xDG{La^-9;j#K9yXc4!=R zd=Fk$8x|6y)W^awFY2Rxa)gx%`!CMfx zH+T4e-?S~hOK6{r=^4&w21g#ol0i#>d@WtOT15xWx~@%v@mKJwUY1(|2fbY#J`JA$ zyz%nF!Z!sbo&&6A4-v+gofREk?RvL)^T*>+3-{0A=U=${|MOdAC%_{hL%H+Fz1(&! zZztlQDN<$Llb#c=r>In5d-Vw!up0s6jnJd>_xCMIi(U$m&4r3HD@jFg-&g)0roKC# z>c8)w)1hN_jLdNCne41|oXo7ujL6DXWL9MF5g}V<*^!YXA){=vM@AV5k%lDwUZ1Y} zx_|c{*W+8O56kxXBb%+Du1A@JRn?;_LOKM3La z2zPHu*%{&N{|x+7(|j|V+2l^X1`KC&5hYt4CBw-X+zej2!>sMq61U1be%9VupZ}4e3PCO-5@bT>#F_p{lnqq{64SfKfJZhssQ@2@&Wtf=VKkvOsVbb zHJ@fZypVshl_`RscG?cWfQera^JVw)p0ZT!sD_-<#RRPHY~d;{*o=$(F|_oB^;ZZm zW#u{sT7TAr@WtD+Hw-TdWv_pIA^dtF=8>`h$EHn;P8?~t(X8y7{68ydKRk=YUs~UZ z078K0BeP4~k9M=hF%luyWeYp-+ zb)RNVpLzV{EvHAX;NvE_urj*Fnu$5z#^cRD@A_r%N|48q#6+r8sS_z_%=oN^&7aX8 ztuf<0_YXT)WqOqh~Og-_2N5EY%S)5k- zGlHKPY2FV+Tochwn%!=QvfTinXI;fAIyxoO?VmlF(NC>^x+O}D^v(1~l^CSA{CGc3 zyggG>;CrRleKfCIXF_hcgz;I{vrERan>NIOLK4#4TYl03TNK*X1Fk67#}-@On?13& zznBgEZR7jsl(MQM)YLHAYcc`-90obK+;tOutSMbw+kuNw-p02Zu8j;+`#-NARZUww z{w#cvJ1V{1t;Xi`f-~8W18auHCX#zuVb){80VaM2=X^;Ij~)^-YhI|D>cv zxn;($q({uBuH|%-M8N9j$NER^!;lZiqe;?#~OQV0X zoAWCkjfgIr@?7Lc$tvAhTWh)#?Nxa12aVr+`57zp&NQbV>P15aW1*nVD6y2gqPUe* z#iUww4At4W;VlEZiB!k?I^KDXm+k0K48$D@?QbyGocvtFHRCahO}D_~m$gD%9PyKl zm#z>3c*uPM(eY=16@4xD$}ZA)o9LlYwrUw8hcIoRjnM1cDXGjh<~ym_+y|ML&WGbe zF{i-Db8v|&Bx|gqaFDwV6%rW)+;uN;Cxxp;E?5*H-wb0-7YHB`IrR(6jC6gbV{&NB zPmN`+`(;aBX%-6W5*a)y=^+JAKy?sgT>>>y$imbqn_eZ|HM)quz^l8C?{RAD2)KW!b>#e2J}=R zP^j;zeE0Fo#(%W{;ItAwph}j#|Mgt-M8#_A1MK{%obgz!lN<_xj8K%QuAk01NA{%w zVv2Oa&4!jR!7oJ2vStFJt^2l&LPE~`1F5eIw#h&!#{ndVf*mIQi?3Epxn2cp1hAo}5qK!ltqHDa`5_YskZshzOMZ(V6` zxo(nQh>B8GyvlIdwlZuMu$}Wu&E5ro!KqRY*1pZ7x_r%l#`&>NmeW(c`#*`^QTu+j z=eN9*=Yg|7-LAa@9ewF^OW(}6cB|j90eNqYqZL^H4(&Lohz0628-|yB{ilYgrb{hrQkB81^O}7&Xn)R9QifouP_v#QLP%Rn4%jJI8XFyc zH0V4w*a_>(jd&1&I(;(B$x7;}T5&y}R%WRkCdYp{hE%-_(2^B)C)dj(F;a&3uJ^a# z!Mx2K{THB-aHZGO?7e=W4^9sdk4oEgs_3H=szbM|Q%*h?(7Vz4>rtNvWl+85quv7H zxt~|aMvAjm_2>cj=XJqJFy&$d%gf8nh1{5_cxO=;^b9JRu?XlrWz~E);S`bfFQ45s z(9LKC*ej*YE-59`aI&cV0ZYsGnBUqRM1?mHFRaXly$fN@VB%(1%eL?XMvg#))|;@B-5UD&L?;q9m%t$uforj3Y2V9}=hY=BEf^ zYyJQ>9ysOIzqE-;H*;WSMD?b>-Q(947pX?@V0V}~WZW`dS34z2+{*hPU|07kYv4^X z_Ws^=X}X`OzT4ZbMn#wf@~L3t3a^S~|x7%n)EBq&9n#!fjwLNhHth zUBeLP+^GFKHh9IC-v{i7Nr|y3e5vFol=k!6Rrt%xW&qg#p?|K4f2v9~Lv6{slSW;Z ze`}L#taYuDSKzAHosi?p8VR}k5DhH#ADg|ZzsU_a>~nuhMv7u4Z1qFA4X@VhS-4g; zTsd$)}>uN zY`C36n~Hr@G(1t@oqOROicVd)Lxwk~^LBSUw;8eI2wrCLHCC!V;jCEeN zgh8E0b$>CT`%ww8kqmfyfy@Y(SufzSw-^*U%)ftGcr|Wm^jlC39~(ihr<+i1QynrK ze@=nM_mTkO&?SA*??|kdyzs})@t_d?ug8Nv%`{Xf7iop^?Q_lS$fmKfeLBaf_qHFS z*J*#Os{>GSTlWX%0B$&#tQg0ss1T}pWl|$Lk>V}o#p zAVTomdG_khz@PS_r5Mb21Qt7hAVlfekc8IDDw1RQPDiz=!CeiIaC=#2*ZqKbK%zsk zg@uIjMKtE83VTQAH2`JaV9{q>=E~ATfb_LMxd=f zxtM+tEQgKI8qwhcqGtwoX!~at&sTX<@+pYacKv!?j8=Z2G@?#ewQ&g*>(f#Sl`6P( z?)?CfiM3|y^POV*&*%PPs=i+J2X#MrgMsY}P?bI6Oy+z7K69T zMavO4G~adKFqbodz@O7y$QkB7r5xSZKEYuVg?RU*iwo(YyycTGKZ#8wQlwE-%zb0`4lNjqc7YM)E1I&QOc30_FM6Cn)|^$sxseTW>5uANwNuex9??6+M7@ zU+ z!;#h?rIfL=r-uXBl|RIgZk&~OAyL`WT3~X>`)bq~b8^1+u!ZM6xb-@j%rF5#GG773 zGK4kdIpcW0R7&!_^XKGCC&KZr$ZMG&_F428*}eQe;T40Y201H9k3~GAe4_T989@Q{ ziwhPZK(#Z;&nuMiG!3JFU)_*G;44FTh$QQ;fVeRLn#cn{=Gb)mgSpHD{I&dB80>pp z#oAl`lP)o`d-k|FD4|&LP;Wlel=6Y7cCuNeCjxIsbICjLQAciTck~fH=dD!NxHiYd zw?CpcO#rj#A)Z3^W@uss)~{Pl9A94^64$SC;t3SC-yd=M1n#NjRzDDPRRE}DKcbjP zD^mqkF6efVljM+>gy>ENoP2&3&QitSzMGhlsm$aRO{lbZ+;2BJb{NyGtR~zt=$Xa& z`9GKC!lQ5XHUN9K{#Fm{n3<(wzP#I1v`J383<7smCRibW)}w(`K$J$8(~y3Vn~Ps z^~5UMJei=2V})1&&bIA22st*;pGK7fHR(fu@2AkS0h6vB;T?#we*?Mw<<_E*2Puw) zI~8AbaX1}s{j zHp#YtjVFek2|)HT2%c21Wk~h|r56;W%|VPfQOk6$Ae6`f z;LL6YTtEcN_mJZ(qR!Ro20Ti1VsO7)q1}UQVQjuQZ_?J)uZFR7oIeIJ!$73U_DX6W zytf&SINF;*M3C|&+#JbTIZ@%y==9t$hL0(*5n|=<*C0h_b&w3Z#Y?)GJ(ID!#l8vN>A_rqeb7wk>Z+@z+X+G zMlZ%K$GjaRscm`&QfrPKk)$lgU7e7$+2-;ttNzru{TjZZCk$qdzem6QY1qUc%8ZuV zb%;X{mkrf{zG?hBp)yzT$4CJ?O&0gd`4R;ICxzM^){Lg6x*SCgeyVw*LNqp;3|9j@ zFCSO%<}p~R^ZW9it^6`0S%?F|3!9F9TzW2i9kOQS+Ou?7DbVg2gxMgktKm?)K2eqJ zN*2|D@TIV z=<^BVI&3r}aIkilGs|Xb{=BnI?oX}7-T*Fb2M)rkt-U|aINk|e%#~b&jl z!5cQuz)wxBm=F5wJp^QYG~D!MbIF~UP9?Z=?~%0bWhBOAawp8#I&LlJ7dKn7F=A-r7EG$fc{w)h|IWAWR=C5{ znC(-goxw_nJFf>~`171h0DteBX6Zems{7&UcJlk`Bx;1$P7WVx`%tRpb0nMhLTlgv zSbmOHcOVqPE5I3DU5+~=L#J0cMY=^a8lny3E^qkm4gD7#+182de}cHCM7`kcP@5 zM>)Y(urhJEuu8b~(2vs5k{6wo027?;Le6krXGhOjd$27iGb39YF*@^X?xu9#dN}qC z3+z~RGIrwMFK4Z<<86=qS@?T2xhN((>sB{j8N7J~NbA)s91W|g{~l3`1|Qw6j1>9Y zozFiVlg(JNehGP077)~GuTRF5{7rv#SaO{in8{`r@X9Rzy|5F0*}D@JEZ74jL(8Ab z$f42?t+I=|mW;gc+fM|Da2lIu&XJ!#17f4EILqIH;)3bVLSQ4kg$fYALs5cExrHu& z1bzaJ?%W3ExUyHHD(c5>BBvN*Qg3cDY9rlkwD zwc+H}lu3?jHZ%3a&OxR<0PTha5A9;&9)*`C>n|ctmIg!ka$0#2zyyOMu?|GZ&pLK6Oe2ees=^yd0- zfhf!IGB?2CuXZy|(PBm*2cc@ zf7{7wOUbbhtA5`RMl}#Br9c_PcfFgYuq1+-DxKVclGwRWQpU#9J3UFo?a5odW|=I6 zqsA#h7i1F?Z)pt%o(nbUr8zK&g33gk=ATMk^}4}+z$Nkgk^cqX4iKQ96gBlERsW1^ zy33_^$-y-dLMWwG@lgR&5uCGS`D?ZP^KJz{ryV)!)(hiI&KRu-B+KD!RpJmHKLcte zx8nR&B##4Jf*GZ5g#J1tSM+-EsEl3*a^&fxEqA>c+y$keP}E~+Gw6cwW#^+4&;6Lv z;Jy&eXH`x(t`@2j5rG%N2A_%LH+e{Yc)^3GWvg+POHK#$qSi?NvI;Vke<&}SR)Bj`kipfT)cmi1Yfb zltsQe55{81E*!aHgoi7B;|#v3Oz{4F3nIHalr~$4-N_Gr)>9$;gylG@;L%V0TsT3Y zI>X7`L@hc?wiipUqq1GApLbv5-yAK``cCjkyianoo7u6_knk9pIKn6a#$r<^1qBsB zd5HE0kfPfIi|fxD`7a!@fb;I&%aJ}!OpOsw3M`$$kM%y)??zQF_na8oi(oK+6P^%c zZbm_&28DT{6Q7^KmuAM}v(|s4AkEec1hpW@IP=gw0G%fbv}I}0Qd$D82)WRDr_ZBV ztLO*5i+c13LlWFwagQVNoC;%(6ls>S8R9Y4Wu;gRX4I(4vxi9^5)}sBz`!KAkUnja zE6+4BvUQCHHrrb{Yu47SCe`EQN$mYRK$N{#@b9LZD{P&jF@>zddM}L>jRii5yDK>aC zov15RkElYxyk!W%;yOThs$(%Z#sQ&CIQ|_C{@5R&LS`d#_mg{tG7=lfSNryj3XEUo zymXA1miP*Evm6%NLN3-{I>yb09MPtj9M5iMqz3N*x(gx`ZPMnPSQ?n++^9#(zX*#E z14`NTByL^0vL3)$FaiMx4jm6P%Nsc<-(j){%Od@)U^sNBh()9G5U`vh>OGVR1o)66US4pafCqOumdV}uPfnT`NdJGCV{qM(3l0On;$ituY5f}&VsbOII+0tegGP|B93xhebgc8u^3`FovME4 zCt>D5$w)M&4f?C5Z+2aW;ntXtn`j7J$iQfwsSBuestXbLV-TvICDM4j7e(~w6p|iJ zL$hM&3ay#p&PBueyMbg|?@$HqX=WjlDMK>ZJ*eF0HEXqSAm%x?$% zo<$nm$}fmoMDx~(M;>D_$WN2-WgvRc1h0jB0XqhBr-DPui`+EGpn7A8M+1VOkRs}E z_jtwdH3B?1NJM?CXS_3v8G-7FfDDG^CA=zO@dSD?2GX^PJvtfz_{(r)pYpFXk~og} zSDFXi7-=N0qE;*7{5K`!@3lds>xj%<)s=O{)bavry%0>jDZ+DdWWTx!Hzggl59Lj*rO!V~39fx3?*C^BK_;2*f4NGQ)` zEtF@8MqI=8U{}omxfkgdVV8y=hr^=8xpu(SESA4J`f<4s13X8Z^g;yt7Kr_I`lIvi9ym)*Uh#U)G;)-o z1doP6N;^BIW%iLY`(UJl6wGv9-Tfn>NbX~a%or>O3R+q@pB7#P1)MDBjEGxD zocvtDg{UktIzFd%feosBPekmAnmF7^kYT31@EDTJ(-vO1P!ejU7TjeEA@FeHwFmtA zkn#|4GHBqgSt6QUTD4f#!t&B+m`HJ+&h1-h>X6lRZuTr4sVl)v|aP*%qh*X~Lr@`)oZSjo0+P?U8wtf@^yx#iEuqEH%b|s z0il7WqPUv2v9UN*BrW@Iof#`N6Iq4aR`NrU&^I1@!iY5jdSsK=o>J(_F z(ZI@OBNjP1z=r}wQb{e5S``_ECszM?P_R|+;o$3bZWn6(?+!F=Yi5eH14 zDnWk(=7E?%nYIHB{=+EPTf769O@E>?q#f`bIwRVEZt4tFX-PqbWt7o(dKNe;BJz$7 zPq1?~L#OW-N9ljINK6P+tYHYzrb1k37Gxe0_`TfYpyz>I^sO87dYwl7}! zyPCKY^8JyI|2~J&p%WY|#lS_cSxPh_)04^w-c${PQ_-TB{}+TWA;U?{pVo^*sjM); z3C|EuIhfll0=ujPTtb)h^0@i`=*UrsyD(7Dl7k7@W9)}<+`kq09lH-90j0zq`23=z$WdU~?g3$aC$n{TeG~E_P22q2E7`+aKO;v(7YO1`~7HI=y>{&(Y zyUsqvzN=J*8HA;UU^89RvLE(jrmy5DK`t+!9j4Off+DX>HMsbr$9j-G5O5A?4W$MM zle0k{P-->HI2SAMKN@e(Ile>a>jL4^?EkKIY*IJxg_M$Nc#R}k)Lp-7_=qvjN=U=;`Y_IxshOfvvSwoSru1&3(pO-v zxNrsA+92VbeO!x`u`tSBl>!H(jZ01I()yx#LQ|9ITRGPx3m3AkxEK9euM@${}Ujf zr|F&*E?`)KHj&@485mEwpf%hT0BDyX_$Z@r+*1EHj0q;>0e9Q>sZa=;PCq2~!8yW@ z2dBebWV_>;#5?-o`-T!Qvgr3ge*^a*rR+kIKpka!WVZ#GMG2^anHtU*auf1tP&+Cb z0kt#+U2;LtB+dpwnODxo)Sml5ppbr zzz);0Vzg!Y>zCMq^9>LSkp?t|^S``}RvoV>?inpP*#_~wBRe4LWd2ll{goGXv$H~C z@#e;_-Hf~{&z?to<391vG137d*1p%5q#`r0rSS<`Or4Nc3WOV(Ith(m~{xCygr|hFY zzn?j;LV_#~I)maWOlCF_C;OpRXDYC*+o$48fEF(a3DAR$%EQo4uR$Bq8L8~m0b=DJ)O&oT&*pLf zfHkrpiJw)3^lLTbIJ+1iWnN4}ZEZ%(ew1PX22^xF%p9|k1!${4Y2)*(4fFa}+H+cO z>WP4o;VJ-~WMKNfyai)70ROVw{cKSFbH)W~x#EG^1euvP{qmnHtDmH*dP#yxS2j3K z&A2C|pIC-Bl0iCmbiHu+IMbn(S?7BDHeMhouc=RZumT6JH;%D$s(%5WsMnGtH=2+- z!*V(j6ssDr07?3T@Q#fC3Q+xKS7+eRn``5UZmx%9L_Y6A2*No<;-&T+k}OyQNaD1r z1o|Vl_8{bLcDxNJ8{f5xKX&H|UhQ!B%PJ=8>3s{4!CtJDbIjSdX?p>1;kCHuo;?F} z3Sq)~9w2m~5R~kqnEV{{&7M(C)#ONRxq1C z4Bx%DPBLu`vA^F?;T4lfgYqhS z2q$cYW5SoSMLlYsy=g?!VDv80=W_sM6q^-?GKMcg+aO;rQTa&zOBOk7w z1M#v?y;Bj7jLuwbvxAVt)90C-{*MoB$4*c-g2h+c%o7sdc5=6F^~r~bq+MDwNHU;= zRY*2gAL27X0jQ_NnYvYqcKE?^Zy=Pf{|TW?b@A6w#Ix>!RFKhx!S5&EXO1B5 zVsqFjE^TVv?Pd*;e&P{$_$BD?XBI>cG7NDXW2=+79Wq{zHsPM%X8KSQr?ms)B;5oG zm0^f0Y;&cFLcW!rZvk1Q2Rnr1k|7Df*r_hQnmDz#KUhe2oeCDG?C!byYrcQD5J|*i zgcCU;lYZ#bumf$iDcp!mx$)ou6-Ql1$({F5&#bqG-Yv#|yE4$X2GXz>Yt(t9L3Jn- z>xg;-;~nhp``dZ?{%=7lg~OB=Z+-P^SdHR{5KRsx@!#NQY44D;&{S3BSAopJtyS=q z9a)5|oUC$~*|evrtu_Amj@cXXwM2gXXD8*ukVfrppufc7e5>h;o1S|rLJb|xQ63~c zuz@PpoH4%sC%+nImJ(2w`a(`7g@G{V^Z@az0`Ojynu26tUfvUCtGu@9m)^H7PuRZH zpYce11>%RX+w6#ZZg1T9A9mFvt%F}64Z@P*n<|x(Xb?n?ouyJaUe1Lj`CERn_z+GX01Xj6bvK%B{a)35y^Y%V1g+Zg+Gltk&<-DDVEySAPfe)FCOpQ3Lq+{fF&* z*>20>MnP>ZY^Ug+&e-!;dd|5-%!{*9budzFvc9r#!mcZt4R~6ooZf%9rDR55`J+eN zFz9|)L>H8VeRrbf{21i7IrR_?RbfAH`E0s?bW5xoc8mHi}eD`CPf$>>YngRbZ!6D`~u0EBZ&U{l|CbUon5W zUVI$}pt>FY$NDKtmi+hkS+!7)A)Q}+-+&ahoyeJg8|jBES>dakLlipWUX1G=_jMfl z^JA!Voa@Dc&8@<EwFk?WMo8W*8>6EnpMRP3LMvh0NRdL{+Z77fh^V zPCm12rMQ^;eInFg>3r+{@5f*L8x}OrziQY>AIZx#)^EjHTO@4tnb24yXNdXVk!;kT zCbg>=;&`o5yrzDyIm-RYdIta%%zx0Z{_gfw^>Z|>Sf4&5r zMTE2^&n)70ZOA;YN%xgVq+LZU>FgLO?MTK)1+`UR(TdtJO4waR&w19AxW7;^5^`j` z;1^$|5Pe(hVNJ&~uk|A8#!IgHQ;iC-!WJqX6G5Xp?oZEw=POi1XbBOWvSCxv0vnCh`#UT@-B67P3xjIB4@yqzpl6Sn^g_5^y zkFNSo4w%}!vm|XChDOY`(l7O(sz(#5g~2`iU3Bo6ft6j16}wem?R+l}UPC+N?gr58 zsZQ{6T%=KQ_T!$-SXp$VN}F7vwm*Cb#3H@?<^^`*O_MDXqu+F=6%7(>%(lFeqGj=dOwHHAFL_ zDm*<=lGhNmKF6#5TwP3?6E-@L^V5y57tJdcPCe-!{v-aP>l1^Q>!1y&Hn)s&m;x8B zEVV!>@SsWW@6hB@&}TzTJ#C1{u8PN3)OndvOtA^~C$u%%rj-3^WjEa|cCsg=4_~)3 zcMKIs*xpmv?Kw}+e2Mq!H^t=U4VsIX=x9dg#h7nhA9{b98zuOqan!N+uL&Cr($!X} z1*>_Y+8Kdq#j7&eoZ_umhnaGhTI)=HM4T#F@bzvQ$1?{SUAjLH{~q?3S}3on7+T3} z`Z3VB`ng;AFOh`#PF*jMPtdLU{OMTSCOBMDvCpi0Op)|xWhjs%&@HhYH`_n@^7m*+ zj%_(Zy^yZkC_#SKSMKi0_epsxtH>|oy}#_@kCiiCopD|Fl&|Eus<+|^{i^{roR%Vu z+hJW81LK-geTGVO=^Fh!#xN&F-zfhI9&3Mg;LE!43*Sg)2ahOE)EWMRU{o=?Ua{hf z5Nv=rm11(M|CL`>aph@(=erL=FDNMFjAPWbPQ9xDNw4%t=)hI7r69cx!ys0!VbF=^ zACnfzC`5@cWZc~0`=Xr4?D%oz3{_f>7zdD)3r#It zo&}m&N%M(j4DW9__sfIQW))zMs=_>8q<^gcG5M3?D57TdmX@P`VS}Aa(C^uWE8*Pr zqdRYIqpyaxWM-X-yG0hrMKCxQ%5ObN^uuf=CjLaL=^(%OE9D~hKb$O8xZaytkCYYaEk2u7>-QNY z9@NC?dK*3LLv$3}Dy2m?30Oq7cs?}5(uP|e%I&#tyP))BDbyPp0jP^ts=b<0R4W#l z6XI_o_Vy^)e#9y@2tQ(<5brYTb{3VkeHHHfqNr&!@gc>%lRW|J^4;q5;q;Zh@m&wF zx^6wTb%R=wK2lPh^Sr`*4iZxeq1jj6YS%jXaz2+bwg6l%4dZcRPmyk-@+A6i=waO( zzoEU}Uxh$u3`kxmQcJkW$E5g5Nk@8!iC{sWr;*Kb*p`^l-1|pf_Od1>4wg{eo$9_o zT?A;iwqRQnU;n%~peOdD%;x4Vwk95bt^_~&%hlIJ7wz)S8-EB&{UEE55WdB|hYMg2 zVkd+?{qV-zn%kRc2X~0`p0bI04*UP?P{Zq&XQ8O=M|p_p$zpvWqj(zPM9@8gP$uDtBj)@M>Hg4z6{-1@z4IQAaXRY zp_j8|$~idyDk1N+>};{9f5`j**`l5zhlzmC$Zd?UCOu~&4MU;^V|aAEZooEEfsA{9 zfM0x)@5ImcUHN_7l4O-&#D`@4TU>!$ijKTjKln>8PxIQ++V~=1p~d@JqMmc8D{xO@CYRm-!=;H?fO+M5&IgMLfI&UA{xea#mu4v(F!IdS3NX{%20F7dv@ z=s&Hr7a{9-V|3x-)rQbij_?th)#_=Z&zy0i_ioyxey&vG>s@j0HFvMETX(0H*$)}k zn6s-;Et`=cNG%=U818??sLe*J& zcdujPq435kbA$QVR|hVjJ30`HClz4>`uX18^ehm5vPY2WW!*o)+_g-83F_{v z&t*Jw_7ALx+rM12oeL+4c*BkLd0#kp{@*KK?k4n^sr(`kSy(sznxf@F8$2kLs4#6< zr?ctt8c}6T|K!478J;`!Gb3cdGscEQ%qSdeFhjU~Hl zSgs|6IH& zZ7H^$j?=APwd(>tF`_GbZ|>ey6rYfv>Dxc~f0=OahbQu;d0pCV&geEnrD6c2($`Pz zdYr0du3oOR@ykYuNFK{1_zryJW9v%&ywlsLi!Tw9w{tNlSGkJxP{>i8(xQd7 zA)5v9taVt9-{(Zz-l}Nr4}Qhw-IBJ5rl@Nf=jN2ZAz8kg8oJ2n*=~K1LVMkHy6e*e zsITGvVDhhMSny3c6s)@6|8V`p>}KMOW6wO}_#_M3esF|kZrmjllTX&B{G^oN9Xed@ z0)-@rSZJpC+p6*F&Fh|q`kbMH^oFS;R$yb|nxoje2B;lAJmT}O`lN`%zv<(O zZ0dVG=S=wtsmtA7Q#{5lsG8%Ajj<2(2z0h9rKs8m!LG3N=w3!@cITmclH|Mo_3Sei z{m)Rbo0s1lzZr^L1bf98={&8%KjuCcG>npj4%IrP_Wfoiwh$zN>Qdodsrc6oy5xF^(h{MIbST7&0qOnN|4o0SssHcWF{A9aE+nx_I%xMD z!** z@7N)uTIo^#v>qo06&cnXiV`2qzW7oO_yF}R#J=mDbQb3*T+ZvkI1IjXa}jaJ!Y!hH zf%VA$nttfhWPmdD0FYo~nTp-2xnzHg_^ko;SUt^aa8b~N=!9gYnPt5 zg+R2d91K1EMs;PzK}0 zCqh7Ee;xr5A!Q`pt7~K2|7xNspt!A%6rBN(F!S-jcnWv52a!~D7PjB zNt?dB!egs{^C&Bxi16*7V+a&t`JPp-hU1TrDfdclJh~T~G*LoH83@ZNe+NES7)}gv z>R76URu|9^pk*c!+N?AV|3~-xw^A>-s}wV_?;37vSen^iI$aBSx|`)9aEc+5zfDlF<6sj27Kw%Ma;68lSQwMwVq4tO1+q8+aRtS%#Bmk$wy` zzR1x?D#@KUMgDQz;eQv!46e;q`9U=l=bj9BBGU?wp+6Y{xCh{v=<{Fwka!XXv9_bE zcQmpMx`Wp>L}8`7OBNj=mlQ%z6##zsUH|Ah zRf}{VO=OMDSm3|&Ce#%w!QjCKu!XOV4qOptPD43`%Vo?*6x{E5h8k5NJA9+b3+ijg z`eH%7UoxLrFC1v;W<3it>4Tcl_Y$O&XIlsbrYB{ev*rKUIzaIe4gdiDDc0stWK*@w z^s&LOz0Q`fT!JpQB{p9@@s!!Ei?9B5TOuyxYo&`K(6=N@?1A{1{ zEGG?5eZ#ZT8%f9tA>doNPNS>_R>_g4%hIKK#DcP4-+*EaGq_qcyp!ic+A#vB0PoRB zM{#zG#iJj-X(IC_<8)m#!Z6x{mT*a(*GAgEiToBIs4~nR|NjG*UPRYh8*vpbAj8iL z-85MG{eTF#YR@F)%Q-9v^T3Z>$uj38FtDIIl$W+${t$kO2~mL&Cd?FQ(K{IDc_FhU zAXK_#fyTOh9}o}&fQH0r{99!a>ej&VlR)f+!q+<%$WZx`$n2xM-7F&sou%W-*6#tY z*Z<9o4W=seZX$?03t>s$B^%KsuDnfwFc=8hb)<3#Z1XK&wl+oNbrNK$w{c+J^y@qe z#jYa;H40G=s|PVCgk;HCoU-{>Lc(U_5xj8>cQE#JUWRPH3$6!lJGlGN#mOC+C#TsD zcwwH@!dtY$6cjCTtxX#Z(oj14z zcQ*(QUW*f4zxb}XW|%;c31I0UfZ|2+>t=`*IDgQ;K**nlyIq%A7IYetlxDh;98K~8 zs`HxK!@0Jxg8k5Uy27I&ZNizv3RrQ~4>3rU1ccn7o*2{5t+)%D*Tv%N#zN@PV0>+H zsT3|RREgd=di&kudvrGuwRshX5Rwm2y4L>pt8n~NOdBJAru}_;R!y@lLtVL&Oenbz zK>avRV+iD@P!&QtG%!krQp^Jzv1jE3vL;A$r0jHE+O3rhPHi~Mce$uzI^*y((j&O& zo63sf@9$_foJ2OUuFTJ)8ZWVyQdL@j8?y{gEC&2A9Fx#BdlX*R3QIAG?LV*3uX~>$ zrML@p=aftfU-o0)rTzmbco4M=1pVlX6%~Vzp}slZkMdSv86Kt>dbN|(hS_~zn}U3#&pK1)e^;=;k)yt{Ko{+H};u&R(b7=3WQ6hfNANErs1Zowd_`2 zgThkU+1q{FZqg!X`9z|C+UQskLXgBZ0#*62INtOW4vg(Cbe=FR>K4){W_N68&j?^@ zEmV{@3dAcj;6{SBaBj5H<*&xv4$iMH`vrl#R+E{!8GW2uePwQX-xjQmJ)=H)j7 z-j4^!^0<5^zE=uZp-H zgnd<=P>~FTdVJG6Sk&0xZcnWd`ybL9e#j9zjc=nEQ8gBAqbuN%>PD!GTBk)<2r_vj z-z5-0Y|&C$+$f*}qyh8cBicQhtc*yft^zjoEW{9Z)kUA&8Gy(B!WD$m_=gfCMaaco zY^Tey=KyDrCAXTvO~ZK`T62;JdjnAY9x~TSzZ?9wjs4Ytp77p5L6n6fr1%RCR(r{)LaP6t4QUE$Z%N{jhb;o_TQzgl`qzdU> zS2*UAY!$_bezgsm{{R8h!)WY)H#AC{&Be?;u2yJrf+ZJ^%HQj&55jFj1Zl}_Ik^_63ad88|3^9dT5YJp}DKlKPSeWDkJZAPp{xO4`*}myP;6=*OZd< z9v_3KnY|bcg855)+KcTTm^p~2U%C1AVZuf>Ou8}^046UyCP_&w_UGbb6JP|bhGnDo z$2L;2>lnDYLa2G<6U5W5ov(S_B3;|&zV8Su$V{Y#*$-M<<(&kfFPeAKt#3MRACZG$md12d$P?c*VHc5T5_RlVP*(iVCEX5wu=lx zFLU$@1n4Bsc=!gA=a8a@RpR(r!mtU2yk&1x?bQ&A!{>{k6t)@C#++z#1a&kqq-=O+ z2arz9KRj@P@|-I7Sfqu-)zhI3r*Q8Zl)bCgFS$ZWGL8uVAo5_D=y2WAxnHw5pv z;2XwuA`Hcc+W=&(Xih5kO|cSV+JqLlGs4l?1G;vJ%jQNre9vw<^S{vgOR-(Lmrb0v z4@L4OkwndTj-WrEuUa-K2r}L$n0}X`fC)B0kmucsXFoR+VWp+%XZ}6`fW+M;>$*?5 zJJl_rH=ZMwYp+ml0b$HKvd#f-qy9=Z>M|?}B|V`rwxr~d?}#WK5_Hh+X}(<=bJ7q* z4ghj~-nXJl8fQOO*6~fNJlb$*M@UosuuVO5dZj1-iRT|P1q?-GM|_Q*!y}I%41?vk zS0BJQwT}4+F{@oT!SgK$P-EWYx!?jPQ@~(Uh1Fn(hkk39Q_cVjIhR_3(Y7f4TWIa* zMd3{1W28t+yc4mGbXO?23NcbWiV??tCAR1Xu;^wPR^f7n^;zTo|D7*r_AmMtEyV zt-L_XE34M7r}0aCA|~X2$^dkJlK_=a*!ytr{Ke>MA_mR__!dsFmh4L;FumQpfHIPS zFgFVM4-iBvaD(}p&n<&Npk(LpjYzD{=Zt268t55_);}a?)0Be;G*nXd^t}hTHyv*a zBOp)OAoPW^rxW*}Z|f}c-ruecxCk%Ovl?HHoPoeF3)$H>oPL6Lr?X=6$E%3CKC-a; zVI0g}UXT|8>@xgi0gAeL%=6{m(GRiw7{w{ zlKcR>=4OB=;OP7mjNHi?*N;0~&maOzKA(EdR%g zT3@=XUL3%vr_0OPbf6u7_tk=vY|~UJ5;ii+dgO;Y39CE=L5soKYgx{Z1Z^ICWXy4f zG?Tp!W_t!a#W>=Lr782cIRd{#V+s)7en2u$cE>QefBReK$FSo|Kgq#C&TH!N44uCC zqlb)z2Oaz3jzvp|!CVPdNeP;t-Y@EsNE68{nan9oiq#G|8tHg;vr7E=Jqx?xyjy$CW8GnOIwGU&UKNZMjR(~o4EPiVSf{Se6 zFKhPDH9U+shpPz#Y9`B$Qe)~dow@-I<@?CeVP2JhR|#ex*h)#CfB+^wW}Paeeo1SL zFmndEY=up`>>OvzO_dv%xJ6PlpqouODO>3Z+X5EUP{Y5bA7(;Pr1rTA{;YT-ODglC z2b1xAAk>fs3=#dR>qZYk!Pw)2z-Mp40^l;nO(Yhk>aj?YJc{a+U+To2n6YzOXZ@|1v*$`WVPH!O(PmsLC?+5XX(4avVu4oRoJ_VK55?mM)c} zN#xk$BEi(qAE3XuG1Vr1?u%?I4*YZ#5q-g44Rk9b+b#G5uT&nObjpam{!hd9Inn%ybz}q&sg~I+B7>?^ zxe1es+4ubBABKki#zMOde#w#pnNyQ&1uE8%T7C*#t|8y>zqah}NN48lCP>5rDEE2n zo(tH*j8&kcOaILiNlK>ZmW0b!|N2_Y;qzvzX`F%N>px8s?G;@(+yA|WH>jL~QsdLY zojR}Uw4}Ju{WGeEXbkkkQr-qpX<9$r|6Qe=A;sx7_PjXl&5{!3k*$fo24Y0V!-Gfi zHBqW~_QvaR=c}pMthxjTV02%=P%BrK^0qw;1gL|=&f*8~q%RjGet~$W>pXxPpnnpi zI{ft&wb2WbwZtkbRRbqmM58s|Ghm<0@6MK*8@`AmTLBv8N8Gf z=0nnQqdV0w{H!z)P|~$RTrIJp#;m-2X7hTP2K6#oW&Kss+3#Xma$}CZcpt^|c@mOf zvO3QT{)~xVFQEGS)ib6qmTUeMh~g*&AP)mwxKnHvm)af(A(R2Fu*=+|!H3NGkcNqp zsMV|Wqlgutiv#(S|86IMYB0g3EWi#TK$>#)!RX&;T0AVqx#GL3y;61AXx^kne2Sx4 z267v@H|={yvz`HJ3Edf%%r=&zJbc77Kur_W4kaW3Nim z25`-?L}16o?OcSD&)FWn0P)9Q(`%L+MO8R@$S3gkTP8duBZ1L|ZSx25{ExULAk%qtQw{<6pplqFotWhFSaD`yjkkW#=W6q7nIC+DUaLo zC@<^0@7{LJ6_(Mv6bovk$SmL{(8WHUnRc8DZWJi@dGc&hLh3%Zl3p^eHz|7fkI7dj zA?j;N=um(hn45q75R@5TuN(biax$nGj4crlu1pV(ZZ7beJ31_IZ;=0fKTC=i+%SjN z2A@($R6XeaBY@+O1uFw5{kbW7O@0$(1hA6TEeG3ZfElVMr~pGqHLbGn3-G5h>zBy5 z+{D^bn_w6P$`K%+%mSB)juT-a%fEms{Bj}tUHdaC4rjXjpBmr`uGAX8+L_2+w)GWV zLKF~BM#J|_yJ`QqfkC3kj~+%7x&4ozZ!`v`WS+ZQrzBofCQ0J_E=A8u$RqZT3n_in zzUY~Fu5N8N;VVPuKrHcyR7EomR$ID*-g#pI>oB}663QfNyh47%iqz9+hZnGFLE$g2 zkbOTQ6_$C8KgPzpn;mD6KJm2eCo8%+$TUNh`r*56u=vg`Gf=YOkkOKlONt z_)SXx?g^-nK9U^vJTPLgV>!iDB3RFlhi#GaPf8WeE_#xY~dJW@%aU@7^b~MLC6FI*0 z?B=|u>dtvj2>%AOmOeSp=-d5CV5js^wTsfGmp)tC?Z07F9Z|rXD!X_bc_P zvE&a3mvlm0jFI+?fV<3>^hBjc&#L+C^soIfK%RAtGARVF1usHWR~vXj{{(n4xqief zt%01A-x?c_`ITS>kE$4Z)UWQ5JnBq8^9?ar=a(6lGQf-)^wzOP7luJmqGpD7pm74? zI-5_d?=U;lxN&Wkj4d2sq~4PxlzB8X%d9K7#$Zxem(FbCMDVJhy@I(Ul%+?xtrS=^ z76Oj0A>fB~HfZcZL?D$=OSRZraRfbsm@wXmMKCD}2u*#(D8f7{I{tCfpo<0MORbu! zrSjC}L)D~5?gRlfc4>lkLDCL$5haOtL2};ztbeh&Stx;IPY7}W?p4XBlhChmg~`wk zFhxjSo)~IF>wNnIX8q<<#ZuRkV&PW0+;|fgbv3@ zu28Yk&ioINGujiwr_N@g2DQ8TOIZoxdik7#kVQHj`Nbkf5;|=&9 zIG`{sT$h7`b0N}LzvYJ2J{h6c3_GJh1iAH;PK5K2FZf5+LiBDQG%inkg>z-bgIq2U z8JY(eaYMjJ77-64{{1aA*F#-sK8)-{aKDa#8$jVXD29 z4jK`s0H|vNK0JDsbrS)XtnZb7(hW=tt1LrTI=mM^EW>dl_!OFx(SsIU~B+w_<_(y+{4~<(wsvY3yFT@W)T_Ctt+RHYIr*2KC!MM~}OXS3Ui9A5T+GC4_wwYL(-fUI5O8gu{qQaB|MblK>4DRQrn`XGmst3J)5p7)_ z1@H>N`w6#q9hPdc*9Sb@5fvUcf0u*65?`O_aLf6egR{w1IR2C}b?)7La=HXrF8OYr zx`TXojcWsY?uwaHb39E}Khzt>&|j?@gs8P9TWnrdu1pYl*uHHEtvfYTC&gOV#l1cm z-8X2XfLg2mX}#UZxit`@5^#ZvO-*549~s24KF_jlc<~DOEVbD-*Z!O2tkXw&_qE-=`%IC9>|z?4ei-U@G}ZbMZyN;2H}_vD`b%}+;&1!$)&DK-7t>tu zGA{a3-K6TgNhrj7vaHMf)BeG=NvH9QuuDZy`m4wT*)-w#vmqdjSs(_q!hsm$$`E3dZI-MXQ5h6 zD(-8ui=FH?5-nxwXW*znsf? zY-YO6aM;uC(6vWXGbx>(X$8YMlvufZ6Pf3{R7U$d$1o7Y(%okfZG(k)c+O)`^&`gP zld)YLVw2~UfaRhNzL9J5N;<>RxeF^1#C6oO7)wKj;BJmFnsimiKbGpg&B!8`6Ff+h zxT`hPz7)#zl48cyY}3-QG@|qw&v($gQ@)fU{x)99d6#Tr$}$a6NDX{~GX+@3ubYz% z@H6=j8f^*gReLY7K2JnApOIwl-dxhwyG05!oF3-La{^^Je?op1l^i*IcSXoT zE-Xaq4(SiNt(;X*=SPR(iLbU_i-Z$_^)gq2B7RIhO{BHnrRF;3GPeKe+;H`tpTK&gEzVCBB&9o4ChMG|a;mkQW zn>I`|PP9T2*2m~I#<|r|JCUkFe#?@AqM2sv7w8fJ z^;7z{rUU)^d{&%fqe~%6=CoCLYLm9xxIeC%2V3RM>W@@ z@nksxL-ZdErt0gUd&oc7-bfqcd7QKouHgEb_wyh#$ zti9`131>y$p*2zCx!}Z>ROQdK?Pp+5cp97wgTk61_N*$?RYrAl=-C!?TMU%HufYBl zCmzj|7H`!kuyTceIu0VaE%yy`qwa>f9o?>s)7YE>OYHMqY@e^t-Ih7rgPqC-*}y`+ zHrwj$jrengQ2@}31bcw%h{hHGW*;6xHLcs~Fzi{$79znM`uf52e!<>{<3)t88 zq}O&zt^gQVK%tspyoDZIS7i}#wRwU3c|pQ=38VpAAS0N3yj>vZINouQG%dG3;^LC| zKrdH;+vLT!R=86RY$D~kW9VG^zSWy5`Y=WDO_}cXpg?NKR;6u&+f6eed&+Q(2ZnoT z`&nMxk;!=@HaFv|em&V0yC7#*FLI2!+%1gfBsZFNRzQxvF z+c+WlrV9*YpUrFMh+KN?B%j%0E_t}p#TfF(klCf!J+RN_VD-bXcf+3KH;R9TbY4!m z2{ypoUH@)KrMU&eY|LTYh}t1RbSB3>@u(x7IP8`9HO%@E^xpY#-p=G*yD_VUDk`V$ z3Hyn87C&?wtjarU=g%1Q$1w>6eoZ*JOm*sElpLW@Ac^B%uixX;+NV0Illfn#ls=kf zYHN!wD$ibXqQ+t@#qU<)-qtdi+c8(3O?{f!;=?Z#y%*?3@(!qswGcl@Ufho#o*lj4 z_*t#Kt}ReNbq^qjmt=TlI=z}Fw%Kma{56u$v3+!{tkyYA69tf>XWo^#YFqc`XdTK? zKN)B1RmKN^N1;S7Pj03;By5;LpVLGnC0h7v-4`9R+YfpH6u2p(KGzFJdtE-?a zbKmK;;CkXY8#lQmWW3JqLO9n>$G-0I)ti){VOJ?z2mx&|{gr{S*G0ST)IZ)04If7; z4a@FBvP>J!RbI2PdgW+|&6+HX<@GrIRi<(Kt+{>$+}s8>^RQIKM7U8);Z|O4?TZFP zm5R>H1L)rejk3~fd~=d$Tav(9ri!kSpzz+%mh#Y*%cHJXd4gSN5||kcR=7;g>D#z_ zJ+fAsHBTT2k$WsvFWqy}uFe9?k3K6m1-BVj7U34}`K2Z`4T-*y=Z|Z%^l*WDa+}o- z#kqoxD{OctvRStp*guv1Rvd4-pjt~9oAWwo>}ASw@5lGjaqHrkL|#I1&SdD+spIL* zYXsL2&%~$IC|Ys_kJvhs{#rszn9QtCTUD+9RoRN`11tpKHC`iB<3<1sxzSp87}muQ zWx~0bOYg6uqiXNkQ^-k1|Dy+VFK!?WTd5aragmoiaSq6lyQid{`FtJs-9)(PqKb%D z9H_;05b5!*tg5J?dpg7XPmb6|GB@u_t;Tcmn~TD-GguE?GQD>HgS$((IcEmwsxoyE z$q2z~>g?*VGkZ(91EMS6$t7tdI5Bj9Tr zQ@TH;g}E1Dv1YTl}2x4 z4{Rb-w34O0>-=AS-jw{|6mIgJk;9XThxl`u1isl;_X!f8ms}|GJ2xS6LhC2gBc`#Y zW1)NbYE^(Cn_B9r^K;fRa}4hucZs10JA%2$da90tz@{Q5dd%Tw%p}7aoiY41hqbch zk?CSIYKd0{I)xY9cIVWAD6~iD5>luJJ@a z?Z2S9KY^<%7+yGPf$y2+Yd^U$lAHIs*?gXgP#|M!_SEmb^R4#N(FxZG(m8!E66u-+ zF0%yc@u}Z-TwO(;UOvEic}hDLSbGSog9+ja=+ZX_p6Z3pH1kDvFezk(7B@IV@~1;} zbW*|M?;B0Ll9ojCLZqaueZ>|`j46rzslI{0c<2)M5>5L+Kx~+RZ3n6%*xdmtPza;s z==b~CyIq-PU?c6+2Vo!0Kh`EHSl)f{=}bV!vCNUwL;8pNrx|6(vEmluy~Ehj9jcBQ z0e|M5yilV*>JuvQ3s?W|5tlhrV+>>`riJbrLwm9=96dA@5sJ*&tJ@de9jQgG)sxX=^li@0*A+7yP z$oNNo_BbcVRMI8_%@rpyn;NFontOdoQ|!m(kngFftWGwMH-R_wqP1%O00HlHG{61T zp{bGE)v{nCpe--17?7~wc(i}2OLMAyw?3G@Bzq}H1?9PXN*&H;n`W0h29GiRHdc0P z#*HMI%2+R?yxw+r4qgdQ?$_Vq8a^c;n93^2-xi-JzMNdiZ-g`#+@)3?9}6&&gH=U2mEGO?}UFwrRsFoIsyk4@Put ze@S6?re9tjCC_Cid10al^zIm(NTbOXXS6RjCRFm^D=3@`@{mnfelF8DH(5Dx{*u*- zNgH_^zX}ph7uD+%rUqunTz~bwxa>@o{u78;-4BN>58c|O+RTHTe}ere_Oj{kAkO8N zOdq{UwT5UF9=RZ??E9N4ow^*I>t5dd2!{DLNc#lo^S4D^n$GJT zxX_y#Sf5;Zq>p5m<4JzgPr8Xbj7C3WS@ZeZ{&Gb;vi#q!+h@)*BiR*C8~EHX=*lZX zgyo^BG#>u? zIY9dYQayR_McOkwB5-q0Bj;G}RlU19e-8H5@-Z-*OJ#$dCI8?b40Nz(4am}hVmH^P z!)8$&sh{s2eSmw0b5ENa@tn*7k`^dY9^G~>lTI!$|*b(~FI`A8`veZ^!9cQoQ)@7I+urAqM_vUsn_k_W!}r z^jIZzPC6;YehC>wun$XRRf1 zH5KOq(p4v4Z|zLgI_L4+RI?xeWav+eclf+D?9>Y=%$skWs@BC&=X_k1s?2RLP4?Ul zKZQ=sdyK0p2RKNQyEu+4@*>Y1vJs)?7EvtSa6_kr7v4%U-x$ar6}@@*n7=?lLIy>FW{F+oK9*dPwMa_RTcExmmdi){PrYCz4Xm+wK; z;r_FUAiVoT(7Sj_AJWNj!K}noX>xW>D{8`i9CO7UJcGQuOD|9Kj5_78bo>M^x|FnM&4ePEOi!#2Ed*|Ss~i-`h2n6Wjc|x0#`XASGAV@fofjeDQfv_j9GatA^$o7;IJYI4jatW~mj<;i-t|;n7lPv`Arj9a<66mDSQ}u6)Vjfj<$G9 z6W^7D`(0u1df@Bxnj;8;@L|Ee*+&;Erz1WNWY5ei${_nxG;&4X|>MfX_^{x7iQeIB@ zSgCDn3j<3P+OoZDIlA@Ba@KGo{e1K|xh!>0L#kit^ec+Z`!Zh+4Kxe)buOM0xo2J= zV7ROEiYGMH?*)jR_wb9S+TE@Fg&koQHh#jPC{|e$`N7xR*T>8$vHwNhC*O;=QJ=c+ zft7qTbJ18_(aS2fNQo_mCWg>$CVy4uO~y+B?c(_k&6VcLv6k%gOxbXVBYPb^Ggi|p zo#YwhnRTfoS6={OaZqiF)wV zjpyiVm;jj;Us2d&+T5z~;E?T}r)M=v4i*B&_tG#DsvK6r&jb>@t=MdP7mQHd&^Z4Q zE8=olmh@d;?k}oewhxv*bTx)nogc3oo z#_TP6B5wLQ7{02UBonpkGFdns{r0=kw1;#PEl$`%@mRw>KaP;m@`?u`Clph-r?a_% ztr)Vq^krQ3GM=^r$erYJXVW>Be?8))+zwgA8ZOD)NQ-W06UV4|1y`{@EWN!L-B?)u zRo1(noZW0p@1~z=1lX}RO(YhNA?kGTbjAV@21Xc?r9bZ=DjIf-0Lnw!9ll!L#Ewi4_^=yjuT; z>0@PI!OR(|aP3a;{oGwut+npzd-C}M4ZHoR{Q4@?eq7}PFK2}qG-!>`9Nxj7`9k3P z3_G0rS{9#@cvRR#r*!x$8`C{-{)BVw%$>uBU5U)Hm)Y!rsw8rRA3V~3x3Di9Iu2}9^%CJ|Euz{v~}98#FX?W^~CTtnK3_@mLyCw zycf8C*C!6!>t%gvvDcZ6dqlr&m?mF=1qnJ|1rofhCSRQX5@z?UUySzdESTpg^Z4&9 z^Y*bsveQc1-Ddm!iWO15H(j-B#`T%nSB9Oo1)qyP{RWP;IZtq`&A9MX*-x2O)W>c1 zNa4qP?SjpEQsi0wP93)d^gKPdwbz~!3|GtbPYFo$tDuYD>?yu9z|uyDy@)iW&x81S z@(@*42*F_+MD-CZ^-Ie{H^HOX*!_kJ`2q=5HkM+0_v%jS)}ykOtSU|r?nT~-E@jg6 z-Yytz+!v(fB>k*f+-Fbuy;X0Azounl9E$Kp+<+X0-V&^{=Y$_KmR{`yF;Roxh$6hX z&KY)wPo2~KntQi~FnUyRe38Ah=ov!~ob=kj!wqJMBF2W>*bsxWFI>x?CX(w0{lEUS zYnz1}{L;GnUNV19^@*&x!PCCr^6y!wCl^>1}hjb1)h*?{m(47JvUXc~b~ux_QWK(a0&^%Ochv+;AU%ZDp0;m}|O9 zQkB&xj^O(FQ72o6vse?Xq9KSxC}_Mc#Y5@dGe=yS=_8NX(&ykCdW-tre{LCO2+Mx| zY>OyP^Y_CcGVe#-Z}H_uWQVsGFR@GLl5B+x)b#@H5RRZ1ve?zhv7d~lZ&1B|{-*NK zo|1lk1e|%f>+iO}=WD_6YYJ0fSqe{ksOO@58dgaMyAzUkYDP_#+_n=Nke~L#f>uyx zRszb75YpHUdRwY+-QuQVaaQ-iB~=#?I^&uM+cb&(6+T@w>lF4Ocz(mu-jUEaSCXqSt|h;R=@YxSl$SV}9S2 zer5rE8rJw)yn$*AXk<#xMc`H-t3gS=$P zkMfV4(q6F|v6#>iO0;tXY0y>pQK)l*KAts~e>KyY!R@WFhtGz+FF+L1uAbBW`hHNC zyy8Ch@hExM5MLUI5v@#14H(P*kzD>B(%1Q2{{g*HUHzqju80q<%w81Cowk(oe6ybq2;-uNI72B zV!G{icHnF1erXfjZn>0V+4gp<7POy&rUk#sJS8Er|OYuJ7WkLLg1 z)v#rqhwM;?(@8GE@G7}}YfpQtt^FbJ zAKnz3jUN^gr&w>JLu$hHi?iHjK>EkmBn#g9GT~gmNLu+I5*cK!&gg4xH_3?9ii)ft zD$VoM6L!sH-XTs^&g`wPo%N<4_0e2=U-l+Ya)fK_afOvkVw4=j82>e|n7%!IQ>Da& zGufWWRT58&uNF3XSYnQl3ybUOx8!xDfq_UagM0CO_Q{3L@ryiH)*Gm2Q6SW5STeMk zbMV03M~vc?vtFcwjr&!rH`lgQg23u1iOL;zg7SJww@;Qi616(L%il(VPi zVxdge2FpYOfV8i56m4ny!YF+Kn>mBLM0GwtZs!BZX=s!qa%;I(te%uFha28=52n`D z;5X+sC0BG22FM28NX8XIWSeD~LL7&=6n+ZH(A zP3Be}r};ZcS-d`C*B)Er>!UM;pZPVvjhCOaDjxe0t!8HIa!guf1U7zmmjilGmvWpQ zOZ?7903y!NOSj7ccc75U_Yf=3wDAyMqJM!kFX~#Kb>GZRc5#x*pu^I_&08^_-lB9X zz3VE{+%$>Dg}?s+>A@!Vppq#^DQO$z@@4N!t!}K|raT)tdxY7iU#p z?CDVGFSGp}2-)It6M$!@Oe#tgz3rNtxgD6Q`5=DIrsGwVo4`>oA4t5 zrOP74HId{u_GT-)5L1Ex9OfGO)lRIWG+4hDu-_iyHC>;NVPxa@z^rG>_YM&UbRuyqxYRgNv>vIIrGy z$UoJU z;!^0ltNm@FRk8I2O^?ch=Arb-v7+)ktey+uqo*2S?C_{5P)H7?YWT)ZOw}Vd z6IdTynEHpdoWN4P0i=KKpr7Sk<$vNtmfm5MPj#73O?a#KJjY8!ocA#Kqu`6FIp#tU zwlEg3&54JW?RJFyZ>I-}?B^BeFLU7fm1Z<@9*y^dE9tJu=*QAhc(KI7Fel6xaK-r?EKz7p0TbEZ&>_-RP1a&6(gb-Y;eagW_A%lE=#d$?jFLBYu@QjggyYMbD>Sn*- z&)YvZdK8yCc+GSswt~KSWTeL~D@0;FV{r;;4+9qytaolP=lg;14jGd8_5mO6ql=-O z)v~`@{APQkY(j>d6D~R2;`A}=h|o+BS>59^=Wr&A$T$-QPu$hZ3*1WvF+c zx@tc?F!T-gydUIe_N6eEZlBhMhsG2^qftWIo9c#gQ8z7SzBdr9%{X)fyC(kvTI<@+ zWP#b&L?vF6p~$#7-kGIGq4aHk5=vkIbC{ZND}urY1-y{Dw|!ud{-b?v z_0sNJ4;>vgryHwTaix8c*wgAylVxMoXy83s1~EVAQ@Fd%_witE4U4HUm<>~Kf(pO6 z6)fA;i8fZ@xs<+64+INBP(LM*7EO|3cj^M2b*IFSF#L3<9=}N6Z)Ak%VaIsvAXj#} z9h0lJpmz!s^vVl)P?PJr%0%zAHN_qk7U?SibwU_bIpoxdFnAVg@W*e7+)&;%+)bD& zsr7LSAwklWfv!H6ppXD{tg6ol`amEA3Hh{tsWPts>5PpwNv%ljdjDkGkWlk4X?EvN zpuzb2ue_cEhaaR)KX|#>f0=oU8rT0~QDBVy0mJ(*czlRiJ^xL~%29+IIgBXHR4^UW z^-TUE0_yxB6KdmScspk_G_$jr>Fqtbg}$-4xJ@k&J3K?AcJ4rAY{ysNa+>w$kPtig zjtcl!Lg@-&Rl0}~x@B(WBT{r-K-1V&vzLJC%VMARGBNRb|CrS*#Yi$l}&OE7m zb%ET{Y8_GK+P7|@8ASmkZbpP`o8$%-U8J*wzQSWa9lf@cff+9S$!THyY^)@03ab-; zoA>2u2uo_<#l*_F2T1Cgiak)4+H-j+wpLmr86E<K}8a20{}VfUD(sV5)K8ZU&kw!v*q3%I+Q*Lo z-8d1{_X%--vhlA|S)u&FNvzd#4+iKwdslC_)qD3>p4&QXpYwi|O{O>;pZb|ad{!4N zJpMZYH=kz2I;g_?rsKdSw!*~n>4j?C1@Hf~I|>h7tM-9-aCs^^&0*aB0*;3?a?ngv zFZ%G21V4GXD`?;NStvmHat|2>?wP${F(G7s8RP89#TyZ2-=HYY+ntBy*HIxE!4kZi zzRp^xSN!m2d1TA$b9wNvfx$8rmoane9=a*tg))#|v17`l z-rBFH>%WtH=kV978lQQue=p#kkxbLOF>~UDL~rm~;?-4HBr*uYNs&Iz`u{7w22k-S ztj3AWPj3rX{RaF1dqmp9Br)P&{0UB$7S;fFz#= zn$Ll?@e&^~s{AZyyv6t+3GHn^?oh=16==l4A(HQp_Hd6de?~+#NQ{);2a{~cSr?n& z7)!~6-#`-^dp%e4P{|L1iz|)LmJLmKb={Cf$U;YBDQ`q`sLUW*WzP@SAhOI3c^uO`PnjYE45j9|Dkd zuj!LVFp=l3Hpu(>qrXpGZQcpZw{C5BAr=Ht+kMXXOF#+>aAc}O{3MEIutuSV;nn5cP8)ra% zev*6aaI|vx)4eZ=`scnv3$RbM@AI=U0GG&9qqy_uonW(7SL=Xkpk)Xx1sm^;@6kQi zuO~-`$%m(N)8X?jMnX;gR^i@G45SB8FY15dB*#1#XcN4gR`E1rDzL|qb=48hS!*?{ z;;(02yW`VFGp4`pd5n`nCw?<>`>_3NF7I!<(A&8|D$nfi^~6t6bF3?LH})Y5&DV9< z(H9LC^ia2Ngi#s^>rhb7+5@a}IZRO0mK78E2!v&9@V~6f;@3p+3@L(qyYH3+%uF5H zFzZY4UIfZMjx!~|;N{?|-2NGGGHl{)(&C~8I2-UC@;5EBsvRBmzH`b+dc`)~JV!$n zgs`&KcVQ=U2Dv5=5P>3rH`NLJUG32p5pe>)*kSWCCYkSc-J789u~CEnp9MhAkSUSY z!lngfc@ACww0-Kna^s? zrt|ow5=%BtysGzfvWwe`lVU@toEip{?g{VX$E~U6S}*X6I#?8M0ajtO~ObzRn&_$ zc69!X%b8t;uvfEAQ^Jjn(8+%PQT^f>5D>Cj6ArBFZuZf3YK#vGNU5qcBMb7tDE*^o zzVH_Q7ebDwTW8Qx=~5-2MLT$jyO0v^xwxJyKYc73svege=?c9Xil)dEnQHt+?rCuD zP$gPGCyKGLQUOz#FdxA;>|_p>0pg-kvpplp$@F;wC{Dg}cR>hXNnN_DFl6guPoN9= z)AR3s1}HL}bHljQzeYCdj%kPQ`otRSz8ejOGsSFo{`GNQK31(`mpkNqtC7sRVvWw7 zN)=wq#y#?}f?#d5frpju>RlyD#rncZ!YD>2)xm;qIDqaIWq1;&{GfCb$$OXL^M{nB z`3-YSn*pJZVH43QV&di)D9uS;UH_(-8{)wA> zDxRgK5703)nNo}%0twUop1h*VaG4vwysR>Oi1*v-I3N- zTEwt?MPxdYYE-hb(JMM80o=02^O?@0-|JH5fjoAG6M$ixDjzR4xa~!=cq>Nl{2LUIoGl3Aq-23@TD8s1OMHxB7hWts zd=o<{ln8B*B=TCjWZeE*eSw&4Xj})@ol4YwMLVyj5+BFVJODElG&bSz>N=zmNQ~J5`W_B~xeRb8QZ#nM%OyAz+ofkOi@5*&;TlJ8+RmN;~ z11826+MljTmq@Z|l_9%st-LBux&R+fd*jUcO{cYou}@!8tJ?7kyI|ofQvQMuxkmjc zDFynHD!Q`vN?G0?QCHqzd@=DRByWa~lwxRVuD4;F=#z7Ix+;?RjDU@}?Z=zfhL~%u z+{BW$ktS?RY&HA+j_JOekqJV^%M)EriM#hu`tRonjS{mUg449^r&TWk}q{K>(d+`YBC?-7ZVAxs)g9G1DFUQCye7GoP z^sC^eqC&?f&+;5f0Fwhd4NT^FLea)35-ucR*hJij8)^0AdsgOmtGU%^hoU|`VP2Kl zcTzWu-KF*i>b%8h$t>=$>}L(AY(hy|eo14lYxG3{Oqn1|Y)FsG`#XBmV6-QRnpO@? zl*-dGCt1YoU)2V9GNacpHWbvmnSeK7Z*ZF+u_A@a8%uOjfaHg~Y$xi3+FC)DK82Uw;C&%p5kfYxeuzE-mTRVBkhE4VH&6$@bUZau11hN`Sn%*Ip0{GE$+r#1uW+c-MLe2Is{4_t`tz z<&4n?U^#1@O^Y2{YS#%JZAX>W4!exSyFs_8e~e>SGn-b7~Wz*>n~szq=33gJF)~21_(!D4$RBTBda_t z7!-HNbl4C}ZAB#gG5B0BKXuAfkxH(E^xIvzI^({MB`162yZ+IMP2Yjd9WoyBb#9bRx2o0Ey(XQyHc4XH24kE__$mJ07r57;X{3jfj>%WBHA5@%|HD<=GfEoLO5Yiq{;?*G4hqrqsb-aEy)_y;if* z5h+|$uYJC|lQ-W$VNxyIDCUW$SRmjoU9h3A)?IM0SfB_43gnRxGH*Hx?m|u{)w~qj z(_UoMoNF9K{>1{!i8g8_>O9q7o4c0vV)d*zJ{Bj)l@^|^OD9@!Bk9We5(X!I-QS7e z@K!D3;`Wtlam4e{i@*0hEO<9^KXECltF1S-%1`bQ$z1<{Q>PA*x7Ys9zQQaX&E1`f zlnC`J0jkk{6-txFEal0zAYZo+ysKF8M8rjc6=h92%PRITxWJ&GbJK$P5qti?PSIwk zJm>Sf?&SA|>UufV7$!KcDJ9qW#R~g?Fb@WD;|z1Q?(l)VSZf1I9WS>chLHT9VT`Xn zfbw=Nn7?tYXcHK#^Wc-<(einj0+xawUW>kAA!DOF2xNdXu+B<+iK{PV;7d|rj*=7W?bJ9?CoMY#C=zn?Y5yu&6vpY1esai zt>kFi+{=4h`*PLV!At6zyZyJQP6y8B!q|JZTL=vav}3q8BmJz9l1!mJSV=ki-7qf~ z0W6$@GAI3h)65v=?ve!y+fN9-G^xBi?qtY@JPyUeMds;7ea?woN37-iLphFAiw(OLR9|cJvz3#BJ>Lp z!1-*AO6ib^@PEIrUOS=n{rn+3L2sVo@xBk=0J24PdE?@`q8ig+UGaT&udD~lM$(&q zZ%9H-(?JaibkG4u%3Lqm)l#GRrh*?2C_1P4bF5+)2;}sgadzimC9ECH1J0HH7I%uZx&YA@=!@vd6b^CB&cR z%PN(>5YHLcB6XE%Ze}wGTDjKrZ@poSxbJdm@-{^8P5?Lb8bp79f$|_SfAr#VJ^4ic zl~_h+774iXbLm0FzT^)jT z?&afe?df+zpl8DzdMPo~59bH0`f_Pt?`RjnrNR}<^)M~|)WI=1kWpkrUoFarNoM*u z%Q6xs{!7j5J6Ct+@-BiP@h$NNp>P$OsC~$18^V|d@BD>K4qI|5T_0U(4AnK#`bWxOM z%;o0~-bN;Gxm2B+9p-Bm;N(Ad zPh7=Jr}oJN$f?5hYhS7cX??{KXDw||_=Rb7ZKM6&nFCYvsM zWRL8Sz4zXG@0690N;VBb2&rUNga|2=k*tvMJ70ai_kDlA|Nj2IkHh;oI(p|dp0DTg z9FOxn9}i0mBV$HQqVnQ#K(P}O3+RSo8Qx4k1zyPWPm^74V=Ghf$WpJF+EXg>IO*8} zF`tWIdFsu7xv z<+_CGh?MZx;IId0}!faF#W zl_|d-3pRA(DIC%g0e#C049si1EDp8AuPuN8>gx@U`pIl z%2Cd?H(*1K{{b`iC?l5nht@l7g*Aa8b5O_xli;hnsKHv`HkKg*|1Z2IN+_0I6WB}hF}J$Oe(Bj=fA-m9h$Bt#iajS4 zB^xU@3$s#gV(Qub&P0g*OY6$yDnYQuQAoLB_}~m*SYvlr!f0TV_!56P7i-A~ zTF82o#1#kEAw<;r_2tD%Z0Y0s2(i$kR_TJioUrfEc`LyC*p2FbIngL0E{@P0+xL&I z;zs=M z`)WXB*o_KgFCO&80%eXp&KgDnE(^48#Oy=G<=x1%$J55jMLN&+-w7hNH#60m6O_w5 zbFh}n>sk-Jdr7au=;&3?_j-o4@1O|A%I7ClpP?}bgJt||sY}o>d2``fh4~J!VV);O zI8#NI)Go;9m52i^-%{;s)FS~7`id(@U?NeE^mfO%%E#lKSdk>`8Q)_@?T{OD80?qF zR-F-x;CIipKUk~2t{3lgo)~Erh+YEofRJQ+i*C0$qs4%SixWrjs;ov7vN4+CjTEI! zxr53TV|R0^N5*5fDzLy!8_t&chGH_j1$sp*Oe$x0?#o_cZhc}o`#513IQ~5BGsIeJ zL|>vQTGnP*o-*>$a(~%J=)acJ@B&6FPL6>aP3rDQBq<01j{0EQq3L`PnA8Q4P&mXC zDR|676X6P3RQ}3Q68%mdsY)A4 z@@&S_9VBn5F!rNCl6x2*g(M5mB-qp(%bgqa0}n zy;MBE1#rfteAIGD#j2f0=+>S+Nr_d#QTYdI%QWf@-~Q7cx4gL<&cdfkYk6&{yIhMp z;J7-*|OD*8p6KfEnqWI0E$t@4q+ zJJs*selNl$+{|;{TiO(FO3)ZB)nIW+48Xv-4`fe|L+m_C44%wLYEu5*&*qR4R}|bz z(MTGi+q@y0rRxMFTs+~s-7MQYg-%*j%FF6cnYG~lWzh2WU_XEQmL=O2GGIeYweteT z9T&(#>k?8jE~n#NhlYT3YN7HUp?)?&m@r@pNzm0sV!og}gY-BY4)ag%7~nW@;I6a4 zJ=HgQkDCLMBT5}jf(cX}H*+iQ&Sc$#?O(H23Vv-}I?K-|ju0_q2LRiyRw_}lqEk}w zw8s!?Qg7PITasvm3FWs^dVDML48)n*VzIzxUF6yELvGNb-nCnC4ixFV93(+sF!Dc- zfC$=>)+p*%4neQ(Ke86WQEqGV3Kn*y3y;Jy%`7GB``Nr^8U zUN@)}Q2|G}^$^TzKlS=EnRTdL`W;ozJb%`8wc)Vm*CU^72>RH3jyDWmxrTDJ+-V4jDpC9H^w;RvkaDpG2^H?pFJ=|CnKzXq+neJ=Wo8Q21;l5Eg*nbqqy^W z=v~VYdAZ2x_BWa4z!PXcl-)x*{>+|KTFb~o8VDkLUtgd+hwu57!#CsWj{AGQ9PAy@ za&~@Tvl*g>>vzR5hc1Wh6u5kjfR91w`v-b`o^%RV$(Lgqq{UyIH=x#U2g4={hZy1P zHdBal3+&Z;Rib&55#4aK^5LOQ1W!#NQ8XTQk^pCd#U$J!zVX-ORg z7dsO-)`r7z76}Joo^W)S9ekvj*SdkXKxx*g6U!$+^vdBW7D5t4DnJ|#sJTK^ zgASTS5)hGBFPQi79e{hfe zW7_|(zh|O`7e4cWvMT96|LQ-#g1e5S^|wp(Wr1t;-*5l>`xwNn{_lm6RsG+kK!G#) z-(B&)2jqWk#eW@yxc^;>|6Pjzb*BF!hW|ftg+|lQTLf5DBm=QU~3Gt%YFZe(Bk<;UN+>?l|&O{NE8GUfOGsvRfVl^RGG&#_E0Q z{Z&ERvf_46**%^MX#6jCnz%>`e*ks!-=S}od)r%|fOURKGuL11EwJBPZsKS8ZH4|> zcL$Ef+kTDmw~WzRqT!dn-!wR#T+hRfR#1%!e8_uPSI~ijtCk-E@vEQK9Y76=oSW*n z5jOCva(Fo&#{ecQb5Q0xj}kH;0Ed88UC}-e>*hN@DB$?yuKPV6*M^;&L2xGD;WIhi z^_aqCfWU`K<5ig1$WiGKw9R0c?|2eI_`Qtu0Q~5Xxh?}3OyS*#1MWZ6=>2Bw;<^5g4XhSR$`Ki6L_Qh20pb*U)X7wmqfr$@2Vz$Yjcc+2m1c;#rG(`q)nsZI) zSu-~P*YDCZlBys?U2KTzm%a=M^8o;*Z(qFdx;LbdG~}B$u~ZT|WZhGztDPQVwiJ(Z zM*~m3UQe50zyTsS9`J0_jbRtkhPH18P=axrY%^H%B!`#NLfwB?sA`SyYW7ZrffQF! zCwHPT`MJx(ll>|gbk6X73g}qsd5ELEuygePh<9OTkV5)RDKSh}R+ zay>3;PW;?q0{tCtX#cMbG#y!5Z{N~>+(+Yzwp2CZ=Y{kkaf~tk5lI>*!O*Vz5m)^p zU46$v)nh9Q=!r;~R(2`Es?p~w-y~;`O{C?dZbi-G5QRHAZ%afPr0WgW3Wd6N-Q>A*4&#=u z#Y-JAHNdca|3W#LLh`^(<5nQ@w6x0x?q|7rE!`KN5fMJIsOt%;bwD$(&FT#l$M}Sa+h}dk(X2x~Rvai) zl9`V?)c|pZi#lY#_vd34gd<)Egc;P-26}{Aehl$W#mv8eK)y#o6~w!6!(=9bGye1f zT&;|1d==$ffa7oRC?WLw{N4S?yi0Ym6hFEOu)0=|7t2SZq*ohapF@e@Y*}WCTJYti z7C7I7AjHPZ+?o`t@9sj3vU9Nn*^~iNuAHqit*}Cc>OlNGW|lx{Iz&~du82IWFvbmrAr_%V z&l?tY=;BoNmi|;3#^Os8K+zbg zlluOM9f7V0rDfl7{DOWpht-M3#ROW9H=B(GGM6$HL>ZPVp3XpHrN~RQb7Uxcd25ip z@%8=$*m!r7=<|DbdfyD$etiI{DWhjE3inzQ%fi4(z!TLgKz)onEfu%rCqDFINvOx; zQ5D((M90efPhuE_g5Q_+-w4B-qC}mHBPDULW~Ik>6}&6h^QVir<2kWL2a5y1mhPr=jDeiDGAAlT}H=$Pmu+f%! z8QRC;^sv^6iE5M>M|2WT;{M5Iw9Jd96QB;_yC)K{X3l zm-B7SS_QUK)vv{*3k^t|i`zPzQ34QVFhfXFF0b0`=q*p*F=Z6eagTA`(W6=32z1sPxa2o&1SB zanzRG`B?br{13FkHcxd;PBjO;|3^Lk;u#n-erPdfo=1XE?al9~+qn8cTXrFG92?TF zq5E?=3JJsE5Z_d^Av;|%g_S`i()lD^ZC9Y`WT7Cvi4;pr{l|=aKWHBMBBJxnZ(g=* zv7tktB(+>p`Y;rL9jxvdI(v zI2$Yz%P7JV4h%d`x}Q8UQ)CEPW)74PC^!~$Wn~am=%lNu-l@!ZiZJD0Bm1x;tANKz z>gIC}rdJs9D7pthyCPr=W0}O!J_noE5mBMXS{u;EfttkLSbc>3@kZj67B2eUw*ov! zGra?x32_O-#b|91eCb*50($S<#Gh3el1Sm3RO*qMky7ZpYc!_i)?L9I^~uK|*%9O3 z2clyG&^%KrQTcK=i|3jlT5S%IFv>@YJe690Hdm9|Z=MMOba@b{qE`-LEm7H6HAE~) zy|hx;OkqyEb3ffz7@<1XTsE5yA?a!KiXr5{w319XkWvdW8H=Uv2z~+qrweF~lAs&0 z1hi(uKrF&40CuE=jr-zdmz%^^Q`=hZps@A}dg_J%sGr4VgGtFjs%83WRmHckINEf) z7tpfN>}RERffcu4LH$!<4&47`C##eiLSko`N`8e>@@T}IKy)AzpQfz2$$OqKxFNBR z^qD>T~=8aDcwwhr?kuO0Iq$sGt=tWJvSB=&QB^q=<$E?=Gu{=mR6=G9-9J82TONx}xR9lhIakEQZ(J znqyBJ=B*(6lRs-2cMEg4<{2r1NGPP*6yM&S^bW(8EBf-P?|e9&9_v=*3MjHh=|OEg5@OE47CP3N_KP~xc(3s zs)Mo9=mzc%P|h3xQ(GjhH3+JVzn%$uHw42`D|Se5H>XS9yW$=KN+<)+5Z@+~P?JTW zC7ONJ_4D@%QUHCjBvBJ2AS<6%mUmHm^e6c{S;Ov({F*qu^Ztfp6z!m}&_5veca{FN z!a?*6TbNN0MY;otMbzHN!`36fPp=WxH7ej^9;vsw!Wl%0SnTF&>0o|dxFm|m27qV> z4SAm?(7h>$*w!Km)+qpO<`R8)7_9B#!chqF3PD1`M@b)RgQZ z=X;JM4T9J`WQJ}JWiu{A_L$n{W-p^R^Vr>U^R#E|b8@Y9G>cC_i|-X86v1K0C)%Hm z%-MxOs;~&(PfYH-GsdKf6eOG#{C*#Z-sWRD9DzV&xk=qfYOWo|(H&rJOAzTyG5`Dw zVxbmYBk(v-YYa z5AvKPtWlFR+6gdM(HaaRkkdCVJhbZWMOvH}p%lHouc*)UvS{)JSY3XjT?kvW`%!pYw2 zEC0kCu&8;a1_)&fm8nd7`TUY|Gno!RTCKne9H6@~`A3E1gJ6ZAgptkeddC8Lstbw7 zNOvo7NTg+32f~OZ-RA2yOqo|{`9imD2{crVBhSp3b<5i@%7WmlWWDDzsYZ4s0#wLuMbGhAdK!s zgv%r857MZD$x9)z# z+*g)WY+(K4|DhSnC4y=%dQIn>w%`mw>LUu=rQbpDou$v8&Rd$=kF=n$tYzRLZ77gC z-dm`?qrEx-8II-ohrn)Id$mBq;Gc8{XclDow;$UbRgP7iwRFOn4A>Atn5X7e@FWo!Nao zUNA8-mUPxqN%XA(?e(>NM6(wcj)v%w6v?${$?F@Izf*7qGz%JFlD~Fe+9@B(ucZfK z!AHgvp_}(=8?Kw4VHl8!io`JMU81CUWUP4(i|S;8)&WIkYBW{}{JiaL)FCqeBB)$h z>{Kb3t2OmyES7XX>_KbaGOp;yNyU#0ZHJ{b58fqy-z+r+>FPKl0^+? zoBV`Q{$-G%#l2Q24q!c7t_Dw1=(LB(f~^$)byA%IOL@4TH>{Y&adic}hC#5#330S0 zQh_2I2vMuX)PVEgCpuGTEZU)Kk?+vN{PGR0XZ!rn(}m%#Xo5SAQo{)aj%D~N=D(iu zU;jHu)mZE@C4}16WF84eAMC6wg-OFSv9;rudG`oHE%lZ*-8Zh$j~Iyq=m;Cvw2O2r z8Wu2~wW^p&gzy*IM;$+3 zNTL#(OFbk8T@IZPqZu9*-Edrj+2)=K*l`L>;b_pMu@q^${{4e#oNb>s6c<~bj8%+& zvCGxs5paEXf;7{I@)K%79eMHN{xl*%Rvuw}Ztdw@$2_m_+}}CnN_bJv({Lp^1!U4VW}6eC+79BWEc9n{)+(eO}(? za1kLumG7Y;XP(=Nrvyn)jYyrkmPgQC(Dn<8WCPjdz+!&*{QE!6^qE zA{mD$A|Si!a&#WG2CUkCfB)hxJCQ}hti;7)AEu5E86a;J(!rlzxAJ3u-NQUYwYNmh z>JLbhjnJS#V@&Z(z02B=R~<)LKOl#N>hW?XhrS{n(-MwiO-M_iZ#n=M0=bSW%nGH_ zNPJ7`(yIEO0$Uu85c@_WiX)mRTYY1uYnwL}8^XRUcUj0Def|!seUwrAf7fVr!EHE{ z3+=4Wn4|o{EB(Znkn--wi&ZKA{ZF9si<3hRS7wAh-~UtIg^HenAMQ?B)xWadf7gu( zAXC_1<{Z<<eOCS&F1&GziomZB2%)Cwk^;*cgYnQo&LlZA?Xc7ZOMVe&|{U zeOXT!%6$IaMOFpZUiBJV6;%x8MT(RDz6%jP`2*hOX`h6T%}D(|4p)guGLD}TsWN^K z3!qJ-mj`;bN>Lx1!;t(G6P<Yoa;@Z6m`lI~(KOnbqydHVvkIW$!ToWOXgXC`!;?2u!#5jX?7?{k5 z+z^}<;*J_Zf$A|N{(xW9ef`F`GjXT(-f#^1f)2r>(($T%og z_vO74xfxeVp>&{%`Nj~<8$=n`xZWfZw zK}akXJ$+}t8r_*2xxy}wsH$iSVDZ79$I7;`$ZMV%H4bcV7MQbnrp`?ahJ4ZOg;IHz-2 z5O2w2)ryS60f}^gb0N0T44pXB$iQnxeN96*(IN~v2_hp(TG!(@(nqxD=kJ4@-@OHA z%y>y}wG8Lb$y*X@PjB!D!Lbkt+OX~s zfmF=?nz{RB5s-m0963DA8aTQg(m?Anue{fC2Zg7WWwfujaTnF~s~%4iM3A{iFQs zfXmDlBm}AbLi+&PB znfe>VKBs=m==QV|TuBEmnKpu6v-Xq)%!n#OaK*x%2|dLPL$&t)m!((Om2nyI>}z|% zmuX3e&9n0L<}CJRPmVuX0ISTD z!YRG$vN@etvX1>+hJb~S`3~WOPg8O4j>ZAMr6KmsXWPFELG6lmj7qs%P^>6C!w}S! z_fqnimCyxdHp=J{A&L;LwH06FuEY7;$<~vN4-+BJG{FY<9lW}#enbn zEAPE;uxZ!#CJ-#dBbIUgL|%#Ica=oWfh7mHmfMe-uTEdnWD=ZYQp)9|K7r{uJ3!+v zMONPTbUU?F=*oYggfy`e`Yk64u0zR0RC_`3tW)>6JT$*P-HLD|j~)2}_zFMd`Df9u z6}rDO;|w^_-Yj9(k!l~$t=<>9$b9;;o=nn`MkP1W@AOK6iR5}bRpCep@6f0cAee_k zFKMEWU=YNYgXU=vC5>x&AVlIE8HphuE-Uzx`%v&T2N^Z?m70B!K{X|NFkG@M@I?zQ zFrBw7PU@FG{IEU(9pK-{sjSI2q4pu2F5GBPa5HHo?E}lwp5Z-W{+2H6r73A<9}+WX ztJgqIGcL*S)hKfkQ}zhkq)yI?6d+nKY=hNv^oSmBYVNtn?zdPHk*OaxJqeQE75J_z zCutSe&%56wkOk#K?P^&lLAIR`jwa_m3CfFAjMRt&+=tvhV6sSKyj!D+1BamBdAI%d z8EbcuS=W5)eB54d@~Wki6OEol)S{UH?)5KL-V#p25eC9=;^?m|BWLSb3MlS|D505( z&je06b2@g*-z)``sd<4)^uub-WtM=McD%u-PVZje(RUiR^JFAM+{TaYdmMIASC=*( zcNU-HmP)Uk{I|1ir^qUVNaUR&pbS?0=oN~Xw=`%d9rrO9oO5eB-UGYzAjs3Tz8`YV z86dH;DR7q*JD*m+J3_Z${u{x5Kl_iQE0nH+_vZhL(f!-T)Br(EpyahRJTs;n+O`Ji z_ejm|BC0%xz~mJ8!@qjwp6x%^&W7ze6RX48-5Jv|A|YhU(S3vQZW`d|lLdE-K5&Wi z7%V{yv3C(75gI~;(jrV2=9Qyt?}BBno_O>}cLdT)zlX+iJX$fK?*1-ObD7XC~a^Qb&7N*Pz^L=vmdDu~!-s8$vM!aY7IxZtNq zE!q9iW-BfDw0T)i;<;aO6jSu8{TCNNE3a&A zK!8G&`{2viGi?&X2otC&WK(x?N!9h<$RtG#css#Y5rb&wZt92jL9x&Iz#B(h?r8xS z6Tdexs%e;4?d%d1IWN{N#0sWi*PQ#gABw`9>ciAk`C_9uFGlYLXo2)@yay7uT1)ae zvT8674!gG<6ZfWinIud>SHcZ|ME*m|;7~O_0TI-DrQ6T{T%8K|dWWU71Jv^tLI1?3 zu|)Pf0wW=D)*v{y2_d0Ue}^U$fcgi4?&>=o;R|D#k~O+nuO5;+&7qwUM;1gIwi%Lr zk~G3Edy|`j`hdLs2VGdp1%KM{X6(O7I~~t_%Vyo$#T!@KMa0 zJl3zXX_l1s&XXlWV{Hvajly!gM8LaEM`9k$xxw;;AL2*Ltgf5TN;|ZSEk8QtlzoYC zo=U#p+bKkVPI|3=Yc}uC#+4~|xsnl$d!xG!jC8XeZ6}VuWV(tgJH#TEwQwo=_gR_f zm9xLdOU1AEp#p?~lUqKH_yxU@oY%afRxP$}*X%VDJ&tP5{)J7cgI52QgVoG5x0tNY&7PsP!$+u`3v*{s7sy-O zpLov7sR*%I;95P|6sduOCe1ZW5$-UF+<~Vl+1HP4?h3wH`H^Fvw3{a0Iypx5wz%i@ z%t?I9#M38>duRaSjO#w$xa>NfR^Re^MePysDH+uay2d(N+j*#eo@%_#&4xpbDt4tr zR+lHrscN7)$V%C##I<=Tnb9!+a)F+(DLk0mjiWa~o@*)#w5oq5DEMwM4!u-(;yBDO z`PB7x*Vv6e>odX-y3N=ntsJLZ9PBY5@-cZndRxSCNNv2v^r*J*yLuGbO#``!TT0Fh zM|3G!DLK99lbVSXwElgK`!k!1A<=KBy~?6;((|jJdL5uLEciNhFdj6|jqg9Re|JmR zcX2?8yX8SlhF0Rb&&oL8)BGNtB4n{;|1CwxvTp_394$W2XovTWM;K<^Itxkqw-6Gx zXJCr#=jziM0_uuR253Y6tk=$O4E$NI*X=07D&p^QH1HR7d<3`mJZ#^m=IaH4>mTVi z-sZ5U`loZMMWI=4Lj~=cGQWF98k$@sN^X>Jp|8zN18-E|{r#()u##kKV!t_b%VkXy zUgQPJo@+kKHRrUjI9OD7yZCjINV>^KEZ6Ar2N0Dn{xnG7CWTO>4>X6d~p)oZk7!@kKEx~EwpV=BJMO#1lB*WtwuL^G()s*@kOOm-&FP0Y6h ztghG3H@ZL6zVi*bI+Dod7rNt#QK|b-SX^%;En$OeTl~BB^CyzRXv(&OJ zAZ8iWAuSgl?R?Vgtgl^t2sc7?Gp}O=%RG5Ng zGtpJUlWSW|z@x8F<^IMCm9syQ>X|LAjQW4=#Qr>qV+VP-=3+?UYc}j<^fCqgWrM-! zR&tzZIu)PU8^DyyouAuC@*7rW(OrqKsO`KX9e7Qd_D|Q&jrSGmg+|z4WR5hnq>$RL zr0t|_)#=xoT{dmJ5%2foy)w~xuF`9cTw|4BRSrJ^$tY)!W~o1ZgzusMAAsRv@0=Ubl;5$0oVsvTEY2Pgdg^6Gd zf%RI0v@>hzXNPlfeiRK;vsKF_BpD@WrDcjiBOG#>|Ne;}FC|~&6r|$BCujPfAD&SQ zg8%*}$RFl7;XV;4Tkrq#uaYvOnw75po_W0j|96)e$37+J`>G@dj}TVIq!^E!_pI7nyjp8j}y9pp@#jA@|;lvt`S>tCJLs3&h)9qzp3D%RHuxgjrFv_`SJ|AZp%s zQ$W8W>D$m9a|*8Co_VyD5?_v?!N}Xe{@FG=MJoniIq``uY=gAw0;SvD_+`9&ZOj?0k zi-Y$HPwU5?qY+g*O-jebXZ~!vK)FOFIE?>Y3|#j&1&B=4U-tb0&Z7c`SvDa(QcpO0 zal0>gYf*?k;1FY*r*=omzs6Ta9(9c!)}TtHCnA{53p@g0M7e3!v(VdXl@05Ik`7oM8J?MX>iQjGuPn+-*VH<~_HS(8vwj&P^ks{LFbnq$u~@;r z{bb7c_$KAqG>4E6-rQOmu2!Hw)w76IXR1MKm&kBK4shG~!vppG&F}ULed+p*H(oXD zCa34(5yoZW`_@sdZ10_wMT zO)E%p7nq|PuwD{$!mi`IJN|N^yEX0L#}Bp>Bid{VNX0Zzw^(6qdkWC3a zhXlJqIhCqPZQ*W2|4r_r>bU#J(_o{jPw&#Y&9oYoKqt;cz`N~+OJr3pb})mH@jz)l zBcItC9#zFeLmH<9hKV1ZG>1d0@J-$&V-`C*y3>Slx4Oy{6Td->eY-#J(n{aV+UMqX zH@OWeaftym6EuVqF<|=4H7TUk;qz9SJ*`53ohJ9sVvmgLOAgduM_#xJJ2(n&x(s0} z;=WdszdHTg0NdfQ^K(qU)z2Blf*g`ebfr>gZ+9-6wGIl&b!)6TuV>wFa2PDygws3e z%d~o#FfIvC-PT@-jXaOpi4^ zDxUblGi^ti?cPldKk&h)uAGOpI3eunzSbELj`#z+mocaG%asUEF3r>5V!>;Uh0^4h zet6SW$K&I_o)-nJqmas{ew86N!WUy$e|?63#_EWqgcZ3OQ}lUmeJ7%YXYQXhz|?$)G+4)4t34fE za~1FSBuOpSi8gDnNcG3tuP!(9_tEgMECQetYRz(R-ZsIZ6^e~D7=@82P z;OnbY2m&so?%tYV!Jj2$dVl(cOeTY@8N4nBxRUH!eC7w&@vkB_IO;Au#$Up;SL#gU z7~E(S%gj?(W0JM8X3tqQYO2@Rc(liCN3GE4-<_3*%qu`iR?#txIi&{skSW?490I*& zYfTZFS91=U9Cejjjq9&hshy*j`q=%#{*xM$ShWAjIGD6>Q7!S3N+#|LU)>ELc2Q$G z4|h$B5&q1CpGuB<5(6DlZvdsam$D522&Z}uTceZbBp>-6BfcF??+1#wYDKzj@XIb} z(#?N&)+NVOlh0JBQ_QnA>uQUCX2Bmu`z#J{tFfk&m^}CSUe2<1kpgNkUA5-T=pUom zN|+Pr&Beh>uQTXBVCZbtX^P+wU|6GL&zwuRztg}q8Z@1de+Sv4E7&N1Pw+b^#t#r4 zsBM0;+269JXIf3R$5?AY#WWN3hr}&Ixw6mad;gC{d#Se>UGdwhFJdgcQB2g(hAQuQ#(H%{GElfLwuU)*4xo6 z9F7nSyCo~j+*6G_9-2(?Qj&(IN{$BS4;;Z@AdGAhPDDB?nF)0ZMV-qFiO=Z3xj2V1 zEMF7SS!E}TOUEq^6d=Z{aG)HJ+f4w6<7x%mB!n?^l0Hf>B(46)q_Mn0tLPe>Jr0A1 z+Ih&|_u)wr!Tzpf!m&&7JSd1|gSvbRI3l|r@9(3UNlng{85*c{S)m{%zX=3kT64ag z;eP`LCH4lzXVnUX?GI~Hki0^TX%*S#;Y1U#q-UZ(J~DXM9=wGvP|04sG2b1(yDSlU zF#z>O6yAdoG5+6RUOWr>KLKPtWYH=TBMF^_--K98jSId9A^YpUYD`3gOsVq#3UJU% z`DMX5{^@=G#$47BQ-jMG4~g-o7omav^EihAr~#>`oaa8Qj8{e;q$>p7ozplHLOsC> z*RvvITHA!KKLPOIH^I6QGmNr+V2(#y5Hd+>fB=O(WEltBUjgOMf#dk;DJJvZHO3Kx z6>=~UOpbwbnHkG+1bt}lU}t}aXqLpvaiYl{18JBF1W1)0w69Vx7@zDHUUoHm5V?v$ zf<1H!#2L73J6SfcDoE+Ws3LTGxhzFA%#wRleGdB6k$?Ded zWJ6|4RMSxq#%1A}J!aSx6V6r%zGRX0{;Fh?lZMrfI#t-6V8Es>>HRehEW7Gm4W@Ep z2q+;@f)r+1t;8}T9j%~*BpNw#;UO~;0rTCb3dGzCttnX8n+TaYL~c6RJ#!nY%0qf- zd|>QbX}45T3R(TVNL1KR-+c&e0q%pXlnbxsx;l$w574hnpf7N#ks*5Fb*5|Ftig zWsots=Y|9b9eZDiCm{`x*_(?cll36YK;AYwaM8o~F zq684Hp^$73&vnQV)k`xQ{^yRxix8tK!$&NtnG8E^q+hUds57K=*u7`Q)z`F|?B22< zp=^Uclr94Iq+Z?nx1|)|GUiUg6O`mhq9};S^-}%d99MvCMfPVX{a3Kx?bjFLN5~`s z*NfYs@6`wHMA$^iZ`o>it;U3zUE?Okl5>I_d9R5m2-s*f$-zShaAfhBi|u3&a+K1~S-h zVuic`dFWLQCW8l>O(8oj+@?(xA1n0~J5G*6mvo_}hH@2Yq`S6cNq!mAHI?>U ze!$W0;&f+Z=L28InqMP>_~sA?HZa2Vc8f%I^U-eVOb}kfdXVol!rRp)bhiclF8^3% zbQO}EN?U~{Qk5;tz%~neXc(>KB5S(@%!tmv2a_sePO?>;0tR$B$>fJhvidU zpZ2k1vzA09EpNa%REPM%$h%s%EGD5l_7Xmt(5$pqk6hJydwEniY|rb*r~13EQ?&2O z@xejJgLvTQZ7QX?uJ-d>x?Q%xNQMq?pQII~;pfI7gWx$zC@->nDgP>m<{lMAgJ_jp z2^G4pd%TtMt0imIB_fU)+iRbVU;lDW`+r*^i`wcyX!NTzzOC;b|Ud^3+x+W;sU|yhdaK?ftA5$8iDYq zKsvBa^%Tum4~?wzdp1cp4tQp%B|KU3SkO1hxj|&Y68a4|NZ8;jjBDM|Ey?y4bf0?S zAoPD8mJPPkG8(slEHxo3o;X2^PYX{-iA#*XAM`Vp@c%A8gYt@eWZCu%lsp}aM4k1u z{2qf_@}GTEAo4JgCJ5dVc`hq>t|RuoP2@5hx(~JK)BB={^efcnpO!#r*Z6@}(Sx$_ zcB-$Z*uC~Q?f)eg*=+~6i-JlGZCU35uo*N!P#O3x7(CcT_)%|o;JRU~QC==d7mmPJ zsDWEG%HE|)r9N&)h*&%+Rh_x_G9e(=4-U~U)#BBy`1|LVz#El+Q5#(5EL7p zH@}ksNBjRK`_WxwEzmY#2c4ubPi7gzx3kalTbNu*q@;;4W+N%<<%p6Pv0hv!STiHH z($IgP`N&FJm(9GP|MS~fQAd!`KbtLX zeEr|Ifj`)Az#|S&bS)wV{J(xyRpDSCEvCtG{MT)e?J5FmV^e}B_MfftpGUR6f&^#c zj;lid*PU?rgu2P3Tj+-1|9MU({J-Gv+&d+0|80){x>L6%*rt5WsKNjHMJE8kaQZuV zF;w$^R)h`f0C(z7(Khj8 zou%~n<>EaEg6#o5Yzq2)9@I@_JHd({(Ms++XhLZ5@b0SSIiW&k$8N( z&hirCcp(8`VK!phD}h+-D%gLDuq&R3C6C5Yl(I{WC;@IpjRui!{zrgfq|r(0n>Bmz zfvF=)?3vFStWeK=Lf5a*81vrVe)5@n(RqRE7E=B{*&qYF1%QcK2A!nc)DvePXz5)x zsk3-azQGKf^Ej9%Ao%gQxgO@^7>5qvAM@W0+e}`adipMSi0Diz72o^)?F!tzNW>5z zj{ACB%5ONdsxX5>j|6GAw*2_PJ>)=9AQ-w1@{S->{04AwRTvU4Km@=JHfMV(qy7Sd z-pGdT-h@uz?oZ~ZYoxXSv^<_}4;HcZZ({j(%n^&0T=qItorx5sYw>feno~WOBM-{?;Z0TE$w50E5&>q~*LrP2qJ2As^3u;?_~I4dyEb9stvV08&~*Xz zW~|zn;L8kH3YXAIiQvr>hKi?PQLT?6zh$eUzHspLwjAaGdfpHJA=Zb%^M(J)Ed(Hs zINo;z2_GIqK>B}-*P5qYEg7;&x6$suEl_VS#JLAicUl*D#L}HWmn+sg+G$Z+5cgBZ zhTr(PwRAnL9>6Xnd5`q4HAUVHLj&(7ulO!S9mfXQ{re}*2Fm!=PdN8s{>x||V!x+6 zoVpKcd2|i$0>_Y1e_tk}0H8D}V@4*5WgF-o^7Ot^!PbtSvy9S3`&I=aK>q2T$a zdI)02cYqjuhuypt!f0{bnq51GE!nK_1*K>wZ)q`i$9kcT60$nv|B>Ech#8`2)jZ~yRwFv+dPkFA2upY>u!|} zKk)nXVaQgdFY!IzpLhr))oab37Ju^E0K}H~FgKO*(gS}4)uv3JAvFsh!za~5PTwfNorsEq=%f?+lXggzbWD<#GhTLwvmhCf^^F% zoP2@;=Bkv1K*Y&WU@#1)e1~?dS8E~kRxu+~(`s|(0EpUtPu{hjRK7b;>~Y|@&;jiw z$}>C`f0MTA5CFM4{x7(Up8<`LClTCGP;kwTqPHDlDzRYxdLu3ri5T}`Zr~d@nbwH8 zTXq@59TRX|4X(JSG4jhy*fZq**nj*z(s~wmmqWvG2d>eZ5CMEL^?- z(z>^hmhk{2KvE=}8KL{n0^f3~gMA{5Z$eTgs4!hSN-OSR$j)lj5f(CYbm2TXR}GY@ zjMy(yDR2EgMgVGP1gJ8x{Xoeu3rgfBNIOsM>Hd9qsl**@keI#KiFLb{dbL65{gafh@9@R`zS!s{7Pjd|2Lm-DOjSuw zR3^y1JRp6A(O~G{YH5wxUiqX`dNY>!yam@L)*xF^R`(cuBKO9a7(0J}?$5FsMY33d z%Mxx=Pt+~-HrUZAtY|A<*d+J+rTfwbbF-ivNPP1;m!EFe?=R;~%H~@a2Ga??>mpEA zOnM~jV&l$DWazktrL=ydtOUZoimSJ^&7QhY-dk^>E@SF2w5DUfddqO#w@Gm|RH`+adW(WOuDtm$c(`uTkHo~krt+so}9d%s*P zsULC*g#|s;e&Wcb*_cv#H=+BNU|8$UYu#IP0Diys>Fd*|{oG*oGnBPEp(6rpZvQvv z75Mlk!QS}YMXJznPp*5UJ@2a*Mk)FP6uCbMKXw=t){#8>?pcXQSlUjp!0K6L%U?n_ zbCL1`X*Iv5pprv)?E884d++C%j*haa20(%`jXnX|N*5k{~hsoRK=> z1!+%C?ZcO26X%x&#q6ZcrZ$G^CC=;A?`RbWzX~4wAZejwC3K~)?`q=j`2OE8H6yvX zH2h1DcY###N8p!C0xRl^WH+AhZ>glwyS{JuSoDx?@r3x(aRHrijY-+}v+Q4NR>{1+ z#1iyqtO=H+ElXM*d<_wiyM%C3%2NIk+KT`u_4}6Xzu3f!(^}@oSm&elCFf+Z;{{zc zrU|sQ^V;aWL^Ha^`Ln32!eMDMGtvVcU9SkJCPD4Qw05`wN8A6v+Lykj6f5F)kvG)q z63!62cbXCEg^UGlDh*M0g&9_Q4v6KiU`t4MzMi?}YGST+sfNmD|J1wsvTrLFTXb58 zdG93X*JbXK#B1=~byyaAeD^`51n(EDiHc30^0!kmUlUW&y?Z{T^~)c?C^Gi=_kOcC zE7u#D5&wK4U_)M&e$M%Tx=2qCz`UxpqE3s?-9i#C4@f#4l3e3R;^tzz`d8Hj&v+5M z{D6;{R=2+Msy)h_30-6+^k<_vF`hw(Q&8JDIg@NUYHpF>T@yF8H1C@_nE_Yt7acp} z<}Tp@d;}NBOiq}8@4VWPqw5*|{B1QN{TCABF&QEOC(nB~&Yy%hJ!Ln`xaio=rz~ja zNawV$qXe1n8s!Iz@@d;kacx954X0BZ93E8Thk26qT=byQ{WcewqQ|MVKwo5+?k<> zj>I)KSi@@ObMN%UYu9HQ&M^A-9Mmk}Y{G;Tg415-QjhCK3kZo(7WzQ;BpXLeIvzK9 zZmYGkeyKu86l@9>UWDppG8fKd}_2+eahb>oRsdfVX(`gTQQfLf(<$* zx+$@jrZ9@rwC~-C-=XPe=rwdWO0TxKuR-}@*2{jxSf_KKb%s($#~uQ~iWJH79!g!f z8e4-o#&is0LqbSS%TJxN(pFP3`b3>RTvhq1ZULfvdwVDX_G!a3%axwn1aa?TeNqKj z7te`J+Ys=SXm@qWVsluOiPDAK4|v;#O~cZ#jx=ABBP~{E{*qEri;H=Ll=Hh!4XX=m@fqq0}F(-uO5 z$R42(LUx>XHW}F@MTCs9vq#9NtR#{VlFCZPeSCFY|NnJAxF6lWS3j>FE&hm7Xe9ICZ8;`;!Zz9X*=SH?cip<*Cs!sB?Vf zD@WS4Qw=ZiBdEr&UJ=coTp@~8%A@8Cc*InUx}i7ld_Mp7?3ciZzl5!#Z7Hk5+O{Ya z+qCeQCVaNrl6$C#-+i3-iCCU=^%7+hbGffY>*_g+&v+=@mrb1XjUG&?2o>=*2}TAj zdpT7(;qUuVYf>jY5JmuLDfnDp(s`q;iSHVj7u0u<;UojPxH9(@mv3m&yqHR2Zub;+ zSg71}^A#INjQ2yh1K3h7a#o|eYVUI23N7ROMZj&)X27)FaVM&^{;a9Pxbog$M;6Jk z)1D+tHnm+ipXYOi)=!%?5u^kM*&g{#58v>V$3A&@sNlI22V>TQokZY~sr9KPV9c`S z22iTqDsu+pi*zLK0j=9+y=;)ZooRJdkG!=ij6&QuB<;-7VTG089DSH978MMaJp}y^6F&a7jS}Y*5%~ng+)h8yu=)zw zJ;_dfCcDlY8%4|)yYQ#HRnHbk@jAv-L`ACd?1Z@Vxbo;ZS*?|L8c(Hqn~YH<<2j>4 zr6OcI8$0$!83J+D9;%fG=X7SpREk`mIzA44tvnsSb|%5u@84q`Az}>fvc2E%F>6%T zws8Kw!09;yFLJehcSd8gs6sv2Q|HN}sK$qE1RQjRi8_}=J6_6OU?z#_G>n-KFH}!Z6O^tUhHr9qGj~~7 zs#zlw%hSbDS-xl3oOyr8SE^%BmQsjYNktMru6?p^MfAXpE%NpA&do({#`zJ}$PXWF zx~`5pag`8WVxi2$4H7LceCxzp@6h#PypOi02crc=7h5F$jExJg7w3K`lWEr;s*sh+ z6+OyIX@u4zu#s=f!U&uMP7}|n=#$0rFRl%1tUqWCi_mQfW1?E zRn%Otmc5g#16MlQ#WuX~ou4GFa}?F>VH<&1@~vUT%U$4*FZC>I6E8bm5L;*IK14s{>0?n6%CyA@#p=kP^O zMl0cT5Zq>0BO%MbNu!E>yN%zOU4Fi@ZIb zx8C<*GRB=eU2BVNdx7~)y`>hD+ZPdegYC9oq*iVcJ5u{$_lgYime3Vs?w3M1s+lso zuCy4-GH~z=ij5O25z%`XYzQz)ED^+Q#_&@;32v6v7Y(Ch`Mq*)NRZ~5mFN%mC>1?d zHx{!Y{~4=Mkr~4CXHSTPJvE%e^Vqq62VHy@$}b=tf;b44t|n+Q`+DL4QQ zW{t;saP?yQf|q@kZV6{O`#tKjhEMeHnZ4%*LYqS4jHo~XHz6glV4D}f{o`%DHwrzs(4Z*?n!Y!_BjFXtTje`QJ^El0{OKnmrz6`f`I!NZDi>@b6Z*yRmg>- z!E`~9Gg?UnUs!TRvF?kIXW`Roh2>nFR-(&D^U4C%2}4O_h$Tl|H3sz+ zfIXDk2s(X8&585BpiVG+_D%-l!2iNG>Wa|fUV4ot`Y*_H^1=UuoBkjC>5>7UKkr3w zQU3?dg1>BL0QwVr|Hc1eSD5!SK)Gz|hTtHd_1`}Si1NnDR`7oTssI0%*?-uAUA(|~ zomhsI9KoF77-x#nn6_UFt@jQaA8RM8rko~g4np{?>pmc;5gf*{#a)~O&B+UcU#Cvm z2~e7>sJAUVAJr%TKpX)fv?dB%C8x8`yS)L>;xPt`8Y{U}6tZ&wfaNcM4iU(g5BOT# z=hUfmG&YL0x;yKb&w~4G_Yhr5Htk zvNti-Kz}05eWr}ZquU3+4S6}A!t>w<2B=U z@j<~94?uz93Tu|Ms}7&sO6t@J+Bq2p|2ANdCIH6X9JUS2k9>~9d1=rf1=KU|T>(xm z13^2Wlf<)R_k zeh4fH=WU6UGG0*HJhsE6CIGkFUwY-K>a;Km&q-5KMGu!Uo<*?}3H3HRDnr-;AgcX7 z@K_0fs{2(gQ`MjuSgzW64dR14zn3nWzH~7)`P<&}?+teV2JE#r zS#tv(fie*Q)@p#N%74;(D0$QO*b6vi4a$v#OL7MfG4u$XfhFRhoIO^jB6!dVSUnLr z79?xBKt2~EFkBlR1^OwKvU7wk;aL1JP=WVm`@j6V<75nc4eua#F9t=WUl4qJbm#Am zD+Yx&RZ?7K{ST)B^;cgP5u_Fk^nb4(0rm4DjQb zD^E^Y$c1)gd^i!re1O}2xqs?KFfU~{nw30C0iux*oEM4W64ZJGg1&0^c~uo7rh8ivolC2LMrT2^(alPx#b(<$$Id2nySV))#93exn^$A-VcMQ+Zfw zQN##iZW**~^bI=D)U!rsv32NO>8&!D@?PJFgdN89kuOO{Bo!&|I`m+HF=4A$K6*S{tA;>PuHP-6E{E z0Oxq3-0l`H^p7SmiGXoYHi-VKg5oVeQ@$b3CU$;b!UYIjwL%)eF#X_o*Kn>~bc|s; zL6zH(r)rW{9uMD7&ua|q zbRz9J<3VY4%3?e!Im%K*Glpy!Y{-VutKgZ+YfXyM{+4iSLJSI*O@QUw9-hG%HEQWm1yi)bxjnJsk75&FkkhUdWd zB=IZRhp)8m*WSE)UC(gUc-c4{U(l3^YBR35nVHtbRjD_@nn44 z&xKy8LO9`}nH!&!zSe9v_pk-RFN#5oA|8AcxGc;5?y8$+mPf;C5{}$dZSW3MVLVPpf`TL@cD=mc(O&5I6+EVje9c@-tGU@;j z%Y1KA)-^XHbz2~#T6PA z-RCfyEViV)ls0JVh*==Q_f>(}6@8=$WVRGY3{q*&Ue%_lQhz@u!#F_G)@VoKKVU#3 z&&Y+af41CIDd&I|0F`x}`uh&E3!M8zU^-z)k@a){twb!cgnq$k>y;1KS-%WAkZ2Yb zLVF8T^pymWr?B>7M_~NtYQII)PT+0ft)`#V2M>WY2wIrOZhv@Ybh+v7_h{BV!1iy< zHv5S?jTA()auD9Qh$4?tTY8)8JoNU(HC{?Af_Pvbyx~d?F(PjJPzYPeFEGxlm!rsb zHe(|-+(FW4oJ+C;GCn9RU=qNUQ**ij&9<3~SdACq>7)QJuz&9XnXYk(p~@%OmF!!c z{dO71$ZgYtJvs>8ZF@2taR%1GfBU|h%kgc_{j$L?J|KTn2N6UEnI(Nb|J_97E2xn- zZoBf+65dQcF;MwxBn-mk8>~v7?H59ky$!b7n~cgV6k!(ssJ=~A*r*sp6)wW8l0i+{ zM|n*GF`4qUzsOG66VQAVYd@ z!}zR+8{o?VZ1Q(g!E(zp8-wI7Hh|b64$g{nHg%B84!ZM zH}nQRSjxaJ@ca1l>{LP1%iyz=O`I5dup*HSlJ6nN;A(V#sSP2Hqbt7u88Nq>7f_0D@d!++k1vIUJ?;Bj^PyC!XkK za@RNxr<1eE^}$d>v~;Lq>cVAuVRL~^Z%wC{hzn#5i~0n_l*)|Vo+(eOCrD}*K{kZ& zD<}Hxz(2pP)4!2j#RPr@A8Q}@3Cm20Ieh1ETke5?_zy_uKYTd=*!SjFlA0RWB#bi% z_e`_J?RrjOxz~lOTp&sH(BB{UCPcs22ab>r9SN;!b9l2L5;1M@uWL+x=#lODwT~-r zxQ_Xc4>~|w)25%@IOD&81idg-edEU0zT{sZwIgk%#2~xd zx9PgF#tU9QW-a!&zVSTZHyC$zcN4Rafqz~cq*Z&$&DCbz-Sa{+cLY3ynFdcpG9`Oh z%OG_9ag6+@zHxjdYjUo$gZZ1M%~L#^@U(vo!B)Lo@f7Z78YC2 zeMNaz^17u24rNQEcsJ?;-<4o~&`u^`>lIwU?_TXzHm$b5MOS&{X@ax2f1FYq9ZE-m z*4QMP>e~XaZy(lH+z{oHPM(OB()a8JEQPV(J5|eCr%T`ErOob&3T|$=s!a7d=$X8* z>F@O|-0Zw{DuTm`**d6~fs9CSelmxso0+8bPlTF?|DOntf@c1K#tgH63z&A>7g)E= z0z)SisllUiGc$g8VyZb|=r(1&2c|$EI|USEMyxiaN?t2Te%I-YmRS~XVqP_2JR$~x z+@LcN$gQ$evo4m#*Abc#($A5x$@^uJ-;hjon|6{B3_=?5GPbkmU}yEi5X`7X1JFO2 z8M&42i|%y0tvtl14*G3`?xsN%Kuo(1t-$KdcaAKF=kcus0N=mWy1h@rxBuhyvGG9j zglh9ioylu`4z>+fDT{SM;;z#;&VszH4~GoJxvJKC_?{FqBn#~s=yWfW*##O=A>(pJ3%M_N-H2<4*O~|UZb{Siv&6pKAs4ZR0yb3f=Sq8_h}gbs z;aXM+K#{og&OigGu!pv1?hgE(fYkioCJd?y0LAL(YgqDKE<4 z4oCzfoAdpKwxwF{&fP*y)!uxt=8EFDBXE%SD7Q;sWr3wWc0^=a%R~^g?iiFb0&WmI z=n_uBn4?qp4TNb<>lseqx%mI)t$HGCTCE-w?EK-`VrlemR_SA|eGS-7wpmjRtpfd) zhq9k8&JVB>{Q^*I~StC&6c@QSeRDLo!?Wvf0=21uzACQ#vbrs`%0f13yA zxLw|z>*Z7`owV01KZ*`>Q4_wYZ}e!>Vp5uLXn{1N2*gp-dbnfe=fJHx2F`n$UrukdKID`ed1 z0sBb_-8KvN@iE|NGJx8h8fgZl_VHC)xQgbWgpo3qcyx=CJA%ot)35WGBS*pSGNkS> z#b;q3jzkZy-+Hd8=O}Q$N}ne|T(B0ZiP0u1Pv%SfRm6yf&i^)qrQSNz)1xf$b!M(T z%=Hf2e(UQ(dsrXcYI+ni*AMENwF=2;a~pD;NXor+r~CO8nCgnc+> zm^Ui2Cso8s>lBAeZ?8VpCPQgSQ5h}zrBuj^z~+xA!Q*dvNjgDfV1@`KscI+^#-ac+};5GLAV=7vh2wUfk{GL>mJSw8cRq-D#*m z)Alwnyyp(Z?qY-Fa0+IH2=gM0l+awfAljeWB*W%;ggB`WF|S&`V|a|jCr2{qLIo&3rh zNeB2=PSziW!7Eew>IEzDp1L)t<(Th`&K%FZyC|p*)tU&d+djL{=+8&MR)SPZARWr8 zf@}EGLF!f9YNZIyDA+O4aJFpe(A^O;cvnEu- zY92;EjYo^uYK9SaemCR zsn_H#{TCHM)FismV?py`EA<+;!M~k#G7>t>k9u_SIw&kpl7yrFC;O=Sv5zlZ-=$7C zP}5pnZ(mYUA1(nYrS$?=w+{+r)f0Q~CX+7wQ}I=aKzX{8E5DP2rq;uu;-AZ2+g}*s zn7+ajts5U7>s!31g<~8L_(!xTXwT=o?!PYSe;Q9|u8Yv7*6ZbU&O|me#A@I#6#8;49zQOZEg3ksG71bl96)2swP4JulQ}h3i zvY!H%*DzDcIi2%GZ07BcN-eMS4U&nTK;;}aW|Moy#sKLN+T36PJ=E-BL0}b5QfnyA z9)mR=rF)jo0}V}{@6Rv35^Sd{=w_j0M|xpF`1fHN>Dwuoa4L*|ygiZ(vdisaKhH#- z>ZGovqT_MWUjM)L>pLjOXn{kS#no$Q#4Wt7HyH#BF*@mj7cvss=IsAThVQ`VswL!* zrNBnNwfaF9!uq)S47G__v7j^^0R0u-H7;kQCjPyyF7)3l0KjLM;1zJG4Ml3V<)JB6 zP^M&5G;#ot?XZSx1dwGU*!3p#!-6I>t%i7;kQ+uE&$E4-0Lq4#zrJVD=<`EZ=z47u z?0CuAA;hUdX2wuc>t-b2c;4_nxnmOK%2YC4lR_>MBO*1#v^aTHo0uF664}f5$;q#e z>mbiY1%XdWd`)GECo>QSpFJR0S-G1~?Y;Pc5uEkh(>MlcR5eI&Vs}$N{lnMX5#TqC z00B00(EdB?XQI1Cp^bp<8=Z#%KP}Q$LOXuYcI_?mKX4&blhbeHmP5Bbs!G%bk17Ja zt~v)#r*U2gbTz2gpD>7FBk+z@NOmX0gv0K2V-0G}^i!-dhTw*|J*8A_H_x{TRXS-0 zHv*KGBP|@!5-Pi9vwjEMHw=&e7AJ+I5pzn+g9NKs{XTVLp=eSNv;@zGZ<9i&@c`hq z3*ah3P2uZ~Zv(o~52Rr+2+a^7o=J7$`c+?|9HpS^rw#JTvjaQd;k|^XGZzmN&CNd4 zo&6Fy=-TMRvrPGj(FsU9(LL|p_spn_7kVRX*<=avHn(|R<>ewxvhDtzx@L<33qTIR z(!i?`Lz@g0_D65VbstKiHXJ;@EP!XM6;WIFU(V)KY4K7@qYyV6jNY4fJ22-)Y*Xol4qF#v?|Dibbh<$<}-0cDO*za9Fw_4l<*A|%AsCZW)!HSR<@ela@c0l}r zIf?u^5;b1`=mEnwT8rU`x;~GB3yl8bJ@nu`mN8 zuNa`+6wqZdE`Q%6_&Jbl2%UxS-SvCw+p-`SlVbp7)5vyOQhR$$i|4ZbA7~Z7!p^(T z)W%#1W&rC?NZjY@?^c^mg7)PBqpCb`ls(VS1V`a&F~rd81}$0)%TDjzy42~taPob3j!O+6)Gr?1<-r>1gN%}@ur4j)Tn*AsZ=d*R)c z-+VezZps4}N8fb{Byq11r0W^yg2GpzEmGk=PB323uT|gvBnR^#*dB10RXY-Vsy7mk z^)kf~--okos)m!80OJ$;z9f}`K1hhyLquvu81n~KbC={(eN9eLeVo`p`oljPO+UOO zxae4%2OSXS!<2@eYw!)&SuQvGdVB*1%S>yK8LZ#jvUk2FL3?$RnxWjU{yQc~kI+ir z134S<{r0Iv7LN{q&`PPU7vMM!Wf?78+Xcnifb9_CztIQYZ(bsNJ*9*+M*fS1T9N>I z)Uv=DjCq$1TSob};!nWKm;BZTg$aC3K0kV_+gQyl^Z(zZk91 z67uuT^dA2Y(W49Ury|B}OtS6L#A)rLej0rI37MbW^*ewutu#u}IxPQz^;Gs$*+U*E zrpO2J(92b=pdMcoT)wvnS-{8%y6g*EAmwA-y9th^*~EL3rZ8uhMa5Bac@TY{L}u3( zf(xwt?@~o8Wh*5kha%NT?y@!KcTX97?s?Fel>B@E-QUE!mx__y zRP-t)pU#YP8CKh0IAc;(V0B4u?hfuN*77O9KXt)_sx73mZ}hfidN5LirR#lw+2qNkhGp4Xl97^ zyzE=d!P`8*?Q@N99wy?xW^qNfHf5}VTc;s3^__+B+X&wi=2 zIwIa7n)Koec3gqc%}?iNgaTJtrFT^HtnA&+(+L=Ij-NI)xPd6;?psw+jkr_&yvjT4 z2|LU?@y-6b2tsD|nY$YaUjTwjFJx49T_J#K(kI+3?CWs;=Y^H3OiC7lA(_TWoCvZA zh$^ujiv0D$c|tf7TtSRV5&z;048(2)E*;GA@o?;U zdH0j*@83M}%6>q6_aXptBe%cQ~!>=piVMASYm<4}^+`wKy zh=5F$Uemexr3Zxob?lq*olwUi#fUfIy;qgQ6OpH{di!9byc~QhEVGEI~UM zW1Y2y)ajh2s*5pHZDJadw@#F^ivC+kP?}NQ*C1|7djv$*GxgU%sCev9TZI>tFlQ4h zHd-O-(E{3q(oQfEE*584#uQ?T?u0i=NRO>LlKds2jCcZTn#W6)&l|I!p0N2Y7og4-dMb2B-U0;RY&m}fCf{klctN2<&@tWMcBJ+P0_>b+6byF1u%@~R zurDcl8Z26WlNNHwtN6GL-Hw1c)MGECaqGPTLALm8jyc)B`7XUPlMO3XEE62z%5iVf z&eX*?oKqpUX|84qP6(~fPxv`oE6}^$;1N`fW&AD>E9$qDVRsdZN{#2cz{15|QvQtW zv>~5}M);?1OVH0BMu_7KFtMM5py0kWr$Y6nqt2b{Bc1qNk&$B|(s4;<>^cX-wk$WN z{wC!(K8HW(IM7?@EOECnt43VDxy7pMUO3(k8x6UBkCgg~V{tN!snixwf0*6!xtBczCx{4(F@$gVzl{@}s+ zBBJ_<2+0M?y@F|t&2BV{5WjwoQ-*L&J^{R{atU#FZnhxv(7FRS=OrdtK@0flXS?+o zy+^pwYs)?P!>70dB%MYuY5TA_$Z2j*)<)Ma0C3=y7d7k&bb^$9acTL2M2*uZ_oq$g z>pi*WH&3b%7bNaLI=jm0oA9HXwN9g|af1$7ml^3RAhw`1Sl~b@xvf2pC!Y2*g6{)z ze=ZMdrEigH>fbka5*tAS9V%)GenTn&!YVm6s&;~NCdUQZO{9;=0zr<>%!ZzrvzmG7 zY6cKgLck^6mZ)E7e8Tjg-&9S-a(!=o`9>hupd~W?o^~l)2IsbPUOA`5A*44{2HeT>9N>CevL9p`Eocx6I6YSXEyLz3TN!58^;OHanSl)@$+eMShNew6de;AfJA1S z#oN_x>_pZJg68f?^}I@a%&&O3dqCK*O=x;*{&k!a%brIDOlIPH2UIR^BCJ}H_szhD zDZn6XPRSD=P%e7cA$Rk2{H=FU}!`c8T#9lF~p z8r*&Lk2haVUZdU&!+FGKEx|Q&CCNO@)&RBXq1k~{!XG_xAR`2(uR7uar;>ZYN>6-( z;SxxIUAm)647V#fa#-mg`9Fm8I3fv<^K8#c!n6@_ZvQDz342e5!yEof>S5dwo6d#9$9%~c@w$GeQGLgidkBYd z(4Ql(xgj@%L>?qz!fEDuI*9VE$wl^w%1vuD9ftFSVi`md==yp0oz&q7khIg#Q89bt zo%sg*-<+PA+bburWHBDr6BBZ@;r*qvAOhgV!6zlQuVNaC6Ju5BjBPCBMDqj|E)Qn% zM3J+N`M!1kCi)-)Cpw_7w<9K$=Vps*oR(zcM1>7|WaFgsPH$4R{o4pymwZ zWsUY8LWxUb|Db3RsE{yl7hjUNokddnYIKD9Bp(U+Zg;>4$c3$dl~>^kNge7NiM~3F zi0h^2SYpd#k@LO{r|pXi`9%GG*B*TqzV7#-0DCh`x; z^l6>@=C5vB9GccTCF?yDZFeSFD)5R-BZ$Wl*W>==GxpfyIP;0T*2t3(x|4g)L4P?~ zV_kmU46G3)ISSctJ?%1?&ijJFWmUe(u9y7jv5U-pk2Icc0rZsO;9`_HSIj4;uYmjp zG|$K-I60Y$5u>Bp018IhJ+ZSO(*8QWRT!^*$?t)QSqI`8i6S#V`8*J+z;RplE9*KM$f+1 zv5HJ4Vt_4Pgb>cyQIEzuU2W~>k=IXo8NnE0vg4C6Ux?Z`rnBc>C{@!w0#S%xGu?t8 z3ym+hB8tI)rGLBvd)t(+g`G*eV`b`Ab#W{b#sXd6SJ;uq7{`FJsa^Mv|LQk4 zdD0i_NKRm`(u9mPgDycOXA#PG{kJ)?MLJ>~UG0Q*Ky^Laq6gUNdB0)7A)UZjl{JTg zuVI>3Z3Ae|w%sD=SiFrZ%uRW2DL~%!bly|xf<7M2+B1fEHs281?(D*nU#q4t?!DmLECJsjQ{_OLW3UC<8 zSfLu>CsC)JGH8fHC);OC9H{S?PiUJ|f|E0`{KC8w^2pl|Q0F{nAd4*8x*N0+nT;3A z%^NO@JH9P{BanL8Ww1I6ldNSTcYrj?LDhv@1`kD55U%~=ORQ2NAZhzM-<|6=YLxHu zQr=vv0~}$j#?heE>UHbyn>yT7KkuH@Mzpw*^)*OAjpf9-0b8vsO8HWFU>GS{dbVA} z4>SkqR|^#5S+Xg5YVS zsNjY9BCHJ_ZC?aaS zyGqjnBCEgp^Z^z3{=i4SxRkM$77HRg&J+ZY{yG)^=!K2lESxdLiL?+%G{CVR&lW@Z zXfpStJ5kvz>`3oF6uHlLj5YgtZkP5j;Ku|JsdxH!ySTSZJ6^1Q3hTiUg77Fy%-V3F z5V0V{_;vj{^4EAS!t5&lej6>|gakPTcsepCIZ#Q1p@68zT|W@~{muly)= znQWrWdqMz-W15qPb+NU^_ChSx9QEA3qfCh;(rOoY;cT0-C+ojECMv)y}}Y$N(R) zZV$Y;toyRQkGYXZdBj$1dq<^b|~OFM~qCKwF?NcUU$CuVh4)kf3b zg;xiAIdnZVbrKDz+Ot3sdchL0r*Od#Cwi|h9tmH1 z3_!*hgwi0^!1siV+1~_aiDw}CzS_2%1QJ@WeFX?;Vkl!yn^+0A;@UBjXUqO@oDU2f zxGxF%$YAoxFc~8Arn?s@Y5aguZuB6(*?0!B(z7H>5fD#pP7z=s9=tMBT3>rY+n zGZhozhJQaRnpYA#_R0g8+o2YczOw1=Sf)gY>8U~D;3%&!=t$BzaTNFofUK5hkr3WY4Gpc65lpAcdN z$nizMNYmE92FPcSX{)P|qr^mGHj&Lm>JDKkZw?Hoj;TNX{1auBk4+$YhVwlXiBVZH zAAjr2LDWeyqO-YT!jmh-=Zk)mE_4ojf+^5HynzidY{C#dF}Tg#*uUsc#^(GI7_I+K zi+)b(NuYR}4}J}Zd#xC@Zv^V-p*EW=pz!GlO7P>G|B)mY1ZnTTfO&bgtvoNK@j2JL z(qmY5!g(gp7tu3{L*17U=U2Ne@HSsr1iZlEfH+@6w3B9ud?vY3e>DKSl6vdO`Oz25 ziNzoVdUFhbBeq=5cI}IQeDJLX%LT_SYIq(y{4l|eJe~eh!C{T0cfj0_BmqI1NY~B+ zaSzV*mo8@z+KtzMeeytiP7&P84Ul~ZbjdkJi^O_4_?-O96%%_*FVKi!0v4o~$E?%- zXaHo5_%=YILi^-bLcrPNQ8HyL{bTbZNZ@{PwgyuUYp$1Yq*vLp?)}G@j7V>zJLieAid#X|gj%zK-t-3i|3*;LaZj_} zvcd9?7*fFDUPEEeJ2#BKwWE#>YlG2vqIm;6rTfxZn+(ub@qjYMn8TTu4x82lMMpIt zMv75(0wc)&SQ2Um(SP|YpyDf^fyF2^U>^cXpm`X7EQ%Z*xdn%XP%bxuPdRkZ8FaxJ z5a9X-ils&9Ht7XE6Mc?S>L6r&kp(12-I>1kCXgxkoF@A?$=3lS?kZOvjS4T}-Zz2u z&hBnlr7$c|4VD(^coYb;gDFw>qzaF~NnHkQxO44&klzv%Qy|P#z(xbV=GXg>%zMkM z3MV_X{{3X_a$;E*kPlxW>JUI9Iz(4`cC4flcI#(xz?WS^#GtMyQwE#wks3_5&yg|+ zrvgYN_k0|LRK~?D^wwAM09>rQu68Dh!wV`lJJV-F)mZ;-=)<(J)kMX)zc^sotC%=8 z#!E!D`nh13YMy-I5p9swc=b>C2Wb=3s~BWsqKSQ|V*?uA+6E{UC#TktAlVJ}w=_zu zNEpNO755Jzslpc5dp*}D#g;#T|#qjTPwed$8kjYLSc)DJ|w+IkOh z+3!u1ThPtFqTm3(YI1_8P=lu`e6gQ(0U=T#tk`pR@Cu(IR z&$G@SNIB_}?`p{NdbuxLR0wj%TeKu5QH8baNWo|JuC8<9Ev6j_wklvJnvpLsoG880 zYZ`HL>h5-Ok48?EsyLjW@txwJm}`aqE8NcbHJ{xSLv>-S5!<*whyx9zHNw}@*3$A3 ziidhr9acd8vkDAXQ8fJesZgE6%diNYCO;b`#kfK3UL+H=UyEq10UH;FT8`SJs8I0q zV-CZ&$&b-D^Wkt3U8CqrA_or_9zaz`EM8^9!~I3FbGy@M(09mP!Xv{_U~KcVK8U9D zvLyRk{*(j`Z^KfaMw%Z6nI_R-FbOz`sM?5E3Cc%^G6+9AnLL7i z%(DH7hQNC8(VwU5pPH_n|5o@*)^o|}F}c7DrAhgG$Iq3b#9OcD!s6WhCI_QL?^fMW z^NXnT0x0(}q}#4OjWZdo%Hr9P?D&Fz1s09R$y%aTqk`s@=9Qz(N3u}0V(W=Tr*vR< zkP1OtkA~i$$o{J;4m6T&;d4w2g0KhTxlxs0`)&#}ehorAMc6mF)n)iX9 zNofdC{ezIb%$b0?wC&ER;6;cj6skDu;z?$+H>VWprxyOeiKgOOz>sfkCH$lh&_(^H zvn~?S#Eo@OJ^w4hHUuew=lGR8WLTLQ_G3j_N1Qk|Z8X*60eQWi^yuRQ4cS(?=l*O; zv!LawzO@@VEdq*|FJwB4qs^9KsMIuIUb_Gd_sa406*EL9#i9Ezo&{Z5$=Ob03UiU9 zDGE3?DEi@>A!zwv`v!j?j#`ov>HOQXyf#*6kXh z$=WH}#sy*q6pC9(Z&{W^vS66z)v!HBes)_vfDL(s+VxnrwwoyFJz6Oi37i}G`xlg- z{m{7Qes9w#f)f7i?gnA>&^j{()BNt{@Z_z~6;|lqKU`e^PMGMzZnfjEFzkOH9!tV- zB)O1n>;!Z3XHDC7mTzP^7J9O2wA1~UI|u4|KajML_+r)=T&_p>DgPUOUgI|0$^Y^0=H~v8$4X**2b+ zAN`H((KXJS+9l}16(rP#Xb8WIyBg1*dQUPK4m@HOVZM>PAs4j2l@}9vXbuzi0~-Mg zliHbwuV9UfsP3t3I&o~a@|0u@b`bAoJS&zn0u#!>=vLP#LpyEyg`pnhc%msUH2HQF_%j(^Lgx& zYLaY#7}hD~_WMhK8r9Kva(Kbm%}OeUmd&61W^-~*x6xcv3skE8sb?N=s#!yTw7JLt zh~$b)DQ%cErrliLs(U3Fk{nFVJF>&Y?H5$ste9}RzRd~|9U-sLs?KA-Uik=eh@s7| zm`B|)tP(~Px6_ZCr2K)~1_2>^zvJV{9p1^V^G+k2XCNx|EG-i5c>7c1%~6muYQt?9 zy~tAzZuR;;ZA3u9E(o@nmpw^CQmQA2Ep;m9J za_LTi-dH~F4Qr3`mdS+g7iCoBjZqr`{o~wCh$|)6f*>X~03Cj-3dlVD7cw$0dkTNV zM*X2XV`4g+X8A2gyU5`uomHr{fKKXl(UiwBm9Qg>+NghXXQaQe+yFv@Z_Bpu5&q=f z^PaZ$HeTR?d1qrH!WP**ZeuU#^>rwF!tXN}LTf<|v5>Qj9J+)gA^$#v9lkbhzV|yM zd=Pm+?BB$nBd9~CpPpzKglP$3NO?1!>BSp+59i~t0=~IhDty8|FFA5mT=Q^m*>7+P zy*iVB%yAUh8v5SCSQkZgq3z=_PopRAN*SS>4iWCcgS4cN)Ik&nIgkv>6*+bm+DAI| zg%Ca_VUmpvHU~TRpCMAefMJm~=m%VbH{Bx4!k_smR95vdw zYVN3pPlQQL7^ZH^-7Dm99~Kq*C=(^X=>9nb$?{%Z!YXB>&L{kW&`yAl2d{7ikkyf1 z=p&%lQbTqb4?zmc6AZ49dPswk8Z}w(1K4O%?P4jFKw0#27n9Bp-)%d>iO0&Wi5O^X zvvT``P+RS^e$Gl=TtoZtC%0NSO9N0E*7#-f8i$MCKSEUFQ@bd;iX$tfxUrD$J!yBi zxe(dL*JadEIA{q?F4UNd)A7;DOa6=g@p~F?abW@F&B6(KL=&Gxx2M|Xs~`;P$4;@6 z;T!8tO9(Um(0JjN}s8$<&5Y=i*wf9qjb~=TwFR-Vh>+Lh=U$_>&*@KS5>a z-4t@J8#yQLs)HgKzA&R9JEF^H%&^gjl-D*79g3^61^rns5=$%=sGFP3;$Z~^D=&}$ z;(&%5ty;igA~nO3j|in{ROiO2ZmT?wF|^M*wJT%saoPXyvxB)P^hFj-3P^?=_86MI zqeT3~P;E}u7m-xyalnHR+&Q~E`}C2D4uLx9K4>3JM~(J{)s(3@87~`*Vtfb zH#c{JXdA-H+CZtQibEnOJrkh{l6PSkEDwNId;{}?A4tE(wn_%n4V>HbPXaoSNMz%i zSGc6O-AlF^l-fXBII`(8X&!xb4&+f!7?aOS7R|$=f}ie6bdV4FeT%1Sgykbp_T4xb7D}?ng>8Nqmt}F0v5X zsbEFN_Nq=8qO=&*57}THZAGa{-mB;0d=~!HiH2)cRXYn?`3uS21KNAF6_r8Sm6iwU zP6vtb%DP;8*kl`VY5u?GOcF$qeG(wFn$@OvF_EmMib+0m>G><ATC^GTLFI6+9F1FE;!d6Du927>QOaeUC@ zcU@$}LPJwa*aI>iF|>eO-7lXli+ zC?JfIJlIkBOR%yz2j1jS&poY)Xv?~YO!?tC8R1V2o{ioc(xywev)ArhsGc1<=muW% zWB?exomLZ=uNakv^5T(BAV68RH^e2^h!)pCZ7C(?U!V5JFkz}rsN&j>HSexyDukSB zXCMpw)3r<>Hqm)zwAHe+KhO06<~{W$ZN+eYeQMrDV!G$-7lOvbe1&5kTY(tz1&BdO z9Xi&J+$8eIjPE*4u2$&Hn5=4B=OWNYuJ+TL@q&S^IjHTOUXw*k`5l3Eo}A|(w^MuO z(Th9}+-1>L^M<3umL{ABxpFKr_uT9Ex^ur+&crmqe++*^zR4B%%sZBa#6Rz-Y)&8q} z#e@85SN17YGO5JUr+IRJw?Zz~1e;`3zl_qKSG1JpsDz`=^_uY>NMGr=_28m3v3-Fo z7`Cr8-aB0TIbAt(?;(oh;948!v|6DRehPDWThheABy?l7R?684S`a52=;GWAC#k+a zbV2n(DqPFP+7!dsd}lDE(yza>(6?(VG9`6=B~m{ZTA!cgr}(#XctNVmPD+v7>6dy< zJRFG{#CR|3Ku`XAZK1z=x{bqX(Xnsy4pn)q5e9o3KWjIm`J<6fq%oa#mJT}fEu0%& zW_5lZbe5#b2b*fv=#ER8OvMAkPl7ysB3^!Yyk1m$Ww}Y>tqjj?$RtdsRIohup+y|u zKWB37Q!X6sehq%QV$#>9+D|sBU%9ikRSa1%v#?z~K!8wQgy6kD?N$s;%0@Di$ivXnsTcTjCGQFh-aRUkmcJRy1g?%ulkB|OYo|b+acAA$ z#5wc%4yzBLzTCIR4;70)5JY}E=bSik-2Y_CamCBjqW)ZGL#OCZ5P{s=9tipsoH(Im z<6TH1kaVSuQ;t8iM#|Xw_2CZba*n6Ii=_wnG|%6i_fAhASDJA4S9yYa`vNfR3x8lF zv7Qbfh~>j;Q{0`Q2@CfgH|0kT(mbRWeLqzmsT2e!x<^2XCu^x7>D4aODhijd-nJE* zBANIhVj=80>Wv!D!vpz=~mfDAZYZpe~W2-5ED?W6L(#+Kze) zU-6U25)K+@kJjCqULDF0%KJEjo(t>&CtYe# zsa=-ByHn$1E;rzO+*k&Q!#d?G*S}3>9ElT@!`I#x@}^nv3(K*+6(11H&HGWt54Wx} zzIe6(V~HEK(tP>tggZ7_i(MADTV7Gs&kzLuX>;+KNRo6bfQJt2jBrY5fYIxj#p777 z^y}7G{0_*P<$!ZSi?asweqD)zGZ8D7obk55!XdC!w;R7b!SD<1#1_|t^g2fmN!5F9 zK_p4o92fr%38VUQ_??$y=%ccSA!SBmfk5? znk}wFGNV8ue5T)y_dB;Uu0>4E_Ry12T+x@k;o>m5b?(!nbK1RCLN6z|ljg0nU0>cD zea<-fVaNBoGAubqH9HUBnqG&nk^VN5byOxAvCQPEy>|U(aZ-ZlBn5}rv&1=7e5+}* zcdZj+o-khWJ}w45?RAD5xPrkR(ThfJxOpg7>f%=0CT%20XQ2H+)vKEE`>(ASZlqwZ zcY^Fv^!Jgnnj~7YEYE=Eo$q3EKc`={3R~df`}12lxU>6#b z?e?dsZpp`cy~>aH!`f$j7vgi}E~Ohh7BgG_InCI+TI}0w9J?2cPwONNbJpP_PlW+3 zqjUKS+eUc;J}#f15VKLNCxQkU$6HLo0zQMe=#$T#6Js)P`8}r9yztY%Q4*hag#~`Q z)x*1#jFM5($&hu({e#i9|A)P|{>!TSx`pAQq(P8IKw6}`L6HWrQ0eaOl9W(VP(VTu zkPwh=2|)?zmJ*N_DQOWnbNjuY=Q-~=|G@ix-uoB*2$%cXd+oL6nsdxC#)`K0Dydse z!M;}I7s>c|4*VwQ;E?Jnu}%Wvyu@~VuMZ;aBMVjhF+4ZXv3?u0FxGEHpFT0T5Er~T zq{bVKkra0jBd_TXn3BP@GAYN;)_~p;!NK?bbfgzMNR@<`ZW&%izg!|czVEeBm%epR zE+~j}o6_BA`g8<^iZW6rvQF_p&NUBmd$1Ul?lz=?f92C{FwXe3c;3|}*Y(8^`LAG&sjo>^|LcpJtj{p`%g@o``R==%4yYXUS z7Y;R7;0+_L*=vSyB;y)8?x)%JHW1Z|ya>OOPr^U08&ywO;KO{g`pHt$ipPbCx)f}q z7D0sB!}Y$_MBOx;tN_K0!9KU6Gy8gXY@w7ZWo_QLG)b4g|HB_ozAsqJ5TnM z(?j9|ID5CkiP3aC&>DPK_holh_3cF-*!}s2)dfT0$ZpE*Lq=vX?CY1F$P>aUHb*^z-r+^?wDWc3HA45X05!QZ-gZ-2yD@DRk@w@9_8 zQ1aF@e|cZkeBdhW`VLKShixHU@%ua!-XbYqL|^uzlm?fa{W@n}+Qq4}J8f0asO)g_ z+K`dM*ATX<-btPP!#40dy81hTy#0pj@a1>TpQQ|>4N^}oxNneaWk$ftBIRDO*9n4S zcSWZmdO8Ot_RE}Hv-tH84(v8{nyBQO3Aw~9h%1}|s@MR4#&}zSZbiBYbYZ@Jsogyz z>n8I|Oyrt4Kfdm|LWbcCn{0H2n~DZwDIAd8&PGo_t2ud1aWxe>XKj!O(IaQFvOR43 zUbqbiO&Fv)To%(0t=@#T!q-ze)GZ}?+4{XQPvmTpn;jHFT`tk6;L;<-g1aq+yEPV= z9l%3H4JaiLNnF;1bD537AUnqUZI_@%gLR6(kgM^jm25qn&C;>|4G08nmBX5}wo}$$ z%7#?Agm-^5IACE(8Q=$tH=yu11~5@X)kBz)AMdY927RW`y|$fskSMRjQ@@(^kPD%g zgu{NpMHJq0ovK%v3_F=)4m7 z>ELq?>-ze2!DvW1rE=dg3(p`fuOQ@|;)CraoV#XPps41XPmokw;xAmU4~dQ$4WHpb zd|=FrYx?yT4Dd*Deu;s#n5jK4bMBz{7AQstn@?u8TH#z`=6GUU1 zavwR#Zyv}Au_a_ivBF~yjnkalpAx48MF|Zl8-F5IB0$cpI|2k4qgOx%QuHG#2{$XH zlXXadS*vZ?_>cp7O~aMSZAytKuzp?e`mlW46!)vMa`GO}r?6Vbkr`ZSb-H=6kAUp54^ty+zdSFWu8-)~+*jykV z*rSF+<&tOI?J18ZbcfOR1I44w>L~}>ZxG>2TTuAq-l@A+GXqg?FO}*PRW^Jh+)u- z)Jfw(Xv)B)QaGe8#&-}!pHv)f8ey!TIVhS6LGyfgAuIJU$|8 z)7rjJS#fvW~hgKIu7VItL+Q{e_WVr_wBjm>??%_?HeO!#9+J3 zgfGP4*T3hBuSJ&r?U%zAxBL2+t1UOLbkxCi1~bGLuCW=Nb|Z+xNTU+xdISDLvYAca zuN$z}D?)B6dS}?Yswxnm{|wauLpzme)GzxFx_UH0{2g?H1ftVU9~tX~8m0-M5H^5O zIYjl>Bq{}_tO~Vo^lLY~H78T%B_EdLHJ|dYc>rI_aRnl(LbtaXmt3!&Phmsl1qUxq zOtHS=r#zqEdoaTWlmvs^h`G11M3QOIcQCN@b=-VbHOAF8tJ|Nv_7qAlB`85-<9A;e z%z;V%vtDskVkWsM{DIAkg_$f^cpL*&@MPmMP4DE$8S|;Y^lMZra|2$_ls{!d5d4}E z#u8h;KKX!xF+xO8$#=`NW>Xfv^fE3oQ)S?{kb2yAfxqJ!&{!w^f>#k-S3&n@?%yGS zb-np2b?fRYH2F~BLDs1?))_t9EVcTD89R79m@4|oJ)$pew}=dyPp>acTL1;=!#^_( z$w((V9$bn*Ar6F)*TvD`RW_n^h<7ynXNudxm%e}OcRz?@8)}fSK}A{(Laa%Hz=gE` z``L~_6iJXJO`*ypN7B3FaQpw5Ne{_3g0v`tZY$A`rmU9#w5z!bSuHC}WGNA~0(-!>zS$D|XLUkmO%gsR?jUcghdh-mmaV^f^T5iW82xs_$mo(aG7 z=9IVY41_YROi^S?4emC+B~i9r9Gc7eyL7Y}rTc~#nkq>gYLR=6h6fatYbru}g~!oG zytVdyaNrEZ90f-$bOHwO_EdC?`ySl250lZo0eq=h9ikwiWWETVjx(MU?c5G;c&?eY z4dl6gKv*A7*zv|Xb!7k50j4zOO*e|Z5QE8#Z0#VnIP!1?}u8jnNE2YxcdfI|J<;QszW4x1^@ z)#X3`eB%GU=)V`j`*)1~ok#yJk$=Mo1joN&^8X0C|3=9FS4PNuYK2(7)(g(!Kt^&M zToP|a*|TdE-1{y<2sy8PjlDzMKii@kH)13BM2BClm(d6M8PRGdP^UE*yeqk?g$jZb z$v7R%eViW!Yj}{UU5h)@%At&9&2<{S1Qis=Hdq5bfts)}K@tA*_S{G3b|}eehiUz; z2bJUI$nU{X*Y~P5Jbl%Wyz+bcRxVd&3m*z;Nfv)&YIy-@ zCE56qotz_8jz&S3Out&%ux~>+f@j6wvk=nq>J|h;FiiHCw3h7E5{)phDR|oo${dpH zEzPZ8xLB$zh!0Ljaq;NocM1d!HY%853}>>k_-;96NTy&;=>@EjHV|iBf5YfgTs8Jg zniLBBU*#utr>vBj+~ifR*n_OHB&y}d);KzldjE71g^abHycT*4ZJ@Maun}G>(j}7~ zduWPx`uOD;N3=QpEgG!oyU-jTnc66;dUat3M1+=k#5hpMeA`U(39^0S&k^sWW zzWn4DHAH84oZQT{`#q~^slIc5kA7~#*K&9)E7C5xwBl-{>>m4Q} z*iG9YnCMVldYAM)pn)LE5+Lf*v)}9Cv!|P$fVcSP@B3%mh3cX+>GiUv<4LqTh8l0& zw{juKtKL9f^_H?03bD4Liar{KD!zJxEblgW+w|ah*yz;B6h7k9+|N01fLc_87q&Cy zFF7NJMWQ^Vkas+IAISl~a3JDJtlwChfP=YYHd3)P{gzW@N>sgz6m*# zheY8P-$%@idT)m$RNIAzWYPrT=KyxZzmtwad7{zyN_A;jt%;?~)&IJ#b3 zV{imT-8xSXCApOnx^32!b4k6%KIRbPWXh_~br~4I4IVkkRY2=mf5KP@D_vEuk$pg- zp&bm6>y_L?M$_<(?7K4OiSp24khy@sAJihF_V+TGqwh*|#<3uwZ2}X)SL71wk+y5M zM*k&+iYFgKaKZt1mO&1sF^=S}zOYQPF*Hg!rHm*~aG}5Zh}QmjYL?vNAU8A3kk#Lc zO_0cLaBbH#pK|MY&4-q;rO0d7U#$zVq7XoEHCl&SCOyv?cVl=YkpQ;`Y;Q_T=kmeW{QxUzqaS6Po%H9#J+sy&dQ0uW z^(H8DHrSkc_&Yw0QABBp4Jwc=O@6rED_*XqXJ0k?-*9bdk3jbm5aqC(ln>)ohPD9m zMXE?^FC-dY7dk2(t zR8k1|Z(P!NaJ90!5ijeQB@7#(f|+KP0c;%(oUFIuY5YOJd-RmDI;w~!eGO-EhElWz7k01dLE+5GHWD?n;FKE_mg`%44;lgy8dk}8Ya0fcWkq0nbrhg z>OR5*J#6>agID)Fke;s?;IeH1sg4(pt|u_22w!=04nQ0!Lc-vKUv=nMw=-G_QAMJP z8RbYc`1qGz+^uf?Q6GaRq~4OU8g{#$5pXJ05c_MdE6@&+ng^eesK-a0p9Uj_Ealzm zUEIE0dE5ynN7A0{<;h|>L>ofSk%;;=>bS-l%K%!X0746X!f(yhS9zcwBtya&aRx#2 zw^0`l*fWx^$^vxsH~zc{rNB#27f@0E{Nd9@H6c5!-%=5p<8Fn;)p&@SK9f{*AwGeq zKPJ2&CLs0p9DE@@+>NW?`;_$@xePofoOZ_gTTk_&P~R&<3wg^=HRl*`GF1v78rIN+ zu+QGSKYmsjjpwv;TO8AkR>Z@A)`b^PTjLNykqbGb6&wS0CtV#Y_)VqiP;{ zRZ&%sD>EPl#|}UGQ`i!SH-J75DKO_GQH>Ot8-g05t077a zmL@$wSZW^cF0Khg$ITjk`3O*LjGCS<-p8LWJ^1u|o=9Rfa~qm0Z!3--1fi2=QlwPh z)9ccKJ-NjyJKDlErhLP#=0OkzMdn2hq;Y~S_>`MUlq zFgUf6mCmUJeI9lnJm}B4i!H|8KzaLc;}JYTli!1(pw3OPc^7o#C3`mr{Zb~|Z-`;^ z1Mb^xNNTc)i2lQq-#I|{fVk3uF^mO6DM0u$-8Q2H77e5xt*~$yz~E`W6#7k{_@`(~i7T=Us}iEsLiazpJg;@g=YQ6mbp3Zhh_pCM+pVZ!ODD`HxB z3u1K8<2Hu1fe*=mi@M(6)O7z%?Q-kjRkybVr_4@l$+=2P;wx!nwiwipmWyk?sao!q zpdMdsDQ0w0g>lIS)Q0-7eyRu>eFHteQf%Kp)!>E>%X%s7yEsFGuo%jAHk|_;<%h8b zIa{EAOCMr==LKJV52SG-+6 zs^982Blx*JSES(_yMCAKF_;NHfu@Lf5j5|NfvXW&pe{0HSAkye<<@wAb1|pf<}cXa zCE1MGYgp_HtRiKR?peUuZU)lcvG#H~7<=JTJavB#W^ZhjSp^i1iXJ)nkNAItcYH_6pv{lej3JFO#K$QnXsva6jBn}>h) zG(23nn`w?&%dI;6Rw~^YX@DnA@H_xaTj$msL{TEhg{_lKo^gJk^Me`vn66?TdhNFL zXf!j41X;sv!A!4U%Kk)pf8lU2ywEgk%Vp033f?B-!ICj&+<$r#X1~~@Li~-a;gmX{ za_}zm=M5J^1B9Hh*~U|T4_eokt$vvy85#|zoM1V$R%Lr6w>~&$l5e8!;-78>oJ-OH zDr%hdl+9+bL)h&RH@!mM#p(E2+mgms*eA))ekC=3vY@x-zEU-kBVER!bt`b0Rv(LG zQ6P_+)KmHzUS&4gDXNoDzr=~CsQ$-0{*r)3Z$E>-`1v+`efx+T5>CHb=cSwMB7I-c zzo`_ESqS%@+8zv1$h$pLXRL%S`6u-SZ z$GJx2GN6^eZ*brZX+t0b7Z3S*I?i5{p=q?R4rHVdAgLY(fxJdEhNaW0g`ndxytdDY zm|NgO@nNB1H7k2e-CmF(f{!&|qlWk>Lbp_H{Vj(?4$PqEx6fQe| zBIO^@Tsd?+#=MYai$y%$YGCooycN=>dotuTwQeZaK5*qB$Ar~+0x~{8L8D_*4o}KjDkXPCpmvV1+x7ZRHa%GQC2|&U$MUArhSb{ zP&?j${5CMwiD>Og)nzctAcw8+~TK+3F|F{^^kFB?U`yl{JO;);4*L z2z#sT0CH;SGuCq<`vQB}^h>t0);3v%_(2#oX}O?C{gLzsidHVUs~ymMHBa8imaZ(= z2xHoj#pBpE{M-5?3cH?xd-duu5!20|NL^B4`v)d(tcDfRSFIM}vL{L3)Nq&p)4wnL zRu&xzk#QqlEC+1Gcs0MMUD8N@UfbYo!6KJ;)yWUC5MtTXc}UKYfM)%(b1-GYNDRpg z&?CMq&Q$|{?QsyqN+vqkp@TF7KdJ2HF)FIsrs1Zk{A{7Z-!M3%6t6rdIk&bN(m zJw?hUKURD>IXUESp;*I4b?NyO>2gM2(8To$5+TcycxtbutVkX_DaYHHb|?aexC2tg zWMpUVR-8M4Z&M6x_!UK=Y^1*XpnX*gJWm!r3wv%9M6)?uNw(N77kHUSebCVCiAOk3 z9Twaw9!%bvE;BwCc=3`+;CD&T*ay{m|WXn zmBY@oV7U2mF9RVJFk)TGepaPAuM_~&&GzM}yYwY0KJ;>PggnOYluNjaq|}m^63&vS zEMa?$_B5+F4aK%IPdy4D%=Cxg;;IQoUwXH zfOXB?^>1PQ)G zafm001`{Jjyde@Q_QZB}ATheE-RCK5;_3^9w}qnw2yDUy^#Z*yX!~b)nl^z35X%1%~s81wE}gB^361DnjLZ>v`j(;Ori?(_$o zx%TYI|CX{Tzc)V^($lK~YQH}(jWkGfxBi$y(1GqZXK zRHW}AVdvF`KjvJo!}TGlttLTKEhdD51=^#ceB+C}8O( z{B#dlWt$)^!cOlRLN}ngz8i#dIpS%biiZq%)qYJ#JkaRAR-x|aw6vD*qfv>Wf9XGu zP*1|h!InOa*y7;`sbX0C61uA#-JQvn^CcZnh3_G^QQWSy;w<;>hAg^X`_C$<^0mrO(GcE3!ICn4~{_5w>O&$YssW}$BQ9^%;yRpmU?Ou7U` z0f!Bt)l?(CU|el3n|p$_y$I@aT-Om#rzVCN%<| zM;$+Al+!#H=yjwCowc|~ptV*76TIE}YGV(_@NcEo?c!$KrD;^Ka*BW4P6*jchMS7i zCMnI+XK(I)r_i+{&DUo;`uTp_RLSWKh`a464*o}-{1ctX=`EjxB*7cV-EQ{t1=I7x zfOE>#`|38Ye_YpTL8!ELaNI$e;Acn^C@>}>dFnZ;iC0>K&F@@rsMBb7wG*A%g}pTt z)(Jn7^v^$N{(WC;H8bSrx|j1QytWPoX9<=aDRYE~)jfYb%`}}n z?L!8>_HXOB&G0nW;|f zhh{J1rP>oOKoa#eWWJn(3G87I7Nm38eK@!tRx@yiyPIjy`9>`0#08 z&Be^)L%Yi(&jXEWmd00HDsu)ZP1{$yoN`~XwDdJiTT856(pTZRceN|Yn3X!WZg^dV zkEDiK#bkD^>YL1SmM^hWaZwMyPTdrqIY}6l&bEGisoU3o>5;ZzZ<})`{}=vOc}_ps zANgKQW1eU|RJmiScMJq^$=8~w5!ImwS1zw~g}Qg_u3>fvbhuyqntSVWbJtB9y*!q@ zQ@(!os)?5fIo&^;h5AXDckx(2JpMs#T29xayrcvny_btuGULtYf9&ZnzkUcH0aq86 zBQwS5y>5=b`Wu^f(RT4K=hGHN5yR~w7L2#|AgdB)vGT5xlH=QRqQD zUg44Nmo0mj=aVeD^EP{VfxY7Kf>|@F7roj){a|^pVw1c~GNLyFJKD~3i?cZjvZ@;} z_FjH`??|`I{Z%DaJAhkCNfrDXV1yt0d?V93Y|_M4pC+z1Y_UXadOa)jw3*1~1SD_R zMkHN1_~h#~7Tt>bTVsMX6$wVN4rIdJUyWK1#;%S$yvubfTD+#Y5dTJ_q0>h!<8JPn zD9(tY`}q{V&Z95R@!Yw{fs>+ahA(bZGv&~*4M0-+HJJ~J8dM#^D`fPsh4>u;!abum zOvfc1*?@iQ1R2NY+`DPANd`XFbYky(>i%TtPO1!$raHuV6xM4M_IeVpP|R{xEf@I5 zoe3KvJukrWtVSXyeT>X>87wJkykOL8FwL=&((zInj}D3~eTN zK-{Ma$7J0yqq#Sn7Mz(#H)@TRZa%LTuKN*YES>ptqx!t;YcAdE?+V}J!5PrrcF*E@ z!ly?5wB#;9WRI1Wnsy3Y_m-|!jf2m;AlV%o>X;fI%y8|#2 zuJ1ANc7a}qUlqslN^Pii}g!=^1pExxhqfe1O?^ADPy9VbP7uyy66kPk{5kL zBTi$-90Y$XGbIv@?Otw|l*ilY3X)q06;x#Y*Z0cA%6@ltxr%ntZ(htY2*A_AOc+m{ z{A9}pFGxUx!MmvZ%hj@3IT|raaiw})217?fAcY*v)VSB!0sS*dSk0{ms_s*hjcfA4 zRTcZHF1{>HpNQ!U1uAi~H9Q7eiTTh4GDJbI)FEPmRO))_66MV2DpRy%a&Gs1hP>lF z>@Wx26pAh>$ij!ps?$;geYD28Fh$5Eq%U3=ph%Z@wW?6%_NO@-Alig%C4!P^KXGXn zZ-i`)Duogno>nyS>>rm`%b)>lQ#MMbA8nBG$yzYN_H&B({ zCP6&rLLzSbajHzM>y7#8c?Y~xj0-Q0qt!wz&L|23IrA!YXVT?X99uWP!TI0pau)dh zxOy?&g<8(NqtVs-gV?f9cbJ-x9lmh=S@fV+vZ>qz7KF z%#oB0HtbaUDIb;UrOvMctT1UQU_qkb`o`lg1f(fZkS&sQ__o8o*fngtPOa5m6hf_>iH3mEsQB^#%ttd(u89r&?+*-6xr(S3UB(; z7M7xC)eEJ-W%7HfBrJKvE*zG5)LpTy^u#HwF7xfq%|G51dZ_QxU1Hi#`T0_5gnL5` zW=hkD{TuVUyWPa*KB0DGJPFOYVP3?&W}VVV!?1Oim256XV~A7awP5AwN^RDu>(Fhn*Mim?uEnm{SMPkk zeWFc(UNVNCJ^RX0yCX_X{)BSi+SBFR z<3QAbel1?!^8E&{ddPUiY6aaAu6i zJ!RLBvAGnCj<=p(<8)h2kf=IHr%@*qEri$Af#D9<9WF_nEwwwhKGy(;y|s#Bc!#d$ zJ*TvF^Gq2DyDFEQ2aUk!y$nyGez;T{^nEB_k4!s%aDMNsIeIls^kyG*;>VjJtSYx2 z)MT_=RJ%rs@dxZ|Bt(WI9P@vNGp4%6ZQB*y=F_3gZw?pA#Q@vo7pPuVt#jR?y0PJ^wHxUE zBUB?#blwZ`ZA8X08?mLLVesYDygsHwx!b%&P^_30V(5Z8fn!~VI!z^phV6t;gMpvS zl&bF?>^J%MkDS?u(H;q}w%ifOo8b3-$ArCV zcHIzb@)5L8(hnW_?6b9>@qTXN@v9rMej*Z6 zxuA|=of16I6~RzmsV^L0eb}`1q}oSM*r7e_l;R z@6M^ z4lD=1vOZa}tobYPk@r%}NG}Y(e|gUx^Sux)WIe|cjj57GdT}m0>Lkk8m#d3wLg$_v zcRmuikyN2!%|N`_a{C=d&o_UI2U{zW_~Aap2b>%I!;G(*^(ybKmZn)~cll#es_0G6 z^|)~A&F$*_o~sgixLhLdtUX|j<--Hv##M550s|eT>TrDfsBCfUitye2$&MW<48iTW z1LkbCw;L6Ovak0PrDz^$8+b2?wGenK6>u^~to=w7$~>|%1@ zWbXA=@ZLwp`}!KY60t&7?=N9iIE;L)nAvM?fqK0~?@wu;9wwc~SK}QA%o3Pih8`J1 zd)MgotG|gH1N0)K35(1*k%u!}jz&(NTst6yE7Z*kC$JM^czBzW3$-8c<_zf<{N(&oX1sRMYp3Y>1u$~ zZ173`3^@XCzV zgSFsVvWkQZ_EXg<$vllR)zGbWz@AHx_P@p|>i3<%O1;I#$AIUWu%Ubk6S88yts%#Jq`04(uPn4D3x_z}bu$&_+w=l!EZW-#+*~?ndcBaFZu(d4oI=^b@{tcm+#j)f@zd!uO(*aytc3xr)Gtdw zK42e~qg`0uC$TQ%R+YeTtojR}4b-ISovCT-36(%S75_>uNx=U3N{Rv585GA&mVABd zSD$vgPMK;vTT!cj<~H34^IG5t7;!+#Om|zwWrb+hO<+JZv?GkxfoitjXYZzpz@<-1 z?84@c%Z||Xmw|wxrxN@@Z$0kt&zGuFwiHh5J0ypW7nMcaisR+)TvbWP`U1e0wu%db zr;dk!j^&@}MNRb%Tb0e-BR!Y0i5)sJm1ujv*XD0oaeEeAyYd6p?za6g_l=hndFj>= z|D2RdrPq}eH$`%J;p?)Wx&*0*)Oh|=z=kTyKVs7g**I%dq+MxdxY<{dJ0bT#0jLJR zrrS0TcU-ka$L26LlMj=4|1$NJ4znUz(R&qF9 zTmp3fx<+E3!5f3DvLXp*MjGjyL|wljPXgcQX^-Aga8~>g0v=JL@0bj2%89(~P)b zB1n83Kdxh2j8Mk5XCGR9{}?E*l6wBPaU2w~+M|}&@S380PD@aq{z;x^7{W$;%`Hc^amy37s4_Sd$4a>wX!=TOCC6^z!y^zR+q|ct%y^-b?k=9x z2Mzy}Ewv1F+ZPQIjt`S6<1WX0cafzV(Ctn_FM`?h~*wEyQW$OyDt z+wGUY)}nDU?4p8YK2y%ev!Y-}xebvd^ z)p;v~y^tC6#{^IDW&aqS|F_aSjdfg>%BJaATwkI}i*ri@SDs()%SfD|eSA zt)to&V1l6{!YabAe9w@s$5`Cx`t&EO2+$82ZZ3&uqcAw-B$Uey^jx}FouBl7SiiHE zW?VW-dJ zjD!RF-emfu%59h0JZkFGgqzL>#b>;D^f~ohe_HM`YsgH4SqDNl{qy`*#f5vkk;d(u zmGV^_0|Y3#?k+|igyDb(O+X*IstTXRRav>N*GTOp;9FEVr0P5(AJ*HK2rg)nF>bX0 z>A6d3q*^LUO95@_22GJ`ACl>owK?hte%rxjNBT1qWThHzksGDuMLZKiW6;ozi?X8# zU&;-$MK~+;nO28gXhhbRt;3FF7NP`O?Q3QzypIOij2LZ|8X5iy@Z;KT5i$`b&WHqA z#K1?J&`a7EFo#A%?J(dqd-o8v+JXu*;Hz1JMk|Q>b)^dd7@WnT@=2BCLL7#a6)#3> zB(9qBeUD%cLps(5dxM*=jB7aQ&Jol*&Ela0&%+vE0r70LI2or#k<)~4vk|h2XX0jZ zBF!2ppb4Ly5U*Ux=(miYN{^x=;Cg-VxU+BlZy>Qj-avJSYGu=6{*pFd-D}*mwM+#` zHSFNdM4-fWFj)AfMUqcGkEi(wV~W=m_cJI)YgF96qB}b=wnduoTSbX@v7F$AIMRhq z=GZ7=MgE*qfe8_>dr6?gm)YjHv--jpXq};Et6P5=m~d=rm>JcfruEMu^P#hp%-51i zm7&XY

{p0}+n#_?Ac4gfMJ2{d}S@cbm&po;IQ0nw2Ee)Xva7hE+_n;_>2}9Neeq zZkIV%AcxYv2(O20gM0k`-*y`EMCPr;>&5rMI)Z?`)VtN-ck;X&4z z`<3whlYbhD(9Grnom|b(Hy3@NoBi{-lz8fo$o}}T?r%YvuzG_E-CKBUd;-+l! zR3pJ}J|J0AJTBbj8sI|@OLBZ^8n?`rCR$wHL^!3wqYRJQ$z;^gN~he1-2Yz{9gK!H zh*}Hhk!b=W`1cQSxV$T)c;x@~qen1`MPbAL^BKHFNL2fR77@nGe_hNBf@lSeCf9%e zfFte(TpTC9;(tEF5eK@xJ{vJGqWQ1?DMths|M#Z;9jSlk6tYhKU40Og{tXELZu}cO z!OY~}ZRY=9Az|=wQ!g5tWP^%={JsCA4gY2gkSqP0Tfx@&Zx;G*zW%qk`2S2u2qJau zSpZQ*S_{Hu0&0AL@z@DCvk})!yQm=&*l=%ukd6`o(Q2Hp|Cj;m3)?opQcsX-GP&1( z@ww7pb2~7uO)`GH;y3@{Kays!^WoY3=D1*A0(LlXsM%ltvo=Cey^chLD;tEf1Xx{6 z2|%94^d*vSYVKPXH|S`B2DE~nUz5nQmEs@rL&O;T>txX+l;!j@`x@#NyY z{@&0PfHp6IhNJSYzya=874G-W=D{yJp&$WtSNAvADb?wq=kTaKuMPEfb1-HA)T)+|w6XvWlEpSKND!wt};wq9Yb70zNz`8rehM zNFU|DpY6$%E!2%9D&EV-6SfRgpB`1c%Jy5s{)S8 zNFRktIrw@s4QG8A!?zpFJsnS`zE!~&(G!Rpav+3Xgdbe*pHPB$Q^6~U=YUtRuq+_J zB%X~+P;{GXfpkvfPyyscX}oEoX|icbh{i=O=p4|O^ZwQX5sEo&A?7j-#2FBE`((tnD#or~9qJxFf;ou+c$@7x56?N7Vx(n{~n z<0U10-<#zP?}o84jlOcxQ5&s7;HEIZH7<{}o$putHRnUzpb_`oP(;P`0r)5Bm*KP) z_q?hrvG%_g(y;5KI*4Tg3Ii68WkX7Oq;`9Zfrs#pmX9Q&LUR-yM_#eo$<`O#een~( zD4Q=tT_op?$_S$>Rg@kP04Z%-m-X`k2W0p$? z!Zn6)i*<_>_*IizWNF8k@8$S`=Ow+c45jd-2COpj=q`vVQ)w+9mby_rm+%n%v+#i;zJnh5`9cD(mZ=Zt`^`YpfRhx z>UgW47Ta?E4_m1PbsE>&ar{{$7^{AFowioRxp}4ZRzKxg?n;lkVDHGnZp{F;WChIb z|8fv_=auWF{J4W?b&-I)4f+5(f+1<%90BL>c)M-kIvR*;+zBH{)6L8WBGvJ|dKrrd1F znbZQFoOG8#uzoZL5u7QIEq^fvXCq!*Nz4Et4aB%wK$~MDiK*7t=bKX$+2ZceXo>&m zlIGiHlxOykE9<*u3JbA8iAPxmmhGLGjMssN&+cjhLfBnt^LFRLa4h$a?jmt(l zWdMyw#MA@`{TiXcn-r5(eHvjQquT`BnO`f-zIvjS{I2@oN|1|S?Xb*?jiuZA`Gah= zY!6i#5Pb^4`&XXe_3(_I0dVE{mXNv0>haNM#U}y5@_l4|)P3|kV&EayTPu&*jRjhn6FOAw5qC}9EkfzcFLdyL{Ra=I`9;nMBUQ_A*$K_)r`Z!rL&zBzlDIL zF*w58NL0u~de9?3*Xc4HSC36-v(a%07YEP3Sr6cUdV3fZdikX z_a@`p?tP?{5eNRe5+(O=R~9(yTi&}Pw~(h8vvj2jv`Y>Io5<4 zS4tW->zzOk*3KBMDKU~lDzqD-z@1_~hh@d9p}|Hx`E=;#0w`{yVi7H(2x+EDFNnAp z9E}fILb_TU&25X&?VStO7Srm$ZWiR(7ND4H$z-X7@Th6%`NE#CAV^?e>Y35*VB0hZz@xY_!Db|x=cXn0St3Xk}-Za+P z&wWykm>`tGt5h-!=AA5EP#3{9$mW*p#V5#2-Bq4Hv4e&$AqWzc5G(4|P)Y-;T~kMx zN@X*9;CD{D^~w49Ux{x$V)&?;-)pd1FC2KR=y44+T=U!1CfZaLLD5``Afxy_n-8ioWdHXDp|NxnTTOSh$puPwwoh58O<3 z7qI^cG}9nA*Tk`li~$Eoo@4uV?W-h6jl>2no=VqVSn`dp^!>X)|K=aBM0{WC_G9)g zTC-(d;X-46&Hoj>M)+91tTG&JBY*8l=NT&(Cc20xO=p=J_f?FxhxL?8+vr)#(PiT4qI@RLIo%FD z^rwnsRxVXNvAWI>1bzX6)vU-ta2N)dgE1_RnwXz#slFm^7S4T zL00nxF%_{mIY-1%JqzlIXulB0r`<9Ub0i(A;C6TQxq>9SpLMju%+Gf-&goLwNYoP) zW0|_TALctx!@FF#i8VpndGE%QOCJ-jShtsg+cyiFH(j5uvm5Y~=cgG3roloh-0hLc z21g(W4h{+W8qw7BCY@9*+mqArXNbtkRjQ_ot)7~YJW~b5+1SbB47qxTyt|c}hOQy@ z1m_K7p^rC+#93Z56WH8{dxB$cWM6(ej5+g2MBw6g;1SR$spb?*y>eQ$ZT*~tJQZS_ z{{1BFo_o&d6(w%NMK`}Io~6taRYx%!d86zP^&A`bTak^!2|{-m*)1u45sdFKijtJr z#c%3{5jG`NZJ4BpkbN2Wtp;xC{?ZtBPKwF(o4pJhYtWV{bfsQ^4{Lqsp>PnfxHT^7 zLSQWxqGzqyrl|XY!73*h3&jf;a`5tw$3AR%w7X9NpqqwKrTX|+ogMKjk*_i-l4Xss zLtFM-nr3HMy*{NANE9v5m;+S-0r8f_Ek*W=cFjM~XRPB;82hfw|=3|jG2dQOQm%ImcbjoXx!kgyZlh7o!yJMeKt3Q8$WN*ITqk7Pr{uvT!w+yTP1*ToH6qwOc+7)BmfhuMUXn z+uEk38yUJA3F#1!9!fw;x=UIRkd}ra92)5sknWNWK|m0sTj@rS5Z^ub{oZ@;@8i#z z!y4NQP@< z*l@YC*rJ*Zl~@OkFO-6C7~yCyQi`sXSw?aVp>Hbn;Z1K-wv=itatF@%+d&U1Ro$2( zZ}qGSOW~~UYtY=+vI32tH>}rz2j|0a$BDL~5kHPZS>eb}I7bxZ{H3n7s@%qyKQxG^ zzC$xjVOFPLqapy@ z!!)N^(1|v_bIY?`pp1;6F&4vK2*Z{CVIkRsW7s3Valfqd90Eaw=(fx#15oAAJ z!QY{AA!H%G{BkLN#p*DwC5GscyLtupZi8R&mbI!Y9YxflN z;9p54o_v4LM%me6xZfL2-18SiV%qyZ;>d)TYB z{G-Qs4bez;p~~Z6RThUME$ETIXRFC_r@_`$;qdj$^7ailE-PH#9)_!vZvi9A0RRqj=mq*u?bbz(%c}r zJneIIiP?n zmM&}Gwa6z9D8z`?)VZVp;JV|EJlbxdM%fW8o&y*^n&vp202ub5W;-CWQKaVJ{-e1- zjbM1FMYYk!AxCfY8Ro*7-$vuDc4%6!@;!wWJ-=M9rQqOEvz zytK36k>&$T@x{CYpKvYZ^hg)NrfK0*CNY8-&h*`uH1R+sCEmZkXtID=Xuek6hw|bp zYo-?4^xvV3n^U?7dE55+z@|e$;CDFm?08;@NQUHY`aVO@`j3e{`7I!&LgV4vrvaT? z#(7is(s&J6T_ZL12okqf!SwGv9S=}ZthEUr7cdU}0RA*2?aSp?>5$k%1aL0>vL#N^ z&VKsKfU%dwEd*WQiB!9U9Rxk`Pp<-!*g?`wPMhMcUE_jS(h@X!^}E0K`C5C`Cmt1T zfxiIDx{3`b)|p`zt~%kt7S5(F*W1Bb;))FDX7-w}>< zJVV*`22S(pPdK9Jz5DtqUaQ^hE*K(h`rK94#4< zU~mFZpLO8s<5yJHsoW7U58L=Uh@Et!TG3}UIL8%cd?kWw^V+Md4%toC9s*X9sHe|v zB*pBpHj*xfLt?`k))W&TCx&-SL2qaHfVY)N(~aS=TrT|{CkfCCzU)S^pN%{ZhBsWr z1-<+ux29@K>d1}NLSdM79un`k^l^;sn>uPQplpPE!V#@zNBf@jFENG|c8X{Z z09o2^shB3;#@+so;7PzF6m#8S9$9ANVaCck&k`{HU3X8t4YD$3Swn|D<26CQS6I1l zS2t}&dQA5X0Gw?>JMZ=(-MH2>de`?~7@Zf&rC)-y!$;;u@|2>;H@{T>Tn34cusjH| z0a>ntGlA*LTxe3LC>>3ij~P6uc{2d|$b*IAZ5!M=yNnt6slOD~I44xyV@OhsIYiF? z7(sI|cHSVMrbw`Fe+7+fXU2@q{xH>)=lMFZk4?7#Qi3z5D*#kM5{w#Czq-};XG0y)w7TM57H7l0+CtK#F|B=#W# z`iz>?mXT7LZS1Z&3zs_U!)@=r(aIO~2arLI4f~mcAY#(K4ZToq3;vGk4Dn4?-yhbt z7cfy<&-vPfITpVu3Vg=XQ?Na-12XQ(L&^9LMIF@(?<@CU??scBR5V6(2PPJ%kT?>Ri08u(d)#VD z%@pG6%|R%aqR=auSVmq>X+pb=0KI*7&uzdvmfSwA&C~2ATIsp&)Dd3=621pkWj~hW zmf45aa4%6zAgW)0R_s~(*#3~V;M?Y;8iG}`OA`?8c6jb59K!B;rsGSo6Y8yCHE!hC zftp^#)l#Ep+!N0u+Oj>J-r2ssy#j%*V5XF0q7U@6q(tFlErCvYMV5(GEq(HP<`gU;)-7pd~S!yblS^_E9e6=vnyLX%JGbNJYPwu#W95wkXI?} zKl%i!=Y_#37dQ%3j3`H6<`s3wV`7FJfGLSjUXe=`ebHNtq}ZM`QjqvNQjCGJvHzU9 zp4voJQp81cp4Ace+fCp_y$Z%-)vbPyAtD&@#uw<>`8wyxjb$QHsmkxStL#NJ7JV2J zkJdR({zR}qrCS~{P#V8?!`mKcGSI)Y%B%C72u!_@p!rO6#B>Zqn$7b?VP6HY+$a09 zOBie=;#X=q5@-qIpBg9%Sy*D)`54%V3Xk~U?tax$HY*t_{g@+NByI*L!PT$0YBc-rBMDzrEwNf zV0Oeh&5bg`=AqN8=I_QT4`^PvKT}wyg6T*)Fu(TQ%$exqS7+2^={aE;|2kx%lo0*O zxofl6>!6}b8N+=WQ&G^F-h~Rw%0@!`?m=PvoF{fI;durIgF8<3lf$zL;%p~!{xQs? zkB+ikAx0t%WVjs@GiX-t!xnZ#qa?-)wm)8WI^OoJmOg?^R3h=abY2t*P%GgIX+vE% zjNc#I>e|^$@UQJ?%#ln4MiUaE%*)FtdSV|_sorXOJmvT)$0$CpMtcNdWyV_%C}-}{ zrlYt(kY*LjeZDoAV!wd?=2Sb$IGjk{!@TotGaQYpSGdEyyOpm5Yky5VUaom%y7|JdGI3IomEM7HVSO<< z;*wCKSJa#OYpS5Z2X*G`5xt%gW(y|BXITOn2E_$6eec zw?Xk>{06L*21F>Xa|J|%4OiFfFT(;YmZCc)n{FF=w*klLmqmv`vh#;f1u|q(XR{HqoeckG;SN={W*~f;^9ZHN!Nj*pMJM)S}26rLF{tggxs@ z5Hzv_S|B)9i&&BFZZv~WxCEZ;;_CSGEfE&T`-}9-znj7Jj0^nTv>2&-51#4Cmj)r} z#?Y`}!eL>QnePPk%hhVlwD&sfD<_)8&?td{X{9V-Q-cntFk5qTIH6L3R<&ton-Il> zcG+Tb+lVjE=<0Z1AH|KBPj0K!;A0HVTcvnv2LeyrSEMJjFB?Y;iYMsnGM)iM4|Eg`w(XF zS+yzx4K<%BW*^fGd3cRVEioCj$X5#Y&S{{p?Po$=tVa$)g-%UfMa?Lm7X2m7t zBbebMyFDa22(h6qjP1XXNKG%<#f7>jx(I9TZd4)Nrv810~2JN z-se26Tg0ksUI&69R_wJSqC+!q?tR?Tox7h;$>*WjAHu7pz|%+1igKi+sm+Gs&d~zb zBo%XoY3a9GqI)%!KQRk6#NXqwlh^Oe6X5&ETW>ePBgT}sCHaMAr;-zk^hK6H9Eun1 zRicy*aoJ5)z7?I=v)3aOJ1O`TtrILT-Mb~<4X&eS(;7p?CLT+xwVBCbfT}mp-GUtc z+(Xf-QrL5keo(3*!KFaYi~Fna4YD)S2CJnBJE`;bryxJny}l`@uYsz#5e) z$RF-5Z8RiVQ*N+SL??W#y!2(e+Qhq1!%0~P`HuS&4NWicO_3o>y=czCo2u_)$7@iN z4xd_ImVinZfr-`WR?I>MUjkl-OmwF_J2gg4Aw81uv{~I4V3*Xo`ju>@oAyZE`0rKk zz~caWjj8*6A8@LQCj?s-NCXkIHFQTJ70s6&Os!G=C1i@031AuFMd2RNk5P5iPl$fP zw#4uuWTEPkr4#>++%Q_Y%2_m0iXqni`jsgo*uY;SHt;$>r7Y&t#usToTs+w!OF4$I zC#L8L^q%l!Go^MB>-77VzW$^ly)%p@$^w?EK@OQItTC@IgGA;XH?@nu=RGkH_JjoY ziiwB3iF*l8C$~*0bGI*-ek+$M&z9SuWVMgyrR=3N({kqXFBb`45>}{yzxFX)k-);&>(}0l$#}STqP&Xr)zR|nN5rKq!VLimP5+~7=cVcl# zn<@NE;eoE1XjilAYL{Mr6l#7_*qemgO#|afnisDLJy8qdY21QeQwq7s!9jb&+%t`=OB6K`~dVNEXE|cKl+*cVP6^czzR(^NHh?e3~@-B-Qqapt13K zra`AhuN9U4rZUsG;kqh3Vda7@OD%AtFMAlFTo$gID6R*##HvYuTbkvf=R|uQR_}A5Y|^J2FsGqcdroIO}5Uau4@b zap@(h*oWrOK`*#w2fPx&BP>Le$R-~CoI%CD{w`X-ENUaB9*s)a3Db5gUF%(JrY3$l zIjy*78II)Fm>~b*DM6Oe)2<%kx4$>FUwi( zMOx+M)$!`8ig{^`?;~e;#hQ&xQ}mY(z&TuQiUT|9yjm&Gjg+RheF&`!QyVKuSY!?h zi=;i>v&fHgBeAk4)l!BP2nzkk<)Uyt><`Vb;7ZUcD=dzasdEad+&GLjIbrn6>JMrQp$s3m%*3{tc5k`j^{kbO)%> zGPl1C{V8e4PZl|(5>w3p2XymY)12IeW-&(hgT)n zpWf)*dBl5b(7Pn7qY|%4RCQB7C;L^R+M( zG+Or69rWd+b2xiE&unyF6qb0_NW8AbsSLRYH(D!X&7WlZdur1$Z~e)2xlM@vTkilV z1}K6ivrx=VD^L6k33*7?vD8p(@2Ojp-vLt9Kf{L-HA49?S|)f(=^;XnI>Gp4Pt(`< zSKg_-3H=UCYV8R@eQ*Hzz0kvUfW^`uiURe!I>HlnS+Wm@EB`G1t=jo^Wi2C~ay`&y zc%b^b-=10^!9xaFBNu=e;F+_l6L4$*k8A{D-3h|NamXpT&|e((pWhhtk&3jgmt5Bb zJB|+XSrN*H)CTj*Tn5brmmsUAa@VGNSXvLkD6Ar6nEWg6?@~@8k5hz6N=-=JqccKO zMeFhWrWUlN1k`;wwkj zWwh&+kaz z;>{BtzXS@&cI9IEGN97EEb7%)o0l@4mKJbg-)XN$ zvmrKxDk{2`f5m%zEuU8!i`~{=)S2p&;PI#v>zUwJs`zlWg=g_%(&C|{(JOe^Pi3w1 zn)Y?H7*S1vfVXPTi4qg-!kP6&nu4HTA%4^A{_eU5Ab%F_aEcCK5LBOW1HFJxFEnav^D?bzP|l#S;p= z$Q6mOG$C@mco^W)NtsCPcGGvGtUsJ%5-0!)L4m)Ckdrh;qr=W@LY)a^t9LS=uqm0YZe;^5I zv58Jfc8;4SB~LQ&$eDs$g|P#ELL>Js_^y>P+-yW^^~H$uwvfvkGvl$!6msuxKDFbA zlJ8j-lI`*@+PG}3Y;u-yb)acT6~OWSNLADfM+cd2}YD7i0u9k*JN zuiIiJe8i!8-1|{wTv;$1CLqjy^#>H_8tZRqT2qbi*juf);iHG_Vu9Bs4GbgaOwoZt z#qQ$KgzTay*xOO#oR3eWS7!=+vWL?9TP#`$DS0_lwC;M< zu>}^GgxaSu2_N>srF8fB64~{Vnv0;gG&krIzd?!BF(1qKnfSp)%jff7*PQ`Cu$_Ag zHo``R1rkY<@R7hAcb(NR89l^dIwNuA+HRqiXGq~*(a*Wz%lKNK5p9D}#^C~x5G~9R zv+JAz3RS~eyz-|%wag#B=zQaK0$>CNSt2BwtVL_|L@4e=^dqTcm7s1sYS|AQVOmXG z6@;hjP6%!v>W3G}k0lA}Nj9Ko7XA}t(8X`1A1ps7Hw;6{Wzv+h#J)xIbso&s zGNlXHX}BHK4n4$pAg&@L_t|7R2s@Q<0-L8lRG+K=IeYN8`Nt^0S>sD3N>|8JzXPxf zr-6u5MsU>Ur6ZPPJ%A2~1$Prjsh;yNt)x<)YV;j)@*3g0`i$}PV2#C~pLYA0MPm(; z+ya_KBi+ID(vcq)K;1W!tRIO$wqP)U&ehZT)%G}E*(5s$E6Saz?BTFHNv3+4fc{5A z8|Vk@W%i%xn$_Vl5OBCz2zp!wZVkV?w0gFVE(s&W2$6V*Z!O6Ix1>j^7V2225!4jV z;loh+WTR-Mhh5sRuhGNRY5$e6FS6#LiZF?sdYLL*CHT;>x@wvdjSmRG*h&J}CmwQ& zPIO3#BL})2v#bmzCNfwrlVJB?a>{B!Q{foe6SwX>N&#D#F2Qu2*F6SuvJo?mDL@4F z0Q`x4a+F%mJ}#r?t1;B}`lo-4g-@Um`RqR|lSK6;4Iat5>XL>ML)*Ps{+R)zfq04v;e?}1FTc4iv zOC<5JIZz`@zB5^=dAiP~X!8y3T~$x@#4+_J@4F)l<#z1shl2>| zcr@@-;=}`vcs&t?ZF1l{ZemiuJH4#9#$%L$&x0_|xo`33=UEIcy~>(BmwqiX07LEp zkSu4a5qqlx6GsVWK?f@bPh zxQsAjWB77A3)ho-H*^WO@X;kbhgE_=!F){a)y^bC#b+-7`Jj^KTzM~0<7}`c?&M2o%Np%J$HRFmTn<1X^cpE??bY{ufF`zg7^^ZQ#AqU2mJ%r3= zWJ`EsAP{&+r2@74lYzwX=c|A3o9}UU-y>d`5KOT5Fa__d9${YW&5YuTz(lq{Ca`Cr z@nu{s5|iuhRB;a|x7rLMsTOftoS*Eie)^;vvUv+X!X(75@!Hp`;KstEp`sO#71jXy zMw{+~_Jckc&tWLg*M#PG_3H11zW|bEvkQ!DhKmTl(V}yJHg38deZoqVj73cA0`*2q zpF}`agrA;@%N6Sd1`Yqp!@U9#h`QY*70cA?Ax%Go!$}LQd>?6JG2S1Fh3_aFCNJFK zBKhs4K=whC+j{R>ZqPk{AAyTw$G4i$?)J}vQn&*hI3srL-pBX=B_*-=`f$M};;|D2 zD3}&{BNdlg9c@oUDnSBTZ?l`Yv zjMM0nFXn?eGy$E4o_o98RYd*y%n#<#;n^rG%W{IRznJ%l@pX#!MDoZ_p*AghRI3a; z*LLz=y$%M{{Sliq6FuQfel+`fdCYIJNQLRnz`rxWx5AD16@F#Zlb`in%fY-;N zbDS{r&}-mRgck-z4q(QwD;X#B&{j9Q3xt_gv!z`81Qa2U_Hl=NEwqso=-;5;68ypU zTbZwxS4&7W4G$(%rUFmP^lK%L6)FW8b_mwN=K5%0$3n*y!OP>u zn4F4tfBC5~jj*VwJM8*o29-}S>Vp#A@e-kYltDsGPegB0dHXIVr^a`AJ>}2N=dOU; zHc0E15MRDi6W_bIJzQrtEQ&6VYJqz)RKwc)FI_m|xxlkOM0ob?iL+jo4Be+5f-_?2D z;8UKHFZ$oXWd7&sJvlI}fRqr@l)3pC1b* zWamTKh<4fhVpeafj?gb`K8NYm~=^95{Thk#FUUiZzI{Cjc{d+s1Oot8x-#6$mUcEeuY)0pE`%}VSUjO@@JtPD%U6C}e4f?jfVu9a^iUg?-A5mRN zNAO-Daw(jFCE5RH`9d_(jQj1y?)13oo}p~sBqsIG&sr6 pending_ops_; + + string name; + Place place; + size_t version; +}; + +struct OpHandleBase { + vector inputs_; + vector outputs_; +}; + +struct SSAGraph { + // vars on each devices. + // * the vars in each map in vector is on different device. + // * the map is mapping a variable name to variable handles + // with different versions + vector>> vars_; + + // All ops + vector ops_; +}; +``` +The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts. + +When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem. + +## Execute SSA Graph + +The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is + +1. Maintaining a map of an operator and its needed input number. +2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators. +3. If there is an operator which needed input number is decreased to zero, just run this operator. +4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated. + +Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph. + +## Synchronize GPU Kernels + +The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm: + +1. `OpHandle` will record `DeviceContext` that it is used. +2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable. + +The `wait` are implemented by two strategies: + +1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete. +2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU. + +Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime. + +## What's next? + +* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done. +* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too. +* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision. +* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator. From 084cdd1f4f78eac9fcae4759575e172d87e81598 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 28 Mar 2018 15:23:39 +0800 Subject: [PATCH 250/314] Rename code --- paddle/fluid/framework/details/computation_op_handle.cc | 4 ++-- paddle/fluid/framework/details/fetch_op_handle.cc | 4 ++-- .../framework/details/multi_devices_graph_builder.cc | 2 +- .../fluid/framework/details/nccl_all_reduce_op_handle.cc | 4 ++-- paddle/fluid/framework/details/op_handle_base.cc | 8 ++++---- paddle/fluid/framework/details/op_handle_base.h | 2 +- .../fluid/framework/details/scale_loss_grad_op_handle.cc | 4 ++-- .../framework/details/threaded_ssa_graph_executor.cc | 2 +- 8 files changed, 15 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 53ab8eb775..7a1b40c0b6 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -24,10 +24,10 @@ ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope, place_(place) {} void ComputationOpHandle::RunImpl() { - auto *cur_ctx = dev_ctx_[place_]; + auto *cur_ctx = dev_ctxes_[place_]; for (auto *in : inputs_) { bool need_wait = - in->generated_op_ && in->generated_op_->dev_ctx_[place_] != cur_ctx; + in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx; if (need_wait) { in->generated_op_->Wait(cur_ctx); } diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc index 4fc05b3248..9180903b86 100644 --- a/paddle/fluid/framework/details/fetch_op_handle.cc +++ b/paddle/fluid/framework/details/fetch_op_handle.cc @@ -60,8 +60,8 @@ void FetchOpHandle::RunImpl() { auto &t = scope->FindVar(var_name)->Get(); if (platform::is_gpu_place(var->place_)) { #ifdef PADDLE_WITH_CUDA - TensorCopy(t, cpu, *dev_ctx_[t.place()], &tensors_[i]); - dev_ctx_[t.place()]->Wait(); + TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]); + dev_ctxes_[t.place()]->Wait(); #endif } else { tensors_[i].ShareDataWith(t); diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc index 6798776076..a1b913a863 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc @@ -74,7 +74,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::Build( result.ops_.emplace_back(new ComputationOpHandle(*op, s, p)); auto *op_handle = result.ops_.back().get(); - op_handle->dev_ctx_[p] = const_cast( + op_handle->dev_ctxes_[p] = const_cast( platform::DeviceContextPool::Instance().Get(p)); auto var_names = op->InputArgumentNames(); diff --git a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc index f77a4b55a1..5ddf331cfc 100644 --- a/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/nccl_all_reduce_op_handle.cc @@ -23,7 +23,7 @@ NCCLAllReduceOpHandle::NCCLAllReduceOpHandle( const platform::NCCLContextMap &ctxs) : local_scopes_(local_scopes), places_(places), nccl_ctxs_(ctxs) { for (auto &p : places_) { - this->dev_ctx_[p] = nccl_ctxs_.DevCtx(p); + this->dev_ctxes_[p] = nccl_ctxs_.DevCtx(p); } } @@ -34,7 +34,7 @@ void NCCLAllReduceOpHandle::RunImpl() { // Wait input done for (auto *in : inputs_) { auto &p = static_cast(in)->place_; - in->generated_op_->Wait(dev_ctx_[p]); + in->generated_op_->Wait(dev_ctxes_[p]); } auto &var_name = static_cast(this->inputs_[0])->name_; diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 63affb7054..e4194a7442 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -42,7 +42,7 @@ OpHandleBase::~OpHandleBase() { void OpHandleBase::Run(bool use_event) { #ifdef PADDLE_WITH_CUDA if (events_.empty() && use_event) { - for (auto &p : dev_ctx_) { + for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; PADDLE_ENFORCE(cudaSetDevice(dev_id)); PADDLE_ENFORCE( @@ -57,7 +57,7 @@ void OpHandleBase::Run(bool use_event) { #ifdef PADDLE_WITH_CUDA if (use_event) { - for (auto &p : dev_ctx_) { + for (auto &p : dev_ctxes_) { int dev_id = boost::get(p.first).device; auto stream = static_cast(p.second)->stream(); @@ -70,7 +70,7 @@ void OpHandleBase::Run(bool use_event) { void OpHandleBase::Wait(platform::DeviceContext *waited_dev) { #ifdef PADDLE_WITH_CUDA if (platform::is_cpu_place(waited_dev->GetPlace()) || events_.empty()) { - for (auto &dev_ctx : dev_ctx_) { + for (auto &dev_ctx : dev_ctxes_) { dev_ctx.second->Wait(); } } else { @@ -81,7 +81,7 @@ void OpHandleBase::Wait(platform::DeviceContext *waited_dev) { } } #else - for (auto &dev_ctx : dev_ctx_) { + for (auto &dev_ctx : dev_ctxes_) { dev_ctx.second->Wait(); } #endif diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 78f566c035..71672fd24c 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -31,7 +31,7 @@ class OpHandleBase { std::vector outputs_; std::unordered_map - dev_ctx_; + dev_ctxes_; #ifdef PADDLE_WITH_CUDA std::unordered_map events_; diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index a6a67c9b14..0a6f6129b8 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -21,7 +21,7 @@ ScaleLossGradOpHandle::ScaleLossGradOpHandle(size_t num_dev, Scope *scope, platform::Place place, platform::DeviceContext *dev_ctx) : coeff_(static_cast(1.0 / num_dev)), scope_(scope), place_(place) { - dev_ctx_[place_] = dev_ctx; + dev_ctxes_[place_] = dev_ctx; } ScaleLossGradOpHandle::~ScaleLossGradOpHandle() {} @@ -38,7 +38,7 @@ void ScaleLossGradOpHandle::RunImpl() { } else { #ifdef PADDLE_WITH_CUDA auto stream = - static_cast(this->dev_ctx_[place_]) + static_cast(this->dev_ctxes_[place_]) ->stream(); memory::Copy(boost::get(place_), tmp, platform::CPUPlace(), &coeff_, sizeof(float), stream); diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index fc84031556..105e21cab6 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -96,7 +96,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // FIXME: Use new device context for (auto &p : places_) { - op->dev_ctx_[p] = fetch_ctxs_.Get(p); + op->dev_ctxes_[p] = fetch_ctxs_.Get(p); } for (auto *var : vars) { From f2d29be784b0d529281fc40bd54ee66cf1eee50f Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 28 Mar 2018 15:31:38 +0800 Subject: [PATCH 251/314] Disable transformer --- python/paddle/fluid/tests/unittests/test_parallel_executor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index cb16ce26c6..bbfd03c638 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -424,5 +424,6 @@ class TestTransformer(TestParallelExecutorBase): writer.append_tensor(t) writer.complete_append_tensor() + @unittest.skip("transformer is buggy in multi gpu") def test_main(self): self.check_network_convergence(transformer) From 055fb215a1f6f4f260b27e947bb81672bbd5c34f Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 28 Mar 2018 15:32:40 +0800 Subject: [PATCH 252/314] remove unnecessary 'force_cpu' --- python/paddle/fluid/layers/control_flow.py | 6 ++---- python/paddle/fluid/layers/nn.py | 3 +-- python/paddle/fluid/tests/book/test_machine_translation.py | 2 +- python/paddle/fluid/tests/unittests/test_profiler.py | 3 +-- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 1bb1aa30ee..af55ef49be 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -1357,8 +1357,7 @@ class DynamicRNN(object): self.lod_rank_table = None self.max_seq_len = None self.step_idx = None - self.zero_idx = fill_constant( - shape=[1], value=0, dtype='int64', force_cpu=True) + self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64') self.mem_dict = dict() self.output_array = [] self.outputs = [] @@ -1434,8 +1433,7 @@ class DynamicRNN(object): def block(self): if self.status != DynamicRNN.BEFORE_RNN: raise ValueError("rnn.block() can only be invoke once") - self.step_idx = fill_constant( - shape=[1], dtype='int64', value=0, force_cpu=True) + self.step_idx = fill_constant(shape=[1], dtype='int64', value=0) self.step_idx.stop_gradient = False self.status = DynamicRNN.IN_RNN with self.while_op.block(): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2db4e5d27d..e7b0ddf1e3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3306,8 +3306,7 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): name=counter_name, dtype='int64', shape=[1], persistable=True) if is_new_var: helper.set_variable_initializer( - counter, initializer=Constant( - value=begin - 1, force_cpu=True)) + counter, initializer=Constant(value=begin - 1)) helper.main_program.global_block().prepend_op( type='increment', inputs={'X': [counter]}, diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py index 3a1a0859ec..de72a7c3ff 100644 --- a/python/paddle/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/test_machine_translation.py @@ -83,7 +83,7 @@ def decoder_train(context, is_sparse): def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) - counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) + counter = pd.zeros(shape=[1], dtype='int64') # fill the first element with init_state state_array = pd.create_array('float32') diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index cf6fe14a86..49ec9c9020 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -33,8 +33,7 @@ class TestProfiler(unittest.TestCase): image = fluid.layers.data(name='x', shape=[784], dtype='float32') hidden1 = fluid.layers.fc(input=image, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64') - counter = fluid.layers.zeros( - shape=[1], dtype='int64', force_cpu=True) + counter = fluid.layers.zeros(shape=[1], dtype='int64') until = layers.fill_constant([1], dtype='int64', value=10) data_arr = layers.array_write(hidden1, i) cond = fluid.layers.less_than(x=counter, y=until) From 9a9d67dac28c362b6b2e86ffeec7c68fa1704d01 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Wed, 28 Mar 2018 16:47:06 +0800 Subject: [PATCH 253/314] fix dist train selected rows height missing --- paddle/fluid/operators/detail/send_recv.proto | 8 ++++---- paddle/fluid/operators/detail/sendrecvop_utils.cc | 1 + paddle/fluid/operators/detail/test_serde.cc | 2 ++ paddle/fluid/operators/detail/variable_response.cc | 11 +++++++++++ 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 598aaa4c51..2d33f026e4 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -59,12 +59,12 @@ message VariableMessage { // lod details: int64 lod_level = 5; repeated LodData lod = 6; + // selected_rows height, aka. original dim0 + int64 slr_height = 7; // tensor data - bytes serialized = 7; + bytes serialized = 8; // selected_rows data - bytes rows = 8; + bytes rows = 9; } message VoidMessage {} - -message TestMessage { int64 test_1 = 1; } diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index d7bbf79c50..f318f8ac28 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -108,6 +108,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, e.WriteUint64(VarMsg::kDimsFieldNumber, dim); } e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0); + e.WriteUint64(VarMsg::kSlrHeightFieldNumber, slr->height()); auto* tensor = slr->mutable_value(); if (platform::is_gpu_place(ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc index e646c894d1..e9e2dc84ad 100644 --- a/paddle/fluid/operators/detail/test_serde.cc +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -40,6 +40,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { // serialize var to ByteBuffer framework::Variable var; auto* slr = var.GetMutable(); + slr->set_height(1000); auto* tensor = slr->mutable_value(); auto* rows = slr->mutable_rows(); tensor->Resize(framework::make_ddim({2, 10})); @@ -106,6 +107,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { } EXPECT_EQ(rows_data2[0], 3); EXPECT_EQ(rows_data2[1], 10); + EXPECT_EQ(slr2->height(), 1000); } void RunTestLodTensor(platform::Place place, int from_type = 0) { diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index bdda570343..862fd26b54 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -68,6 +68,8 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, if (total_written + size_to_write > length) { size_to_write = length - total_written; } + VLOG(3) << "copy raw " << size_to_write + << " bytes, written: " << total_written << ", length: " << length; memory::Copy(boost::get(place), reinterpret_cast(p), cpu, data, size_to_write, gpu_dev_ctx.stream()); @@ -147,6 +149,7 @@ bool VariableResponse::CopySelectRowsTensorData( const platform::DeviceContext& ctx, framework::DDim& dims, int length) { auto var = scope_->FindVar(meta_.varname()); auto* slr = var->GetMutable(); + slr->set_height(meta_.slr_height()); auto* tensor = slr->mutable_value(); tensor->Resize(dims); void* tensor_data = tensor->mutable_data( @@ -348,6 +351,14 @@ int VariableResponse::Parse(Source* source) { } break; } + case sendrecv::VariableMessage::kSlrHeightFieldNumber: { + uint64_t v = 0; + if ((wt != WIRETYPE_VARINT) || !input.ReadVarint64(&v)) { + return tag; + } + meta_.set_slr_height(static_cast(v)); + break; + } case sendrecv::VariableMessage::kSerializedFieldNumber: { PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || meta_.type() == sendrecv::LOD_TENSOR) && From f707a83c80311f792aac594f3f401743d90cd687 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Wed, 28 Mar 2018 17:09:42 +0800 Subject: [PATCH 254/314] Add link --- doc/design/parallel_executor.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/design/parallel_executor.md b/doc/design/parallel_executor.md index 076c55d281..9aed3b059a 100644 --- a/doc/design/parallel_executor.md +++ b/doc/design/parallel_executor.md @@ -8,7 +8,7 @@ The executor is a very naive interpreter. It runs operators one by one. We can u We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs. -ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](Out-of-order execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs. +ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs. ## Overview of MultiGPUs logic From 802dcd676e8dcf78836d4f8f8fb5c2e333f592d7 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Wed, 28 Mar 2018 18:48:49 +0800 Subject: [PATCH 255/314] remove CPU restrict in While_op --- paddle/fluid/operators/while_op.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc index 8b62b242cf..8c1a2549e0 100644 --- a/paddle/fluid/operators/while_op.cc +++ b/paddle/fluid/operators/while_op.cc @@ -54,8 +54,6 @@ class WhileOp : public framework::OperatorBase { auto step_scopes = scope.FindVar(Output(kStepScopes))->GetMutable(); - PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), - "Condition of while op must in CPU memory."); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); From 7da1ea07a2cb8927522acd46d6492632f79701e9 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 28 Mar 2018 19:25:45 +0800 Subject: [PATCH 256/314] Use PopAll --- .../details/threaded_ssa_graph_executor.cc | 26 +++++++++++++------ .../details/threaded_ssa_graph_executor.h | 17 ++++++++++-- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 105e21cab6..a6998f45df 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -124,16 +124,26 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( run_all_ready_ops(); // 2. Find ready variable - VarHandleBase *ready_var = ready_vars.Pop(); - + bool timeout; + auto cur_ready_vars = ready_vars.PopAll(100, &timeout); + + if (timeout) { + if (exception_) { + throw * exception_; + } else { + continue; + } + } // 3. Remove the dependency of ready_var. // Find the ready_ops after the ready_var. - pending_vars.erase(ready_var); - for (auto *op : ready_var->pending_ops_) { - auto &deps = pending_ops[op]; - --deps; - if (deps == 0) { - ready_ops.insert(op); + for (auto ready_var : cur_ready_vars) { + pending_vars.erase(ready_var); + for (auto *op : ready_var->pending_ops_) { + auto &deps = pending_ops[op]; + --deps; + if (deps == 0) { + ready_ops.insert(op); + } } } // Keep loop until all vars are ready. diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index 8392170311..da559d8553 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "ThreadPool.h" // ThreadPool in thrird party #include "paddle/fluid/framework/details/ssa_graph_executor.h" @@ -27,10 +28,10 @@ namespace details { template class BlockingQueue { public: - void Push(const T &v) { + void Push(const T &item) { { std::lock_guard g(mutex_); - q_.emplace_back(v); + q_.emplace_back(item); } cv_.notify_one(); } @@ -56,6 +57,18 @@ class BlockingQueue { return v; } + std::deque PopAll(size_t ms, bool *timeout) { + auto time = + std::chrono::system_clock::now() + std::chrono::milliseconds(ms); + std::unique_lock lock(mutex_); + *timeout = !cv_.wait_until(lock, time, [this] { return !q_.empty(); }); + std::deque ret; + if (!*timeout) { + std::swap(ret, q_); + } + return ret; + } + private: std::mutex mutex_; std::condition_variable cv_; From 38b53b37b491f1dccf9133e710198e3d0af34535 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 28 Mar 2018 19:37:10 +0800 Subject: [PATCH 257/314] Remove Pop method --- .../framework/details/threaded_ssa_graph_executor.cc | 2 +- .../framework/details/threaded_ssa_graph_executor.h | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index a6998f45df..2603aed62a 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -125,7 +125,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run( // 2. Find ready variable bool timeout; - auto cur_ready_vars = ready_vars.PopAll(100, &timeout); + auto cur_ready_vars = ready_vars.PopAll(1000, &timeout); if (timeout) { if (exception_) { diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h index da559d8553..2ea57ac8f9 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h @@ -47,16 +47,6 @@ class BlockingQueue { cv_.notify_all(); } - T Pop() { - std::unique_lock lock(mutex_); - while (q_.empty()) { - cv_.wait(lock); - } - T v = q_.front(); - q_.pop_front(); - return v; - } - std::deque PopAll(size_t ms, bool *timeout) { auto time = std::chrono::system_clock::now() + std::chrono::milliseconds(ms); From 2e577379ca8fc67dbb4fc436297cf7ae826b3fa7 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 28 Mar 2018 16:52:20 +0800 Subject: [PATCH 258/314] add cos --- paddle/fluid/operators/activation_op.cc | 18 +++++++ paddle/fluid/operators/activation_op.h | 49 +++++++++++++++++++ paddle/function/EigenGemm.cpp | 1 + python/paddle/fluid/layers/ops.py | 1 + .../tests/unittests/test_activation_op.py | 15 ++++++ 5 files changed, 84 insertions(+) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 979115eee0..7f4b23c526 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -260,6 +260,21 @@ $out = floor(x)$ } }; +class CosOpMaker : public framework::OpProtoAndCheckerMaker { + public: + CosOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Floor operator"); + AddOutput("Out", "Output of Floor operator"); + AddComment(R"DOC( +Floor Activation Operator. + +$out = cos(x)$ + +)DOC"); + } +}; + class RoundOpMaker : public framework::OpProtoAndCheckerMaker { public: RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker) @@ -561,6 +576,9 @@ REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad, REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad, ops::ActivationOpGrad); +REGISTER_OP(cos, ops::ActivationOp, ops::CosOpMaker, cos_grad, + ops::ActivationOpGrad); + REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad, ops::ActivationOpGrad); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 4c575b4a7b..3bd3f0bb94 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -331,6 +331,54 @@ struct FloorFunctor : public BaseActivationFunctor { } }; +template +struct Sine { + HOSTDEVICE T operator()(const T& val) const { return sin(val); } +}; + +template +struct Cosine { + HOSTDEVICE T operator()(const T& val) const { return cos(val); } +}; + +// cosine'(x) = -sin(x) +template +struct CosGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = -dout * x.unaryExpr(Sine()); + } +}; + +// cosine(x) = cos(x) +template +struct CosFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Cosine()); + } +}; + +// sine'(x) = cos(x) +template +struct SinGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + dx.device(d) = dout * x.unaryExpr(Cosine()); + } +}; + +// sine(x) = sin(x) +template +struct SinFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + out.device(d) = x.unaryExpr(Sine()); + } +}; + // round(x) = [x] template struct RoundFunctor : public BaseActivationFunctor { @@ -782,6 +830,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { __macro(abs, AbsFunctor, AbsGradFunctor); \ __macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \ + __macro(cos, CosFunctor, CosGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \ diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp index bac4659e62..4c81ebdd31 100644 --- a/paddle/function/EigenGemm.cpp +++ b/paddle/function/EigenGemm.cpp @@ -63,6 +63,7 @@ struct EigenBlasGemm { const EigenMatrix a(const_cast(A), sizeA); const EigenMatrix b(const_cast(B), sizeB); EigenMatrix c(C, sizeC); + Eigen::Tensor ss; typedef typename Eigen::Tensor::DimensionPair DimPair; Eigen::array dims; diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index f5c6b47d24..ee8de219ee 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -25,6 +25,7 @@ __activations__ = [ 'abs', 'ceil', 'floor', + 'cos', 'round', 'reciprocal', 'log', diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index 4a2b35322d..b78fb8a319 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -14,6 +14,7 @@ import unittest import numpy as np +import math import paddle.fluid.core as core from op_test import OpTest from scipy.special import expit @@ -196,6 +197,20 @@ class TestFloor(OpTest): self.check_grad(['X'], 'Out', max_relative_error=0.007) +class TestCos(OpTest): + def setUp(self): + self.op_type = "cos" + x = np.random.uniform(-1, 1, [4, 4]).astype("float32") + self.inputs = {'X': x} + self.outputs = {'Out': math.cos(self.inputs['X'])} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + class TestRound(OpTest): def setUp(self): self.op_type = "round" From e868950e5f938fe737b26f5040ffc7c09d29f6e6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Thu, 29 Mar 2018 11:33:21 +0800 Subject: [PATCH 259/314] Add comments --- paddle/fluid/framework/details/ssa_graph.h | 1 + paddle/fluid/framework/details/threaded_ssa_graph_executor.cc | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/details/ssa_graph.h b/paddle/fluid/framework/details/ssa_graph.h index c1e041b8c0..ac3e2d8699 100644 --- a/paddle/fluid/framework/details/ssa_graph.h +++ b/paddle/fluid/framework/details/ssa_graph.h @@ -25,6 +25,7 @@ namespace details { struct SSAGraph { std::vector>> vars_; + // aux variables to represent dependency. Useful to resolve data hazard. std::unordered_set> dep_vars_; std::vector> ops_; }; diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 2603aed62a..3f8655147b 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fetch_op_handle.h" -#include "paddle/fluid/framework/scope.h" namespace paddle { namespace framework { From ce16400daedfa8f793d20d44081db7f417af693a Mon Sep 17 00:00:00 2001 From: "Yang Yang(Tony)" Date: Wed, 28 Mar 2018 21:15:12 -0700 Subject: [PATCH 260/314] make append activation in place by default (#9417) --- python/paddle/fluid/layer_helper.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index d771837fc5..4341e06596 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -398,7 +398,6 @@ class LayerHelper(object): return input_var if isinstance(act, basestring): act = {'type': act} - tmp = self.create_tmp_variable(dtype=input_var.dtype) if 'use_mkldnn' in self.kwargs: act['use_mkldnn'] = self.kwargs.get('use_mkldnn') @@ -408,9 +407,9 @@ class LayerHelper(object): self.append_op( type=act_type, inputs={"X": [input_var]}, - outputs={"Out": [tmp]}, + outputs={"Out": [input_var]}, attrs=act) - return tmp + return input_var def _get_default_initializer(self, dtype): if dtype is None or dtype_is_floating(dtype) is True: From 01c5ca73649f5b5b65d28a9d81301e87d30ef724 Mon Sep 17 00:00:00 2001 From: JiayiFeng Date: Thu, 29 Mar 2018 05:15:57 +0000 Subject: [PATCH 261/314] fix bugs --- paddle/fluid/operators/compare_op.cc | 9 ++++++++- paddle/fluid/operators/while_op.cc | 2 ++ python/paddle/fluid/layers/control_flow.py | 20 +++++++++++++++----- 3 files changed, 25 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/compare_op.cc b/paddle/fluid/operators/compare_op.cc index 86f7046058..9a139ab27e 100644 --- a/paddle/fluid/operators/compare_op.cc +++ b/paddle/fluid/operators/compare_op.cc @@ -29,6 +29,11 @@ class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { AddInput("Y", string::Sprintf( "(LoDTensor) the right hand operand of %s operator", comment.type)); + AddAttr("force_cpu", + "(bool, default false) Force fill output variable to cpu " + "memory. Otherwise, fill output variable to the running " + "device") + .SetDefault(false); AddOutput("Out", string::Sprintf( "(LoDTensor) n-dim bool tensor. Each element is %s", comment.equation)); @@ -75,7 +80,9 @@ class CompareOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); // CompareOp kernel's device type is decided by input tensor place - kt.place_ = ctx.Input("X")->place(); + bool force_cpu = ctx.Attr("force_cpu"); + kt.place_ = force_cpu ? platform::CPUPlace() + : ctx.Input("X")->place(); return kt; } }; diff --git a/paddle/fluid/operators/while_op.cc b/paddle/fluid/operators/while_op.cc index 8c1a2549e0..8b62b242cf 100644 --- a/paddle/fluid/operators/while_op.cc +++ b/paddle/fluid/operators/while_op.cc @@ -54,6 +54,8 @@ class WhileOp : public framework::OperatorBase { auto step_scopes = scope.FindVar(Output(kStepScopes))->GetMutable(); + PADDLE_ENFORCE(platform::is_cpu_place(cond.place()), + "Condition of while op must in CPU memory."); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index af55ef49be..fbfc383d11 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -18,6 +18,7 @@ from tensor import assign, fill_constant from .. import core from ..framework import Program, Variable, Operator from ..layer_helper import LayerHelper, unique_name +from ..initializer import force_init_on_cpu from ops import logical_and, logical_not, logical_or __all__ = [ @@ -949,7 +950,7 @@ def create_array(dtype): dtype=dtype) -def less_than(x, y, cond=None, **ignored): +def less_than(x, y, force_cpu=True, cond=None, **ignored): """ **Less than** @@ -958,6 +959,7 @@ def less_than(x, y, cond=None, **ignored): Args: x(Variable): First operand of *less_than* y(Variable): Second operand of *less_than* + force_cpu(Bool|True): The output data will be on CPU if set true. cond(Variable|None): Optional output variable to store the result of *less_than* Returns: @@ -974,8 +976,11 @@ def less_than(x, y, cond=None, **ignored): cond.stop_gradient = True helper.append_op( - type='less_than', inputs={'X': [x], - 'Y': [y]}, outputs={'Out': [cond]}) + type='less_than', + inputs={'X': [x], + 'Y': [y]}, + outputs={'Out': [cond]}, + attrs={'force_cpu': force_cpu or force_init_on_cpu()}) return cond @@ -1395,7 +1400,8 @@ class DynamicRNN(object): type='less_than', inputs={'X': self.step_idx, 'Y': self.max_seq_len}, - outputs={'Out': self.cond}) + outputs={'Out': self.cond}, + attrs={'force_cpu': True}) input_array = parent_block.create_var( name=unique_name.generate('dynamic_rnn_input_array'), @@ -1443,7 +1449,11 @@ class DynamicRNN(object): for new_mem, mem_array in self.mem_link: array_write(x=new_mem, i=self.step_idx, array=mem_array) - less_than(x=self.step_idx, y=self.max_seq_len, cond=self.cond) + less_than( + x=self.step_idx, + y=self.max_seq_len, + force_cpu=True, + cond=self.cond) self.status = DynamicRNN.AFTER_RNN for each_array in self.output_array: From bdda08d9f2846cd4a5cb407e993be0bc03a674a5 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 28 Mar 2018 23:13:38 +0800 Subject: [PATCH 262/314] add sin --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/operators/activation_op.cc | 24 ++++++++++++++++--- paddle/fluid/operators/activation_op.h | 1 + paddle/function/EigenGemm.cpp | 1 - python/paddle/fluid/layers/ops.py | 1 + .../tests/unittests/test_activation_op.py | 17 +++++++++++-- 6 files changed, 39 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a4ea74a6d2..8c8def6bf4 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -100,7 +100,7 @@ cc_test(init_test SRCS init_test.cc DEPS init) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) -cc_test(channel_test SRCS channel_test.cc) +# cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 7f4b23c526..a6d9ce0f04 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -264,10 +264,10 @@ class CosOpMaker : public framework::OpProtoAndCheckerMaker { public: CosOpMaker(OpProto *proto, OpAttrChecker *op_checker) : framework::OpProtoAndCheckerMaker(proto, op_checker) { - AddInput("X", "Input of Floor operator"); - AddOutput("Out", "Output of Floor operator"); + AddInput("X", "Input of Cosine operator"); + AddOutput("Out", "Output of Cosine operator"); AddComment(R"DOC( -Floor Activation Operator. +Cosine Activation Operator. $out = cos(x)$ @@ -275,6 +275,21 @@ $out = cos(x)$ } }; +class SinOpMaker : public framework::OpProtoAndCheckerMaker { + public: + SinOpMaker(OpProto *proto, OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "Input of Sine operator"); + AddOutput("Out", "Output of Sine operator"); + AddComment(R"DOC( +Sine Activation Operator. + +$out = sin(x)$ + +)DOC"); + } +}; + class RoundOpMaker : public framework::OpProtoAndCheckerMaker { public: RoundOpMaker(OpProto *proto, OpAttrChecker *op_checker) @@ -579,6 +594,9 @@ REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad, REGISTER_OP(cos, ops::ActivationOp, ops::CosOpMaker, cos_grad, ops::ActivationOpGrad); +REGISTER_OP(sin, ops::ActivationOp, ops::SinOpMaker, sin_grad, + ops::ActivationOpGrad); + REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad, ops::ActivationOpGrad); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 3bd3f0bb94..7fbe4efc04 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -831,6 +831,7 @@ struct SwishGradFunctor : public BaseActivationFunctor { __macro(ceil, CeilFunctor, ZeroGradFunctor); \ __macro(floor, FloorFunctor, ZeroGradFunctor); \ __macro(cos, CosFunctor, CosGradFunctor); \ + __macro(sin, SinFunctor, SinGradFunctor); \ __macro(round, RoundFunctor, ZeroGradFunctor); \ __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \ __macro(log, LogFunctor, LogGradFunctor); \ diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp index 4c81ebdd31..bac4659e62 100644 --- a/paddle/function/EigenGemm.cpp +++ b/paddle/function/EigenGemm.cpp @@ -63,7 +63,6 @@ struct EigenBlasGemm { const EigenMatrix a(const_cast(A), sizeA); const EigenMatrix b(const_cast(B), sizeB); EigenMatrix c(C, sizeC); - Eigen::Tensor ss; typedef typename Eigen::Tensor::DimensionPair DimPair; Eigen::array dims; diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index ee8de219ee..0e5987ee59 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -26,6 +26,7 @@ __activations__ = [ 'ceil', 'floor', 'cos', + 'sin', 'round', 'reciprocal', 'log', diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py index b78fb8a319..fb162f8b73 100644 --- a/python/paddle/fluid/tests/unittests/test_activation_op.py +++ b/python/paddle/fluid/tests/unittests/test_activation_op.py @@ -14,7 +14,6 @@ import unittest import numpy as np -import math import paddle.fluid.core as core from op_test import OpTest from scipy.special import expit @@ -202,7 +201,21 @@ class TestCos(OpTest): self.op_type = "cos" x = np.random.uniform(-1, 1, [4, 4]).astype("float32") self.inputs = {'X': x} - self.outputs = {'Out': math.cos(self.inputs['X'])} + self.outputs = {'Out': np.cos(self.inputs['X'])} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out', max_relative_error=0.007) + + +class TestSin(OpTest): + def setUp(self): + self.op_type = "sin" + x = np.random.uniform(-1, 1, [4, 4]).astype("float32") + self.inputs = {'X': x} + self.outputs = {'Out': np.sin(self.inputs['X'])} def test_check_output(self): self.check_output() From 450be963feb74b591bb232cd2b05aac9b01b23b4 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Thu, 29 Mar 2018 14:34:45 +0800 Subject: [PATCH 263/314] fix sparse errors --- paddle/fluid/operators/detail/grpc_client.cc | 1 - .../operators/detail/sendrecvop_utils.cc | 2 +- .../fluid/operators/detail/sendrecvop_utils.h | 7 +++++++ paddle/fluid/operators/detail/test_serde.cc | 19 +++++++++++-------- .../operators/detail/variable_response.cc | 9 ++++++--- paddle/fluid/operators/listen_and_serv_op.cc | 2 ++ paddle/fluid/operators/send_op.cc | 4 ++-- 7 files changed, 29 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index e73bbe7537..03b789f326 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -204,7 +204,6 @@ std::shared_ptr RPCClient::GetChannel(const std::string& ep) { } grpc::ChannelArguments args; - args.SetInt("grpc.testing.fixed_reconnect_backoff_ms", 5000); args.SetCompressionAlgorithm(GRPC_COMPRESS_NONE); args.SetMaxSendMessageSize(std::numeric_limits::max()); args.SetMaxReceiveMessageSize(std::numeric_limits::max()); diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index f318f8ac28..7e3f015dab 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -155,7 +155,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ProtoEncodeHelper e2((char*)buf, 128); // NOTE: rows is of type int64_t size_t rows_memory_size = - slr->rows().capacity() * framework::SizeOfType(typeid(int64_t)); + slr->rows().size() * framework::SizeOfType(typeid(int64_t)); e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); slices[2] = ::grpc::Slice(e2.size()); memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index 3b87562703..b3b2b8469c 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include #include #include @@ -35,6 +36,12 @@ namespace detail { #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV" +static int64_t GetTimestamp() { + struct timeval tp; + gettimeofday(&tp, NULL); + return tp.tv_sec * 1000 + tp.tv_usec / 1000; +} + typedef void (*DestroyCallback)(void*); void SerializeToByteBuffer(const std::string& name, framework::Variable* var, diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc index e9e2dc84ad..ea1670e56f 100644 --- a/paddle/fluid/operators/detail/test_serde.cc +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -43,12 +43,11 @@ void RunSerdeTestSelectedRows(platform::Place place) { slr->set_height(1000); auto* tensor = slr->mutable_value(); auto* rows = slr->mutable_rows(); - tensor->Resize(framework::make_ddim({2, 10})); + tensor->Resize(framework::make_ddim({564, 128})); tensor->mutable_data(place); - int tensor_numel = 2 * 10; + int tensor_numel = 564 * 128; math::set_constant(ctx, tensor, 32.7); - rows->push_back(3); - rows->push_back(10); + for (int i = 0; i < 564; ++i) rows->push_back(i); ::grpc::ByteBuffer msg; operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); @@ -65,6 +64,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { sendrecv::VariableMessage varmsg; EXPECT_TRUE(varmsg.ParseFromString(tmp)); + // deserialize bytebuffer EXPECT_EQ(varmsg.varname(), "myvar"); EXPECT_EQ(varmsg.type(), 1); @@ -75,8 +75,10 @@ void RunSerdeTestSelectedRows(platform::Place place) { for (int i = 0; i < tensor_numel; ++i) { EXPECT_FLOAT_EQ(tensor_data[i], 32.7); } - EXPECT_EQ(rows_data[0], 3); - EXPECT_EQ(rows_data[1], 10); + for (int i = 0; i < 564; ++i) { + EXPECT_EQ(rows_data[i], i); + } + // deserialize zero-copy // framework::Variable var2; // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); @@ -105,8 +107,9 @@ void RunSerdeTestSelectedRows(platform::Place place) { for (int i = 0; i < tensor_numel; ++i) { EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); } - EXPECT_EQ(rows_data2[0], 3); - EXPECT_EQ(rows_data2[1], 10); + for (int i = 0; i < rows2->size(); ++i) { + EXPECT_EQ(rows_data2[i], i); + } EXPECT_EQ(slr2->height(), 1000); } diff --git a/paddle/fluid/operators/detail/variable_response.cc b/paddle/fluid/operators/detail/variable_response.cc index 862fd26b54..f59c9b50bb 100644 --- a/paddle/fluid/operators/detail/variable_response.cc +++ b/paddle/fluid/operators/detail/variable_response.cc @@ -68,8 +68,6 @@ bool ReadRaw(::google::protobuf::io::CodedInputStream* input, if (total_written + size_to_write > length) { size_to_write = length - total_written; } - VLOG(3) << "copy raw " << size_to_write - << " bytes, written: " << total_written << ", length: " << length; memory::Copy(boost::get(place), reinterpret_cast(p), cpu, data, size_to_write, gpu_dev_ctx.stream()); @@ -152,6 +150,10 @@ bool VariableResponse::CopySelectRowsTensorData( slr->set_height(meta_.slr_height()); auto* tensor = slr->mutable_value(); tensor->Resize(dims); + PADDLE_ENFORCE_EQ( + tensor->numel(), + length / framework::SizeOfType( + paddle::operators::detail::ToTypeIndex(meta_.data_type()))); void* tensor_data = tensor->mutable_data( ctx.GetPlace(), paddle::operators::detail::ToTypeIndex(meta_.data_type())); @@ -168,7 +170,8 @@ bool VariableResponse::CopySelectRowsData( const platform::DeviceContext& ctx, int length) { auto var = scope_->FindVar(meta_.varname()); auto* slr = var->GetMutable(); - slr->mutable_rows()->resize(length / 8); // int64 + slr->mutable_rows()->resize(length / + framework::SizeOfType(typeid(int64_t))); // int64 int64_t* rows_data = slr->mutable_rows()->data(); // copy rows CPU data, GPU data will be copied lazily. diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 08b83375dd..9796fabdb6 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -141,6 +141,7 @@ class ListenAndServOp : public framework::OperatorBase { // and this will still work. std::vector> fs; + double ts = detail::GetTimestamp(); // block0 contains only listen_and_serv op, start run from block1. for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { fs.push_back( @@ -162,6 +163,7 @@ class ListenAndServOp : public framework::OperatorBase { LOG(ERROR) << "run sub program error " << e.what(); } } + VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts; // Reset the received sparse variables, the sum operator would not // sum the input sparse variables which rows is empty at the next diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index fdf3c06ef0..0752bd1bbd 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -72,7 +72,7 @@ class SendOp : public framework::OperatorBase { for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(scope, ins[i])) { - VLOG(2) << "sending " << ins[i] << " to " << epmap[i]; + VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); } else { VLOG(3) << "don't send no-initialied variable: " << ins[i]; @@ -81,7 +81,7 @@ class SendOp : public framework::OperatorBase { PADDLE_ENFORCE(rpc_client->Wait()); for (auto& ep : endpoints) { - VLOG(2) << "batch barrier, ep: " << ep; + VLOG(3) << "batch barrier, ep: " << ep; rpc_client->AsyncSendBatchBarrier(ep); } PADDLE_ENFORCE(rpc_client->Wait()); From 0ac43217ced6f84ecf3d0dbf90ecbdb41fc8dc15 Mon Sep 17 00:00:00 2001 From: JiayiFeng Date: Thu, 29 Mar 2018 06:47:55 +0000 Subject: [PATCH 264/314] check whether scalar condition var is on CPU before using --- paddle/fluid/operators/conditional_block_op.cc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc index 337b34e8f0..bbe297206e 100644 --- a/paddle/fluid/operators/conditional_block_op.cc +++ b/paddle/fluid/operators/conditional_block_op.cc @@ -54,7 +54,18 @@ class ConditionalOp : public framework::OperatorBase { "numel should be 1, actual numel is %d", ips[0]->numel()); } - return ips[0]->data()[0]; + bool res; + if (platform::is_gpu_place(ips[0]->place())) { +#ifdef PADDLE_WITH_CUDA + framework::LoDTensor cpu_tensor; + framework::TensorCopy(*ips[0], platform::CPUPlace(), &cpu_tensor); + platform::DeviceContextPool::Instance().Get(ips[0]->place())->Wait(); + res = cpu_tensor.data()[0]; +#endif + } else { + res = ips[0]->data()[0]; + } + return res; } }; From f5da16e51b05ac88a9402f256cee0a101c58116d Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Wed, 28 Mar 2018 23:57:17 -0700 Subject: [PATCH 265/314] Disabling channel test to debug issue (#9491) --- paddle/fluid/framework/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a4ea74a6d2..8c8def6bf4 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -100,7 +100,7 @@ cc_test(init_test SRCS init_test.cc DEPS init) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) -cc_test(channel_test SRCS channel_test.cc) +# cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op From 241f3c988f87978d05a8b2f516509490b01b5ef5 Mon Sep 17 00:00:00 2001 From: Thuan Nguyen Date: Thu, 29 Mar 2018 00:04:10 -0700 Subject: [PATCH 266/314] Add channel design document (#9463) * Add channel design document * Update channel send/recv state diagram --- doc/fluid/design/concurrent/channel.md | 139 ++++++++++++++++++ .../design/concurrent/images/channel_recv.png | Bin 0 -> 136646 bytes .../design/concurrent/images/channel_send.png | Bin 0 -> 85643 bytes 3 files changed, 139 insertions(+) create mode 100644 doc/fluid/design/concurrent/channel.md create mode 100644 doc/fluid/design/concurrent/images/channel_recv.png create mode 100644 doc/fluid/design/concurrent/images/channel_send.png diff --git a/doc/fluid/design/concurrent/channel.md b/doc/fluid/design/concurrent/channel.md new file mode 100644 index 0000000000..a00a3325e7 --- /dev/null +++ b/doc/fluid/design/concurrent/channel.md @@ -0,0 +1,139 @@ +# Channel Design + +## Introduction + +A Channel is a data structure that allows for synchronous interprocess +communication via message passing. It is a fundemental component of CSP +(communicating sequential processes), and allows for users to pass data +between threads without having to worry about synchronization. + +## How to use it + +Paddle offers python APIs to open and close channels, along with sending +and receiving data to/from a channel. + +### Create a channel + +Creates a new channel that takes in variables of a specific dtype. + +- **fluid.make_channel(dtype, capacity=0)** + - **dtype**: The data type of variables being sent/received through channel + - **capacity**: The capacity of the channel. A capacity of 0 represents + an unbuffered channel. Capacity > 0 represents a buffered channel + +``` +ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR, 10) +``` + +### Close a channel + +Closes a channel. Any pending senders and receivers will be awoken during +this time. Receivers can still receive from a closed channel, but senders +are not allowed to send any additional data to the channel (Paddle will +raise an exception if users try to send to a closed channel.) + +- **fluid.channel_close(channel)** + +``` +fluid.channel_close(ch) +``` + +### Send data to a channel + +Sends a variable to a channel. Currently, variables of dtype `LoDTensor`, +`LoDRankTable`, `LoDTensorArray`, `SelectedRows`, `ReaderHolder`, and +`ChannelHolder` are supported. + +By default, the data of the Variable is moved from the sender to the receiver, +however the user can optionally copy the data before performing the send. + +- **channel_send(channel, variable, is_copy=False)** + - **channel**: The channel to send the variable to + - **variable**: The variable to send to the channel + - **is_copy**: If set to True, channel_send will perform a variable assign + to copy the source variable to a new variable to be sent. + +``` +ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) +var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=100) +fluid.channel_send(ch, var, True) +``` + +### Receive data from a channel + +Receives a variable from a channel. The data of the variable is moved to the +receiving variable. + +- **channel_recv(channel, return_variable)** + - **channel**: The channel to receive the variable from + - **return_variable**: The destination variable used to store the data of the + variable received from the channel + +``` +ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR) +var = fill_constant(shape=[1],dtype=core.VarDesc.VarType.INT32, value=-1) +fluid.channel_recv(ch, var) +``` + +## How it Works + +Channels provides a simple interface for different threads to share data. +To support the synchronization requirements, channels utilizes a series of +internal queues, locks, and conditional variables. + +### QueueMessage + +QueueMessage encapsulates the state of the channel send/receive operation to be +put in the **sendq/recvq**. It contains a condition variable used to lock the +thread (when there are no available sends/receives). In addition, it contains +a callback function to notify a thread when the QueueMessage is being +processed by the channel. + +### Queues + +- **buff_**: This queue holds the data buffer in a buffered channel. The +capacity is set to the capacity of the channel. This data buffer is not +used in an unbuffered channel. + +- **sendq**: This queue holds the QueueMessage of any pending senders of a +channel. When a thread performs a channel_send operation on the channel, the +channel_send operation will put a new QueueMessage on the sendq and block the +current thread under two conditions: + 1. The channel is buffered and is full + 2. The channel is unbuffered and does not have a receiver + +- **recvq**: This queue holds the QueueMessage of any pending receivers of a +channel. When a thread performs a channel_recv operation on the channel, the +channel_recv operation will put a new QueueMessage on the recvq and block the +current thread under two conditions: + 1. The channel is buffered and there is no data on the buff_ + 2. The channel is unbuffered and does not have a sender + +### State diagram + +#### Channel Send + +

+
+

+ +#### Channel Receive + +

+
+

+ +## Limitations and Considerations + +### Variable Copy + +In golang, variables in channels are copied from the sender to the receiver. +In Paddle, the data from our variables are **moved** from sender to receiver. +As a result, these variables should not be used after they are sent. We +provide a flag in channel_send method to allow users to copy the variable to +be sent before it is sent. + +Please note that this is acheived by adding an **assign** operator and creating +a temporary variable that is sent in place of the original variable. Please +note that **assign** operator has limited support for only certain variables +datatypes. diff --git a/doc/fluid/design/concurrent/images/channel_recv.png b/doc/fluid/design/concurrent/images/channel_recv.png new file mode 100644 index 0000000000000000000000000000000000000000..c06cd15ae7b8a8c94d5742f6675e389081fcf789 GIT binary patch literal 136646 zcmZsCWn7fq_O>8MOM}uOAl=>F1Jd1{(p^e7(yi1G0)ljRh=4G3Gjw;?d*gG?`9B}t z5Bvo8-1pvl#kJPD)|v=qkTe=H0rHC%FVJLVBvf9!fCU3TRfw;E|8cN^|McPo*$Y_- zQ8iD4{cLzowf^b8Q8~M{P+i^kItx+vv>KmO7vx?BLn{h78sy2XoEQ%s;3HMZfS)Wl ziXQU=pCxi`LzxK6pR@bvTdT5Ntsa}d9S@iMj?&pJ&CH~IYhPI<)00}Kh?G8?A*Hs(B~y$o6Gki`E_|1=$vb9)-|W9>um=Su3G%T<*R`S?f#4 zcqqe(su zw#+@d6P6bR4@1qfrSnBSvG(S$3V}#0e@_)ii!o3Kbv+F-pmO8WHm66)Y+q0=5cpaT zTcp53Wsto}FK$DpQJh!V_GbULk%&8$k*9fTXpR!!pR&5-afiT=h3Y&|U7^M7rtopuZ>8K35yw3H@*XePJ$gJnAJ+k&t3Pm$c5grv>v-Ezp}l=pXm4@;+?*~ieN;oL2Cg$BNG>E~aBRwI@?7AUM9|NJ zhD~6zbxy_lryv`b=hEn~#u8`a$dAtd3FQSg3oJ~2IHW|n<&;mWAtrWQ!mTD=mj~CK z-dz1pEl&x5S}CoX3SdG8xvcAb@Ry-G@mp8PETXuzyxy0spxk;@55^g z`RR@8SQmu!zt;0tY4K99+7uu#DZzHhv@Jf4J3ZCVmfi$Ys(27bM!E|qfbjN{0laXu zC*DUx`qUwWIQwcV`;i^utKk1={P|69DUFn!npS3t@v8VPoVlBQM>deBYpd&&&414k zxw@&@!QBW66JozPq5h-tZJWKIbN?l8vCjhT|LiyrsURv)v}Xs(zQj4Xn|6^Rx7W3S zJozD`d%~}28++_7yng)-nnf)|d@=E>uR^z7XN|eAo~bSNUxz3Hk*W5UT=_F|(eH_|63+LV)c>nT5vIjUe22g8# z?a!L(p|cu^wqLP^ng*&8CaTP63n>y<++uJ`0p`X!@w>`MqQ>zb6{^K%#RY9hU zJrK{6?Pg|m@WyR@C)B*@{l2|kwVSl!SVR1P2%nSX&yn%*y34e_Go~~X(iw|?i!MZ@ zsdpzYam{}WJEqjIiQPvYA|^n*Mbh3%D@U%WKzco#G`Q1{#cce4R}CD*kV)mq4lp@~ z(QNjOp~Ee4^!AC)`2EM%Zo8JIpHnp9F)WNLZLCY8xx?b$*OY{{#sjmXqkHMG1x!*H zuG7&gRk;h1t0D<_@0B$_ZO*NIh}jBAFTr&&|Cjv_P{XM(f-H%yoYndVUOsB(m5=u_ zDRHqb-~UEL^ikY8JW|oMGS!!^*CV#@himBn-=iWR>UcUQWMi9e_3e-!3ibEMjchb$ zED>$iH8yeOsgU9UNJwxJukVr+8OdaYv*9p@Z)3&>7W@?=pNq3X< zoeTpLTaM<)?lsZg>RJl!ffM0>9{>P#02!7x62dDjtw*of01EL1<+?;$YedTs%z5J8 z%XsW<+_Z;sXX$gIrJj`h`>{H>FSua&$76#do777AbXZR9Gz$oM%f)HSq5gK6`pfq8 z^<+gwv!DN9_CH={11lQ|fd}`w)qNm7C@~2%1wH(Ja*0egx z+5Qjh$y^Y9RkA?*=bxLS_YG=_XZq<}GFEj^6^)KA%&XY4xkhI!uWIUNtEXIs+NACco55<4xj{!ZJDgfX=n@B=-TrqU5e=;CxMxe;9v_D< zy?PJTnB${ITdiXQtt@-g=N+}RRePJQ-8`@x5KjL$3_T;}cbek_o(rMHZXHZx;;Ew3 z7qRTaw^aj4cpmz?1(S12^_LY_VbrR0>vI1VLkOG@7WgbSMn|oqR;zvd#`bvko`ER% zx83gL&{-3H5c+1Um4Jtu#ng}+>%Rm+kLddj1SZ+VpRdtlMnMTN*Qgn`e-kM3ojT+7 zO+~~*$&QJ^Z;KagdQ|^22_r4o6R7LbNrLIZ0Sc&0YdsSO`;2hvU4>vi+@rU*3MtVc zx5dZLJ3rZU@D2YZzo75$FqkDQiAZp?D$RWG`E9}9?Q-YYUIhIpHsW(}1mXmBdlTVT z?}{(~!lGA{WIa3U4+iDV{CG>}7paWuuWTdeM^s<~mX@(kxYo7%b9L|C$>a$)`~EEr z7+A5a5q>VH>m;+lleu2oBz{8& zI<1DUbRk)E8>~GQs24rkhE79Ec#}d5H4~&1fkLxy>cqeZ#W~<_ zHKz=IFOu5*KPn$1u*uiihr-$zZm2jN#(28dPBtIf&#$6f>O(oR)VV~>F$&vz z9?Hq{NF%IFLkZWAQKbTVOFI8J_x^)1@n}c`7DOcH6$XQs4(SQL0fz;RHQs{+89W{o zv^dTL!t0j1`?Kj%3U5H!0(@f+0VK*V<8CJ!dh2WKme|O5UmNmC)VbHyc#dG{uvsE0 z8geM!qrngoVvGAc&WKkUwiRvG>LU6UAN@4R9h<099n4q9&Zu?8Bc2b44)sFA@0V62 zk~IEiT;qDEVvHbA>*2UqScf+Mq1ytj#QZL$#%q$g>EkBF0sqg`w)c;c89lVY+!F8Y z_StLVFq(+l4`dQBKb%8qoLnD%xy>~=V-Fi}>CRl;FK}aWaS!AvX5*k|J-gKqva0ch#INMDR>OL>$ zHjC57e}pRwX9mEj?_9(FRS3Jzav6SQ@rUF-4(p|NY3d+X<=%)& z8{Z4rWxpx^Neoj8UJ-CzCBuDeVHwV3%a!w2|CCW~ zb)t3hO6_2;l;h)h2S)^i*0cz*O2D$0NTo!r#^J_$uG;(OkM6lXor$OUu4Svysn(jS z-y}LbrkKPFZt?EfbF#6Z*mu3luZh^hU3D0TFLR!Ct;poSRAW2Dq$A*ibim)N8NU4c zeVE_05XqS52rh+~06aMP7izJ#p0uYYPcox!N{)b+nB_>;H$J!B9^H>uhx(-&^G}FJiQh|gB*V^%mCvPhL&0ob^=c}z@6NTEEbv{EJRq zKJ$Gwr~A{!A?TGjKKhsJ)@+{F=rnL=(Y@@&@(?Dm6>qd1#a*ph9wEzdy;8MStn%5$ z4C>gdwB0mF5BUp>GTo_1P`5vclyiFY_wVIa52qJhiA_ff1{nh0$hfpR!LMQoFh=3H z2G-e-eeq?M)rysdxELQ+wCyhhCrb=7TVDATayXRmJ$Aa>-pyB$2+4@Eu#LDxUQ1qP z6-63d^3c-J$)UsCUM&YewsHeNZEbC4mRGAn0oTXmlcnlWhl|ar1WvzL!Lz0^!;{DY zKG!51(_>ry>_HqD@x#1a_ue607j$MEn>b7I3TxsFUE*m!WKi<>9{;c|ouw;h;W6uH z`CpHy_C(h0Ol17hV32nX`Q)E{xhD-(!>X2zfCSsiLtRIzX8|GO10X1q?Y}L<%32ky z{Xx@1)7inNBCO(u^;Z&_BYsvSvKL&Goq&#hA-R9*4>$DuqI)S#vWp;k5q3 z!ooUjzP#}8@E=BV1bf?Y={3l*LR&Rnk^nO zL^Wn^Zz`@DkS&&gnJ`f=FI=_jyM0YQaWW5*ja0o_I5hDlWGy6N%=~kZSzn^GMcP~Q zNI*V?0~8Z!4q-F84V#)(BGyyPKdlD)detgSc=`Od;k@FGsdF!c)GEiG*xrFE4I4!l zbzJj5EK!!I_iMNwT?(Gs%_j=nJ&1JEpyHqW@N2%rKB;gcoccyg=rNv`9tAbE@9yp@ zvHx6m4xN>+YACa|jZ;I@vjNGO=RiUwTm?8UPI&vs$s!=%#KXn`t5-^<*QAz~mIe?4 zN6_y!sqOLBmbE-!XPoNv`1oso9BI@}K+;-&oGKu{;50`4CXOi8HB|EY+n^Uws%_Xn zDkp;rGduCwkDzI0Q&1T!F^|feA{tEtx3h*2C%GmA<}mLxV{Hxhlq(4Pa+)Pa55bhy zX_-i^fle=5twx7H`NGE^DeoeBML|rU^`HKjoF#{%NT#4wZOJTJJoE>*9p(%kn=e{{ z?|QRLf+jOlD|_WU83F+Stg;fPTLZ`8L+;-n`}?a`G1TtOaYVi<1CK)y}C}-QK*C;FBg=g2XN)%cFyEG{g(r zE`AB~qyneP)E?`0|A*p$r$^V-E_iYokHZC7^-?uV6I`h4(NZgi&!rt^BAwSVxmpw+ zqnu`?L2n}iD26|2bBQlX)usJq!;=D8GMt&}>q`bZ+WPkS8ZqWfHG{-`Yn}3xF7a;3 z86W5jBxx5N={0$+lp<&Z1RcUf2_s{mN=O~Qpe05}Mx?0KQM`=3G%56hep(x-inN1` zK*6)yppR!ZxTh9+jcz632}dr`xzzb6JiXSSk&jD%%U&i-d*SgEgF8l`Xg78;oZ2e4 zqbHQ(hx85){KNInSJ4`uD+gBn<$P&$auBBF?Wpk6pBzCSsJH@&ptnSCB(AAbhF{op z7K5pi4Jd!u&m8tG8JPfW;?>nyP3eLXdFq2^lKEGM#OCv^bd(ww2vfDo^$#pQ<4&m; z-Tnl0r+FD-W4jzRM}2G;*F{5Y`zK@&zN|XQ zk!vOc3smW`zWBG_7e|LRN!ZNz_3BrVWJ(G`(Mdsz%^q=%7$pNgM0U@2AYCqz@9z6! ziSzqno&xsfcPGoDwA2u_Kvd?}RZzSyR2|SHM+_iwSCJ%JN@QC*{u<~_HQ#K(@iw4T zXCo0+j*NpY94H}f99$C(XK@{r$F)@(N$5Dx;G5%(=SZs{H)p0>bLb(x)&yjyC81;; z&h!BVF#+iZ59jHNk7o)UYCu&f0Vdf8s!CrapsK8CE-Z2Ve2|3LW0SY@<7jTi zNzu;}!)3s}6Uq^W`p)BXFYWS6Anz)O?2v}KKAw!BoHhE6wJSVkkY28?n>fQcve7XH~mEvBuk)aftxx$;<@}?nxE6Av(uxPS9@!<#9PRQ`B6i& z_H36TyF%Ll#WB5}sce4p@^6_qh+zaA83A}n3B_^l_4r%qSEEmK0mJ^13-T`hztzZp z3(*LA`Ou$^L`4ntap?01+-dcu{H2-TVDP}cl)$g?bg;R4_Z$a*0s~(a@;A@v@D`L} z2@PJub>%NZN&@@?xTm_qG`9%@cxGS$9X-ki0=LBd=aw`8zm|~0l){(E%}D<^umn*v z=6IC#dDO|%7!q4(5#$OXSSG0ZOK=>MK8K1Tpk4w(#TNy_i@gXZV#eet+r1>K?y@%3 zka16%?8bU_YA&v(hj=N-)!R{IhxF&FGoIo3sGt8i>fk1azNhoT7roaZK7Q)Qwoz)6 zeKh90LAs-(*7w~848(c?b6*)8vk3oE!v#5jXeWL`+0z5Hv1M3SAG6jdnUWZl%XIv= z^0Ex*G8gHXKap%Zw6}aTmdNL={h4lU_xhP3AY*;rQ|X^QX(Lr9*ba=Iz5*;9U)lG4 zRdw22Gf&vnF>=@4ks}z>OeSY;U)u%PzYli9X=PcwA=m7H`~6v@v}fmcYtE${Vd@>4 zoX239+A(T-l_Od$S7_|=(mfjm*Y^gGGu(u}y0Cf6XQry)6L4NN5vVGlT{5|39{Bf& zYPDT#Kip5a@Ek60(y|i}TTuLD1B7M>*X-zB_6!|m8xnr6Sp21LtpPvcDDS!Ur8X*h zX4j~MTw>9D^cXyPDI{G|x86)Jv{{qfF8>kqb$RFAelOaV zgOr981jMk9+8>?w4JB~Z=Ijd%MxX_#I6jjO!z6% z>HCVt-}|PE0f|?U5x^gbjL^iBJ0S}!c{?^vt_ITdo@xA26>!8|Qlv{fO!uj*loIlF zSqxBsNak9qBv@25kl?<3h2VArgP|2S32QnJLQj3zQN)}ull7QUl z>M!2qhi;%u){g9*r^|{Ayq0S7dJ1*`C~rmx|e; zYb2nKhaj1i@+2D#3;d}p=u)YjofA`U@%{ER%QL_EYljH3*!~qUpy_5mu2{={LNJCp z53)@ye*|u3Ym5KuDt^z3Y%}dnVGXjA>>z`x?XI4Szidkw!ll=@KWj>6z8Cf(lyA11 z^fD#Dev2)|o%@7|hjaJ@lFrC6&Ah%hR`6-Ce?C)S&@&9O7**sTt>eJH!Uba$lpx%> zrz9+ZzwV_*-jDPW0aVs6+WYYZ^rv{sbGmD%?X#~41mo7XPZM^tN7fa=%z%^wkAO|sEf>lO~i0-R0lT|qN^3_w{?;DKg#mgf#@+#aQNMA#KcK9@H!FWu{ zHTDX#HWfFn38vVy{R(%<2VTVjvj*Uu4bkBO9X1OutTr8piVyPoVa1{F;~|*R&ykmQ z5v4h`hGAXjlU+HH!fPXyS7Fj&f1Ld`}HVq3(4KlqM$m^QGU37=T8P3a&G)D4v{rBH^1_>d+j7C2+t?2WiRP zP-$K5V)syK_|e9<^Ie|6FLj08+d6){NbTCh_1V5C9N+%h;crqCu(%t2eb zENXbmBvW%0Xit5=^ibKLDD0w`M-7%LD_R+-KNo^B!iTm;AfpE4tNvm%w05nU&a1^l zq2bVX;+=mp6#D-$KivpOoIvT_9S&rtHW@(Hr^F##!!m%x)6p$9(X%_F(?2-V;V*N> zmMGJ!VBxVU2Z?uLI_~(b>msB25EJ99f+IAfb+q4qC*794v;59=i(anbVQ(^|9*63H z%KewxWl;n`T+wA=#)dk61FpK9=K%-l=)r($olVr^71xlgYf)f4Tb|aQ>dGDx}B5FzPn3J$6BIKk>=RP|2E~4E@fKMZT zvIRZ%+@nfOM{pwX(IlF0Hz~&~;JN3=JR}|U53r{(JzEvOyB#mk_JXCR0twSq!C@4a*_@GFB@-%sq$)K}r0**HhJc~rZqHL(8m))7ho^0s1V zufmbu1P2aAL0pwr5zP3$o=`p8Xb>vailyT#n9~h~4m}`?;!I80^#9A*M+1?w#OcOm zPocx?c_ICqiTGb2qgDhKWYI|pN8zetfdKH`beQokx&0f+3{+i6{0{lCWnKaowhrtC z|3m+aGy(3Z$jt}1eHuQnvdOu$p!?TvtDawt=jD)!l=cQ#pyo2NWBQAC63-4j7G(M} zT|@&Lz?;dQM!~;Ncwqrfm9sX|Y5{9a4P0$_;QBexKNl|uz#Db~`%+n8T;2g$6_IrI z-S6jx0dK$#e&+Dj55OsW0nV66#Ko2IZ1hIXfYE0x#8-7HbnrX_Y#4w2ujeONxPd2z z?Ry&A709xIg(7Ss!@hqgXln{!Q^kgBGisx@=ckWmTt@%OMOzjCc#ZMRy0kzf;KVjv zpZkmcww@HifjzMa7`T#=-XsFQnvQSP{e5Cn9FXKqH0MfBO{V!1lI0F7yf=;FgEZ|IPxeKa1;zeEs!bPg4K_vuffSYOXIx zZ*qZO*WzGw|M3Wf=X)6^HO48BVFRKch6~q&f0j28%w>S%M}}X+0m2k{PMQ$@Vi^d|NB z2B%w7=|I;HSoo)&08Xip@q9}XA+8)GGh$#D?fwkizp*-v9r$Zg;C8`-I{p=KWXOvs zrGJ{wL9WlXd@<AS=i| zKbPHG>;Vn9-?CZM=Q-8W9J-64cBp`uEuJ;@7M0B-0mS<`4g; zH&F2G(GyP4pJ{=!fEPH?c`3C)=!nw z74XEowc{0vQ2vzG9=Yyn2@8(-jH~`IkUEMuXU^PTxZ;k3#U7x8ZFJdMDjTzkM0oWo zWuTO+k@(qWQUhnhRcUCz5h2L91lxOidwSo*LgKj{ml!UuuAT;6!(tgnk>FzM9Q37V zAetIFx>GA_lo`S$&`{Ze0vF%LHhdoqaFZV@w>(AjOg7h zrBrnI%n#%4@_NVRsR0eju09}e_v_X(m4slZYfmQ+nsIWDjrD#fBBCHZ#aHIRW3aN# z}&W&>l^^(}$o6bm!$IvaT9e$Y&axuUr_R#>%*~WMr zwx^mR1oK-c!@hp~`m>$AJs7CUd)q{UVDrjTnGIC{YL#5MKP)tqiK7@sj~6gAYWa@i zCb~$dXx+nNO)@v_E+D zo}rv>*KKa$^5&GD7zMAUYV;*}M(R4UWZh03sI2e@cV&nbn%4~*+xet?xn9XRq_0nJ zIyjWkrp9*p#?F+1@Jl(;`mf|Ytwd_Y(sKP~t|jvLKPvzm5=t<>h7f;^Jce-R&*k?CdNR61RXr7M)r#1_2R~04EE}$CM9JL4?B};9`BJ zTWVsW=6}o#4Ajx#Ynm^eZm2g7`AQ;3J++XU-}&24+MKq#MJ+9~V7|pAD$tlKt}A=F zJZjtEbG2yaC_re13cK;WynM7$w|=D5@9y%~I_gZFedgxfQF~lWY;1+BL|Y$A$~*vg zWIk8}3DAkj$qSh>jq=iRy~c6c+L_jtmafZ#d5_uU<#uD6%GI;9K#3KZOfIj3vkd}b zLd>xb4#qo4^>*6yh${49N_?>jSbDB9h&RiPivDffrfvj^KSJbcscw5S-=bX}oN*7n zt{O|1>zJMO`u_9h**3jq1y55`Q&H*R*RN!yDur?~A3q-dURf~$9i`IQ01!QJ1LvH< z!&s_PQc-TuI#(nf_BzFySg2*t$J<*w0)t`7qgfGp|{$Oh?+&C-K|9#r&#?XxZs-g$Ajip&{$w&o}WK zD*2S~5G8BIhuaI3`6jo$;jdr6@>#d0i~>kVwuL#s3RNpqxu~nJ&viN5R16dFIxX-i zxWB(IHtCIA9UB{?1Q_tt!e)v#7HMpr$-;cF23S8HY{#3f8-e9M5)!q({YX9CxB^z% zS!`{_^sjfY#+AESxh3>Bw5?HEjF+u3L@a{?moRW zntQZRrpdhbyQtk}vUJ9{I}{%9%@i7n_TLq`d+fxpnxSZAZ?WGtEs2@yHdu8_$+@I8 zDIIMG-`tGlWA}6H4h(?nUcX+X8_wuA_;KF#yJ)(xsmb8%{G5T+c8Wn)JdMX@qBwlA zOmi6p74^Ga_$|O)b--0~!zAK@>+0&VljJsj*-lrKMB*{pu>6W65ftn2PXdAWa0v*S zW{uFCdS}0M>}(D%Cc&Q`jP2D-Mse8{f4sSmva_~y=b$@2q4ajWZ_p@F%R%?QtXBW| z8R7HK{A@ebT_i1CUGDJMSRsFC50XA$8yh9&l?Y z=ZaYkDXoCV&CN|;zfdkYB@CU^T^9LX_~~98u#j-l;iM|-xf<&MKyg>MH#dH{w7AYx zvpF3*huQd6HFIsf1YFg>RqOQX>baWhP6xP;Py0*S4&tC`@sR!tI@4yy56C8Y>RbztU7Glv*%}mjIvoh}5*d`mGxlC| zs{Q^{z9BAdGzxHRGnI&6-?AEY0sQO1WO?gkV{SgFRylptZJ+=Ik=0ojvrbHNx{M zrJ~|q(bCvbxA`$62ERH;)=rEKPK{0-rB<>|X*+JZj#U+=dGK`8rIv($rLZWBg)x z6udvdNZ38r>=wVjl8cANCnu}hC|-|xx$RA%H3Ag>tAK!~pdFx~|E&Qxf_{C1-wF~g zb*^06XXHE{(0j0#Oj01vH5js-HG9bo^urPZGc)HS+01{{2>ac-rhwMKW)7|17h(>} zZLN%CWF7Q{E;rB3vLacs`6yLOOUq@^&#(M84-T@6)<=L3<{_Yy`~jM8*7w&ZHQpCC zrza=h($e1SPL&g)5%I+1G3lj!`;f_Dt+EbS$|D*g;`_vJk=G{q*gW&#E13RI@C3M5 zRyM%&8bP+fEoy^@Q9Jeia9MY4YK2`w8^JZ*OU#c>&dyC-?)x(fK;Pz93bV~mi{Z#D zA-|@lyQ`zB4B-J0q6Qtn1*0X%_@f{?60V1Uyyxa>ZRy8sZ&UhA{+q8R8mm-_gx-miJva{F65_ic%Mwzsq6`?Yae3m{NEohFkH zGwk`rMPX(8BcBu)auFUGc`JzU8o?t1DUC76juw=il|_w5r)KrFveL0BaTn;E0{uD; z>+yHw3N9PJBp>dsz$uKnDi*??^guv9-5k#BeSCZrPCr;QGa(jUs>sfd<$%F$x>Dv_ zA9XaD+9GlhH)#2qnaNdbpU+uQdwGB|#4E^Np3~6SX#ZWMFtY!-N%pY7DHSIuAn>Wu zpw;_58QIq|*10V%!1xAyVP`>EKnTnM5-gu|ud_{Fvw~)%UfP!YfX5pe(c+P8eoeLc z8f(>)dX@BV#8*OWPCen6n1oz*4HrOQCU{$wUCAi`gQ$ukJ(*o<;CD8}enP_1vcdZk zzFbF9Glkht*7yU}p8fJjI!m>C8VU~&k6t#HeTK`^;{${=JhzrBU}6s18)XXgIKt3W zOEJdeDKokhfs8sH39dSkhtbj4*vT3Ykuy4a9BTiAEYx*xs+_~?)NFUUk`(v7vIyDd z_XiL#`g1QME1P{<_T%;+a(&=4oK;m-li}MI)X|ul#xGzAGkk&@xNY{0tnW@5qjc;90U$>FsM`w|J7WL{=KLTs6 z+uCKu=(!@Q8eK9tCT2J<3ZJ<&5|=(@&_i-c5#qSB)i_%s4 z07S44C(z!VX~O_q`twvKeL6gu#Wuf8He03b;Ch(qmukhz-#idr9<3~ZXy`{wh_=8Lj7!Kz{y^NXm zT#PMiM$8W%)T+JY=Y$-Y)amagrBlTl9&0{gPL}B;x0PSGRjVT`G?)4K-cA_01?Z}B zmUFHDIg`g2)2HaNQz&wHnF%COg0(>8Yc}nF4bJL$My;dKY<`c;gZcV$tP2#!Jf-kr zb1Vi`zMMoBC&M(6I%*?Vr`_FM?*brlOlnl}2%(BK?t zxg7pj)w&+avebh_D1CJWC7eR-0{%kO27VDir zX)pOtkj7Y9)xPQb%<-7=sneT{DTraCd7Pi*LDx;RYdQHf*NCC3kWOJ@)FqFSHJ@wh~x zf*qpe{wRN|#Ym~qOr^xzEAS|aKO#A=&jB;v-LxS`g$J|u!y)mF-ICa{TAAEu%ijgV z?3Uxp=!c0hwVc}TlQ}$Z6(S=ez7-0`f$L&~9)v#MdGy9+bJ$JUeRz65N^13SnrVG^ z{T&mR(6=4z?rqtJDbPzugjcO5u=Q|vg;_;s+rGncqbJ0ub{ zG2yq^A4G@2Kz4kS334rlLao#vv${i!HC zC{D@qZdew7AITmHVYBc6-)T@ymYbYkTJj$m&g3ROT^}?(1`#yF05c>4+uiy3`O>ct z5SZEP1UF|Mzqu>LO-50Hz#N9r-7tQ4wo=@hWh0lzk$Ya8tu z6w^z&zGo+3^ZhLatKMFEi@PH3io`+CImv&% z?Q?8}mXwlVd(k+fF3NpOdj)MUdCyx3CW+PcpofsOMb~2t{Tn~__B0iw&|B3U)o_Sp zX8oTT`1VL4hb-Rg(iB?p+U6^=rw)fTLOk{W%omOu*astioTB1P3j!%L9?L+ulSbQ*|?UA6GZEM{#Xw#Bvk<>dZW;D_#RHJI&67EivsBQy!q0;T$UA=4o zqp!I)BGKN0P;tg|+=zx#lp7g{KaN1r5SL!+#<`PL%ysQh(3=oPq8Ayzh;Y7M<)hXc z-0#crQ%bpV2ZY>sTP@yqDQ(wNpD%A13Zyt~$B$=Swr?`IYwQn+yzim#$<~vbluk`& z*GM{Te*DDRbxAobzO=DNIxM*YyAya9oiCf#W}L{S$gjn^yGwMon8e8HDh!|UE96v< zhN$luf{A{mh==WLNea1I?U>Nb7umNI_p$EWyFrRn3RS2i=TW5MB6D@Z-elJ~Q|T|) zsh<-RO*pR@ zH8ne;`{C8goU9iPP1G0$G*y`rCg0z?jMZ;n9qsIv;s&7&`vfJu1NmT&{c0eDW%BbB ztGYpO{z^=h_eBCnUSeP}YG%UCsSDX;KM%-K>w%PCdvhdPH(f2(GF-`%`04S^k(>l@ z+>rN->ZM7Dai88u7USTh&&|z^uXcw?TH*9;ZaJB{-(NccHEhE1_X@*y13j8ad`wJH z^P!YNTGb*%QIroEEQbNN=R0CDj~CO11BSPQ3lrU8=t^9U7mp8jJAhtI{9ush1~a*t z1?uJiTj~R#rzHaWz%*Qjv>$!RqTt0snn;#VlrG04?V;q0$skTPn2uJTcY>qjZJ_}3 zJ%KqO=akVuGIDr(e{+`YbFrJu>9Sj58eeExu2W0r4zPMMUZ38uwKLc+C=l{5<`0ci zxb0mG=vMgN`_V+GGMLe&ao3Zb2{B})JU$xZy{#(7O!&|>Kl4Kh?X*}m%EV+62MbRM zqFIrr-5nNb9S}ZbSxFAEgtS*06ut+wpX^H_wbnK~hUDg^9ZLOTSBRV59zRGtN%tTT z%zJwHljv};L>v5F?c;F%7ISfqrP8hCG6|+LS&zO#^s(}#9VCNbqL|4}emg<&%^b${ zOybomWSp8znu2tlO5GR33Ya9RotC1{T z6LPv!1$iniE{%mPAdSK^&7GQc`vzz?(e2!#oJbJ`0x(*}}>Bz##GYSXe|N&8f2~ z{KOL+ow>x_f0qC>ovMRHQo>brb>~5Hh0Ys5QmE$nG2O@Od>a#5ndi!e($ITB8v&sV$f03A zig=1hgcsh?5?jqw#*u?^RN1R6*aQC--a;rq{mgn29=Z4OD!InOEhC6tGFG{+hFR%C z>b5sbGfhjR0amD9m1dA8DyPW%>~V_hU?@)2ep-%{&AT3zrEqNJJv}`=K39O)@ZB28 zK2S|udke+M8DDB$EqG97L5nN#wR@SZm1Q!v1C*GQDrd6%2dlH8CB6y}bYi}wHo$2; zSu^7+aVS{4&$q8_$)i7@cfpcrBn~NNPq2ELliRS?qN6hjG%fpGWn+NNeQI0H!bI@>dqnF(pv~f_N#R6*g)iom9Da$NfVOl6mDrQ?%>TLb&}G3IL?qgw4CQdt zJARba)SO-}RVx`kJzni*nhnyjvU;!I?ys*vsNKWZCKIE-)5_G?l*wgJ;0!=|tG~bh zGT7*)Miz7k!CJYYAgJSQWlofHD#L80%;J+@4X3pVN?yT^t>+u{si~)vV3SDizS7Bw z*?c72!CXTIJMWz>W{o|0$n6&otSEMj>4>#2p2Oihe&_c$0sFikH&J7<_~YtEv&ED7 zU2m;)h@xVD)mqE{ltzuqol=s`lB=V+%#yRG$#PfH3UG$p%CN(C3Kz>;vX-M?=5LPV zw|g8!2e3AH-_;$TQhzrj*bGq09dB)P@j2u!n~sy~{&s)_{;7nl6VHlAP=fv77e zoHVo%0?-NH+PyDI*=l)n3K#wyf1$%cu2QbhZ+3_BM(lVoGgrn5fcf3`F<@~}3e@Nd zOsQ5`Lpp%5SD;3fLl|%OL{xKr9?86$lf%cA~SBykmZz{|capg#{X`MLUx|41S@KJoeYY zItE`gB{c~IJl<~)CepMcy?!kf@2?_ynWn*qT~9_)NO$823)|0M7gf&c86GmeyB6d4 z1q=M8-s#s@n27=n#@oyOwqV*VL6iV0d>ZMhF5i=s<(;L+Rjl#ru%c ztr0~)O9eW(^pU>bXqXN1mX@%4ORp~=MaZKzCQft_y20=w81J~tnH3@89Qm!s1Mh7( zt$t|raQF=vkslVA#$H}>xIb39LdU6^+WZJx(NP}aekcySfW=}h47WTuwm0Q-JsiE( z(jLsu)O&EJ%QRD!C13w4mp;xhQdS>c7LtmC@>9yo8+m>X^{rf?(34oObf9=wUTkDu zln{z3q5Z7Gmu7tP1867g&QzH`!M%D#0fA9bBu9aD{$=v&BQs{F4;}7zGdM@4=T=?3h*SurMmWXD zPCz)XbQ=%A4_g2)5d`LdO~byr!}U@!A&KsLW=gO4LiJodjmZ2U8J~Rd15^@E|ly^|&1gezWYvhj+Ej4>oR!BzSQTW2suT|xd zPyry19XUE(X&6=FO#@E4k`jG@fr05-%N7>Ks|nD^yBo_B8#^==@BRSL##*quX)Z-|NgC_doFSD=t% zALnG4F0cp8rHQ0q4}|mdx+1tQb=dU5J2jrq0c6Pqu~m(&gr#3KQVkw_;{1W`ozhN9#I3>1B`tmREL^uM86 z=1jN8bE|JcT0!)Rm;02vWs_KO$$6NvI|M`C>Fc>y|bnhy7(z9N~SYs#k(2FMi zl0+DzEiX`UzBmfW=Jdw)-GN00DOw%#qJd9 zLH`=8u?+<9{wZ8$)nf5*Zn}7cKJmhudzc2_1>EjJzA6%c4V4 zDvmeUUJ}LJvHlOaa4ds_>?HuPQYRg9(EMlcpj*d3jQB~5Pgnhe%_U35^SkH;i;fLx z5|fm|`6L7 zy90ya|EA17<(~|p=ruIxfM$#Vh>aaG0>ZJZ2D~>Wz;kyXG5QYL`YcHX#j1sI_h94B zUfIuUTPz^2CyoGwkZd++<%Y8Lf4~eG;}(0C^UTZ7&kxfN37s1DAwG4w*9Da3K*KL( z;zGv(wo6S~p>jBr(J?PJZaR_1lr~qz3ElvhP#9~e4zF1KY~vSY&1Rq(uE#aiYJ}_S zsY}qOk?O_WMPq3@ggegq8+&`jQgh7v&`5X_L}Yz9jC=qkc~ru3mv+C)n8-fR=5c0H zy|%w$(GwY#Nl<@|>MpE4FV))>o_MaSqO`Krb1@f#&q~O()N&0gLlE6}9C{UrAPkw}^~lI|V0ty`K^fu5*_YD=+iDuTH>)YJUsQd2_Ge18enRq< zG7kaubJs6{^Fdw*sF)2|;KrF04-O7O8XFsaCh1!j{e((6(rz_|70QfagglPIMj{SgZdD(PGrh$sbjlH<^48^A^ z0Fz`L+00d82At2^wcHt3>29vu0X3sAoO7BeAgy-jk_Kj2^qXdpcD}sB2#lUO3f+@x&l(J( z|N1!?emDfQYPcEx+GFk@O2x}P7%EKWV3t<{!&NmAM^XQx9mS-H@BurDB2BLFf5>W7 zHt=N_LP#V?B?w6sRR-xDH$@57R)d>U;|C;PD!u8jIO}TkHoO%oTu&zKMSCZ$6f7}9 z2akk<;gEivqQX#^bDMC?pQDk&V@DcEzy|R(k5NY|EIxj%$bwF$QgnYhz-k1Os5(m1 z#fzl!jr3Cs5LH+cK*gW0 zM~j39>xU*ODE<1#+kMEK{NsuS!ZdhSpi4Y?^v$bqoD{xDT`pQn%MsO8Lkx*wgLY`x zWTX(vX+DS2(CBAZgOSf#rkhv$w7_?M(LqgiTC;h)wzXALDVxYZij2?V+=DEnZ;nYC zlw^-12t8qvH1Zf6T)8nD3^=Bthst$G7FNi3-t-39lp9~KPkvayF4|I4qIA<=d z`QQ39hO%%fjEWIOszp}C8s#-Rqu)EFoEm>^R>+}ASUQfGz)|55ttyc4?N=Z$FjH6i zbVSkp2}w*$3?)*~ROx3%m^Y*hvhBF{QXxSQd9&2kb+L9`V>8d0k&zJ#NFw(J5m~wA zaZeUJiFSeTVC}wNNazgKful*|P84N4g-t(>vDOA~>UB?ktnBR22l3q|!Z?H<=wtdt zP-SRVz_V&MNeiuCbe3kJ5+*q?hg+K!vJTYR(Bex;c^>blNIORBeT`9g|Cg09rgr6} zN*%35cd;7_U|O$fX}&sYOqg$< ztm6=piazIe!=hj=aGEXQ19TNF!^v;6gyhwd_nqdKsa%uQ9vs>=HUhxLZm^%s<+c&( zjCj^2g}yTuW?Z{lrWwdY!B>xCZ*O16Z!^ca4XTa~g%YQ*ak7tt+Y|2;qi`lhZZd@! zcLJBybZF8-d*Dkyg6w1hHp;+}Z=6$6DMZA*P6(OCL0I3HfKW2@?D&j9n0Y_V`h37_ z`vm)!;sJPTckeo_8_FttUlA-(tU1|t%|GQEOzj=&=P%O`46|XHoGHC!PUUH5db=a= zd%?k!g_n~P=e;_s)2TwAk1=cyBqoa&P=(E)Xl^ewc}&1KNa+4}3FA$nR-(x!$j|Tj z0jSv6#kHtQr=@sbwA>$42eV>tICtYy8!5yOFh=aWMMpMgdc&o~x4U z{zN@d?j}J6-_S#aQRt-PdT`z#7EQLGDznd>p!&7bbO=ujOWO;ZIza2o5#2N5UN$=WbUA6tpj27X7X!l9YHmc0ZPl zAcj7lQI;v3ILBivFQS+#ADtnTA1LQt;LFN-+w|t~S}&_Y&$94Ksd?szSdmJe{Ev%$ zfrppXWRLBEN5Av9h6uS6_SY`?lopgM@M#f_%ENK9WuD(cFC0w({Jx)LppqY(9sG`L zY~71+z7ZcaQz&E!9=_L(Biu%*(qyma)kL96p-MvX`-wT10`GZrzUcN+_gDz$hb<<{ zZfTRu0?$xg(`L&5K-q61Y^C}poSo3O%Q;u5X!;S7Gxfn+%?H1K>}wjc2Wn( zK#I>mDaS0qy88y!%i0OaA3^l9)9dx;4&k?$I?=yzy@zAjBt6?D-LcpUr7vjBkWY=p zIoDGBRlN}0IwDxs7FIE&cL|#MLu36e_g|H2R*HR)f+5>6xjk9$g_NKe;Zvw%%=HyV zRxrGlD~w)=PN^c~vXuC*GC@Mo^6&BT1H>PvTXd<(L)@U%*CsQ}#!)RrnjD;kG0P2F zGe~&4zlL%770C8e!N32+1mD*h9-x4RVFv(+;16g0VhL;hf0Z2$gdJqZ!G<5(TdG zx~WzL%W~=AHrCJZ8S`R8WSQN(Ecdjutth9}1>s!9)3e3dmc+M1X-a^x$D9w$iefv% zA2pb-jcfeLQ99q4M8m2>F{k&_ObHB=yXQJF!#FxozL@xqv8t`q4oG(UO8U*guXXDz zpg%6i`1~?%6*6S6E{`Y^Bdako;}*wVi*e_wqw({@UI}|XruSD_iUYbQ3$Xq8dF@{O zHk7S5ef`3rojFmf zx}m>;hz}VvJRrEC0(}HB>n}`!0;{hUsvJbqPCA>ZLWbQl2INCL3nL?TQQtFK-?re^ z-{c^s(`4BZ#H=FZ&?)|QvgEyfRmfK?lwYAXCyGp}`hnr2#@o{ZnnVpeQao$Hj{!R( zUD)dp&z2HUc@Baf_Z*=pRAogko4cVD~34l$$Bg< z{t_$9IpeWKIQm0or5-=>h56iMwS)8fQS^y;7_Umb{M*j$LEJT$-4%bTq+h@0`s46y z=a>WwTN+(QH%tcBC704ZKD%AhL7!{GFW#QqWUTVObLIbG zU5Eksg$qD`F|6%8)u$asjo|Cxp;-nlYKPS1?;v{Mz9nTPO!BwO+J=-%-lo&Dk)4a` z;$`mH>$_eDPK!dSw9K?wCx%Ta#X4zD2Yjr&-Oeg+pd}zxaeuoSELp;kcfb0*3z>55 z&xzl!FsSz@3RMt9LVw5;r0^darSdfvEZ!Q9!_#q2M_{;x$6%S^-Hz>I10p7ZLuwqe z`|umi>=n03|L!XH!bG@>{HWEZQQG~c5KZ0v{*WZgo%*SIC$m`Ycd}}sXB3C;IY4jw zNZjr74Q-J@I@3bzf8boiuMYLqGw0yh#KMvj zr6Tiehtib&7wTn-Ct~B9h3FHJEY)jST*&UQU2LW~^#^GNVA5U6V&uS^1F1&Z34q{Y4 zp!8!GObA8n<@7OfKpJ5bbBNJLU&99ZB*s7rjOdT}Jyg{dVM2yU@#Ix63PKQV5bGN6 z>es|WtFG*Iw;&|)jh9*IX zXk1{RU56+z=C%>lGb!{V`%N+ss)`1QNK=33RVa)GO7ykliS)aAIhzrH5PVVPACZga zm2T1&`D){P$}VVr{H)y>MbMER7lE`3>-|L$1#+JE@w+Y}uQkEULK}R$4e6TzPBjdx zSw&pCM*#3(88BJUYc*?&tfDE8pBssxM^!z8fNfo}HluP@5P=2`lf;{Jq93%A1cqm_A$;oA*YByh@iK1HX8g3CZOSQk>K1gW+v92onK?M>ounup{|E_|!wAOk z$B35hrp-MN=jZ>7DVVB4Tu6n0-IHL{;0;s*MybRB_-m@#H_;WOU6!&pnG}DHwR$1+ zGU<@2m(VTu_mXwdRRZLA1fq0V-j6riYd@p!OOh=?R!{s7_-!LuGX3{B-cr&XSQIa1 zNW`rLtQG6rnZBF7^7%+7z0_{9aUw`!{u< zd!BNo-l90L4Swoxh zucYo?JBQg$8x~n*1>^gh$tqvj3%M+h?RyW^Mit4qx~FFsml@J8@$N>=;7_6R-#fbl z3XzF;-gT=`qaN~c%?ReRr&!zg!@`%4r^cwGK_{+JDA0)E160EI-rmy&?E{r~FkRO+ z>Ih+uUB@K9e$|O%^avGuieWfuL@Ho4A*Ng_pj`66F{=N}W|0r0RK#UX45jYwpt%=m z`d)xOqRuoeTL&c#dL6aO5#3O+=+v?JSb?nXShMmw0^if!7zjV*H2eNepV(GH7g`q- zt(WB7V}4Y3A+SQD9R@RA;4~i@l}xzJ#H7;qIJ}ik`P#P$0G`fWgH<_RWnA)Kt)1{J5+MKQ^S~^-|*TE58 zYsJ;j{I2Cz=$GD9>fghz#yOc#yq8!>FbyKJH*MGV4 zemImj=J)U455`tjI)lqkPqP5pB`Ow)dKxsv<1pPK)cJ0ro$zDw12SSP9#szfZ-zvO z1?xCN?4_ja9Gx6Pcuvsg7@6SlQ;k;~pRG&>ji#T@_=xptwGNw|mbDIrwmr69ZM7VE zjND`_3mskEPba)NU0A&y4kO|~d@_B$krTO$o}=%LbCQ90olI ze8$;S)UpS|iMi6XV1FPqLV@PN{;=&!UvOH-&zdJZgz-Y1~QQXg_9t$zZ7t6ldVf zKkZQ+-?qw3$zbGrb+RaYy+NjQSiW;LS*&q1HK8bAGb(V@gx!6cbrXaWw}l8>$W>?= zIX*u5m^HvFFrv4ul80`l)&5n+&pZ_XKHQ(-w_<M%rxd7*?=UChK$gZ3KCE6=7 zLpex3zbrBI8e2*5`)x;hxqk6?ny>F& zbD|o-4?^@B<@t;YRR&R22{p0H)^O=f^zC7+*jqODvJ~n{>8%FkWDD@+*(1=MPI3GW>cmb zojz#%M4z+zd}O&=%b*edn`AIm_6{AR6x|3S3QAL6Ma3}*X8mU|2D|~Wu*JqAzKajm z>TwflogpQ>Kc7AxDn}A}uyZ;oaY>LrWVV{juSqhpxci_9_dYIl0Cy43qT>Lw0>F&+ zg|_`f-r%A#$UYC~bM6$u^;4e{M;CHFzS+)vvI1MC1nYs5=hA~Xi&AYd&?`2ly{PaQ zC}~$xlKl-HBr8rRikOdmuYNlt!d{W^;15D{9?p-Kq7&`%7(!Jg!*G~35D7do21kqC zPtOI=9_G-)rTqps;oy&7Da1ob3JV$YR$QO8FI2n_3}^3VE#fRKz2tKtPNKXC_WmNN z`1Qz5-W>IXxgUvBP|NwaTrfc6DfL9XFlDj>GDBlz_&ZAzFU}&8^c-$BYp$=|MnF!p zadyBxaZ;+_Lyp_>@X}K+*>e{C+6kwxAUwtlr z3r$Q+7?VKJ_~M}Q^ZZ<4Xr=)bYwR9elO^3u;)^Tfu1lqdNRIF=1593QpmySnD$`DK0_$4@nY=h;3qWCD0=K z*2DSObS%m6^_F4G+pRP^Qdv2<`Q-TcpKm0#A`m?C$n*2p^Y-TAQj0lZ8Os|z4feiw zS|8G^^B|~;!2P&9h|h6bn)6ysck`*SF3c|geni)ip&>oMMjGG%a-wu!79~TttM^C; zh!K>5w&1y#xVYEG)|T(;`UKZRF)yec>cpMwUdIXPS&?((ke@4IQW^FEq4yM!QrPFu zm6#Y9v_lIY!)Pv9*(`7AxmnIW4S9Id0#)S zV?S4p0nj%{`ASPmKMfAb+c(Qd(849`65C!+?*yC#tbc)c-o1WV%_0ZvE(y)-@d&haDv_ypCwHz0u#3>8CQX{KF@N zc)7x#Gb-Y&t~J$p{Zi=*PX1H8VC4W~xiZ6e6&6OtZ)+l8XY{=VN>raY?T?!gOw3(e zb#bm0g6~s!9gL=dEWS2BH}|W~QuUToU>hbJ3SYG*j-Zc4eE&u>(*z$L6DZ;qukK)A&WxS5%cp22H5ZRMmYtvDQ&*N&^%jJKL0mg5|T+3 z=wnYMRL9Ak6KaglB%Nf11t6y`l+iSzD&_)XSy?+&YPgAEqpoqzbCs;Ru+mnNJHp#J zkNnVQaO&Gb#>P28q*{p1b4(dBlxC!JS_hw2*U#FcC89ih8R-!xAm;`AmtGU02<+D4Ua_s!bD92R4yAVCB zoO-<^Fn-F}GOjk^`{Zac)?(l~ynnxI)q_4-O>K^kgHz=>mL*xu(tJLi&8Mu_Sbb|C z8O|8Dhy6w_mriMpHq%>W6ri$e0RaKSkRMC&!h=G-ksm4Y@mT#N`{>0fhmA`e5h7+uTs2}SjgerXFSe2o^IYt)Of7kF^U!OcJWQvbV_By`9_@o^U#?z5 zA8u`AeDH{T(utBne01xr_N`0fJKz9USChLnpJGU1Gb^LAU1SgvaE(EnK^KrE1zSl| z^)G@~hhC@0av{&JNMCssFL++8WP3>X-Mi$=`|eOrJyo>$Oq!YX^vbC#MpZT5YHJC& zMade8m>!e!T@7+PaqLzv+r_@j@KT(%kx|JMaXS)noTfivzYL=_x$KbKz4e&Lh!e(l zRipB{O|RYPzS=Gfx=t`@HaS*{`wCG)I{cxdCNJdoLUqSPD9Qc8FFixk8v}EJy zoqopWOnwF1X48sv-?cE6r6xhW@p4kXs)Fs85dl_VI39B$B1`y0M2()nj9RMiebS9J zFj`4w6gked_3Px<_phBIqM4dm$04+K9O|@8Z&l-ro?L9tB`s-AdDa7h=W&tXeVFgo z6X>C;8D?m=f)UG{al6o!Ae_K1`fp<&bOob7!M@oJpc%!BAi5e!<57x*_NM<9#24{R zz|J9_ylY)|@BRELo)3`p%s_P%lALOaaSR6|t|%lr6zRnvtGMW%z*z##AKrS!VcPYX znD2-4i$99dT*q_3b|_>Ae7%>SKcfs@ds(jFST4k9=<~Su$l$vYO%#|gMxPq11k(#~ zKrIw0f=8CTYTwbEC*F|teVsv%kJY$LT6R5%TjE({x{4&t^%mP^HDGZ{@@CS(XO4lU zatY*>Y(F0nVs#SMhHxCF79fzS<@D%sN#`jwL2Im+%Mj>tuI2_<)N&j)Zza?ecDmN1 zpoo_uew!vK_u~VO7u3z0=^Vqnrb6UtC-?>s>`$nPEZZ53P9l$*TYfFNzM6G!g!P)W z4?RrtAZd0YH+9bDzg1Z3e1nQqFFGEs=r!P_In^>r>rP(OTlH{Qq9xe*?AKJDUYE?6 z9Qii>i7SrTbb6^rZ6|V4M<}IrcB*RbYPXc|aHnC=vhZ-=@sziFT*V>M#7aZ6m64pB z+}Lt=*w`@q(#!~!PB#Eq1-O-QzDvk_blJ=Fd*s_)gL@_Ta6W-c--R2#Pct0iR3iI9 z)xS?=KI0nZt3c{#S01&-55Q>DL~G z7W_68!3OrIfo*Y>!Da~?kWXR|FclYVYeg(-=bwfsH$xoX=jpx&t6u5rAQQKDDI2Ph zU8X;MMnR&e?b+5)M5Vj!RM{P%In7G6t1V4H!G3QhIl+MHk2{01ko4NIin;1=bypSF zCb@}i&+f@REGQ(Hfq7eF*@sqXnNQS3;r9qO%dU&O)`)uk*F4Ve3vM1W$T%=LrAt*Z z)7FprH2Na)p3(JMiu{l69yjeTC(X>y8l#DGL$}0@L*p46l?l&p@5mC4mi#o}4IG{p zIWm-~wUlh8<>$~^-eRuq-CZOuh==5C{~_AEppU0+v4Bpo!JTa{`)a`9WXbJ}4UrAT zb0`XH^(gtsKGq7{m=~%Z+6ftcgXEiKcsmxJHF$L$JIbLXaTXId>T$QQS=mL`Zl@qT zoZ_|YkLY~*^rh=-F@473Xj$OIl)W8QHJ z!46{T<;|DcTxcYL&-jGwS3dTSi;2|Pi0V`H4D+n@cL$`-V3S@fi4+z(g4}@J zbw<`*3JvEY7QGR#@>qPA50mB_M=-L8FXCtV@#Gyl&_jEX0uQ4mi>yq zl~>!f=5l76#6pExZS*r(&UuuCQZoHk44an*zN8ShW(54PHvp39Bm+yhUo5( z3#0z(bLe@f2?-VZ$W47!PDK@~FID2jw$q0jypU4fU|A$D++7|N7yS`ctq)=FxW0|w zgq@*Rfyt7QTn}q~0*aWRNl))#-q*{E>YV>8j%(R6R0 z@iqhOVN95GYM9u#xTbLIjsu8OSaAN?t01U}DMk;{3kwQ*uY-+bC(t)-;C2|Ysgt^l zg!x<_um0B3(t-;=I4{6`ZuNfJCXp_W$ z=uOwgdVTpBMXSOnxan?3Wz$Xe&S=`o@i4`VSX{9h`B~=ri_r!TryJjS?%$VpWU>Og zNPNNj!=CQr%2xq#TNmvU6OZeZ`bz=6B3Gs29xu_+_D|=mq5MAtD;BN=85k_BB_z5i zl>|>DsHhW2CT{rR#K7^l?9XW*edfBHh~J~PS|NHXRDLf;rz+<UpM`nb>}^*=@X%a4cXK$(oZKM$v!k_LVt~N#dKBnwTtR#>SqM5WP+T zyTDBf!#)cS1hwO{BC34LKip{tKU;k6YQ6%Qm)m|RE?n2t{O?I@Z0spHe4FPf2{79@+sQeW!F7?T7}s!Q>|2M z`N#G0Q^sMGlPAc0F1TI^t;q5{N*n%)Q#1I*94$-}cGSw)Y)5>SY3?a)+Udz%Utu%v z81gxyZQ8&$G3}*6Zdq^v)@%^B^JL<~6mPG-Q#Jb@tL91YXD`Ph>s!N&u5RMX9}bgI zmX1Z+_Vd_E{mWh|8=bF|7~`6NTq)P&i4hmZ4wKAaw~4rVU{mo~+)qczc7GAKX6Hv_ z)<~C4Pp8Ovgmf%Cyc!BxT3YS2&cu@0gZ|ot)Bd@VBL=?i<#;c8dojvxQd8Gc8SDiK zVRLU8F1}o6j**__gwyfdSm-!h+|r10;DAp;&$Q&bzRzpPoA9MfJI|4kkoEy!;%(vU zLPX&dDEiI%pEHkGQF=OiYhPcVz}ni{p^~g@xGS~g@_aHCEp0I!Gz&`d6EWeExJVFF zhVa!Z`VWZTcG8RPiJa8Hk(6WK^)B1O!UEbXq(5QMjdAVSHR?j^wzpKXv@;~6$hs-5 zNZHxBnKj1K_e&wIDE-^rzU!-#HgSe=pLULk&A8#O(Jv$->(-EV>~?3VPVI!=_xHqiAuSz2ZA6!8;vGD*iA?8BKbS-<0B4V_JGn;+59W zI>Bd?aCtV)R@(=~K8#^(aDl?8AuRY9kI+WPD}?2H!W zGNs>qCLSDCZ&$l+McbW%3hTCry;VppDU;&x-mDW)RSs~m8%g}B$d^0l5u6wc-|3*} z<@Z?sr&f(|M(_AGU2$`0%3GfTP7FkStNe{JIXS3Ze(_dHY5+v9d;t2ia85B*w!KDK z#S=d2QZjnjG%l;WP;-!al<=EGfmZEQy4LOCVrBpUTjh?>?E3Cy;m6(y}h&}Mno~9#-c#7C(t1YMaRl#YiPj$LD=5unI++X-7 zA~#z*W`fem8kd;3$rpq|z%+YZmBpwJ=nXoPKTD&zIqSF725Vb!d7!*^e< zU0qZFR2}dh9vOC_S?h%m z$?-utoilp-?lzID8=d=IE8bCKP2usmznFrn`0<@Ya)KjdWO`JALUUNEf^#?h7-(Y zl3AKa;dihIT3w@sKNiXbPO3GoeK8cwc}kgO>6c{gD_{)5EmLvx5Arp&Ib=x<$f(k( z<7lcg#>DxUgqn<+k8djs9BiYN?c z#??w7I>WYiA0r_=ETVKzIgd?-h+*5u5_I8fLn}*5ORtETA!h?oxM^Kqj}PwyXGylB z2o~vHcL}hJY}^d?^<(KQHehKt))eSAdU#-A;$3Ersf15vq)@;I2al6;tLq)jUd|mb zkTdYJ|LD3KNE5!=-XK*;{r!S_o-Ik-@cDDts9S#(L zQrSIGBzu)2+&|iVNEg=U7X_CX(gJV?CC&;`=!d5{D4Y6`ed5)lYPZoz9U z=;EesrsT}K%`rX zG~lG}^^J*i#UBiyxFx@~_mO7s)GWgz z@75ZO&Hk`VRj?o7tRCKClE0sW!y_PY_jvz)wHL2EX=e!*u7yzeyq3mhKEqzO!-Mqh zoAzs_)<365s9}t)Oa3qV`g7?&@)IrD z*NmcyBwwFgim>jbf6ct4@1ZB#_UUy6B-aKQ3H;yc>aN|P2Zg47%<9XWyV-r8B0YIp z_Z>5fCH3_*Oy}dGkvnnpzDfz6l$as>$?+7p&x4=1t zhKGkYj1!w*H1i{M%srrOPRFjHwcy*zQjjvw{fx|HJlc>@F64Nzm~|WO(`3f@>Ti7c zSHCP7Xn~=^s~VV@o@8S=WA!wWF- zdEsE;l-)6%0!*eCO*!)qA?iLuw=ok6b)lpuB#^{DYIu0~2m|-k;5Lv6H9d|ip-Y`_ z{`OaW2;(!g$a zj|a}q&hSe>FIE8cg<8>|W~;A{1Mp2FgZPO%5HFUthBBpTYq>TGd>pEuindly#I+tA{3=xR(4igK6^*AC*`9?L-z4Ag=(AB)t2bkz(2 z2h6Cmj7$NyGNh(ymX8U&%_luXCZNarOGo2P`wH(0)XW^ikCs1R*3Z1x=E+uw3=2zv zYHGS)Oel$v-0*@8uz=M^HKjG6Hy^Ngt3|~QRYgeD4Y!bR0qyP>V+2; z(L$HWNG*`pyTHS$1e}9sQeHqZ>iI!%?qWr)bkT*(*GbCd?Mj5z3n49CO=lhNkqnAR z?V6(D<-}c-Y%05(mg||#%D<$EdGIIlU;4B@LIjfhGIW5_Yveif%E0Y0j+(}?K}DnO4jma4^&lfHt#K`a zwS^As2GeqXGlR4$tx2#0`v?jgX?=aYfx91fo>X94E5e_Tbb2tSd*!WPl!R|2HsyLd z!prILG;nk=RM`<$)eFM*%rbpwz@Mu7|5ixjR`Zx~KwMb@Gb6RFx+sx%(z%Urx)5si z*T`q|Hi|fm2JrF(#JEm5pWL;IKh`9+F$A%I!>OOE>tzZJVXO8pn_P6$?r)+)fs~y~ zs9~YJ2MH0L!IIor>{%An6fu=vJUN`SA_87s?+@GC@mn6+(E=0fC=7r8=0+nEzCQ;` zPn=fwp3PcUm=>tfOBr9jyya}McmXqjIl6%}?s2o3RNQaO*++Lmo0CuV9D~MBc56IT zp?z1DhFqFn@*{aRFvs-kVW2XaxELE5S@O^PAokmSc@1X>4taq<1nH8-W2YbXD?6;J zQzMAsU0DoN7mk-A_|qp_z=7FNK!Oun*th(ru>2EeQ0pqxOINgFdr1tACiR;kR#I=( z1{~D8zd3S@Z_bQ}3=Gabj4Wk!G+_!YJsuYq5rlY{Zjxi*<6mil?NpETY{hpL-`gc~ zK0dx2M0q6{84m{#XBWkyxAbS}{5=L8E=;egil&ys4H2*XwFa8$F5l|_j{0={a3KjXLT1ARZMYpcfT$G$0-Y)8tX8KLrXqt zW<0me9P!}LQ1gUsZG7Kv&cHT5jO+a(#xk@5+;i^-juDg+Fxyo3!ybv&e?DC;n=D`2 zyrLI*P<^MG;O8{r5kZWdMYMBb12+eXa;bimxpFCBL#o910z_^$;sZdc4ldvYoQ1LA z_U8LieycIU5&tTT8V&it-&6gqD$9tvU8N2oYd=eyit4fs;G;WvTO*mQ9A?9+zWH4> zF{}9w>pjAq)eKf+-l!GVV3b9IPDTUJf6P4m#6M6J@WAbtlrTZW+|;Kb$=j2IC*usS8S11l z{<#T)(ACpZM#^dat{C6In{7Pax99%`o(r7e(ti~?HOMj*T_{=6T&K(PK!w zxdc&+Wks6fzy^~y_++jI1PDh~x&x;dCo*l;-1BB2HrPf0ls0n>OkyEV=dCzuIV`A( z3fkVL0tZ5!f7>_=ey^lNPb57P11T;64#CXv{uc^x=hWZFSR%d=DHW&6Iqkd3-0Von z$jrrQsH!F~NS->r?`cnJWYtbk=b~g+Dpf=P^z;G%BAjNx22O*An|nI%U6NT&3^j@`Y}wAEN16SX6d z3hTDbe9750nhmnS7Iw}CTj|a(r5zMSqG&1Pmi-&ZXe4n7WVN!VUzqtq*vFj$DLe2t z4i!>7#{!gmOX-1|phOi;``?)70Oy%;AtArBxxTF}OZ{COgG<8J`xh{@aQz&6PB%a( zz6t7_I*k{hMc(M=(>vfKQ<3!Y1wzYgArXLIGgPaB_fd^FG8e=Bnt`$U_h}T(^@Rg) z%Si%UkFjrHz={!~Qv&<@e6JOOiD_Jf%yI_4sYjSB zH}Jaw{Q1J=5YI9bn~^;T4SF^3g;`EsL7}-|VR#V!g}>~ZHcaEGtN+mg#HN3_sB3Bx z6a{i4LIzNslI2dsyC*X7Vo1)q4&%0=NY!LysppCHjPKPJx|btOy7Qf08rPyey@^@`q$a2?!S)p7e%@ zsZ$bhefik-=1RcrEGjDiR#J@PiVl*Rb~3UI4^X*jG$hoA+`0mL1z+1?jcUtZJ^Ex! zY%MdkMpi5$pWlF&>&%9RhDvc;&tim`hFtlMS&k8Pg;+mSm{glm}eOoowxVM(L7g08ya zo4wN5_w>oEwGKZ|=Id+Pz*^C7KQPQkzqLyVGE!6LDwzmP2=LTUPBe?QksD&XM(BG9 zaipFikG|)QA%L@F{|hxYd4dm=ik+o@zHrS4J0t&x_w3QO@Fdixjp znHd2eFx-noa44`ZU__+eSSfI!y!G1MhFyI=!Lu>BCdmotgO3VV|J+MkG^YJzswKha z%gJc?4yiobzQS%~3-IwQ?GP3Gec(qj%5T+q^$-Pe3B4CRC6f;>LF`Oa;j&T84@weohs!`TK40|MT1a6Mc=WkXq7| z)^+5syYE=TL+S5V@;j6}^KJg#g89F1LG%rUq5hw@7~Y^A2k^>s0EzwFx`7A$>fc8c zSBylt{s)h5O8<8wf&b;>Vqby&BMEaPbhKn^%ku)w|4$olO!j?L0yuMF_rU-0w4$I1 zDW3}m|3j4L|Nr(wO0g4bYesN+9i14HpQrO@$9U~kj)^WJ{c|}-x%<&g1aBuesPsanN1X>yzr0|imVF9IwX=X}Fap0I|BH?$A z0sq+FbEN`!aQL5I1FdFybWym;8Bb3JnsK1FKuqHrhfm1;*bsOoGnreT9vi&@G&v4* zcG)Y|^EC={H8zY-cgs&yG&DUi6G&l^^}qiakoOTEe|tr8UcdThtgZ-4DmpnjhEseK zxjzm}cOI9COGuz{*_~9zc)Z;!cLdXV0OSCJL9baL|Iaw|qcFVGf6}(&(D{ekI@WST zvIU3Mx&@!BML>!)F0z3Aan5bn=g)xI&g)a)S?yIUoJ{#=m}ZK##ipk=&*XoZ zme2d}51!5L9hIKpG|b2DuqN?qU?3hykp`4R9;nRH9eUVOCn_plo2)JWG>dLJ?PvVq z_vHJE%_!*B<6=sK`t|G4eEUkxPye~KOv-0`Twdjw$o_r`%YWzBJm`ZZ^4>@yS`h4? z2!H|&u+3)#Ar86sK#MnCX)I+j4IqH#^TS0Q-@5}IlgIqOZabESKM+aN5y3P0%#LoO zSM2{xP~RxfZV$T8?(J^>PVy~b$&T|Ee5Qj0^*38-CSX4t9>W6QSu89p9IfDVUx1mN zPC}LFGbcepIXs&Lm$0xtAd&^jF6ecczEJ!#C|0%+lAC8n;M<<^?7;B0I)qXUdIZ8X zRvP#AgRHlNdS9^mg{QBX-2$j(02mXg$;sb{HaTr8@@*zS|Eix>{`25RfBR%v2g;Gn zj`2AkPyaD9_!g}_wORLmW{$V z{&_y(kT)_Lll#rWwS%T&PmsU!ZG7cy1LuA;=5lXF3B)m%bLIm`O;*lxfVsA>`D%$# z;?ThLw@G+cmI*2n(&_*4TnQ42LrZ(xVypB26tas8daS&Olm)OQXaCR_lRb)yk6-+( z5JaQU57En9gZFK)01>{Rr$VFaK>{lS+;mRd56#{GJRF$&m-?C7iX17Y;NhygKQlN7 z1H{a2PXNFQ$a(odSI&0`_QYrV&oJY#*G!^{2;(=vhOY(3Vb|r9vhu!8R>)$o+ z@36qH&qV%+uDIM+)g568?(cuq_)JPe{S9B1XTrQbe}Ay(|LYIlKIc22fRKgvX=0{O zLbMA$h4;y0zM_K+F!%iapLGHk<_vN*lX$i0L6lPCZu8)u4*R|=6emdSk3b#LSdI-_9b0{RZLHR17KqX>Bia&;MG3Uj4oH z_SDr-{oiZrJp`x2Qffo1a-%jTPoD6^ZT`Dv_=f#$!_6Oe=Q)4du<21kk)%5Rt(9e^ z;eX1>Umf6J#6oC5Ifdbexi#DiTF2c2q2P4F(!%ZzlmA`?O|;+zO5n$G^1q{+amQ{$ z)k=vrEcAq{TC?sygY3VHN$5X|Nl|O;gdwJfg2dYh>5T5#)c-E8qASbiJ};2}eVy}r zpb%)OG8@TAllTz%pRaCGjLJ#?{E7fGY>O7;zlwsquyr8kT)*~u|Ni|V5P20eH8q9O zD5UP+062m0Q&7+ca7Z)(aZ!2r`1tu?RmcTPfVmxb??rehF$U&c?6O~kw2DXI>r5gz zCW)1M@LCR5X$6G?yGvbUWo|tU?!Tq{bKP>x1jx4sv>kUd7d<_F9+05!{QC8a6`VyD z%-CA~j{LVPOqhH;?ChuaK#Lva+W0UTMIvyBh=hbx``a1@0943P5fR7$jf(yA$9$~y z>yyt)8_a62Z7tcE&XWx|bk_r(o>5v(j&Tr3zqM*@^&Wsi#^ChqY{t~o)RBCi3ZgUg zg8;7NZQFjmI;Hh(`9IJ&f%pHz-CMt9*$3aEf*>i4C?SnB(%sz+f`rl~ozkU*ba#V< zsDywZDoB^4f*=jj(hX-G_xbMqp8Y2rugjlAp69+lF*9q{S_3&OU)Hm&cxq`wl{_&~ zrA5J~ZDCaqtQ=KURi8O=c9!~HOV43P*tom9uYURR1qbXq1nME-H2|#{E;!tpe*;1^ z%LNJw%A;77r%wwUrW-Xct6(h9IdBWue%9OT%ecEA#KPOHUt3$7jp&xcpR_uC(tq>f z7E<^eYI3}OHcNc=d6o1j_xyoYz_%&gaGA4}Po{H+e6aAiV z=9~{X2W;@Y-E?$xvLJl$hDCUS&F1!91#j z#_jK%B#dJ5croJ`$iv3=tl}ImQ~2U;@EsudErR2V@3L)K&K(}JZxMets+-lo1I~_| z4tm2}UfV_)y!LtsC#n5se813u{`d zy!OQ+bOt7#f%<+a`ap?H#M`+OYz6g7i;IosQ*cxt@L~p^oScXO85>imOSgT>Hv7J4QFp6?`f9&3((F#++C;M z!T!mz?oY?(c3tY$a56uHylpjxQv7#rLV{$C`=Zn8v!KeSfl%je$ET+9HNZ|)Mf~^@ zZmF+&+B}SENdqv682V52!ZdY<5KCY8B<=*RX#O|SPbT>Bdu?33B z984}U0DC;_xVX3m>{9sEN>Wl%YMPqE|6o}*-}tr03j7HB$(rRt7+`b+cG0+hCTp#A zG z{<}5&r}uXj-RjPj*_-J6f$m1R1;5bdUlFH;(k2^*jIHS6;)1)hygVb|w$dM8AYFOA zmw2>>Kx{#sQm&+>JM(bg37i`svE|??KLz{R0GNj2@1=9t*uVejWzeMCr+!2 z4t$S)0tsemP}!BIBT~-~PP6Pl zwMhf&vL60eg=2Bf$FoEVB>QAziI9=Fh^6g)kSsV zJJ#n2&p`nUy^`H7N`m_VC;cZtmf`wP2wTW=3Se-d${(W#JN}4c>93+1Cl>lIv|6Sl zh5}S8gAA|93 zL0tifv3tTt%B?@}#p$u`uu@U{%R^nqb@iEUuR@0U%rkDf258a2R2o3Ydb0=b-ohbg|L; zon%frD5n6m$+efCjbs{JIAmbOh<%xd=%sC5|7&Y+DM`H|4*^2o!oEv|=;)+Bi~|$z zJ{Vk+Vs!$|Bt3!OfKiI@L+5#E7lOHJxrx>jIqf@eIC%E}?Iwi!kL^ypow^#_lZnum z_gw&Rl7_9bZEKq{;ADPtHP?4L4p7y4=W{DJ=@hE(&=yi31e4-iuirk_cqq88nTd%s zX(rFhW#r_TwGfOJZ3;!XS9c50r|wy27bq~(-KHTUBBEQ(c8jHX3T=%KB1b>Xm1-4J z7!}A2A4QO%+>yLvG&U1Azsy_18R2-xog_#>rD-s!4ios&mWW%?V=dGQ7>3K&g z21`;wAp%&HlHEwy)Fx2E(0qIKa_Cfizf>|F7p9j*Folthh(S~g0f5&M0c0-g*8Ev1 zaD<}4s9}za+Yh(HJifjbAh>LcF?u(>;dJ@79tk( z;oJOtH}3>QjX4bd8F%dA<<(aS+U&=qdJ>}IbY1(=R2Q{iROzgp)iZ`BghDV z9yX&ZCbjRMf|to;Hw-Agi#S3Jp3qcSNVANq6hu7L2cw@Dkt5fHSy(FjufQAK1N+Xw zI2Z+&ybs%4dvCwj)p6nv9~s)49XmnTb_S=4g(m8;1oY>}8;4UzzF#f0KZmJCEL)6s zZ5oiEeYipr!y$8r%f!ftm5@Ot-*t?0$sHhCz}X*CNw7><^KMUY#s{ z-unIN{9z^E8zGt}N7L(#%vLJn_=vq^l47T*2Cei5{LiKiCLmj^z?JoMK{9M`%*oBp zzK{YUu|NB7GxkamltSpz9lbHAWza}^VC^d8x%uc6-T<-SGB-;FJIcN``}tvf_o5duKkTj1-WIm&pui{h(IP1*1viy*c@@Yn+3o2TpF{g zeFN2~LNrV>pbY36&*%G3N=h%AU^{*@_z$6)zr zrjq9|gV%C|_wM;T0j-}$obYy|Im3>M+VDj}S(1%^M8=W)c&4VxMx5B$Ya`#S`Vly= z*YVo@KVimNG>wZ5CL~)a802h>dA57qZo}EBq zxOl)h4hq1gYZ!r`FIJACCuy;Z(eWVV#%nM(?8}!7P2RDE8$pN8-}X{CNqdVMGx*zE zB9fcdu8&h!l5LCVEgePj=GZ!;W*Gwt@l5^f!5X}*Gb1;*n&+Dk+!si>Xa>t@lsX}u zI`P_uVKO)(F|ZvSpjh2!W{62{WlXdYgb7HAEJB5LU7DAUmeNO zRr_y#US*ReXB5g#`k%Jg^RfJ==ifBI!d##M=6mR?XK*cxe@n)&({A94(RPUpqNdph zJefyRR8>t8^FMoS6EK5tR$T5Y=S`65+z=@e`j~TWBQxwq@m21`Fc$#7bweW*Z@z%4 zl0(xGgH<)tOJe4I>u?qV(Z91~KQu zTv|vKkW8;@sqB5EAu0{flJ=Pr4@-TY%pY6RsZfB_*>ggb7JEBTqwWe%#m~Wmc8CSz zntJE!*O#J>R5@>3_M-d%Y%EDs*O#e=!srw>tu&wG^a+Q(=dzqz%mYNVLpatidMo&U z;bvNz^h-R@%h$26SgK=t{6U7-;pg`SCO!XSn;OVz+#d4tAFDObT)B?Nb(hI&hzkjw zaWv0t*P`Y2nur>+6Q?0xSPwG**TQsJghD*PFZ?ATF>y*>Mn;d^yU%So88}KZP~lprD=E&stgVKKsZWMBo31WJ z-otGk|MoU9;3k?-@V5-r@+}qEur9->oYKE``N`!cA9UUGnTH3{#b@2sK z&bD`eaJ&MbrG`S<{+rH)TfC&Cq-=P2IJwdOB_Yzzz|>|_mznL6(N0h5n&Q9qLdX+A z!TL#uT0#PXGlYiVLrW8$tx6d8@R#xW$t};xI$OdNM+ZX6)%R;%r5d#pRpt{t%%E&}u9{J5o%d;ne6D!VvoE#=;rM+uS7H%vmLy~~pnT8c@Un};%B zVYgw)61P0hCuv;2&zuh&zQjkz!s-M11xAR5ZrX~f!LW}(#xKDB86mmLQ`)=!+m72T zYRrWmKNS^n6TbfW&?Sm!0$NITsZKM^Cp|;M%x|5kzO5pqZ@+z$NqOu6XF;C(PBQPK zZa;8M*sH0jdG+|kugVHXMwcP_4t;O}AR6BMUJ%+R3j0-+*!r*gTeoC`8qvSkNH=ex zU(Idq>ILd6C%M(Geq=>nX&d>6eXqR>%bxhZ>LAzCdo9@tZlf?yHM(re=NQH5-BV!( z^5d3FA0Bt*lnMh_{Ajx1p^4+zcl@?(>D~p;Fi!JgUEDIOY}!Qzg0_ zO-D*|O!1Y?FuLmq@jLQ!Dr+XvNeHUaC9O0$dhj6AI01%@&IQx%qpHmO6>4R=zaLR| zZv1%r`5g2r&ud!j2Uf3XG$wMR;~q{aMalosc~bqP%vM!zf z?HIZx$hFTo1!)e`6&{*Q)Wfsr>OTKTE)PlmnPk$J=;Q6?9X%_pir>>7Ka4BZ2Ow1} zwJRFEi>n);P^y;b<|?|^Hoio}b|=uf-SQQk`~wf7KsWxxCWQ4f#GdplxKJu9y|Xq@ zLOzIda&oRfS;I~eTaipGZ$K*M*EmAs4&1gPPOV?K*i15D8kGI(X z#L-(qS_m#plDP&tMPl$xG-i?beJDy4>HmsD;h|&Ny$W^9m9$zD?i*JBzu7F+jZ*#% zCuEoV{s=Grkq8s_VOi$LK*fTM;qU*$1(+4>l&8Ky%3w@z)~t0ILr@OTgqpIdDlcHj zO8fIzy}#T9yG0JiwC!ev=EDdr1~Pf=D29s}=;+X2eoQnfrz-2lstGlE*Y#dr3p?Z{ z0dWV+Ve)-tbrDWt5)T+aWk7v@q9?YX8Eyw#_$7$}-R#>`<&|LKpkDJvJgWU};*_!1 zd~$-uosXtWwyU%4Z3c2T3ga{Svj1((jK9o+fUPQ$!W)c4?D*~v>yPp-uFiM#IcA{p zGtnJw0$gAWD}^9&FlL~p&K}x&zV+qrCAIXkZ@YFx5m#iY!rMxYyiceNGc!{tiDZHY z$kCpgblNBmlLN47{A{Hq*<(_!S42u~GGelB(x|-7ZPGGL(bkfz@9_{^OakXniJqGS z?!ngEJmFW3_XlXrI}M1+$<& zi#o2{Fql6NQAWCybXPjtmJF+I*;DdhyL1+C*Y%x(&9DjSmnl6>XOzZZ+H{Jj{%X19 zryiD=JK4Nn^U^P0h2A{R*`FUAW*st=R-lebo-JX%>r0L@HXQZse$>Vd`yIyG#<&(B zSt?;j9P*q!RX2V2B=}4Wc$np%hbet809h&RMIUuO&J=Vj)q0ftA#L&bLTt8ho86Jo zKEv(}D1$_A4CFj+kyX=7@T&z!lQuxrnsigTT;3(Cd?Wg1rFcUQ-kBu~(_1I#RjvkW z^-qr|`ltycP=i_53{7fme15tMo{s)R@tLgQ93|m+Z1P2?rC<-|ftyK7y83SYLVyy_ z9zQ7=)rtn61)qw-jl3danxLR0F^(I(im7@Z-nXC{{@>-O@$bW{i&KmT+3)PY2%Dsl zKt?~MMoI&}=37Ao2@@TDNWNvc$hGA&nsEj_bfzhPlRth=-$Q6#cHejHwkEGTag>mY z9|fYVUSuz<{A-?fP=JVaRmC+0f`L}Ax<;DR_)@Yjbr(6Q^7;1e49)8(v1wi`?rN5| zEv_9v5MCyxh5Y%^!- zCDExHU0JLv++j9X!!N20>wzB%TiyHMTw)eQd6#0c-aZZURYyHaUt39R zm!Vz=>USI$@m8OYf`ebRT`SJ*0O5P?>bi#oK>SNxH8_^A~`nv}@fO6*JHAZFSR94O6FT~fD@`>mP9t>qI-*>r>BU`wv3g8x!II>#@*HiOJk=2D*HCqJw#tyd-o@WU2e{u zp94>dabI*wm*QAWa$K|bML?t&eArz3-o3x+!t)k-Ch<2S9!Mt2A9bP*Ww$#llq8Ld&r*5%{@Q&PX;uh?{ zhRP|Bq6jtoU7@T*d#_W)ij7&{CkxjgZ5$TYGG>9&ro(Iya-&3(`H zejAEw94$O^qc`8Z3F=TKt(;!gC0;4nZ=N=Pcn;^kNW|HblTm$8IVq)L+RfiMz~3eS z(#)3h+{95mpuTR9aF~#P2gS!*TYjGR?J|Ovy+pF4hfJUANCZn|d8+7*qf-gFn`uu` zh7g*h4Drc}x)Zz>j7Ud7#6d4!G%@A04!CI5Zf0m{>5FWLUHLHkxPhblzy zKLM$(r*E7!%Mh6I2RxQ@Er{yCCW3-E--e#*@kv@r+Rb690oyn*rVspJ))93>|6#e3 zf#l;bR>9Km1t^S??s|#Zb!O@I_18=SYhzNx^ktUwJ=y8C3`{nkr)^10yq^vOs_;Cb zJ8gSiSZ?b9p(OnZsRjBv3iy9cX93IP|%?sM84R1={J;9lujVilE z%A(*Z+hgnFy^x2#Hs9PiJ-xgA#nKAykR$+JU$(2QwvP8E+pm`Exes~Jm6ey5_ab)$ zbZ6pu%S@p9Nn%*JrV1X3TO^0x+IZ<;=K6ptN;K#)_=_0Ma--S%+18^e``%0)z!_Sa z4x(B29BOR7^VeYQE{o17sNExZ*;I4n>b2Y=l28#P+R(z8F;w}_hNXqb5pkxE3?M%J z8qDNnnTm43k5s`U$W-W85%?|4=eMMgvp8Ls6rlJcQSRm3?Lt+L(0&PmpXk&^@w(44 zyL4ppgC=<9S-P#N$;w}gU0=8zy1vXF<;XxK&mvyZITi;Py&vF9I_O{h4XLeLURok@ zQ=qzur1;BYjs%yjhw@F`SDm2j(r9#y$p;Cl{Y}?EXxbSCCxT-J-k8ca6&B%Pi`^Vw zDPnKO;GV4a<60!mJzSd3-j&<$2*x&BKlwB)R#rupY1m!fN@6A@eYAV&hJE1XcYhWY z=b0Zz#6l-uK^fI)1LbC;Az{6MI1vt-7`*e z8QO3sB2vAGO8$YfLGE1fz<{qXzOS~nre+X7A&2;DW0fyO57Uo=@lbTZR`fbWQta;N z9$XphEvhLFl`CSZvSbfgHd6QER8GUXB03-$1#Cbg;c>G26DE5TT_zThENOgK1u^42 zAv^4^*eo|V9Gv(q@^)^Zj8Y{2=@sa0BQ%w2{cPhnV~ckRjdcGFXd~m6_56LKS87C= zQBmT{Uh}0O*u$md55}w$&oXq_u7`t0NyPGUyHiq?k399Ul#=Vy9@DR;)UTeUrk@`; zAlPj_ee`AX^YcYCw6tZ`C5&cn_TTcXp1gNsXPb!5arvsF^f zY(8{2qM^z^@vqZ(H%*xoK#Jt|zQ>MniR+I=j_f8yzZw)*x9NWTcT0KOl zve@)i6rCRX<>Ud9J{9NoAO29ky;(%4MuQ%ni2>% zKS9r<1bWC9t|4jX)pgrn{vet}>~wlR?!B#3^x2hskn3G|320@NbCeJVPS1LEvJt=C z*dNT=Rm0o$tCy6MZX4deLz_m?{&gm0S<|vf2Y|;i_w+c$CF)0}HpRnILhGcW_=-pI zx+)`dKT47D#a;;wko+T^q0Z3J(IvB3TYj0Vdb*(1f37YsFMnMRMhR!Ux^C<9Bj+8@ zY}AIpJ#p+N8V=9jVT?59AFz1RF)MXvZ=PFjHjO(?Sjm+9M0?!4;kNFE zJReHfwtA5I*zxoVp4eeAbd*KAIM~WaJU;|QBFApyiW@BifD#)sjn>M) zVW}TUzA!T~gx2tyk!d_H&tt*>3q(l2hjMH#dmj~L{>2WtQ23%|SC_aqwl4Ca(Xr9c zzb|W_!7va8tVSm$CgS`UI6xc(VYT|2W22B?3S{zApVn8-Ri02xWYrD|ZT$J!T_2r= zFR5jLM|B?6uw2h|Y&1*YmX{j?v@DLdzlsYA1&>*C<+12h3$yEXii}#ujSO@JvLZKkg8oq393q9|-@b2q-`%(q|ER7C z04&G8N&NiRZ(3V;zQe~?^#GRl{QI~_+5OO+Q2x}kvO6&XiUSyqm6(YOi#}?7$@bPR zmHndnZ-tq1s9ukkCt%&p^PG;jONPc+VVDpVLxWUYO2fhO8BjH5k%Nk#ZY&};qmq{Y z&@csGB~p=UJ+oauk#&<*Bt(^pXa!pyA??!WQ!%BW6VZed5f}Y!cW1|&8-AkHSYA<) z@<&HYC-xpt{Du=at&xjg6Ancq?bZ05x)MLDCgZ?0p~3r!S)9m>+qMm{_Vl3gZEHo{ zfz|Tu*QI{dwQPVGP|DZO#F~3Ohva*tn4nAsYYX-;29h%G|oT-Rnw4 zW>dJ5xBK(#D{BVbI9bPHUMS+#iVNa$zF)JV7JN1Vfa?}evq~O(@i4zQ(^3i7X)aj6 zE#%ZZ6-m%;yZ{Ulg!BfjLk~w`yUQZ$w8tc%t^uvsKx` z!kMAW53MnE2luvL_pqVKnj!9-egkxF!9VNwr?QCd+1#V~=wl!Iq2>62`%1%T7Ezlt z8<##MUf7<%+D~zBXPc|xfjy*(VUZ~CO zr@c$?f^hH#ZmAbCTMv36%jVi);dA24rxjhKp#(t}6@yF3JLEpT88>4@sy2r%a(qRe z={BuC7qW6QE%|pd$LmLAbA9*^t*xh$$@IIKS;aictJK)r5o5w95BcmTRCM1_qrZeX zt0k4|b7(zs=mlRk+j(USYUW3%aveH<7<=V#iPjp_G}U12x0Ou@%B0)7)A+wpg)IZh zV;lC zlIoF<%D_D>!n^qmXHrUAI^|&n@BGvkNwg685Pu5{sUBt1W+@3APT9S^l7eTe8rqpv zqYn&e(CN{r_VQgdq=el;iI;YEZ#G&niGfK=S@-_ZDe#1~IlOZ*zJ8y@=)|SAjD2;R zQVjh)JEh{+8?9uI(5PO^%F3=Awz~NZO*1L4N0CcE0$%gu_`E#I&ECT+PgU7tqou6S zv_a)psYHU3rSaBmB%9F}WKGTM6_sJVPl$BqOqfsciF%$AalN(-7IQzHfgWfdU~prS za%nuvhB*4_`LKCn;!p zmG8I)t(_<&dX@BqkW1MK&MykYnbL6t@M-WFoS7{a+17lK>x|9_VlbI<2=KdAaI&0c z8rF8xAwBzwnl|>7aaimx@@jZor-c0-k?f=4UlUaZ+S4Ffv8$V#zJ8s!`p#R7 zr8;=8EO*$Aeo`Q>zA3dIEh?xmPPp%4o<$Z)?16D`j(1zyi z@_Rs)JtkYWrd8aJw3W(Gc2eegV-)qSyJ`FpP;lUDsJ5VfT7S7S0uELBn^W~0@7CJt z67l|e4~^4jK<~X6N0!GT>?Z$Szoc@KqL{RmAl`d%nvKn`{1%jxU+g+?XQN@7 z5o5;JleTLzUb|u(u6Wh{y?9lNyQs)qQ4Fb<6eDNQL15xiIR`*dXO4a^5U2^{<`Z5h zvPp(rqQ5ELm$Um;3!C%(&|^Yei`btVy30weR8&+(fMlr-#2T&e#3TCw;$i|CB@Y_X z2R=N)4mvg;+;3ZpEq;q#i|lr(ON-_G$}8yMmtZ%msq!XXyPL%}j!G~+=_j3!%kkDP z{b!jrSU)8w$TFECLVnwPdas8oe&KustSvP)F2VoAxx7A=)1e30ARN4M`=us8l~!y1>2l1_6>pJ^g7U{{@Z zf5#bjMe}SpWe9)dC-(qC8Z15L7K(56fC~; z>~1T_0r>S8CY@SO<y7RDmXYWY{zQPiqQ7P!8}-8H{y|x;^GEl}LPPWp z(%Vq7|8k?M%AV9dU2{!$;Je5p6z0wMX*yaPl}}K}Azqj zJh(*E7Q__&BKpP@ld92ScH^xYiC9&IT7JX-va9r*4puq*fx*P_3<}rxzm+=QT85G^ zhH3MwBwkQXI>T@(g5~dF>JNVWDK%X`d9#4E$|`N7pIcQl93mj$?$adAYJ+EOghQ22 z@Bf1D&TW!!#Nzs`>NY{tLsOYIPTeom{zzwV3oZ+zGIf5s#m2N4TrA;0MIU?BL7XC3 zR|iL7CM_|MK!3RPVmZL;*obF+&?k~tZz{Y;Qx9sLsGDz?y2^;KkVzlSlc3$%i!_wfPhd54nC2AuW7U?Y%;e>m znXA$Y`#g33n-#NrgkycDy#9s61QTk!^=g_~*jf z4>z9S8*3{11^%yjsEZ%6kXO%K$In3*49IUW+MR9^!mQ zBPwd#gP!t^%3_d1v3j{9##8J3Nuln8yBORaunMsW`Z?6dTZD%C@&;Boh7%a+ZRAnW zH!SeDFuxXCu(m#bgr#)$%6c(%j&a{G2@8ojKK@i7)6=7lJN1SUVciOmYws~0KM9U2 z&-Us)^8S{D5AtuxWP~_5cLxDt^U36&7%@F_wjLU{7&T~)oK{bND4B7nOk1`YeUer~ zNl+}~Mwj=Ssw{S>GbfrTuyG`RoFg)Ofm9;a5+|s7MS$10`m-N5YZl0nMy{YL&LA3I|-q= zM~geh;V)mVLaK~rt8FIx_xqGe1npNz6stLtkZZDnA6nDXqVGx`+J1fsYmGi=yI!-beB)0Jc#;wxA@Px-UXc9cN8eL3v>p@&p}&phIw z+im)xQGJAQ!?>qm8qyzu5FLnIc#`W(F`EIlD{?Wx_Fpk*qCNY_ZV&u`yfPj-@3vZ@ zKDS;g`&NtHU2FM9{scYW4LbtZK(6Uuf22qFbyJ;l?L`DK`SLEMgua6Kc#R;KDDC4W zj9<$o?Yxf2r&Kv_w3sJ}W=*6HZNGQ!44(dlNaQNuT}NDE$@VtmfAN#p5mgozmF^M& z3zR--{~^CRW}JF_WUQ0=SU?{VUsEuTwr(S5Y2KkY%h_gPA5b^MMD{SCn8)OUz?M1Y zih)C8OC4wEf~nHtWIO?n+#rLw>XW;n4+7K<0(ir?hkZ z#W4TNsv3(N`vl*vcKnyD3fx=(g7~Y!$Y(*WKzm=_luF=u!a#+I+9`xf-BCO z{)55tVg-l6`r_h3&RYt%KYtqEhKy$cQ0qS#K(&fH*2}y48OP$wE*K@aB;3oqCpjgG zqaG_(wprd7`Gl!WJy|*`_1=Z_q;P_-MNa$JZT{)}3xh++gwT60g^(-1pKS!%=V=?M z{go{t=NjsIhUHpCb~PaMmnb@8@i^B0F6EFreM%mXsdmYp_GE)70A(Uj_|4zw>6Ber z85wLGOw7!B*yzss%nt>RA}Ps{G>dC@$NXNK)@66^rnO%vJ$dPdNnTd~_lnq=_f}R% zy|oGBPLCzcqQM7tGLn}RzBv!{wmV`;b>~gohaXX zw8tZP4UeBZdE%m}iN98a`Dv!%^Wkc5ZeK8lfwj}jg0LU6c?7rE0PwRS?g_dUdu;}>hb%hQlc~+6 z3k~Z%<&sj~cysh0ZiggOh3N0~9|yEjPrAXbyX)0m&O)hE&YUEFGKV+vbI{%CZGzI( zRLFVm3&n)^fK#r9rLUuh)K4h$ZF&8ySP@Wra*CqbxDHqVpg|nk0Og~|TZSzTtOtHo z?zgS<*TWWRg4Uv@11`Lk?S;Jm&b@-_^Not4qG0x;5uUfSctD9cYR47-ruF*kXz7ex z_u-$rZ+2`0S6XZE^QWc+SF>yRMMXcge(gO+*XhXJxHRydD4KK=Z6BD{{ofJ+gfpiz z2owH~GiNYG>+f#LN#JUSg8hfr)Yeg&zkjQ)i1=gBfw;m{i14~HT@kd;&;3C59N_86 zS1`L4z`LuIT?K?}e20C;KnaBWQ@=^6do4k*@g z$u0H858&RP5AMKBP-pTCEbS9Cw{VwXyu5&aLf z`AuCro2Az!n0F`1Dx`~6M8!rP5xq`boMsGH`a*2$XXK(;j^oDs3>(21{@R%h}XVV07EF*sW-z{*#pa0m@p2DKMQV031AiKu<5*oLDw!roucesyPnUw1e+{ zZN-(Alu$h0uWS31g<~W;7e=V|2=d*!4^XM&&fOmgOGtd-c{FKViHu2CTLx1AtScPM zboocN;+?Zukz$wW@1<4Gef#`|k<2i|Ee|l#UD4y|T7+(XM<-*5cSz_Ju;F9q?xHXc z*PU(s4;KKjy@^BKUSw6G3W{>||F@~LL%PAy_}@MZaRS>iM{rY{5=voV;Sq=gcwg`+ zETNXZk#<;8PSdJG2fP!3`zefk+_B-5ek0T-PQbl#%`uF*Z{X#y3n~|p3hT#XPj+xo|hs8 z2H*4R$pLyHq09t1oeEGi(L;6Nq{1sFxr%k*O%OZ zmtN<(tfcqzX-^{kt7J2BWo2bPRK_@}*oTYeDzD#BCH##;ok!)oWh={M9B&^lb&xk| zG}n9MTK6D0G5IxiI9C#g>MSMo4Jg{1sA?I}`umR5az!!*aF+=8zhutcLeeF+Ao2<0 zr0f^3<@E$*$RY3yF-x>;Rb`k^KV~hMJwxn)DJi{!N~eE&K-e(3hw1Uhq9XI}z+h$u zHq4A(-=n3b^9*il2H^8;f^tbFM#PkrolylvB7&W8xEH|127Az0qXhZ}fAb*Wr1qOPy5b|veWWMD55 z)o&IQOG$noIy>=ho8My4R1qIv6sVc_}SSsudhuc0b86 z(23kZT{(fao*4-Ec$M8$1U~J^4t2NaO8>^Y%D0>#_Vxg_-ZyB%rtdx-Oe07B*v1i2=rjjH}KT_OoDre=@KBRI^#=}u-x-xVzAXm~>O5*Hc z91>V<0##4$p(IfjkF6BFQt}C|VEB5ISqElmcF0Lv9N{`^Hxp_9;mN&f7IG4OJu#4vq>cs^EYvdwQG-gZZTd@rzj|!zWLuw1sMgUR*uhlW!M!e*`RM z&e2Pto80BxnQ7*C3!GEU$*y?o0!aarQpuhX$U4<%wD5Q|rm-ZUJ8^u)hpzG3rQp~SAq*y>uH1JAs)zvf}kQr%YsOTx9xG#~(85^UIUilOY&==m5 zsY{87Uk<=!K#}SRjhnj`wE0dY z44Z!4DBP~i!s=$#QkRT5JUX%pRsn`+5lBL6TVXR)-K5HTB%XLF&2|ZCX2!=?t+ABi zgGMO%w-!Yjxv8*j@SlS&^G6SC7Fmz#T_;m?k;p$N_?J>Lvuk!YBV6{wgS=N(u=rdO za%ri$%Tt}4U1-a&alj$q8l1k}1>OH>6l9zN8R@PYG!8$3tny}^pSsNPl=O{&K5_hn z=dy(Y=H`p~%qbO^7qS3i$tNTvj1@?um8um=%F5so>N6&=V}_+rb_rS!W-F@yjeA0k z$IBD1dlFMvL5TYG zlXj_a#~I(hw&$8FA>GHG%P#jz4W+k4A%D)rhqmZ z0`-#-%+P3819R`w2?SoOq>@nR-85nPXWzBzyaf2|YbMdoYP!LG-w?pkGDu)nMI9 z3H{Id&-X0zx@~SLWaHfu~C%(1tL3Wxp(jaUS&@^STI$5 z{`}bzT0P^sIYwXa;^D;X(wA(?Km5Gm!&kI_IcY2ri`t84vepADX?hk?mjADBl@$U(ntDTmTzbwZ1s`Tdq&93YEdMVO(0-yT`_ex# z;n@-$-G>YPQk;m0!HNQh;<7U9Diww-ihK72P5Zo~w-2#`D|9P#?or;q|GQEu9Mh_` zwKY`%Kb%OT;|;a%jS2;Dh85$4#(9%PPh#NjW-nYHManx3^%1A*w;=kA-`+OU$)Q4A zG*5p#c0Ie(AXHjfN^-n8X+8~iMsl^^TM7|x4Mh1v)MWHKBEr$`p8fEr0NU?k;EfoT zlVcnT`fpq?w{(~WKTBmDvyf9tdaYCR$s_w!KZFE11LKPbb-vbJ7zgNo|ydz)dJC1Z+aCw6#Z9xH770va%k#f)gxE%dj;=YK!7n!XEdO zrn$nF;i2c@8Jq6uSt~GN7pG0?L5}>xj4wKJWF&L_A{1Uc>EI{6?g@bvmiwHdtFWj@ks;|l$6+4#)&2YTt3g5is@3nbCaI`su^oc^ zKFFxcBL9%$&k#1qQsqBG`NpmP3qAVEBO-{Z%<$IilkT<0m=fXb(o-F=q$tUUOFez7!@QN4l=KY#zJY6y?C9du zP2bIJ-y8n&6GYNYjE(Wg(Ij+htSOaf$x;O|QBzBj0$iUNVL~vRlg&UKi-l4bf!LGy zM#J@k|045uGEza*Y%^_JAMMC(RAgkN;2=_1K$Z`ot<+RUb*VeA5#RWcI|m*fIywqA zInu7A;~bd;ZW;&tqf^EI@BcA%HZ|gku255^PV-Ir&{Xlb+oAp0un^Q+Us*htX7P@7L#n@g&|m8J6g-h8B+D=@AkV@WbDg5wWK4mtA~x zh#h9V4y2#gj`8_6=ED#WZt4z~{rf3AbZ7W6VqZy9JPm#IaJ!0*X6EklJ70;7vHpF( z;2RjO%zVLO)L+K6yU!kkcw@L$l*~|nB!=C9f{Y6H+GvP5<8wwK^mKw@LmE;!nY}`8 zw#&>&llW0^VHkV;n$d2mjwvI9y0X3bXmgSqyiH0$nxzp*(0vu(z>qxJGEYBv=ihI; zhuD!0k%h4m6g14LGO;pCK^!1Q)>>IvVJC?Eh8hUz&k+!zJzbloKY~H{)1f?ZFJ5l$ zXns!4$P@)SP9QWUFsCqcK@AGlnUYE}!!mTf=a;DXsKg$4Q6m=zA=%&sER2ww)Ol_h zl7hNcs1@kjh3Y&dz>k4hxEkSNwCnoq3>G#xUi|*!;9};RJjoMYt1~ zAZb$ymrC#km@P3jHWrr{v!ki8v9VUhJto4D!Re7xTBpI9HI=4kOW(a>JrA&4xt1*;lM zFR$YhaPm z#y6%f^fk;=;k+xs(aq$iRuKEKsc9H0wv3Q7$herbORuVC}(`Qg>#Sy|t5-}0V&+jHplheeZcFb@n2v_r*I z{W3h9?EwvA)vZZL9Tv7h$Qz1ahn@i@W(#9X>s%J)KDw>q?DKAoSCVW^Hzncgq$tVK z%Dqx}Pz^+@QkaMF9Xt`bq09jnXx|X!8`bqSH6vKBuwA{PWf7;wP*MV=zz$V!*!Bwl zH7p`09~dTCr!ecP+`_%@04R)jO3eM zFRwk)^X|E1%Sk9fajo0xAfl+g#n5d712?G)v=OCk&vj8;{vIJnDYL;9PZ_*0UM0gm z%}UVJ*49o`Dp7d~18eSFs2(8G=4`TazJ~LepN2?a(i-X;+178Ws}Jo!Q9siK(c%#Dwya&vK+^aNK69~q)j4*PyP3iwiEl*tkOZ*|HE zv^AiKrUXI6SQEa7h(8d9UIpa}y1lr(1meNL$^e<1&M^2wD?*deQmjHiJ^k?T(o~{K z9qy-Y7GxS}B@=wYReMY%P-C0InC(KyZZo#_x%2ydwILm_asj(TOc9e`qHbyu%3B$AYkXWsMXcWim75n!cSimq|asD zY5!dra}69^#S#7*jOgCp-tLicai%kcvRGX}QckDUFsS1}fk~^jpjGYxm!UY=Lge?! z$uH7cpA?aNV|jpZ1sY7&cwGj2y|l4Wpfo+C`xYwd@WI+>hG`eDmdlNrT;0zOY}ZFh zu)tw4s6wZ_kmT-K-wWamM38om3+j$9JonAt;kE%_XcU~IRpFD}kN5YvIZ;Oa19sK` zS+N1rDkiv-hT!C#=LxthNCliTGyhma-R)ISOYkTMLLv5kPeZM2BjN;_hRj5k7w_9 zjQ1Pgul2))!sNcM>pYJfX6ghs=j_F1CzXf4r|cw-U<_pYmyrZL|F*uM)K|;`TNmEy|7rF}j)8${# zFY<#mJ>&8!sa_j>Zvjc`*>*D(eF&c*OQD7EOLhj|JFm}W<0;G>V`gV(5!ct(lYA;B ze++BD86;{b!&e6dK$%6c-ZHiRGwcxuWe2OugcE%ENe;D?g7Pd^xT99*DMIa|P>0sn zP_8i`Mo$aj!7lixTD`mz0~NI{lqJesfYa^v_ErM`u$knKNihSH5|4!!n3}2N4=WG% z19FtX*)G`VTLAKm?MtIsMq1&qR~7drb+bH|^mfbc^1tL0gt)+5ugSOMYE`a`y=#(ffR7Y`N); zp%k>uuOOfJRf+YZ4tW|ZTsOd}toM>Kv2!4SIwc|k<>?hl&p3@cQ&A&W zK!K}@6^!W`RiAW?!eK4yb9Oa84-&`BLTBgZ(h09gd2MioQBJ7@oN4HOfc?jUWe;H0 zlW5#>zX1)T8(vVQP=|;g{}iD`@&Hm5DdD_&3Ee|JR_`hZ~eVdZU{>n5d5FRApWAS`s=JxkC%VqNAmU z+$ll*qXttVX|uDljhP6dCuxFJSqh+qJ6NNOHZ%#qsQjv&N^&n>uCVtQkL^JZ#K<5Cfh@t|8de2U^ijT}=x}zS%O!rIMp>-t^Q21(C=Qk->7B#` zA2&BGxI533B$$>ErLg)YM@M6svs5JuPEIKH{ayoORHzUtQzVDnW1w0&NO69BAA*7t zz#%SNgYrVvy}G*EO)HpkgA$z?9kVB$$U?H%VX_DfqXFUNOr=i5UKNPL$vfisEd^bM zo(V&=0u1nGF01$Ah%pkibND1-9#1+*bpP1}2vm^3i~Y-c1>JTwJNpJQ(b{Zr@r+ip zZ`aY>9Iq1)I%udQkw1a4O?l8v$O`?J$0XV4&qsa&37f&0DRod9n|mHyFhq%=rxPej ztKgEi2*t_tGNp=0j<%Pa_7Vo@uLk?f0~I;>NF>wfA5~K@oj&6Hu^Ug{Fx_MK;VG~+ z+*z_6odN(#>LUZLRpJh~CpjB3ld@eE5his`x>=c;oyB*6Trv>Q4+gkq%Qz|v@AS|- zt1zyARBDz>@x1D#&70*kF%1?3B^8+x-Sz!IxW?5d$j%N+N={Y}i_FD^6fKKF3FM&L zQ9Er?i>!Lsbc5wJhi^$t{`D#FAh{~SP8-=Eq3@}uqo?_@vumCe5<0zll66w%x;d-` zNOsBH#6(;>Fh*#+1j#!}0Oe8S;Ez9T8Rw3Ch7f96DEprNDzG;jn-5=$qUDf5)tJ4`-2%QXITF+tnSBFNWoGc#$-l{Py#&@&KXu=|yVPEa)9ufRu#AMY}+*Y?r4 z(TcfwHd8B$c5ph&ZB+Fk*C5_^oMOn<6F$6qd|-fa%7qJi(B`Ep`fsxZo5`Y~5pa*^ zd1a}i9}uZTmBB-dX3pdBHZiY|&yuTE^2GSy;gQS` zBoS;ws2NAUgQdi(#Elo}o=vyzZIVurZg8M2xKZ>H38J@Xc2F36nPQm~<5vU)!;(=? zqAl9R&xX)wmr%SuGH0W>iGg3r%d2$POL#-XlUuN6l|(ZzFf{m;Sq#Km`JdLSkwC3G^r3rIG=M3d|?%X31gjiuBMHyy*AopUB*`!(M zLN&fptCA?r;8wFlVnInMiz;r zhX98UFO;~miWH3&As6&p=+zO9?xErZ%TsL-F!(bS$aRFSGF|obZo+jIf^F?pXWkWQ z5zYdFc9m|=UvtT9$bvXv6N$dz1kO_j{!M~tpFb(YFwx!6MApmZvUnn#hXM_1KjGCP z9kPYNhdV!qb3q9o%!v-*Lcx7=1%m0|25?QEtQg#z!y$WvbEN`oGt8+9#e$GHO$_6y z-5TIGD?N(o`PYxW{0QYQul%|sLfs_jjOps#ZqjSC&H@8fSK`QKzu|%gV=3nkIx;HA zZhKD^Hhznb1U~IrL0Rx>FVgrA8k85$|E&c8VF~46g5T|;JE7bz&pNx76l3SGz~)BH z(>jZag`=sY$+9_eLe}eR_j2{KguEq-T3L%hPae2Mcp7ewrj9jp8atAbsf6cq~N~R@zE|hX|IjD-*=(FiEY=^<-$iW(%wDPLj8b5#jyj=!aD+b1ObF(es60jg6yc6G!^AAixnncjh zW?>%M;lGZy7%mgF|9~W%YyczStGfNBV=&p%v(}=-@MXKo27u*PYqhmmq}JVC90pq&vT#(%C- zOf&$UW9>TP>%D*fz8~P+Z#FYA?eF>mK7zNOrC1&97*P>WpngUV z7Z0znQ9+IDhe#NjA$!$AiN++LxWad z5&9)Dt)R~7Gs)wY2?lNEcXS`3)uaFEl?$ z1!yE{8k=kBRLa!MfF%mn3Y>8wMx#C72W^4yeohOJRVMfodPZ>U{R99tJD@nw{vj~a zL=cN~%1&Eoz^5DF{ zyVzI@rnJIACO)aO_jtMQ3sFC4ez2iaNGm2ImmwcXW3)c;~ z8xUxz`|}p$&Ckk$2su3uhD5_>Q?h&&AS0=6B?&1I#D!Gl#bRA+aESr#)A>&Zm&K%8 zIBk+x1y_Mn>eD{1RH#g+T5OEvEx>$e2#C7uXuCopM4PFFD@@%S-s?<*8I?Wok2WY9E@8?J&N-Tzq20rm5RaS!i6uo1W|QB zr`J(aCzQVhm2*SW)2Zb}U%vk%R%#VEi1IQv1QU|670)BrZm- z0BJGEdNAQ1danCRBuK&$*XwiFn~wx^TK-YxVf>_|d|D;Xbh!B%73-SZQgj#6K7Oim zq4%@$L)|JlY|l-3Ur|`ZZO*?^3{gvY~5_j~^3J&N9CO};aJ_X@6I*gC2RNb;cCcLH$PAu?^x z3Lz3+#Xitx>#oRYQ>hbhE%UK^@~6TmBji^!_8K~)VDNcOTQNLp2c>7~*e zr%T;1VY2*+oJ>yOubm_e$R}jM%VGh&{E#4K4z?`uu^Iq@niZ?p*L2|3Q{U^f1n}o3iZ*0-fcOfGswZvLnUoW4b z&fJyDl3=3!QM$WDDl3*m&IE$1b$xa|W?j!71uXNb5fi6VgDi9kn!{H!Ach+8Awf4&>4h+e!VX~P#4klfB0+eA zb6MtSy(I!Gm)o&*+_W?|0qM6LiXPnW2yPx+f(|1U%JVbk-Z1NvrRK|}GC#A{7S=rG zieU--bY~nSOiW2ai%`gtXZG{l6~Kf&Uw}Z&u3gg>ws8hP>`@$zoT6jY2Vk;c$jJ`I z4Nfc!V(UG`x5R%=xAx6zALM{5g0;8+WDPe+Qwn<)fpak=$^5FRkUAF4UKgkcNRp7F z`n)m_sbzDr9p~$Uv@E5%XclZ}s*b`otz^t*Bii6n!Q(U(T$JA(+ynxgI9)6Ltb#98 zpa((jiy@K51Y7TB6$RY8l-v9?{mOr=NZ; zCPTF{FgQQm(`kw2cAGpiG;PRZs}YUxJGw|W8toyXu()}nNPRf=Th%AhA-aH?dF!ku zrcssAf_WO6iW9;t1FhAlPYyr?x(4=w;(9i$PuYXajDCAtPDo@uuM^%rb(|ds3aSKV zO$}Tn<8VStykS;N7!qB1iVB-A*eW;n_J$+rMsM_h)a;_5q9VoaDx20BTo;M@*{__O z#&OdWibo#^_I`t#89gi;sFf5z#1`{WVG3D$wV-FQ)5i3F^82TMpAi3F>^@W_+msV^ zNpx~xZu+a;>k96^FFhYiGL544`iWl9p9kE@I~5U--zO+AbX0S4zZ219P9f61wfyn2 z!{oDWJ_9azS$KoYxpqE)$=!{N*M^g{#6`Io{FMA_26}q+Z)uV6lruPk;;BT3|ME#{ zwlIs<4l`e>RI3-GV+e5b^S|1_g;xim=TD190sTz@rj4P*q%PgFmnS+HV_2gfU|8f z+YzID7s&Y=8p!PI?b}wt(C_JGKjahwH38T=Kj5M8t++F{l0R;~k@_W+L+kvY8)%}= zLM)tGVP?5**w??RIsE&AE>L*syX6e+`l53>l*0JhD^cjmk5(WC zMYjAlySEDG*Pw=q)hwgQ0PtMuy-o2Ec3y?^*!dn*93#cTR8;f}3yCt6;Pj<4LI0{) zDd{#sI?`vX-TnQ$dgyxu6ex!b z6}ws4@_|18+*xlPQ{>NqLY$H>EE9aMzaKXVx#G!#_j?*=8h*H5XP4i{SY(FDSZ}%G zYX}!JvNH6D^M>rbOVF>$lf?vcvq^4Q)Xg1V{FSbqx&0_ow9|P-`}nA@_@?h?g86pI z7*q#f;l429yRz*4jrB1F1=rS&{pc_h0fQSoPaOjzH79Vc_sbVO2?9+6`~7x*ZO|pi ztwo?_|>j9}Fy=6JX2&5va|^rR8W)D%a?hn_xU z8MZf|{d%bUKSfke=Q2N$jUxYthN{`lo?@cb5#n`}u7ZXyye(R!{(@bvKJ9CNeXt{= zX6(yA{PR#_anW*K?swTKUx!-X-M+4@B2hL`;yXlLwd&x7^r(mBYoPou1B=w+M*&`N ze+7l;97FbIcCZi3pKgyEeH$J9rL3f6PLC(A1ggV}6pEvjU@&YIIXQ`ZBVP2a_PhA9 z)%-R z!CbVY^rJ0}`< z`n43P!Z6~k)b`ROfWe9ksk*Thg65GFq(ZrCcjliR$I%m#h8NDAUE`UVD=hS+i`J$%hRk^72EP zwKC#l!oM)cxbrJvhfeB#0-|llS<$0^+ABU9`x=*~Ktf(cNML{+5dzw6JOCww?UX5Ry3gi}LM2c35C;n<}On8KITxNIJh^ z0y1FSvrPfl3DcDZnKx&ci>v`{!r0Nt$qn_m8)(QJ#+yD(m6P)h zorc*w?!$Z=7}7!G*yuX=T7}nfl*RYhP;d;(=r2Hnld#=;~;hjA8^1IytK4ZZSm1JM9J6(%9!*uUS!%h~86ecb(vTAW=L-~- zfxgr@`t~z+aIY{-Oe7=uisAS72y{8s@vY%Rhp<{C~%>i^+o z+&oggW+Cj(fIj}_^3a3{f5Z+KN!X2g0b~xTfzf|bW%F9@I08&AkduTxY3U@mMCFZ; zZ_fpR@>v@cJ$;&U4l6TPhGpSTWP<*(V&KjwDszkluWAXfZIJ4LDsbSWLMEE=j`oI# z3cT2tGF;tTimjMAl*e)UU8q34SM&BQ+u3`Tlk4Jvw8bl(vFF;q+Jos#&CJ&JcXkA{3sg0I z6y@R)2Mt%qmui81FMIFskfXV&$&V>bftV#tL2W_Fm%lT+vU30K`*+nC@Jwm|;}A0^ zXChdCmdPQ}&njrvWDp47IWB$fNpg*k`>S`QS-o z+h$Vmr@amUqvCuE=a=;*An>ivE)YN=%mLdwqo*J6vG4)``&LcwNG>g!SmY^~b>?~S zu?X`8o*H%f2^PfKc)In}m#XA7sF#BF8cVb@u)U902yeP$?upN0VkW+(mcK1v1M%q* zzx9zIV_EzO{&5t7$>-h|zxA|l2wGH<5Ium??)5jh`2@EkQHv0COk{E_`+Gn(lDq_j z><5rg7Y)%i@4Z@GnDtM=Um=Wu-XY`IpZ*a2tum=R{7_A*wP&fJUXPGx)+lY9oUKv#%v03|^c#ep zm5?>n#J97U1fnL8jiwZI;zzTqlJ;owNL*lnJ#1o3j-P}B=+4WU%}785?= zstvEUmlDx>^EWK@;=Vo+8R{}6GOdzvm|n9Cb1U)xLM1{|pg`;Kq6nk;EMaTXYHQ2Qd3sex(pNhU4}) zK~M7mu4JXVK9`K^wJ8jGt(>FNwxW1ZLlTMTW9}f2lNWZ=Ou^%S0Q*=U9tw_lI-kq8 z;g>$gG^L+DeOfm%Gz>sRM|T~p`cxGzIpOnWo&;Dj>$x*uds z)qcHcyRIq!O00yGIEUQz=Eu7(o9}ZxJA((Cn)QCRd%t*c?^_;1YwkOu6w=%a8)k$i z3tKE~RaeFcE*Ww=scw1bs$&AF#t{mm8(fEvBV+vXeeQlA+EYCbbkN-)m+pGvUT)NO z;TR@0VF^TA{YPE+&)=ZFU5X{=hgn%$dtLyD(b+k6_-ER$sw!M2wL<6y0O)cgjwEgj z4OKbWs)Oo!DX@xhgX}pus0CdddlXvPTP=fsok$5LRIiPr63}Gw5S>e zGpmRoLp6UQg_4ZAiJx>en1kU##BokjeRPf_7#7GaM>#E3)9Jp|(5QslMQVHj9#g(D>SIs@b*)xP9e2H9)Z*TfR35F9 z-`ouJiUQT_YV&fm>HFIaO_oQTbk*MLxMYIK{(psKNAQLOi-4&ibt%)WFPtq!oZ=#2 zO+>%i+-#NRnp$(}O}o12!;+IY^M(YXF&+QlV2TM$Gcek8Dj5P0D+7n6SjP zw+l~U1^9?b;nywEdgQ^XK|oDCen3C1kXRXO?W=_I6=&-N$ad?*AS6gBBJW$pS#_*{ zlMl5^-qVmJjoVFldJY)oxm=@LdTI$-XqPa*OQ-`XbZTJSmVG`1{00jYn{AXcrLKV6 zpBN#RTR6Oh<_cB85lDI#=9kW}l^*f0*@$ZzYSu*%b2tj>l%1;(_d`yGhnNXWnCiNb z<}3L<+ShO}{nng@^QfK4-&RUS;n9;U0Gq?fKWZgO_+}aFbYffAJu@pS51{H?X#i}8 z4C7Q|_@R`9ghX_0t&^0WpGa?G2;Wl24m&;))x+H%;u7mw@)7OBVqeB7zvZVQB(xvQ z2bV@;fB$i~pHfsBCHKM4>%PAa%2Mpazub27A#1~gS1&J*4j_Ebu~1VFJMW_VFYR5D z0xUWo9VvNLny0OwuG$R!n~K)cGMg*Yu$%B58|5PU_W*tS1%mo%KNJ?SbT3vs{E!o7 zZ34{X*`FVCNLw~c>6}9$XK()=_n$`7+Mw?}`^x|(i?PC8@cDlGODS0+(VH(q%Zavu zRh}qeD7jiUmw1n6iTMm$fd-Wohp3n8%Xr2YP<_nI&K}QPGB13z_H_h{u~gOcT36Lr z^1``8NN~)NU6*`%0Ti{>ud>7O;R4S48^&mt7Y8W>_c{kIk%(tDKL z4Q`^30`fJCBT@Hkd@k}1huVOt=}Op8`$#HEI!b?Gs~ zUr)R@`#+3&fD}N_190(})i0?RxIVvJ(AGPs5>hC;`$pWp^w)KYv`|BP%T2P1QK#yyjw#wv|FmI`kye5QgXDPVR1(~R z<1y3ism&E(hPE<3fjt0Pi>aZ)T>qayRk5y&(?HPA$x2+ud`ML(HeN zi30I&nGEx|yhZvYvQaMNtwS;A)_f?WaWy3kz?$(m97V;qTL{kUyMi^^ckyMX%@G(4$sIf8@e+-6OkxB zO9rC;F!(g&KGvB17=r%`nC)=?*mP4k{|8viHo@VS(%0WV3rSPt^^FKtxvFt>brSnt zulYRWr^$PnrX!yiGz=%IX)w=s^&V+dH~oqiQKJY`PeJ z64E3r5{X1@di6b|i=uON9|fOgtOtc6vImS9XnRK1@XCF1oKL;Kn8<$Y%>Ta zye3b;@8{PSA~(cqBsEM_R8vW|Bfh^NCS(^V`>Y^~s+C?ag>xM5M2)LUmqM%)G^BB4iZgq*?O$0`Jdhj3FBox}Ci_=GT8%Q8Vq@iXZPq6T`SY{Fds9hZ%AHgvDa5gOw^R=kxv)wla0t0ak zB(eeW-ffGimz=5MU&RKYW0;7U z&H#4Vlaxa~N>h|(?Pznon7N+nFIEknK>TrHE(*h=GF}$=HDy@e*OC1F@MB>?X1Ud4 z1WlpE=2{kxBbRgL^vQ}&*%)a^&WU($8|(2>892?n00%3te3q_E-TKe4at|=~9am7j zTrwveJRnVQ|59*hd=De!bsP##wm(m$P!c)Kv1vad;y+XppQ4&H&l*O+F zQMiQ)Q6NTxDOiu?qFdlG_5IS@8~w7(A%I95T7xEbKEo}X&ayI}W@q_#u7Jl~l&RV$ zJfs85e|9-WtfJy>XKhIWLVg=wgCa`9s_15?T6+^n#VPmx;n${ph3WyK_W4kv{#93C zWnu9pk+x&wfv~jxn-}&ToZtoC0sXE>8*!y6LA4YFs#I&EAt9fA&_Ckd+lavP>&PB2 zg$1LU2yCc87nwKskuqgAc$JP@zGb4~Va{lX4wJAW933K(LQNJnouIu(u}SkFr-1E- zioqMnv)c*|p3PZrb;~54a2E4_nCi8G$d*&TMARmD@BT$^A-V%;)M7%*oKnayv6!QD z8533G3ETxtE3#-i5SmYj9){rUZPB+p%7-up2BZRD(UubZ#%HOQhZ+N~bh5a?0u=1g zp`l1c@zE3Asg3(mr5n@z%ggs$K=!Z%YDcr)AqGWupwW3CL|Pk92Iem!Y9eI#3$%=2 zcr9I%&UIIXtgKmtq3~%UUJ2MXK9zZrih0FD23h;O>w@tX+Hazai6KpHng31-5Sd{J<#{`;dg0+ z)iN>?IOqRFG)xv@SSJ)5({+u`k;wchRMa`db!}$9vc=RhctvSgqa+o1)UOB8-(tO zE1cl8X;Fwa#k%q@JZ{|`grFg07~S{BZltp2?4SsebE%x=xIAy6FQr*==2=CensdR! z3DkkXg3wTz$!e9phO|thant~Wb#VkFgxNu#DI9K{JYC=2T^N{{IBo`uVC2KDp?OK= zRC`CqCJ^TB5mHdJ*WzyFle^kEIxchZ@p;UF?NAAr5KfH?pK}J5rFn&HV|TW|2Uj99 zwWa0pqlhd?I(m=sBtowo%C=EPzKNXpVkfYNwf0blw{s+Jg1iw{V37Cu+SeDA?Bk7e zeOzvq68l_Xe3#q3Q7wZv`NF{j8n-uR+96S%Eq1AMVo;yt}~@dwL8VYw~@Y-g!gP> zZ94QyCq+cRig)K1Ay^70fYPKfA}oTjZwyPY&yC%}7_MV%)rc|`dBfhi*gS2TLk7__}vsiml@C*dIpj$LJa z7LxOtUv(gZhd;vaMI>~ZKT-_#g>%h$y6^`8=aV0?#2fHim*x{Y zF}*y$eR%#sMW4UGvwA9-q05W8m^x5jc_biH?!^mj+slBTu$$fVb6a2AJ!i5rt5={E zlHPc~bDi-U^9%M|7^QB`fy#E5Ee94ASBz1kU(Xv}bCQaFD^X$!-<#j`->$r$J1n*4 zmG|wZKge~oel(l2dc*egX_R&qRz9*K|J42w6WNP0VSoNsaev)<%rP9o_kl&ee*Tb36HSS>R+GFay={fVpT`Ap$7P5xjo)P znW!g`Wu77rrnIu?6iC!P+XQMybqZE#U6APKqX#tKP%qw+BQ)UVo8uaG;`Strqh^Ot zr0b^pj6`L-;){i1jzZ>6|Hjc&b<2nj(BF)~1UDU#f&N}oIalO*@q=sv)T&YBFMk?` z324DY#vlVI>IXSOnV~Ou+@gdWmZ%MgP7-2szvdAT5Yz!w>vsTj2ZKs#y4B7gANGIq zkVF;91^H3d;s^y{Yfae6X!y8Pw1f4?JOs1~Gg+;AROMn4a>w6arZTxkt@Xs9H28?> ziMX!`!SUd~!0}6#2sP7|)xy0}?6C|FNMyv+dM?2vX??^`#f;uXe^7*z$1?p>rWF-e zGto#>2W}y(iy%-#_8GIBz!tg*Tc^v+$~4apWPnpC>4MPTDc}g>eaEW*3H7hr(P_<# zc*173+x0ZU$c;ZSft+|5^?&C5v$eEZ1J6~20&ZMy#^d8{ZSDO0N>!)AKiY2SbnL$T z?q9KgPJg~L8U4iXn3N8r3ccY#QEh2(oUYq^`;dx?3@^Qcjs0o5^5L?{?NDBSlV8w* zBQ9^CLgaalcv5hl$SX^S<=Zy^tF{Jv}4ONsW53`qZaRBtoCHY%cd-XN1=8 zICg)1osHCfSPwYcQl_Pwps7~Wc{rw~C)xh#dz9*Pahuzs?lHwv*d{SBx+*;24)9=9 zP$txk$7cUR&M>e_lrZRsWzKy=lXwK##7a8znhheT*nU5zI0j8L{ER9QB_o|kWIPWs z4coSbU6UXCg?@hfipTi(nswi8?_Wu6NhY4#)oli!MEk4Q-W>QIP12vF-*Myw$YvAn zkC(Q3)b@(WC1C|yC(|04=TQL+D!Elc{x?kz0K}Qf){q$?L;$0O8VLLFgi87N+sS=F z;!lg^Q0+5|b8;(v3C!IDpGZ_~Hkt$>IsX$`Cg0G;oNXCgRNyxc>}Nw%iHTp2;o~A8 z6q9Cn>P~-!aOIftY*bYw?OJe{QHzBHwF2?Cto;v&-bV0Et4>#k?M3sud*I^+7auu8 z=`lo5kKf&a;lI9uV5u1$iHwDvjE?xZe%*C|EWwxgYinegXpf|S?MK}Sh2MO3J>i_NL_qh zE7YMa)>N!Se#rHan}PlP7!4VEir6~Y-I+5asXX4Z70H`iWnWTCo?C369ZbXMGN zipJM@1ua2{L8W!-J$~48Z`+&NHX;YE`HKVU>-y3;l>A8-_6<^!$P4ArSwH&8QpwC{$0dVq2 zMx|zo>XFFjtH0r^$PebuLD{ZXv~W1P{fX}|gS_gHj--h7^yJOxrO!l`p!xa18Z;w_ zRZncLs6+b*ILw^AhsGAsk1=nlWKg?hxG|8XpTVxgYxNqpVu1Te^E7hKa;0TQ@pfan zr^hV3t?hOYBrO=y&j83(#FYS}#~RK%q*mP;UWIf9!7v0myn`5uv%w?m`u^C*N2$eqSxB4T1c(`<=dO+z_+nt zrFuW*fK@Kz`}gmJ>g7#dix{r;(ilF&9nu{d6}+U3q(XMs9oXqs^00>`H`<)U+^buK zcN+vy$S~9150#u9Mb{SEKg;)Zp)D8`i!GsKpC~fu1;bcwuiwhEWuQ4ClrS*3Z}nmY zyZWwued6lb@M`01toV;oPgpPd)cwuM*}T9R85cG@xyAN3%n>UZ=dsGEY?!B7;D=gUYidL z_fq3hQnr_~CWDchXGfEGG9NDjZ7SW3C7W|z)Nt&st{xdOq>xU&{}^OZaBlnRRm4er zq)jYKlVr?;geNdML%_|)8%9$US7N+K6pD*3##Xkld9*3$Yj*tgl|V4^#42Lk_;FQL zkz7jOT~!M!Mpc8zIsT1|tx=-#QP}UQY7rVp&jJ$>yjVe;yPEFYO5g+5z5*zC=jK-3 zp~W?5`!C*a=MIl*;kN*=_IyKFq28zn8Ai=xYyw>VC&_7p{xfEQ{;5n$8Q6^Cu9Ylo zZ3W33^HhXDrQIQfX5E3Zh|uYfrgTrtSY(0^mFu3gE-Ig*+Z~CX=0i--jUgCd>-*90 z8NND!_QMMx$;z@>r*09r^uE;Pc%#waty?~U>efYQu(K^gPP*&He0(5^cAv^AE6u!w z=bUlhLLngwGyIDqNGaxnJyUVe^H-7lBiW$RB3<}}xdLNbGks2(C7SHsBxLL4{xC@c zNl+3Ci3{;KOpp2i&T1r;7G&GRV}G(Vr!5$7`&b$iOofCzo7bu!`P}a`yUQNy8?-noV;G}3!_#?dCFAAJipRJ-;Mp8 zU%HTbPPREa4##tqlA8T#Y#OXTNMILRUaj3$8) zj;puC&v8&cx3!69a+*oCB+<#*Euo~PX!w|e|iiO2}UtV?ifc9Gj8TX1gT(J&fo`pN*+J}g6o1K~);iIRqPuVeEWFp5z< z;`RQg?oIq%K)Jr?~+|7twQ*rA$o zn9{X&LDqFU&$MFUGtS-B9qxNBD*;~`qZZHf^86`AoDXUH_eB#Xmj%rDeo-mKvizP& zIXPH;1B~3%ie+guI;7t&elI(U%hk~5l*@n7uTnvJ((^2#@D914 zSXgF#GP`k$a#lc&JnQVi;MqJ&m%Qc&HK&q|7mlhaao>~=yEk@*6nQ;QXnmX=81?ju zGyen{pOKGv!6Aqp573`+3WUO_4O3vSaTD~yw#!^jCGKVIf0Hce2npeTkf^8e#;`}L z685DXoisY5&}?-&zQm`H&D+VeV({}f&wwF{Ou*~NG|914Zl(MI<_PfbA)rh>!ok8) zX1!q7uYr^qcib!krI6>*<>j0WL14eBpb}FXMdzS3s9R*!Rzi4ZuuP+{wtytQ0nVBu zvwjy`jo5+%Y*ca0jd_^^qe#7lbHq!jNLy6Rjq*`fEK4fb@0G5=VW6$J2WcwtHHu~S zx!H@0i}7z4Y?IKmuLnKugV6=v9EiIpEh#bEQsaN|z3e9Jl-ZHu1IBfG+OgL2a?kBP}9M(O@42D)^FL{H*6*SSa#oQ{7Y@-%NE#V2obz)FyBjUbNnp);QJ0g zwVR*+zF$|L{VnFr$)2sfRlCvsMmA?{ z$w4o3P4&nI9g#jasHS`VUijUU^FO2gQo{^a*|03$)Fts7wkZpm{m_0y90rz{RFn%xY2Z$W z>r3N2uD;)XiXe4Ox5cn#0%ylOT(Um51}IAu$~AvIXa9yG*JMbi3XmUa5IJw(+dm6xG7?iPC0m7-*e&N&wTjKkIF+ft#Xz1%h&q8 zh6INJ@v8ACd|GS1$mTEEGW9)n#H&Q3#GSvN%)W&8ftZNHWK9-imnnITa?SEbEg=W0 z7rCa+@X`9K7(?ESGr5R%6Pn(>9%2Yh{kTyvqhQ@`!5d6pz0k# zOj18X6}(&@Scp=2Y!_YiKfiWcG~S&~rV_c~H^Q>I^c<_@eHHZ3=G%7Udt+_GaUq;K zwx~Qcd6*w}Q-O0=e`?d(wRxU(NxBfC2>(N|SaBk3{r#Vw~>=j~9#L zIZ_z0Rwc!62hyJK1u6yCu7U?eYU2BPRZE`tXD(71yuo(-cfoim$-4_b^2^LVyqcm8 zq*CspVlMG=Cb}oNK0)sbTM^bR=u`j6P%?88m=O^%&;V2w31voYpCGA3SJi~y^Yf2M zD|Kc^l$jNZcD{^=apYATqjmhj)#f$50KfB1dsLB18g}QUByL<%ouii2bObJtA1nV8 zh<}2?V5_#kM*XJnG4&RtX>yb)-LClL}R{o%3`G$>!C7ghnnN1GE5k7;b0rs^+{{N8m z7C==-?fN$28qj?vw^e>2B#RrBmWtyze_US4f>!vlk|K_QTl%aIhPH-2S*waJYehStSNb26MYdov(SbA%AkFcIFlQrv>k!JpqE`PF} zQXrV>MA2=2Lc6*>Z@m;K>;5T{`Brq?k0V&!{N`Wlc6uO-9ck2yu~-Sv;yUKs_GIym zPvL&~UoAjhRy%lg`e_ptABW$r{X-tj&2qfxA6qX_xH!F%w;*tiW|?3LcX}DDMi5Bzhd%UfHhl?9uR?wMPUl&LDiC*oAdko#o24FQa6nY&~~v= zP{IR&e89n`hQ9X#SxydU0}S=rJqlL3F9JhCI#5+xoDOb5uk7)B?oyX#*nenrkoHwN zKt(u~2nSe-(tgmTDj9(0I;?NGFH- zfV_%rK6vxz&$fmkQ~lr+kT z*A8vK%Ek^)$iXN7#{V9!`;AHbm%v+o&v|H^zF9GT4UZGUp7+NXINV7ZhSJ9b@fv>p z4>dQby_Ge;Yz)yX7+4gI!g>2)T#nmO7%~b@y3e!z84rY^^mH>FGo57gV5Fp|{~hUQ zzexO7GDf7gyC6P+cTJog4T0hAq6>%9-e$=*;4R({_M@e__LH$(kMe^Xp#SMDK)f(+ zCu`G^Jh31~hMJGC(ckCwV!c&-dTN*1tamz|&T;6;;&L(i)k_2ewOR3efQbjemrn7 zBgcQ$X4{c&k(z|*5I3Jm_G7SmeO@&rw(T1l%2h9uLBBxUB!L3^Uqr~$)SoS8spOo)KsaFt%YvcCU|Kj#%9Ns47}2*w~VioAKT^aEy-SKl&?RP7v6n*}oZ(+?pb z9-QsDQIJ4wVnX=1(drZ4Yk81GW*pXn)duV_pW>;E-75 z*<|H!31fIR19|cORqkpQNAbM7DknB?vts%YNp@B4w{NfLDmWjCW~%I&?>z;<;~@`5 zq#G$EutVPR36nsCB8n6uh;SAF>2hU#3h&p1pXf`({QZX{I_ zK~T_G%TcApU$eV%kwI}!lAXfcZ^rMzZ+jD7xwyQ@qv8W+Z`PIx)|{*$u@Xs9wV9!p zF?pV#1%f2mCp6EX!)1X4!8nY@H{mp!Sq9(w-sj1EM%cato>pV2+~ox%Gl=|Hh7zhh z__dzSD)zrLs@eV5=YV^sx~|NzF3C7)R<$Mv!M1JFtImDZ+I@|%FRUIj2K zo&yT#8SmLrRhLvXp6RQHuv%Y@2jB#miT?KO+f3kR{fU%x!%1;W+`pzlOl~oGX<%+% z|9pfpAb`+H;jmQ9eCqp;@jkJT+M4TE&a+Ft9!CgKZRhjEeA#%Cxg|I- z%}6Y10^X5`{G6O*kBS1o170saNRFia-id z4PlZ_n45P`vtkVh-cyiK(A8$^V8Ez*g@P>AxUow5Bz+r)!C}`GnJn&rjMD4tA-rg_ zAJ5ifs>Pk~1kF{{*FYNXvl=%l^Mpi)vzWGU*0vr*29IOun)As0-8l4mZ>PmC-DsPu zXa@{71tK%bvCf>t_Cg|!BH2eCo_B>a^YHFZcMTA7l30m*cMp#! zX&ISv4!d=M3m{aQ%Ky~#0A}s{g!qR5lBEb3)UJd^MCg!xA8sVvugmpzNJSFqpcgTR z$=8QgsHo2$ylUa${FlbDO_l8#_Vo!!*1t5*IfQ+mGPb!_Bb6e8^nHK3Y#`ww@cJz< z%%#wu{6-NR8HuXR8i#mFnJC`qQ~6)pLKQ z%h@76F<^`OS*whrgo4>I_?oVi%UYMyU*S{shox}s&OxlL&o01zSkd}-y}9EhuoNhF zGP2oL8Q>9Xqm>{aC@Y?=AOyTCljbW&vZ9cT5gSd_cHg&z%uCf~$Veo`5y5a&ZanDW zDe*hl^Cl;RzE~EK{`rPps#|{giG8bYa!WnhvqeKKAfH~{v0N5d;HYJABeGu;p)0mM zASdZRM141YiwcGVc}gub50(ea!880R$Xk~P^0=QDod70sf+WZH>C*C zpSZld>;kH|zko?z|J9VX!#A>^%kisS2AI>L*>-Su^YFND0aHcA7a|0=1kPe|_)L|R zm3zRlO3(3Ns^ASgG%~&eiAzY9&;-w_PYzo!AuI}1z!io#de|$llu&TJGww6C+S^8CtxnXArkKynQ4;n?rrzo*XOxAreS z6Hlj$rWMQ3qTTmDcjiawdPXar{t;*Y*ZLeQla%aCWU+E{xD@onbT7BYjP}r>7bvR|>F?W&t;ON`R0-lSDgU5pk`lIa#4^`z_jy0CrWQpljBa5S5 zSLI6ecQLtuTiD42d%ejf480E(3k0a1)0Lwa_>eyLi|Tmjm+8;T&{F(sE@Y5Um_t$*rt)QJ$?KX6_^{Xz~Mni0t z@(D3XSwq`^u@_|nU}S$@GtE+Ynbw@-)V}}x9>5vKuUWio`tmmdn!Ef1SYjXGoS~XF z6&0YEOK*ubV5_&gccLGmD}4fPipNmh^sQ-X{Z77t(bC$E>M!KOe63#us%~oFoA~A*FO!uiU`c{-uKvBklQKFHTw5CR#V>&oG z?(kiyho@(Bcu2^;YK{;*im$KlkugWY)(x;pGEnw0 zGqW5t>1YA!m<2F!RS2H;5HR@OxHdt+(Po}UfKWRg+m1v}%kjyI-OUo0Rb7zljN34$ z`e0p_*mkbz8uV6=W$|q%7xrsPDm7~Su1JB%R~c9n*VYyaNh%8l;HOaB=mh2Xw`8!p?lX{4XU3a1H%=#+f;`>~a1(-4~#1u%DO39(U;~TbE$gAWXsvi5?^i4o5 z&_WffQ%^8F>3L|V378;l^E=_v8?=~R(}>C4ykf{-`dBvc)z-8cSy6r5#wbAdw`@vf zcCg?u&g{|&yHvOvkZ8WS0yV5Hpt`6VphcuwUOyZl8KT{KyB!@6v=(ceFZYGTlLo5x zB=!~#4^q3MO2}8YKq%-E7=-7Z9**Y*;t^)0<6lCvd=gq+F;GHC4qvzF-oj3truCleIuU@6n?%bg$DPO>m4?BKJO#BweQaeU;qY258nZ}*LqrVvPP+5o>(E^ zJJj^wurELR;mH&03w%ahzPfD&O^l(JR{fvjZ_U>Qs4=Lq+R zA$(_HaJ>D-fJ7mA8pJfy#H?9ML0(0YM)I8`ZQv9;4oyE)^cR68@A*%6)8dLSfWy_kXYZ?El1vKd}5H z{;VPC^QW^gY4niAl9s?YSIhUDQQ8ScQcR~rK`haG&V?vvkWFp!%C}K5JZfzswIE(2 zW$sUF^i*>v=ZD;IW^#+CYEV^;Km(jLyTBZZQsG@oGiwy z*I_)Tq3i)L>9jXhX*JHma9B7~yTv(^zY!eqcI@7sm)z}>>-_3gIm7;e@KG(p z3BsFaVZG?&G-`)ZroziCNA;4Mue8VaSYMb}9Pan7MPIxVmfB3YKHA=AaZL{(h40@D zE8u>HI=??%`h3oCmKF7>yv+=ipFo1swsVCIgY|S6c?JJMZ*?^>;q6VX`qi6OG|P+3 zLuox=5sCpY1~%O&4ILxl<9X|d>#c>L5RU;gzGB*o|v%bfuMIS2Il9pzk(A>JXOvSaj-8HYpe0oo=4kmTrC5e21w`-OErS$W zN&|QUB&VisGOm2&iQFAMD;RBhUTa51pu{rg*!JJ^iD;{x~5v_DmdjW4MFyX$M&k zi~&=ZBdYS`5Pv^y;sALfNZ=*nru4=p9?T>BFb!f6$3V&9f}JR-1Tp?ML{tSJba*A? z!i9-Y+PJiUNENY?u>z&^+^5vI>X9Z&*Ssp5HTK2mz48D`oO^sTc@c> zdEK7DB-MSbJOXfe;6o*BBpIHfL+?ybfrJZ+A_^>8WSFhvWlgNY7S%E&gy0jgnV75b z&{b;x@ZbhJuVv;yiQVHUN~cL6)K^4b{x3vF7yF^%BMu&B?AqXr@>iqnBJ=YP@ zaZn+nBCBIS+}#hXBc!A7*{FUweI|9&8`uvzMwFT@p%U2FY~cwaGoBV`y4}6H*(mT)1{S*m!w+ zV9E9H&%wdjJ|BMg$5faz_rbyFBg0d(P=&lE%;b#ylhfIh?BUstND}&Np0uSH9D)~H zhW^gGnNAdGuI`UF?|JeLJKD|m=iAM$>z^kRk8bw2YiK4q$^ zNrW7(v@=TUw0J(#I`6Z@;f~^qZi!!eQfvEO=2O<2{1W71rBH17FQ`_nC4aoad`NxoLsg5{;>`}EMPsKzl3Z`w9) z7%C7OrySG->?x=~7QzHv`v)gSqmTar4mr)yn=aAlaybep;GVRRk*usCLW&17&#QAL z;fBd7ib0HAiyTgGD7@hz9?I2Qhs;uhNG{;Qk6A`yOYy+ebVb_ir=fy4Xyj#_<@XNd z4^R@B;1?X$a@)Vz4Q5dlMso1D1ZfVo>z}2DN8pue3;)`Em=h*i-n*{5YsE9!vw2sB zyEdDV8zSnPV-P!2aPq0e>uSt*yl0KDwv;V{&ohwk@S*cLbBhJmYMqu3 z4}ZgdcVC>*t8R?@@&Moy>H?!6F{{-f2lb~p_4 znxXHjNnI}6*MxZ6SMzIcSQ`0zFM}gerMZ2dt17$3MlXuxqZ{4Di>YQy=FP_*WNmE<=_(fAmv)it{Xts&nudly zmZqjZHo)!V5F*+Pl9PF45nFbur5C`d1ZoOI_>tz>*AerCC!fcBSwEG?#&D@i?sVqQ$J5{CsRG zoU3aHZeu#i7Ro#*^RcNvsIQHUI2@689Yn(xrKGFcpmLRk)w7#Ak5+s$9^OF~<6_CQ z5N0*h(SfNu45NkLMrd8GZ1%N@t+fcY*rlRkOIZ4=opleKz2{{3`8$c>GL&`4Q4dK6 zz6P_$lhjuov0O~^J0PxGD^a^AW<7;%5>z^BVHaPpprS)Rk`$%(hI8aH8(wP>W6jC+R*~VOBq^(W z@e4?-%ul=A=5Q5VzT0Fjc0lVn{engWS;x?Kvrf{r zU%FEoQ6c>-_O)1(V8D5wH!Hqt=sDMIF*J}e`_5evi8se6*MM8>8 zbmTxgT9p#NFr<6`3BG%njlg|gA#8;TPp4gvUKO1qM7y=&y zNFsWmq{7@`zZC@AdlK26bFT`<<*@)gnF^?uvzeG|xrEloCO;lbmuDZQEAETVqdNEl6qY%1 zV7Jb^(b*FO=2g^JpSWom1^V9$3p$JS=JC$p)hTeoipGSp#}vgc-h{oQ=qcObY1~RU2ZQZp^%|Z(-_FI)e@p<@Pq)Aleo`!E zVED-S4uY0=8gq*jh^ax5gX#^COoeL^R5XRz6i%Qly@oGDQY676+Q(NcC(xG5523Oe%P*k zuJ<6m$Jo`;$axJ|k2Fw@FC7Un%|!AJeQ(LfZ~umMzC>}9{c zK;3u^#E_l=r(#54@nwyw-y-(0kNS&Pd5tSA|3bSLN_&kV$pO-!>_9p5u`&6^E}8k^ zVWj6wsrzGjn2M%@oOD9FhIY+eY2(w{=4?y7?V8Uokj*$GEKcLTD^)$ymkpR% z1pu4c`W!)l93>b|{`_kd!l;r%2)Q|us922tM?mRS3&TboR0Hso5w0-b7m%=&5DMz! zS2*=DwMd7u77!MXz=~jt;3iFhqSFVI5BH)U-`Ds!&@omp%FJF(|FHi3Vrp|ql`Gux zF8F2?;{y!bWmYgj~U3p$*LD!Y5TCv}U(B#r!u9b~d7-ZkhX1(X}=#t9*%+s8LF_UXX)3cO43*}+| zuZoO(K%~*1vIF0nRJ@gb1hkXX1ggsH_oKedp4K4^nT26 zTZN<+QMJ4ImY*R$##d;Bt{Y~(U!A&uTIFH5;xH35GmFd@pwcD(Q4gyAKvz2I05WjcJs`FnbH*8 z;0C2(YOpqiS(q%osaI$v_T`s49+hd4I+E1_t1F|VsTXtV&0?)j@T72dqr$KR#%ztH zo~7F=ISL=>ZBtlHG$gvgtWL`NA1$?W?%-2PM+gr1`}t7;q}IF{ecAyg|Eg<^@Xzc4 z+E(uWTi_P$p+Y{5h#==0mlGW-bsi>j%tG*LRco^0>Q;HDA!Y>Us;;S@O8HewHG+tO34QO|vdnkJ zTg`eoy^9O0_47VzECONKWtk+jy<)F=i71*Ia13OUsv zHvNi~2(a>u5pOyLiO#|^>0&@GCAh;ze;qkF6R@7EOk9t)5Lpjn>jSLrK)PBpDVTm;NPg``<+#eWBf6gC8c!_XuIquLjSQ-}FOqv3~_%E>hRk9{^S zu2e7rS``u<&Af~%rjD}t=F@&Tl@eADbDhFH%up);;xQY>ZuG>=?)b!f zdBn^%xH^_pII1N~TT)W~mv_0MI*!j1qnl1F1MRhGz+19MQ3Hrr^i`z4}2P9a4HT)N{-rT|zxK9kq8f0Iu!2p=^cUG+NVTUL8X zSpdQyHHA2-qn;TJ4U|%7(1McjFY=B=-eDrH4~;xJ&xWcuF&gl>^aFZGgId6d^oyjb z5Y`a_t_FphH}FVn@%vb|2ABZd)k~MWIEm;eX)7&bYadL`}3%Q>vSaFQ6PmgG!dk_R(GLk>h!Nl zQ)v45q@!SPT0KtQ`lK8)pIQ!K63@T$%QjZ1ymjmAymgQ-BHhYA_`qAQ*jG;snmY~P zn9P((qw$-ymZbhYsTYjY2mHFEVg(wfmS3dyNGaDzNw)9|3P>_^iI_TjF+r@H^E`2M zz?z9tr0d?o@?YiqliyNC_9=<|tBovXWrvwR?r8_G$YUi~6Qn4iWSAxRAGD$bScHCN z57M^E10S9);OSfb!KgpN^@!oHOE>+arP>q-ce6yr#5z>7c4bd6-0sJ#ACrDiawXko z=V=^Flu5|ZjP)ooo| zo*7n$ww#Zw*ZyUWbA(;`-3^^?i%0O^WnU?77vj;{h^Lkd^gk=UVHpVRlJ$g~#1p?( z#IWfgsP1+&jFgi1>IJx#Zf8gz(K3Khk32Yr-f;noy-|Ea1D;;5h<6wo!2J9in9R?E z{sV6Ckm-ZuI-(g0bPsir08E-|gT?GUn6_oQ$=z7;V?=udrdVWu0bmbU&Nzv@m)k4a z$AFQ4#=p`P3o^9mYk8V$$xITEyLtSPoXO)k`k&f>(qqf);>6)V$-0BQq@;<>9cUY% z{%gT2$RAoW74vM02~J~ke!e8XFTQwAZe8tME1IeHkrfjv8q*ma*mHH9n<|@E>Ru=; zw2kcBgSAz-VWz?*fOvaY8`YMi>>MMH`lMD_mZLWN0;*?m8mDaw2~`4Tj{uZ|&eh>; zcu06SJyYW7>#hD%O#&F0enGi!^#uhcMWFJ<5A4-Pz%+JV-7P~+YZx6J@D5f2#hFX6 ztvpK~d->?_W4JkWciN}&oawk{eJfP^(IQPi%wYpf9#Xn^pBgjmG5T+EqKnsj0eyfP zFsNLJsN~Vq>6BY~J*QOur&u^#UWE{yna84>WU6Us&uAzI>d#7h+w7ErQsXvOl%WOQ z!Y6Zt%m;hQ+3Ix;z+VtCGCX{1PPx5>5hC9|H>VbxkPt9RJ|<8&QZ$G$fd>~6oNh{H z0i1Q+jDT+^<*x|*Jjb)cV^@0svZMt|^-W}Gs30$I=0py2v^(EkY%EL9W!D+Cysk9o zsT2Kg4eHtanCi=$(D0Y++_ZhB*CEUV)V(wMNKvXIuOlguogEJ$@8NimigHa>S1 zPO{ABodQc%W1+_9-L0*Q6i{tU0@2jrwkAVT_MzD%qgH)Pa6rJN$HT3CO{rBO`6u_r zJ~Ze&@}gthmX;Qq^Q)^r4%-8>w6ckT@ot6M!8d~78AjoVP7 z(%+g9s2sYa)i9JtrZOD0Ry}eL+nS`XgVx%f=h$+xj!(d1fU(kEpVr--r!i7uLB%^9 zk$HnMs@+C=L?z&10ZJemZM08ME5{ATZzc9dO4dFc&`2n+wA~Uudb7rGUA;?mX3Q@> z<%~eUfwum!P`&>8U^*Td3u{p6BpSBRtnOrUN*xs2=0F7bbObC#jwq-I49_OGL=TjV z+rV7MSHR^K!o>|N^TP%(hT0p&F_koHFhMU9RSMMIOUKt49aJ*OF)ruZgAKsVk~u1I z)G3Jmq)CHRKYbg!abj-nM-}i12?Fhd1(nJNwF^Z&RKjN99aXDT@{S>X&}HB7_3D>c zGAPWCJzZl~8?%qF{1mL_+PO-E^q$|Pw9>FKQ&vr-_OhTvMEa`(XsE?KYa6^dRXDh~ zqOi@)w+}ZZ!0gpAkMWhZX8XsVgT_eMSur9}Z(NMO6nO^=d(9Zd@$trj!1LTytF2yd zmI-0A;nioB<6`jc@oupxj>5K1 z!_cp1anmEe55TaPt2KDQ$6icxr>+?R&?wLAc%g;`NLJrq1e&+{u4Vx`xL=GZnNY0V zaPY5xs%)_ue?G7^UNI?_L^d?6_+E8g8UFBK58b|1h?&~1uiyT6Kr888=CW`t5MB6X zLfPKVE~LZxD5(xOxz5c&Z(gssuCdg3qc?)uPe$lU8U4r720_PtL^W+oOH0tzi+M#` z4oX1ANH-`g^-5-HiI`V_~t+T4TSLa;T;150~r9q znM4s>z?)Y`{VpO}bh!Z< z(sbm+0s+k71lk^nLqN$D2Ba6GO6N;7vNqc_J~4vHj;DLeR9;Vxa7R%j6%8AJkR3y0 z9D|2`p_Bl!aJr@&X+_{$C6zQG)iO|*Vf7=z#~+-l1+3;5i%s(K5Y9dVj4)LYEvZ3E z6S7nkS6>06-J-(6!rUM9@&Swk{sUDzh;5?Tw=vTr+@BZw`*(*5Wz)+4T^{7we_aD3 z3DXackMWpTSVtI>Mo-xIA7M(t#mYI+kXKdry)mouG2Ol1Vx5{UAFJRpiI_`0D0@OR zJ^_@bY~bQyGTRXBj3sgoygNq7kS%F^TwPtMXWGP(lw-uuwb8{Xv`K4_&&I&{Ybb)X zt^9TmK(`|%kSkp^crh-*+!WE*fa&RnZB3l|-O&_P9F`IqQDl1HGuw@!ZxYyd4GP1@ zT*So00KFulHmKZNzxiqG7Z38(i>dI7+MBoo^rzK1=n-Q;^!BqA5u4j#C<(OAJnAjy zD^WSE7Gn0k7Lp1F^%4WBrvzQ%j~#i%8UZK~Y|8KJx&9$8pU#g%5V_(ZQg=LD%flHA zM8T%U`M%a575}S}n-u6Tj#^voz)mo+O?^Z%CD-dwnFx3MHv#0Bx9e8jvSy&;#Nrh;z{?+!l+k0GY~NAFhK$Hojpp(Ken>+*b4S$YU7~Y8DMO7y2}_I8k$0ggVPYhFWU$6jYFh9I7DH^ zGA%58Lz+xWQbeJRpDC#Kk-WWav|RUB(Qv$nmIg%YQ&O?@*81lR3ezBJ<&?}MDfrhtr=h!AVJpS!0a54KBvPoJ%tE>prP!vG6o?=NvfsrI*yor)Dy*xm>DNV>> zC4pE36AM>ktU4i;p#MlZhkc<)nFVkg0Kbw61}FIcfqk3dRR(7il(;|{BDNk7{uvQD zI$rz)zI7=s#|vleL-auZs`m1JWSPEmiY(e!2#s^IQNW4t(-?*~yXLDKg$AzJDTj)B zn|Nr^TDHbvS_V%s{9YRxiC<+upttty;n;-*F1gjq{I*g@D^AB;oSk{WMB{TRa6)xP z7XzD;m!W{b&kEpQ-yg_{#TS=JMshDWX}szheN}9j8#BZCvChhzES6nXIp0g))Zn{? zwcgN3e@;El0G&ixXo*O?&4qKNQc8b_nUvRC%DzrX$tW~g>3C|5RLS!9bdv6JDVa`U z=7tUhrvG?(`nmKF@diaP0YRD?gpAkeJnjyeBbOw)cE9if(#ed7dQqAFy?9h_Q?OI; ze?#SS69~pnt3_O$n@Yjml( zl%%&KTbUX@exYoF`Jie!k(6jkawF{bK#CY9wd>uF|D9*3Su9#0k_f(nOVyIkP&Wb|S4rzfyC zyY7bN!o(C~iacVNp7!DCZEJQq11=dp=od@X)z)f(qooIEK_Bx*zvw?8hWPB^5;~;1 zSEYfG#=iiA5-_m_+3I}u3Un6{xb%RzlBto@~p9L$;okq zCdN&s=m2FKWE8}@Lh3*D2OLY*ql`pCI3=}lRG5;vNRpT`GL)>WETL?qVWe2`#jhzv zmBc)nSjl-A$QfyMQj*RJ`-?nJ%-CstYI8`}b-=iH1r-&QwR)wFIsA7|IcqTaC_W0t&rY6MJCHEsr<&D^iHLxhxwp~6fvDe4j$Q~^z(=qH2&z&NNfM{I zp6`#C2_Oy>nB>uXzE5rZVI}5RSgUyAU&&+0VRF}+`y$g}XcZN1VO5XO*T+R_T6g*z z^ucJ51?WDnZh&Rm4uIR1z4oTI=$pYl2cS7r%X%J(0&iIP0Ce7L^PT>}YS{DK!@Ur4 zu?UH%h={glV1U<^>yEsiMr_C2j%Vo85VjHP0B#VFS-kZh6d*U zr|Ac;>Qf}$)V5&}Ot|uvaBdcGK`qug9zBPJG3QS=daz(%VSD1^;$DFwS2~B->)X^% zFt5J;_XS?r;oV_7gI7Zk-D6vawS%h<`5GA5w+HkAWx#uw{$RH3J8ynwn5BP#8z|$Q zal0HV4D6g6I5ek&qqhFAG|K<)RD%;}IyJ3iu-S7#JAsbSlOA zpet&-0%ACZW}5cYk^y6qSR8b8sEpZd5WE(8xVb%mB6z!~$mK-LT-V+Z294>XlEqAMtU1tSl`FDh|Mdm9i*`*HB_+%* z-RLX;JSx;_b7NT<8~Yn6L)H3@fS@HT1m?=L){E>Nbd-S2e_vpTm?R47JKokflsFWk zmw?C>;2)-eFHUfb-(>?(VhIowvb?Zka1iNzK)~x1h3h0;hcQ_XMA_0n02J)d{ICWB z>N*i8c=$XgK72kX@LXqG!+nCN`WMsCEOnr?@wbbA<;U{y4Znp8NBm|%QVuHIjIy$_ zX(Hm{>gF@WiuHiN6c1jtxvq|F1EKrwzltPa?f{pe`cV9o`S=X3W7zig_5wI+nm=)K z^N$P+G@;|;yT71uCfmunzoHFefQl5}%RRP%%rdRFarV4CI)=%^n{~U0`*7d1L{yf* zP7JIh3IhTI1B1cpAjD{X(bUjmBmcU*bP5LPPAmBS4cb%%0KxM~TSJ4Uyu5r>f&w$8 zx!FBkge;Uam!?1oKNPMoKcAvRhDsQ9hEv2zo0@Ge!vXFS%>Dl?f_uBP+jn6EFRM&Z zJ|=qnrD1csH-czGrUxS~fPxtE7d}1R0@_6Qn2ekp7ESzH8W;~};StUyPJn^!<6Z|S z)!V%YepqmypwQ7*P$Ir*#QX?JYWdPHF);5bJ&kKB<`=z>5O_lUaworGtY_812oHe& z_ZA)|*Atq$i0o$Z1Iv-6ImG7?0Vfj{)RAF=#%3*HP0)bVqmxohKmR z#QXu-0U7TKY+jo9t-x?i0aOVVSN;K>z=7}cKjI8t&!mTkhjVA~$N;Y$DLH#PaDK77 zpBp!9V`~MV;45ph;kEy~)b`QF2A(~*@1=;Rcy)7q4XjE%kB@L9mzCyZq5gW&85RlO0%kiRyV@;gD-c=&S)= zY2Vv~q^$ z@VK3lSg*7s+N^bq8p?WJ9g4nW?r^FlDpS=4J+&c6FPS%h8`gVVS)8Ar=YnSjLkPyM zA~G^kQ9t3)9L&vaI{}MBb^v}=OPZLRj0F#eXvnnnQa!EX5yGoitX7#k?%m7JW`GJ> z1lqDJtlD{Es761Cb?4Aj65x&u)ntdNof-*MY-!oASt0rqQ9;vmcSX>g#pxfucokERBJ|A3eR()l)n zELSbiDevQZDhMP+Kn{?!heBGuVtlx`x-v1w*>(as%}ae6H{Qb3w7(x(5lBT@0X8Nj z!xHWfJBGJ0+|j!9I^sSXOs<~9J&!xze}aC?m#6iP^0QW24hNUg?6HWc4`8z=!#jh% zl^u7yz9JET9D=_I!>mxYK@FAk<2P>_v=AdzIY+*4Jg!N&JqDbuQ^$wo657A4o@CEMD zLw@9f?l$yg-3wy8Yz=-I#c|4{!o(EH1IsTE=oD?@zm^ozq)*tGlGSj2y-7^EsTh^u z*G_|JB+ca#z>HmKvWGS1=zJUMr#I269dk4r3k-ZEnhqB4F{;94odwc$6lRL!tN%(x?c0!q3L3hpq8)62*xT4B6ddCCSboK%%$x!sU zblWBJ;PJM~tZrneUwXImm znWNLL@Td3&*xfH*Lo50u=xr9LA7MKl8P(GKTFenf*sT@w|4opj`0(#=mL3FLyx|qt zOJLkQxn2c2KTf2JSqfd1Wq6ne=u8l8e0xvHUJIF}r|-h^`n(a()*y!}qb)B0E$6G} zK7w~0ZlD1r^76TfAftb%gvYVeROspH9h{xL_f&dwx@W5tXp)cC1_wiln4#`oE<3S= z?4Y@vBO?@_XtNg|2YB8T+z^Yk`d zI(V%jJ=b|~hX1mC28IX#j0gb?9dcOE#%_ez94AY_}1 zFwk`b-0in8Fu4x{Gz>)qt4N9p?8njZ%8bzctuGfg0_z(lrPT4Sp?oECye%D4y~OwA z3Q-|+=opxi??MJX-_EbD(!FN1`(G`+= zsa9}&Oo82p62=_-_jVlcRjfiMs1c53j3l&6^CbV7=i0SEa0_tH{Qcw{7_DmY#W zs;a7Z=^%%&NVC{Xs#D0DFI1abaNs8c{$C)lpx{G8Lb^)~d4r)CQRq!LaF_c~+!QmN zTdYKe-0_V^#4GhB<`O5VLi92NG!7qNp#9=&!ZG|O7kr2!Pze{vjEkEJ2=Tk%g5s~z zAX!%iquk6>)6=cP#+ctChq}=`>236t4c;z_8Fia zXI+4D<>(Bh#uqFoodMbp%*X$^1e^XpgEzrb{0W9j6c{qNhkcazR{O2oTs436$uK;MsolF6%LMLj3E=e`5?L^ za3ym=p=wZANN5UxWgJOaSnP*EO~@5PF%^1?Y$!Sm3&QxHw?KI6-&ZsS9tu0ST68;m zd&qClc;EskD1%V>)E|H$+JCata3lui2pnarcJ>GvV3Yh(2U5PqCHtrlYO9Ofe5yj zut(gj&VN@70USX}N=n8fnAel#a{ZfLMyU6?9UMu8K#s^r&d7)g_fY|~UYGSTid9Qh zN}7=abKOA#RROAFhKoNK&y(Dz;l6>b`CD<}-{ody8*ne_C@74zOqCL;RFE(xPoGLuTByj#3`^Cx zb~5(iVG5)Y?`MXFEILHUNP6) z(DS@aF*%evsZg3*kEheqgO{D}9drX?Q%lR|vzwc$7P}20ZCwF&`_0~7*))!o=&-QT zs_-ZGtia%}Sy^jn<5~Q~f56uVgKjviD~I)ITOd;i4w1sbC{4;P4UGr;2a>e3 zv}>ryW>jYp$cPFH3LpbJ4!?n+P!K4!4o!D=lTD6{$WX;fY_hVl<|as)r~p8%h>}tq zI~x^JEk%f|$7o$%_@6q}>_BZW#sw3(q0O07W? z21z%n@8293Ow#J=DhjwpLD&0}Julc_6|kLDpmlSrME?Xc*XAJIwfjdemBi3|2%OyG zLnLy<-N3ap6qd_KPd5b+#Ylj{bh$n|3q=8RJ;p>SjJ%vOD&&sJSG$h{*f0$t#U1}2 zRc{?u)w;!v0wOIX4bmObv5*F77L9a+q!J3!-5`rjDG4b>1OcT5rMpE$aw{lENW&d` z_xYZC?)mdP=WMs@o%5Yz{IabrNEoo%a&W~kh`?DvIQr1%>FMdI;o)I8?>iY%|!!*`Ih(%qK8}(u0*6>Q&p?*_|aZ$Sz^Z z-o1h4w#qfu8`=U1i$HE$WQHCi<-b1~XGgTfz#ayK|=r*5zcz z64q~t<+^X+2{uOQL&56)#o5uD7D#*ImH!Xiv>aHNn2XofWDQBFpR5Vld8=>ls0b-$ zRYWXYfudQetXlBzAPYUA>Ta;rmqIn}d*J;z&a-oWO)>TsX=-c7>AIin`JH<%Qq$?_WYGIWUrpEebj#@1_l}OwL2H}gKvnR-_LSk|?2KRoK*l2Y zbZkWwhJv|%=l_M}N&c4aG<=F7;`s=aEqm4=?Jv1yBeFogC8wo~OXe}Uy!TMg`wez} zZQ^nGbB!Y+Z6oUI>c;YN2G{p^yZvDcEbZ*9|1gi zq#Mca4+pO*PHVA5{=e$a`T6zp`a%>+4shmrKx10qp~Y{M;LzP|l%UvA?;w6<0!|}A zc%-FarX29aGSmd<3V9Kb^$?ega6@UT821@U^#JBYx|B;`xnP2bxgg;HJ+#f`M37^2vX@ zOV)GuibhSU47g}ygQWtP3SsYU3%i8Dx8;CadfJm~KpT^+9y{hFzzH5-LDdVVfj zyyngS2?mJqtv&*FbYiVP?dP?28yNTrK(TYeR)G8=c*&(-PP}rTzggus*tQ7!*serhC;rhDxwUbCw6HP!(8nWCO##m`k#n&aOyPL2awygMzFia25 zH4U%<*>~^WdDBRE6!k)Awwl90`uMllsKx!kEPiJ|R~!8H9sNMMo(NIKkHsG?8p|o! zL`jj$~<~KcS}HPEwp#jHtN@P!;P)% ze=h-x*{0i;#|iS^Tf;ScZJzU-4$=KOLLBdNIdHt=;K_Fap=RQ7@W1(oJ6iCM2}(3L zJv&P)EMy2eoPMxPXP$r<%@&Ms9LZO^UKCmKfCpu1UWGd6t_`fcIMLrrZF$#4qbtf* ztUt~p#Y{J*7+kU|ofu>5YJpR22iL8k{cg9w0Pg8ueW zZc9Pox(@*+F)LYVBb;zzpAqC=ydc8o{M@_1mg?K`QLnNTj6s67*Z9vVOf1w)z=BAz ze68z-mpjwuu%Dx_?y^8O1POy|5ioWFq?cXT40WOrp|;Y;w5in|bgMt^qg_zy3EY}$HGCz>SRoI9;lsaWxR$1 zTNZ)3mX^i8=?7LhK08a_DDw0@-7AnU!UK3bVO04EG{sB!k1jG<{DxxZ@UVLxD5$=` z7eujp$lCN;qkPw@Jy1eOXip;|r@;m}H3geW;B~S2y6AhdHL@POOm$QzOk;xhoR?99 zW~zjwH5GLfD`kp|Y2i=F*2pQ8o|Oa9BrK>$p6NR z6;VBg+s!0Lw|T~BDUNj;dRJjx2k;^QhG*r1w>~Hj5%U{DL>fGO zl6_nRnBDlSD-Pq7yl+MdAR6(xPE&yYd0Pg!jPWy;!dlM|xj^ z_JT%IM1;)KsFL!odF=JxumJg~YM>K8-?Aqp?)L|vx70Y9Qj_p3f`k42HAq5$d0OZnj8 zYpJMscm+6}CGWrq)$v^HiINeY%kazCaHBlHXFD1j4+nt%S-`ogR!gW8`{Ku7tLN%x zHNekV06P+^q^4>AwxRSSFSgrTf^b~zzYCW;TMCg1TZ$%TW*l-ZBg4a5FWW_#D(dRM z&KgZG^$aZoM~sj+i!lZR_kJLk`ES*ta3ju|yt?}EeCfbr)CCx@;Yh7#e`LEXI>1$^ zzE=O%gWy(3L|zEU?-EP9d*8U2L?zrYQj@Y{tg4!gz+rATz!8@VnV+r2%cVl}Qq?8m zxqfa`C5!kD5W<|M1W(O>d!!;f!BPghw6^DZH&bq&{w$|NBSoqy7AZFio(Wbwq zSy~JX3QEgRU$@TDe6JYcsnj)F#2@T3&a`|9H3FBFI-E5879Xa_{;)3XMWeN=LZel6 zLtUMO@BW6C()oT*Myq^dY8cFcTr@1p~%Rz|x9kEQ%X?n{7e6$DIF?l$;Z|ac4)7HuuVbAP| zYjQA8+sCq^oAR5GId+YMm56DB%w!jbp>wBhDjc(Wov2BskN+5J5ejE?@a3o!K-7X; zrpP21MG(wMu$&Kja(k8_zcTVBr3{Yb299~r6AvgwQ#KC6->8Ag;TqH;6eL@iJbBXa z8t3{NRrUJy>(__=+1aK+z@zbM50v1YTY;DSy(}-Un;gAtLCNywTZUj$5~k%pf7;FO zW?el7H30=m;0Ip_j@wcz^&^R>va{@QW$Wrn+w3yU zeHL~DyS?J7rc5xtA@4fgDVm6ZaEmA$kHGPs-FBMr4Cv5pFpGp8m z7zggDY54IpD=S6Ve>l|tAZTfEIu;RipBPhf8Z9(ihgg+Zv6v|>7z7K6WK|nkZeQOg zG0t-|z@JvXZ(woSEr>EF)^3%1$)tw=wm`K`fdOS7{^T~zXDq*7LF4Bn58~&HS*7LU zT_6NQb3pw+{C#Ybo5ekz_NS0`I6|>TLoX32%$UO+AaUaq9hY(=^pYf+kyWMgbAC4l zf$M8V>9W__S`3cHC1#B-FxG%+>R1L#}hUneD#y}OPm_AND97(|DD^P}!|C{jg z?Y7odd-zs1QrairxpIwB-KUUU+{`s~g&?GDIUih;L)%+4HB>4Vf6z*6P`Jqt>5Ff0 z>V?l4gl#KO;LYOoo5i+#@lM_!uOmFSGUSqTld1Zm+pEo;QIC+jF}>Kj`<+F8PWIwp zvKgKs*RLOKrPih;Q_$TZ+CdCDn_E<=c8D2-UvQLBl>d z&yQ7iU`GqHztB|bYj`2ABpv6(PcqEG9KuZV=;6a7z~B_LRF~yk1p{si71-2XSt0s+9@{YqXG%dQK$a6L!=dI{; zXCWCL;gv*B#e0c4LN1J?I{0J^_}Q3Z4544Q*09-MWi+Pn`(ivbATi(&(A4$F?< z7iJny34-9oOat-l2ISh3Tb;MACx*%)i%H%w?KkiX@fC~4;e>^T+$J$-`Qfj~E^4b^ zjKH_ODdaN_IMH{|Nb{+ubK3a)+cw#78M^D4W|5{B8>A>EvRKa;<1wy)@nvQvvjrlO z$|9>tXK`t=6ee)ZUobWxe(Et#Qvdajy@%|D7~PiJZxshE3}$s6Q2U&4^73k$QPAo_jO}%a2;v!76UZHhveA#y zEgSzQmWH*RwO(9z_s~VKH&6&zfqQeXTPKH{tY=+Kp(-QMKo!ONy5U!y~s1Ke3oRl(9wugNmZ7EHnW ztqi;$s`e?Wj$Yl17L|c$1B$Mn~bVEd-RjCI~Q0p`|I`Fee#!IB8#* zOEXLU%kMVH*2qyI0$9Js?Om6IWqLg7VR`|5OU# z8x0M`_F4H)xO8Pm^`uo<$r>mPcQh@ZaE+@V_eXwG)zmXKdi+)-B8`*c{E6db3fN`l zi3JRmJ=fD&!t>|RHDogTojb{HYa(}rBUgk-m;&*a($h63@SH{L(p9Q007%9sVvvb+ zL4Qmac0~Nj_^s^(pdfXWZME`wSM2cXq~-%~H+-ac-C0#Pk1!x#=;9 zY6Hx7I5axJgny3M33Vxc(g+FEnl|bIqZP|J5dVnXA5l+*xKGC&E3KYh zcALa@V5G#zUfD<2ak;O~Kb|_7dwiC+_EVF;{CKQ(@O)_vr z;a8K_6;0i@Sdz)})nho*0bUbjk$+ff8eX9_$hWwjE-vm7G$lqEl}-NV9>b%9J6eKUsSAQO=ahet1pY-6O4(x>*wai zo6a)uP{tSI`uG+vh{yk^D(P~fVG2@Jm0=*x7iQ z$(b$v-n9K)g=74Jd-g}P{vbuL@Q3Q%4sZ&c8-+H;?y&?hS3HFqi(${8|#6~FRn|A~U z-3O=0@Q?VI`!s?DoV~9WbdG4_G-3MXB2CswHXnPCFB>AGl`oOf8&~Chmhe-r+aGU_ zTf|8>`0vfVmRPvAA`cAUP5ezYgT>4%5-G%}hG%4GoFH#mY#3L{CBjMcR{f-U zCa7@U=yJ%dbv9P9&qM@2-N$BdUr{Pq4hZyB0`4H;>wB~l9ue`PNZawhTX`R|vK~|c zdP4*Y+)h@7z`;$L<8vOPnr9AxLCIKrcp}(Lr62fL!Iv~c+soDCe({KKRp~1hX4@tm z;&^3ECNT%&BD|NszkPgw0IaV1nl{s+TBZI(1+Vq*$%%>QV8OLI0eG_hA5i+}w6%;<-W z+__M<+^*NnGQ3v?oh5Jl={8v~ZyHt>IpfU>_t<_?CraR2DS-Tw{*?lo!!P&}wk2;X zR(f$3x^Z3zHi*HloCxc`)-zbDzsmUx=Ut{&&W+wV&5l3Hkm(b4odaq^AUyN`M z^7B0S)e5{(3eSp`sYCAzGctxo#%t`7$wgtZ#`fqWLiz}F4le#2N7 z8-)dJQO_#B$Rx?=xR8-Lsoqsubb$|u@dVR>V)2)RId{|pMVDvjPkeo|!059f&W?Q3 z@9aQJo5uqm+b?x%%H*Ch5@6VzXcv2n3mN%XL8g4(6q`U3eLe zo-ijOX87jgVc;Dc^Kh1GbZ=q*^>(ULsH~2Ua6!z?p^6im97Nn6&4t$K+UO7dPO`kS zLjZ%(Wax<7OpM%7tYOre`g4{ zcvyq1rS;sqzsgqa7O4}9c^KbhB*IKYjfEd`8Bbe#602{7O<6ymJ1FMI#m8viCaJ!e zn!qg!f02i@VwXf3y`!>MH8Ya4AD+5KX|ivHtPhaO+fB^Te+hs6<$vj^{j2R4*iQ_m z7ovUrcN&Q0bjV0_bQJh0o$u) zjSl6CZaq_0#uED2X~NqYmQNZe1*{4O)H6T$Nly8qH{)!s8uO35@_4A88ZJVOMr44^ zY_M3v;oTSd0L{tpfxQZ+dE5g|F5U7l{8j7sefB>sAmWklCkjoiQIadfu^k(PTcFaJu07Mq1z4_}JdjabvzGpvq7T<0%M0lOH-duHXmC zr)79D8lsTM;ru{nl&mVXBNZl7g(YToS(U_^`}Raxf=^Nno0ce@ z?tI*l`5mkgUjz_D>f)k#Ah$4gz`JLA&%kuil%Af>YuWVZg%t=P(yczy($4@eIS8Df z5e0-en*|47HjNOx1)9c7-FmMrZQa~KmY5u4VAr7sduDutd)OIV3q&BX%*tY6XsuLJ z=*v%Gp$kt*AvWY1O<6xyUwtp=y`edrMblxY%A%C2c&|`{lAZ7XWw4enlXr8C+{&En zX2`{1md`8Y+uR!QmwSW!xkgWb)>znQwa9sbK4uqx56z3n%0q}*?io{M678zdpP-}? zD&Hl)O$H#&pRfydY^F>CdVL?j-0f>__Kn)@^(uE^x4F7}Q1rf5)6#02HVmo?-d1Fk zl9I9mjicbkVNc7~Rw2R#wbbCj`XpimvL`=(JaC~Z{vG7dYkA1qw_snfM>5${^1ZCR z-6pfc>iE=>%E_E54M!J~JyJVWzX2O(vRJVnVs|mM>u_gvALNb_FQRlX1<=`vO{fn- zvB++PYThYTA@v5_zgEOHq3(V_=$J~K!u#>=5*xH?U^z`b8oFS~oNWsyMcR$n4ioFl zeEN>_dd0iM$j@)AuVGoNVx!7vJx0Pz^ zZfXC6zDpROIy5yjXxIPS{T{flg;0w5lx0cFr|;s~*_b{tSm0@X9|`+)yuO|u#!xNC zzoFu)H(`q`g?&n0KXM`@u~g)eBb$+&R|B3JurtFIyP* zf6|_e2BFkz?D}NSbJX~wwxb(XHrn8+i$-Jd3$-qwrhDBc2qHJ;Q-3eany~^ zpV++9e#RRdXCJJ>RRg(4rOt9El*N47GkO6owcLc=_)IwH>=%RxN_bGqQ|5sPp4nA; zGg{1Xlexk(ryY|Ly(Eb#0!Ih)&ZVA{4^s|GCXzFp{7R6^qiodK2of-ZlGW%p14Ynr z49qEpG)(Fp+fes~LmjgNL(D&d4me5bC3SS<%5R=`uy%0rBW0Zu=5U9iR zCI0>`(WXOqLuwqpx)I)L3gtdpiZ9odgjTxa!D=+sayzO0409$4;!5 z>sO%Z-Rp^k^?wRG|TPWUXB~+P|x>ugA1=p|V4Vjl(u9WlJ@w~&6;2+WQNb7Wxdg7a2 zq$Zz@=`hC&R?PMM<5U^Vs3K+ zqg^c@UgWQk3~59}_EMOE+$0}T*wA5sj&1b0WlE)2D3P{2BQLkz-YT@G_HXCAYydo9 zS0D!Np&sRq_R%)Act(N&>Zu^txjGv=sHt|F79Swcg`OF#K=xJ;d* z{~3j;goS0jA)gmEmgyJ-NyqS7z7GU**ktmZdt$}q?tg*m41!9axt(G0?}=S^ZcwY? z-LXs+`A7M)cPx6VQQ9ynjey=JFlY>?Ly^^9hl$33P-tleR^F@@_pk5Qot>Sd4|^Tt z$i7SC5FqTC!Y42y-c;81Jtm}UJ?Fb$8WSD8u1^I!QH1!UPe!2@)fjo=z(l=d3`Ycu1z zwM{xP)TeWvxVuKYLRQ)0`>h>|iX#?}hxzd{VQ9>K_vo7>UYNwhqB`=#tA9h>qW@L0amGwyygFWf?W3!vjmdvTJR% z1ACfU#-k_pu3!D!RaI5hTwJPT(G_QMPt;8DI5cwgnWsotXR21}LL3e_;)Y1{Kov0= ztvu+wrSw=vwvXle?UHXy;cH)3Ba=qC%FQupuT=E$ah0sSs$vLUzU|$9uAJe9j$`Dt zqf3pQ#@&UMzt9p*S?(vJ!Q#bzQ*zhsb!EqWzvC@ZquA~E3+d$i~zyhqDy$Hy$q&i*t!E*k%=Wt^9wM9YWHR@04tB?OJ*-$&}K8cU5p1z~&c&gM%GQ^$r?xK$!EN zlJ--w@IUV33ct<3-5y)8SH#Uz$jWc2>IWl;1{eh&4-OAsjIe!Feu^O<@Bx-=)~xq& z(I;vPq=@^2EIg4^hP^fdYV7tOND=5}9RB-i%;ro5PPWe3wRJ+o*7q+waWE_MoKSjw zJcY$6@@dhJd12l)4)Z7~O9`^Uh^|rKJTc$cOY;_(MU#`nDNhYtryXbRK7z(@e zR2-WX$)dp&>2&gT_{F6WWE!Be_z{qj3XtTSsXzU`0OKJGj0{V#F6wh)Gx-stxeldf z(i|7{|2XN}#+z{C68$3-p3NvdG=vgvSV9@}u?CRqVTVnpsRX7neLn7e%i->jznn7> z0-&GBclDfKD!EPLk4OYK&zeLj1525bnv0jayP0rm1GyT`mm~LQ{DNmcOiDO3Za1Ki` ze?z{LX)2&l=nL&;zM~dY3#fbukQ~~$m@-x6#2#4aODZ5czUXggXuu*|+VSyHSG$Sx z3wx8@Cx&>1g-Pj^UoRHIvr>A+S5pdy_9t1QT547;{CBS#eLlQ5NI{pk$T(pgCr~7i zVA?NHK9&<2ia;7ujz<$4SaO292eA+8u5AGH1I=-|G_ud>^L?ypo@c1hEe;V{(SsxK zEl3!sD$%`~Z_tXv>An@se97T|J~h@i@QdTLef(b$V)@|t78-}Yh-f`2(RZRh8SNoX zpH*F1w9jfHz8|)SJzp-Jwd|YXe!lp4r|~%TTJn1V$p+IhULBTg{w+XbZ+d49M1(K( zexeg6RCHovia;UuLy3N3a|CTL=J&DEICqS7d)F<0w0-?|<_zmLwL8x$=32HN^Xywv zRejFVj2pzc6spBbA)^cCbA!ikCRD&f*$^~rqyFKw@mZp)yFkggthm+#}#C&*xTNLlx7Q=4;IFxK1;);FCuRg(PPh}U4t)Uyfl zEWaZd1pFV;zr^FU{PxQf^L6t|{7cwCy2JmddPDMgtR+)q(KBy`#rQ|fKcc&V9mL$? z5&Mrv+ZxGK6p_X~>*25RQSleH2>%)jOG&nV{Xud4(+|oz*$?cOr_2pcSOqJX{~OmU z4(dJVN%&++j`K4n)(9i1p0}7ilCcTX@ZF_0j(iziozZYGCKhNzb|!(pUaC&j_}bB$ z>+1l*t9^(vy!o;Ubk5$xl?Sr#4S#Fw$R=hk(je|D#ygVn@xIoevmDbYk>=2e=$L1& zK=GXK)U2ae_c7MUT#45xZ*gbht}#^(j~y#wFjXjf%To+YJw7j#*==MWup)J7zqwKP zSKKv+Ys}R4GkZdGyBCd6E4#IP#P8cZoj>RFsMRhsZhFO|1#iCa z9RTj}_bS8{w>308X(eXPInMBAr+)CK*5&I1fF)<|TDBfD2u-vt+-<<(BTx_qIBFUs zK&y=A-@f`4PR7vtbK>mZrSS^qDBXB&AVU1W{?$-if_8nQ0iwBVIS>0pDIkuTmWSNb zC)O7Jf1)R@NaP_`2V?ttFIC$g3L{Kk-{SokB<7b1VoVw1jQ_A}1&qXV zHM{J83!i2==M4$#Gukn2^dMA^NKdEI^k0gO6i9z%D%3x1QbIcX!*EUcKGbij^Qlc%@$^6?aRZ9z&0YBY*YV1GpCRgP51BlJ~T5>;O0TOs) z-)=D2$yoW>rlWIg=zcoouTNBe?=!*i>XX#d2;C?g-_0~7fql{xXaC*)w7ALPw*nj$ zLl)(mAmr8h2f;Ih4s{>4(?R4n|)$}-Wr6naL z>Kf|mE<=hLJ4Ost`fe?O5jD6B_*FbaQcho!=)5T0HpJ zgX~#v3311bib2a>P^R3c$U3ylXSu`cdF7ZS7wm07n+vgFY&bpQh%M=D!l( z+V?d}`Tm}H0thJh5tX*mv|^kt5)}6Cfr_yN| z7#78{X=5Y^9X34qS|9&ybbRCQ+E`xIO;z0o!lmy^0HqsaZ!TMU(U6g)8-Sv40Ww1V zeJPyC4I%JbIv6JL;@)R7z4z0OTZp5=Fh^T2(@^m%&tS2Q(7OlTE|xD!V3K76SV9^W z+VD3o`nzuF{1|LY&$m=CX}{)|45}KsLthbAk%fU)l>+AWc>Dh6>_M+5{2thQ?B$;x z9}l^Y70DC#po+vbCy#%S-b6;|3@o#lmoV1bJBE^xs(z8rIne6UAUyAL(YyB}ATh3h zGlX!wk8fh#`fYd}>A8wkz3~k$Z#Rehu|Xe^mx=cqWP?b|81+%|$hR(Yx%l>(&6o{u z0gIch@oFVfHB|mSBg3*4T02D6=hTp)^vV0q9|BCR0c*Ai98*&}X!fn_=(v)8zJ_Sr zZ2200{+=S`(L*^hifF@`4Uc(;tXpk+13VqCR15S2=RChzhsgiw(!#P(@UQ%A&o$Fc zZ*KlHNh*Jw0h_+kaMPF2arOsS;Eexm9h)7zCJph*)yhI`I(dztDHOGWJs z3=FYrK+H)X5aCXm<2)?;St_KG%vw*;NyFQPE2rtQl<2SM|NHCjD5^8!b>Fmk%$MDj zx?x}?nKU}@1l}RuyIfq({E>6(%j~bl)|Ye`7P>J4j($zMuEQw@og}6SV^Dp6s3+j3 zu8f0tj67^5_M0C#&v;L7>ae!#5SV%pk@Jp**n4ptxvc#>zmtD4$LHd3HV;Z*HE{N| zM;3Q_k9r`j_hJFwNG#AXN?GY(D9E*|Uyd~&9fSLFrRLJlR;bk*MKeQh6$b9@XaK;m_uMz3q#yc z>bAu8x1MSg{=EHWDZx@$l;%`^SehZS6OA5--P1S|4d1- zq<)Nk1Coh;kTwb==Rffvm`$6Ejg9>Ro5}@n1HLM&qbxjpeX5!oka#iCM(czr&VfbzxkK9TYWHmtL}6mY$4>==(hV{V!vh@P;n0V8k-{G)V-f7$4x7E0OU&bGBmEU6LCa+2Ftjm_TC`|f@@MR%_3!^iil1Hs&82yEQ%cuS%iuXnmJ zpy-RQr8qrDWWyXXFEI@i7|Yl*6a`r`(lzRbN)QG@_>n^gMOXO|Q9}&QTxk(6sW}qq z(`s9m^Wvxr8nYkP)E(wD4v__J_KC(#VBW{hl@8Ql+RXmuV3!2$tYHLvUuLPPsjoy# zBThCg<&g9D?t@h-EZy0sL5-)lOsJ+l_fD?5$mucdK4DtRFg5;(<_tO^wn(_^GnJL+ z;njHU9qG3+gebBI(+x!kKbR+6313c_+tp5FroW}T^eaErD;Ao|l{tLvJG0UKwE4GZ zto-x8&sE-gQVE?V4sq5{6R4g=2jJ#9tmw1*w|2>oZshV>!L}@bryL8T}`W zdaKNg=PdS{3Ag(utLno?fh z+zi>8zO&CgvD9QVBG_|7D7;aK$iuN%N zFq(OHSeyaqd*V62hEq20z9|#sY^tvL8gsiU`qycR^jLvxmo z@K%T>yyG&RJo?Huf*$Sldc!B@xh2P~C+?L*D%wY{;k6>tH8ssOvX*n+Gseb4|& zDfLf42N zkU5sP2BmoD{Kt{RxGp2Ix;W_kLSL#P z1XB!y{zueqLwZ`qfY8v3G;w}iUP7m!?2g7s35;(Oi*)^6_=xo~MJsDGF(devBykMq4%>Ng=DxlKAY`F;q8jx zjS2w?y4xqnJ9se~_v_~V>piZ$7eCHQi7>lmAGoZuBlDj*&eWKE0Z$6*%}qlY8*wBf z#Ibw_qB7_-MTOq;c%8M@|FoO#w9-~u32T*h_-J;KJ2tQj+tgGgC8*Ce_uwX*6`xmR zqSQ|)DJSb)^~_CH_*7ma`9;_V)dxBZpK>pN!eSJvS455nMn{t3db_%-xb8`L-+ z-5>rfxQ;@tj|})Qoj+pg-7MdRbFCd-a#>HCo0}^Iec0?ux&{qR->q_usxHR0(mdSq zTd~{ASB88P*Mp6E8gasb{adUYlWvd=I)Kz(cy8rR-rnSd72Zi34=ZTt=p`V znPNBU`8`#H!w+wSkKm)-y4fP;?u@ba!GwnfFWg8#Oa1_4Mj?_Q6h_{B|Fp41R7}iA z83tArFch;@E$~*QQiO*`W;L|q+y@Bw9SCrwI9OOBq$Wj6fc2$#Xm5`xQ6)A0fbAXG zK`GJm2r5&S3gY|rxI*O2@m?{}+X@@(2akbMF)c(Y*p=8g-hCqG5jiXAml~vay-) zYyRnrpK83T!U!NSKiIgW-j(8e`lT5n7Y!H|ruQ3-PcZj|lvIr}<#qbgrLyzfUvLZZ zd-As_1bMO*%?kH;!-pG?rphFCwE2df7$N@o zbUA`Zq?nCGn+?(Ult#t%O<|re;@LQ7f6HL{JvzNIqDoPBtZu#JduP~M!kAfT$}i1( zQ2LgRIR2`Q?NEVm&<7ekzxTTzU5;NKa`W<Sc#vm=1*J~Uykin^60rb=0lo73M#kFZ4| z%K}IjRP7n@C|}D@#<3G!*`9W}L=nsk(%4gb(fZ*ON3fb!H$;`2m#cj8z3tAlVE(y+ zkystwp7(-w_+~9f42?({N^-{~oA&=|0YvgdH2-qSqEa$tRxd&+sQjW!!VVQ^`mLEn zUlKdEQJ88Y<4oRWSQ#5(rjN>LBbid2o($X3a4R6MPDVpAYk_dME>V0a(Q?LvL}qME zsr~mY(9FRVX@Vr5iFqGcK0Uwzl7TOvhcd+;=F-Ydw!gf8o@`%^V=n9Vf;;Zd+SKdO z_eI(4RoU3AUPqx1x-uEcV=H)_^|*585p)i*IQdHU^~ zeE-9r)s>a^$#jUl0odvgZY781z4e>(l!5%iqRJyRcHNP2H2oyA|&*Yp{f0DjM zBhm+0%v08$;Uz4t^-ivLN^+e2XkFf2C7+J6!ioE2_6ek$TYlKZy(d+0Qj1nNm%!0a z>@vsNx-cm-nlUZoTRgP4!R3|d3s@v@LK*C>4E#;nz>q1mQ0xjV32);Zd~#;$v6MDJwl=>uJf_)|io?9!U=%KzhOzq81qK1d}{9E4=)XAJ{G3|;hu^6v$ ze)#dqq;NABK0J>35}wQ>OKrA4KE$U@dcRm$_cjfSt1`PIS5zN_T%THmS$^J{5Qd4n zEo-TFc3St5wA9j%^Pm&lf2-CO?3<>KOkzpA zFi4{}QaV`+xMtYbXfdCo(G#9ze`oS^!eFYIU?r4ayPDy4|FQG>jX(Prh5h)P7iWiS zb2H|~w0?S?!ri^`1k}fs3MJ!@6rS(}VVP{o%#4~c{^g|37WWSQPe{d<_&BU*qQjhJ z;oC2&@OXF9o`+W zm@hC{0OlDa4Z+{?LZ;v0G+gU_s?r^Uyl)kg`5Ys23x6G^Ik;JtA5->E@>MuZ#97aL zp`6U!Fqkozb#*_XW#@#l%mT;XBG2blWyMvzcT2uSN+~LNa-njqFO&1o`AIGo;RIR! z&-r1%<4`d8;&>PmaUVAx-P6DEW@~w8r{fQ>+kabrmgkLZD%IYV=o=zBcDMWUv>cob zHBVscUN0*vllbB8Q=a_GXVf9f;@=m5{e5quC;t<($7oMI)vH^?*HxltWO^ziF;m?@lKr{tui-zK4+)Z^c!X``$b+!nY4<{zrB9cl!KiCizeei*5 zILc3+(cu$Ep{Q-;yMNTL)oN~i*QF^<>kFDm{Fi!I9LZQ7=7!<)?1zj!v-MK;?>qf+ z_5~!k=YGo8?6g-|T_*oz9^0~mA?D)|h-jX$D_F@U4vJK4-kaV;u*r43xD`T9&q*{Q zCHqQxHFpyg>14qAy)3b4)gDvc+LG8nEJK9pt1$n=*O1yxz%ijd>g}S0~~?D69@q^(@tkCDJ|6KxX{5U4W*`br$i zevWG_=EI_`I7X>%F=i`P>D>L16*q&#I`O!*vo|+qoY1z9K-XP-e7Q{(am46MF&xDE zbI7QS4?ENAQqRm=K48$?$)0RHZ`wUbSYvhP!QiHCS zs4dKIcYb7#7L(6mpS6>A{+uZpA+>k0Ru^|4qnaW{)R}Oa;g}H%aD{moh~EJ0U}@mG z#1&dSuLU-elVc!zTLa)}n?|nw=}%Vw;L&gJVB38He?X2$JNpc5z59$7Bs4>3j` z9~b}raqwJOzfdvtwzwaKL%RlYDWA8-Fhj0vkG|^q-v`I0FwwbyzDSo7^c1 zv5K^t!+Dev5S^R!gz@`s*eC3yWyxHrC-O2(X->Zm4ZX(-Qd?=?y=sx=VK-~A4~*X6 z5EV7N7wRd^<>t2T)N}tPALoWf;E_FwIaizT)6ngD|D$iduKR0kp+4!95_lMT_5E~N zs0a2UG2CrWF=a2{E^TVAt3!Qg`8d5)_goG%g-85=*T;k6;T~Dr8-ZFXjBOJ=lm2M@ zv^6pEt(Fzv>!88%dw-hkI?`~D__R9P*3wBp&cXp^O{Yb;-0iptyP~93nJ~v$i6d0) zB40>p^+~x3X=fdLQ5<7qWi>}BlJ0ZR@aF80@{!7}NktTun10#DFTRg|3Qqq5@#(?9 z?Cd{G4HLa`x$I^7P1UpCVFBKT)XF9}c7B5|E&hzO7xNm(7dffP9}R{Tb`#aXXn4VE zc-PbM^{;{|mCy(W0|NtRHMO{}HKK+c>PN#xkm6JQsP*sXf8e`GQaXD+i0y^^P@?+% zUJZ_Mr2XlgpIa3~Yr?+rDP2B81rf4$IWTW17$gY^7X>nUL^D#Q6Z(JmS@hi1p1Q5o z$5WrZEOAaD81yUS#;F_Ckatfk2?Degb6#+!?;80Hg_Vk+(*6rX#x0DQ6k2{lpSo#8 zA~%jciqbtZyhT0}3zaSm(h!Syn{uT0V(nnj{v!@ku*tQT5R}ospZ^m!E|(W>LjEKyaoZ|n*gpKzec|4wgl;abAe8`Y7kqLt461dsVFNy zC&I@M_zN<^=bbcc`c>mE3dq#G;(hdjV_<6g!(n#A&rO%L&pgXGLty&rz~@cA)2d_-F{QvaFZ#TOOu&}btk@|)dL z^u0r5f~LfRXbm&QHa|^0)}lM;A6K?D%gO#`XyyxZ+R5X#!x<~keUK+bdrzmgHWs3b zFM51>2q)t)*Ye$r()`;dfTla=;2BTh%-N|vGyT@uJ_xM!dkQi#r7aDa760(ZTz@nt zY98|=dQW@Wj#nlKEIlVRRPuKTI|sSGJpYMU~pG>ZZj#-0kqo8WKi{k?fbfr=IIL zeb9WO_-}2L@c@wJiMf-TYn^923G#|5GVk7PZb0Xt%><$lu^|G5z=-a$m^7l0`#_tv zFumbT7{1=3dK31~5|)>2U3lW@^GM6$tvp(Fa6C<}!M@-CPEvPc$x!NVCE#m}fW`&s z#`beeo_l}$bOnqHVW4LQIrH^HDK#B`tsz@KPft%Z0RDdkbo-*rBDF&#X`NhyCX+RuhLibh^mZ-s#z-aRtAb`em)|sF;AOC-*R*TchzUTX{~7j@zMUs>$YX zVQw{dJ>7<%B;lO6Ce{IqleAKIjBVLJdSna?)x}2)SWL-Uew?Y{`9rqHYd^9jG@--R zKf^N=p>?_#?+YGrXJ9qr-2y{(AE3htq?sBJ2Bn9EVkQbJ@-3;wApHpteei|d%*n={#Kdin)PMfYU-RZM+2Nh=%|xVoR{7h`=^LC?nb;(j4H6Zd*p)fUr( zKPJVEEnlC?E}6*8gMqy2c60L{U0p)TZT_p{Id2P1rZxMp3G696{scSyx9@#IV&6Vr zF7G^!m)v?A;nP@3%SGlE9sDM8 zwwkM$qM7eRcp#v1hA!+EcOp{g5yGNDRP;-}3!%8K-?4TNr{tzK%Oreh5s|*X>!P!u zM?YHr^ZGe^v)%GQN+3t`uDIRXw+w!V?-sbn^@BuBO-*I(?5af?_kNG}kcm~hd7S(v z{j2e3&0d#LBRW4`i+KBY#QkWbBA6P`!xHP7KN{B$yY(2U9269^nh!_hV?dPv&#vCt zk1u3o1=X54DknGP-r+&?lxPpZ3uYPkRIYGnEhZ;`a1c{KyX^jJ8k9OdXdFzTnL8Q5 zG#alNMq^so_c%LcDrbVyV?-z-o~*Uz-HkbZaF1k{_uCTEX$V$ue3!NapJ~<`P#7HB zrWc5OP>3>x-!EomI$O7>`wjZnnw5rZlpeccY_38VK96mAUwC+a%K(6ulp51onhLIg zg$1=d2ALa?Np&Bg{;f8b6O6nM60^{(smM2qbCp~ADSaYa&YIlA?@wqI!L;JC60I6R ztkGTEhk|N6m$B13;y<&VV#l-FTjT&!Bo!douq2w&yn;#k0gC- zb&aopO;{QvQQnDK8aM{;fvYJ{5Z6wPjkT+vFE?kGnGbtd;c@^FDys8^aE_oFE6oXnNJ} z*~10v_}iqWomKjK72mKeGjP!S(9HbaiOIQsBDjU)N}I`7Vc#5l&%X?Nx%ljX40RFi z7JtS4XpLpjyO_8Ohk;I%NEX}SW)Fker4dC$JL$;nHH+ zDgIrwv}at(mOh#bMK+=aI3d>|NZ;$85BgQ>8_hgzF{Mc0F#t9u9rI}D{J1{@m}C6 zTYw8*#76|}r;-(&nQu|x;x-9dCv%}uQOEhe8SXhmyY>*F;|nJ*=F#&-ij$43fKf32 z*H5Ih;pD*%Q*mb0Z)^!(DNsk|(fRYm?0pfW!}aiaFQBgBM==tufHM5YTQ*Fl=kWvG zx0}NvSScHmy0`|l@9QUg9Ep#aLSBN1Qtdn9Xv-9|4~Owg6V$;C9m#QOWuFn|@+Xcv zvzN4)*7nY5o<}sEZ)q3#NwaL1gd|_H6P3OYLbxm=KgpYsln{bJ;XaT7&{WmbOteP4 z{$bm>SylZ5ilg&DIzQJkfVQ2P_}%DzHJCdFGK4;#RCQDK5A^nm*%YYvW$}0*{0<>8 zeR9@%zHRU7^?YsT8%8{niDK1*+n+RDWu%GvWoQQl8gi8hNUERl3+fdbdcL7Td7_K@ zjH&Q7HzL`Fx)&u(l>PD20qqLvSJDSU))QqZ(jobCln1_>3$z_r0ZCzXUOx<}wg1`I z8(Z0T)w@G3kCowhH02Do50uVQ-X(aS5g`$gk+W9=Ttj0Y9=-HaW7lztQ1uOm2TK%z zifRmazxCoF?=ce-sA|1UH>k5HJYo_W zR#g^n6AE#fLQ1tFBcl>b**0}0Un4oC0u|#C=NOAI6xsK?+FXNm} zl9>`o63ulySzX44%VPe~FSuKKq$j?doQdCcYsAY=pm!PEtBLO$Oiy30!CNyR?cIt9 zB(>xTzGagywoh|zYDiPRhybR$=M;20jwtzTe0Eeq?e)GCe-;c&H-8AWf_h`X0>}VE z#Y2|S7`5HB8Sejsvq5}WEW9a04Wf#dpXR*69i_r>4peA-(#40-vxGisxS}cex60au zJz_XV8Sbf9Y(YpXZk%!Em{R{}y`hmzo1iORq?-<)sS*0Jo01w|J(J6%P0xF)f$3J; zhEo3``_a10_bCIfdi3j!iqG2)PEPd}_TxX>EX@xSZf?6QNGl`&h(Nw1D+>A-bmtJg zuQ}P+EaMpmOM?bcQAcOxp*5NY3~U(^4#Um<+X=`!<)3Cty1c2;%qS*e1d=ysER8(& z0vWU@j7FtJ)`)qa)W;*BK^Ldo8a1xtlmz*a=E zH3y@Keix?SMTJq2o3d-A6aMQ^ju zf%@171K7)-fEF{X^lRC0T_i$?OGr?`{3^K>00z(QU237ut4~M2i?yh1IeHG&L>yG? z$!Y4P9w-Z-w;u-*Y6cZT<5&^_mF0Z`i30*>gxt$)u+3|8)4h5*v#+IZ~|$B7^j*k4NE2{~k>bK-?Cp%5uB>!NID8?EgamqJ3?CO~Ndc961OUcxbpQ`@UX z|IFS)!m!FlPfs5XoXM3*kaqrbYDqpMiEvh)g??|fkB`r<{ey$Hx|$5bRltIBeq20DMO?i?}UIE z)aSQ@WQBCGY`2R^nr;{i&(9}VcA#Ci#$>%U7Or>et%oPS3_oMaXjG5ZdOl%q=r#UZ z&;32S*hwt50Fke8t&bvJ-ibxJ|HZVF$~kTnN&yl^diohJ@V}B0FS+_fPC!n6z5)Gp za>Z0`=4O@xE|5^=0yNyyjF=c-3bvv!{-Vbm;PlzFq~BE0Djp&Nx&p$FQ=S{ZOzN~6 zD(ur-)J}zvc3QbnJH8-?KZHXsNfjR6rN-^*x&Q3mB5t*e4IY^|w^y`682@T$MLI?` ziIL|>`eWS3wvj09R0h2#shi${p zwB||6vKAErRl0%CMo6r#EN&#f=k0Ie6M@rU*UI_3G;|xlUePBCVQcrqD%*2M_3k48 ztp75MBO{?Hms7Ix-QL$MdOq;il^}UMk=TYC6kOxZo{c zs21AnJ8*afAAvuZx0Zqo1HQ~h8`a>SI;G98d&?>r-Oh;U5*1q-=#$U+X_EPUa|1=Q z*60Ku6ZhT<1hq@7o?>jK_RT1HBo2Q7%o!4{F27DbhSvSrVw_T9g-*@#3~0$^M4YG& z(!p&W@nN{uY%~UF0LAKD-v48&D)Gi*H{?T>f_UsGH%;fil>({VDId_N6sU$^AgXy^ z$W48;{&k@5su;x^7J=turpRw3YMdtxoXt>hu073Kr3=6bekc#yBldW4dD%P0;N1Eeq;1HX5=y2#s@WuZh$z{~o<-Dzs}#%O@VHL@;zh~{fGT%kcE zR2pOfiB;6Q+&W#c*RLZP2k&kqOuy|Z9uPKeRrE8W>FRvc0~K{`Yf({%NuV`zj4B1Y za9Ix9(kZk$BwU+8squOi3@EvEDXp#3ivThn2Up9fs+-%{)Fbh6!8^Xd?X-q@joUe3 zJO(F;9@yTNXE2F;E{k%Iv)a*$RO ze@Zwmi0wN+KW_*G&r%@h$f$vknZf>LRedLq#Y8eFpDmG7Q2hS<@#C%xp1-)liZ?of zyE|MU_~K&aY7+!CTSB3nQi7HP&h4A&<;C#3SgLrR`zZvNM$REb`>JnoZ0~5RF5;iO zW^Suc@s{RySWDyYf5)_J(e4@cPK^8PD^j4I{{Uq`P~u!U&zVGI-1RsG@@|Z)|KOEn}B)- z8Mp!nO0uxAvlkQ`RP>Q)Mf@d20y*A$K!6Md&zP9kyt)mkojrUfg!h$K-X3VMES~4YFotc&rl<+&P-OWa}amP4J%D{@PB+S2cb09u^mV z>YX)YdVkDk zKDR^DAV5Gaf;6?OxvOL)ZQ=B282sHw9w)nr0LHa3lBGJ{#Cm?$6{Jk&?4d6gX+|)}T^)`iESh<(rowL>p#+Fx_u~e=yMo z7!2>mSwTO~$=rNh0OY^DCWt*tgkg7Y5E2j=9)UF4mG||r_e)6Xj1unrbj4<2PwoLx zB<(98H7-K4Vh3D0I@QEO>eyE&J{Y(_`Wyi&^s=CkOZLhA*s4;Ncl>`eMlX@GtY!uqMv<jx>cbk zpUBFrUu&uo3NPXq+HOOH$;;Xt8cuc2v7`1j_zFk;O^(A4ztsLOEWk!zlhf9E;<|8! z3oGc|bASSRsgjM2&GFZ-UmYt})&8k8qH~*%t@aQq3EK8H*#7;!E}_*}x*s`g=zZxp zdbl=<2Tmx}jm`twgChh;;A*r1Fstda56-ipHg~e29$o<~nvE97Qe9UU3yj7zb*CXh zutBHZ#KGav05rryp%=qO`RIixHM=}#TM;kYw;6jOr_JO$v=n}cum3Sd8{&s3yYf-+;%+m%~tg7WjMCu8d}8Q#n${0w(|AT)8eu(<9yqa z^Qmw>MjR{;Vo`7!hYb2t@@D}9Yj)x5S8=<|O{bMS^*O^}(NfryvwG_4+>eBWE@RWI znu;$H^{g4keSoetjaVUj4w0rwy2iqjB8eHuY&bcbpvluqCPE z*~-$(kO!_p9b%&%j(gN!qj<~Z z2kYbzFYx|Z|H%5%0z%aaptD0}=s2$N2HqA9qXeee)(H)9+$Zj5OjDXY!GlAUZC7q@ zfd%3Pg}~E{q$KA>P%StJK+Iv$^xW6GulJ9pa3w?W|7ff&YL<>Yflcz2#L$I?pNs3j zoa1n>NWHAt<562tQIQCo-ZeKRk;_{7Qdw7icH&fFX2k-f%#7HS6j!srr>eZ}s5f`` zU$E#l1R$OTN+6{vqhdNWnFucj$IcvtA#1lT@v#R&IKC(}l2}GYbBVxi-v(Z8#rNJ` zGYU#dE5BMp_pLe_n6e)ppp29-WFJl=;Tw&Y2qD3BQwc3`;r(yq)+j}_ zjNe79M$!dAZz><=T$OL$z7_uKp|q~ZAvwSQ@#DuqNHj-Y-GL!G*5fVFJxUJ!@YHG( zx)%wxPH;9qmX)!xaB_~2;FMluuZ9fosK22d|8Vj6Oo)-OdvRwc^Rl0#4do$$xnqaA zl3kx2EKCsePrPg4#Q%W(*tGRfBH@Muixk)E;H{|m-NcIgnd4?LdMCHlZ-SWxHVS{da5KCG@1Z3aX4wjV#A1l{$hKtUu23*eee*#IY=s5~? zxk5-K_%=jh;o?SOVPn4sdn9^|H|hZ7jPErLV}1BAoC$mu{uVYg@C9Uo9ohvj4ub9g zIBpjxGnw5{nVCivAdVXe&rpw`{bkJg<}L|IA<5ml1txFaItc^GprbjrB*ywb+3~tqG^!L*Mg~Tn2y|@hcNx*qyE;(rr@~}Jk4F8O z>L4eQdLbFFmA;s+?zAtQ3hak0Ec=PS-PdE9NN#tK#JUG-wLC~;{l}(_d+_^J*kCgP} z2w= z;e%%mM6qqlpDZ3ycqvFoNPHSf;bJs~L9}n=E4kSgkn>z#Sz3BoBec1Cxqg=VpOu6V zOn%)3!Qkc;tf`Q;ThFaBQ(x@EP4M{yXoLzdX0kPNaXFcVp64VaCnRId_p>xNZy^P7 zJU}~BO=3Y|LB9E5|7x0w@rZQuNBsbD z{D4l{8vUEUVO@`A9tFvK=;`g$`=3Y`blG~D+a)C>1#eQA8_9Knr-FjNSu`OfqhWRd zDYRvI$jnAJZ)q6hARdX&_km}CkQZDGzi!;+`5>2zcHeFc7BX|l?DkYVi$6VB2?wqO z359@DSrOEQMq6`j*BUgPVo_;jsnLaWu=phabv7 znd29gZW`$N0qw-%A@vZ?XF&MB=?2Y!Peo+}lYPc(NDl8vXrE$WJNz`5epwVA7B&!S zYG!9=rwqfpFR&AhAkt4_(A1Y2ui;9RT78W4`GY0vTZq_w^xvu=KklrPj#m2!?Y#`J zcFeWC$ysm6VosnT@e8JNLyc(8>lN7{g5?Ia#9d z5FBo#cM0#`FTb={a~AIJ>mz_J&UDn(^>Q1`Xq)mtipc^7l~wQ^(14N53zj99TIhSF zh*xbhHJT(WC=@cVMvE><>$rhFdicuj6GFRW3%aj+*7!X>1Dtke22^2!0}vZC{*CJz zG|DegbTMcMpg&{*RrdrAHg?(HjSW34I#B>gJ_kD!b@;b2ga#M*(FXSVO9=8d%F4=y z%E~|PiHNkNLsRFSh=@o@d|uk~0&b*aXz-{(TU_x&sqVgeph@vlWputESgPKt?Xhvp z+n;V;n>+QB&bopawd3|Ocq{5`NI^!2B_A@#3}~`wvW|+U(Px8vrfarcUj;R=*sW8} zYgA(T_DQzyh8w2Dsk~-JDWksmEwcZg-y-7IaL-Dm&N41_r_7_N2up85bAcC-#fksu zB1dc4cuX7fwv^&HWf229)6{wfnpsFGZQ=H9!+#|=fdad2HG%&gPolVJj9ML92?ml} zebb5<*+NNAvAd5}3)}{Tw>vo8-Y7~(OOACps*u9XNiQ86kOiNg7+l!6K*-(Ml^_AYkzviXPrLDn&dsu~4n&`GIGT)yo$kDvGciTFq zeD>%9+g4+7>=$KmYNh!6tDBXjL`wZ52q}9XO%~k#X$>$b?1EHK%EhJO$t8%cg|Zto zg&?Eh{*+Kmcy|yTTv3pX*oFV|O(IEUIZL5cNGZX*#;MELPt4ns)Ua00gO7-}yKFMt z?D|ZBkU)Hsi%Hp~rlc?%x-Joe<{hng^X^xyh3sI;k>^)^Wk!EY@?m2#p{Vy7IB(tV zaQ1~a?`Qb06j3Dj-<#(pPNTc`ozTv0#&I$fgODL)Vd2$d@FuGQ5Y|n$ZIH!}gMl&p z`~m{p0_gLQ0D)-%vgA=O;gfmfKPxMR#%aNV4m>!4J|uK6m};6`EH(HYZ(ea_Uw7^| zP4tF4uzQ*;f{JZ8f)3&r!E~F7_;}X6()3q-c}Q3o0nF**plmK${oPpec4DsCwPtm) zGV=rq8lD3YONeASAmvGxVmX`>7;8!FrC)*^C>kNtuKae~B_JKS9ic_v{sCEYCLgi; z;J*du)ah`QvwI4K_~W+&q*b889uW;hU+o7B4Ccnfc3>c-y0>0hX)auRb#*npPhA3& z5@Hb8{;Ny>OK>r})QkzGne?_q%2Jm_X;nebly{a<{ zJgN!)&7(5?_ox^)|F1{oOc;ZqF*`_ndwEU6k=OMZgOrPBkXiNwNayrtM_cY!sc_$$ zwiY3@6Xg0NG8$5Y#M?jL{MFf(W|p*;n~O?wszwF*A&NfSnr=$80sFX&1~wUygCfm~n`=SwPl=T(x(m_G*TqXI|08&x zzHnk9zo$4@(Dr6aI{&vN-CZ@FxZj-o@@kXZJDTWx5u(;Komwc>STxIP7dhL+;>|Px zq_hFrOAb5RpNEyHneEnvcQuAEZwLBxpp*8o2D_{C`uN;Da7jcCVpkCbGZ2w49)d(8 z$wniYilEg-$7c#a-~T~>5Ay-uhfk?+m(nP6;VX0Y3k}U6w4@YgkeeX}&0C=`@rkTa z+)H>({NenUx-;Ik@pTYVZ?D(5<;`b_x8Fx+N?F2H2UvhmQHdcj5~bfQUA5YLH)w>; zNEEnwk;(mVpC(i}P=S?rN0Vio=zbJTQz2*nze0zoXr<6UhjL~YQp~)*TM7}`h?}#& zDJj~^D&T9k%lY5+hfr~B6mRc9L{F>r6f;^V5cB{07?E3$3%^TB`{G#+w!1QkHe|i< z+~eO=&|P?uRNc>-5nt6j73jah<9?TF0GU`iVha#vgu zlZJM&4NQ|9AxnA%HUqCu(x5hal=A!!40p2OhX$Hja!&z)a0%hx4}LtLX60&aZEX|} zp@{2^*8zvLl(aN=IrQvHg=6bP!~zFR=Nu2dP2F<%f2))B_5WU-7bHj`*kofPi)ynI zB77ggdc>$OHs@-S`jpm#HZ1ei*aN zDC=rBf7gn)-1&U+8JS&Ic!m`70ZC+j)4m`IH^Dk2ot*6K6)nBS=Jc|w5Xm$q0O;`p zOghd!nGTEZ5|WYWzIoWw-%kj$3_@Vx6W??ifVgdR06MuVjIyPbl{*=r13$MH83u2hSr`k20edKkAkWs+;it5nkkyJ;4qw zLc@DO6iRr?&!`lCdUjqIPfW7KG}!A|Iy5e|v$PH?eit_GVCs|naOYl%j2F5d8MbkPa0K)?)LJ{X$k5(g6O`fqD=)Obk$$mc$@>XCwEUm8t=HB3kDq+k zClloo8};w#FOWhE_A4|sRcTd^H}QKvJ1vtKzl}cuePR~bq155?H0z^ZRk5?;A(a7$ zzk!GYhtyH`K!i0qO%OVb=G1UhQ>Qz^^v3k;GYFhn4fgiRHk>Y}b@|*zMmqh(!jrun z1fH2Q&U2w)M0%G4nrw_LEb9%h&XZswZA1+e{OvuBYb%NNXvs#)#r2u0|ANF0PL`&Y zEP7e9kdk(oFuqrh0tEuIjYI@+2Yg?VW%i zFGCl>(uQYS;8W9gsr9nB>w!okn`M&fThe#Yn1JBs}kOc|k?>M;y1wC%!xxf?C zAMG~|UA~a^k-*s~s%n2XT`TJWPK>VrUEc+jLOzhAekLa+@g74_i#6m)WzPD8$KcNrgtlM%*N|{FL>W5X-;X&pZ?|o=O0W#sf`}@D_ z)IT#LHFbq{chtc7Ju6p2iy=Ifhq;048zt@0C9^E`xU=DSV5R07bBxA=`w@>G zsIZe4zjZXe$s>3yB#OqLhPF?1Q*NloXCBCkn*O9$a6jIw#3*oQ%qD1r+J9RVDnUY94-X_$BO=8|2HdE8NIPm(Gu==>I2pk7xZ3eJ)W&ohNv(leoj+z9o<-vGp^ z#^WS1;R2e_l0PSWpFN;nSRT_c9rql+>)Y~ajxw2O(Ye;P@Vg3*fJ0dmC(k(cpTB9m zOH{ua9@DiH)$T=0h4+U;?JE_p?7C!!(T5?=jpQY*Df5|6P=lYeoe|u0Bo+2x6f|9BGHL{%E`f_2@ zRtFxY$Aw3sQ_8#khw}t>?pRxHCoU73fGWp#qw@LD4;gY0!pNOypRLMsT~(@o%+yP*U>*NgJ!m z+9E4YT-d4Qxmt7IaiGerJrnk*;}pQxw%;-EezG=FXM4FZeKzeH-ZUVEp8mUa(Ba+R zAQAV0%R@{O$Ks|cKT#+?q*oRfn~Q7n$6tp(V>5@ot@cv*dcBGHvz*^g?EY>4{O?9M zzN1`P zE*lPXlg;xzBrKFN0^ z1Gg=C*RxTKyos~)#|3iB@+9ARUe6HHe#&x{?DUi^u~5gr3_ z*S@SzRGSXdj&|UQG=sEdqXv(`D-E{(Xi1In&Snq&o0t7>pRuf9hXU!qdOOMBv(j8o z`$sJQ_>InQVf|!Jh=hgLOaGu(1A*|>sC8RtqC#3&a?djvDY zbwhUl=C4{)B#GkHdOf?(BfCl!DH$gdf! z7{%S>y=*)xH2Um)Vk;oCl2y@$@P#q(ZvdDH35qqAjU|`w0op7aP+CubdoO{nJ}Kx3 z=P&i8xjx5##p>T9$w9;2WtnbIg7J^T1_JU5tx?SvVLuyQw2JEK8XB_8_4joiZ%l;# z84_e%R{B6|wPT?#?=cXrCMlhq9iQZWIsRr;n+~mX+ez%v)T%&G zSm92a65SkPRqO2#zVn<%oXbwX)T^scb7rrbj}{51J$a+JtJXye{CkvMt@1s_nJU)=;YOJ>XU;i%Nj3i)3&8N`*R{n67FlES{s_ZDsFo4o z%WRrgzeqB}jl5=P?g(9kNLLhy>?NDdt(y;V0n)dM9;kkB&sK08y=IT80=o!Rs9Pe>Kxb7Pd8Ybnw%rz5o$ z`f4n%m+WQ6=@?4(b(vwg?t)PqgN6Of*{KQb+L)Q`prwCZRb#8p-){>sS_Pprp64Nx znO<^&JO_>uA7>{%CnW>Xq3S%!|5eXlCDX~MX2pEsS%muVpbo^U?F%=`myzgg0vDzm;TYnDHUO z!esd$oi0~$q|$AQ*kP|R|Hoo0evZ_I&WbK_wM`dxDs76Emiz+8RER=O&d#MIeOf1W zT_#R@d#R3$mM%s$BLa!DqwxxU*Y)^@%^^1=THl zZGnXfXL9GA$Fx_e6JPAAKmo63>``mQ(Ms8Gu7w$n@47L2QiJ^*KWZad1e}O2n;h|_ zy69;Ic(tBin${;Do#duYQQ4h36D%HT>`nH@1uY()#T;X`)F+-t73P^PTX6>^cMHrq zMoC={v;^aYdtV#~SFNOVk;NoLxZ_d@u)Do)R@yvw3SG7n{kvnzwDZ|9dUd?Y{|Rr4 zwC2uqbVh>Xm*4;LQe7{-RzD72(ezJvC=O1dFlX%bdQ@$N4+UH|avRD-W+Jq3NL_AM znc&C@INzn5**!?#Sn+sTFl{}C$@)>u|4ybtrtoDwWYl*wv&bDScqCsyLe@0|iA&DS z?%XzS5On6~g-X&LX}s3^zZ9%M$&hV>70KdyFKT|_X;tG)rq_|5aY1bI{gZ7yd@{R? zny@YR<1qH)-w!7S(;mLcrRDiLQ7?t1FpANOiHS}0Cp#O27=ueIg_RUZ29v!Exk`99 zxW2oC7^tB=Cja<(qnus;rSzi1(izUHv4c4F{kS(D_PK=i#^c0g6$e)sF0CSZidC*7 zqes2lP7{B6FLKaK4qOaB{kxx)nfbdzM9H(5b=TpoP1N<(C-p6haCCi7uqCj*aTN`8 z9VXJo^tDIVhtPBib^}^Dh*z0NT{I{V{3Tk7!|9Q3qH4(#`8w(O6_w)_jZdxz!x5ja z|N5aUP9Wvvos!|aHnrK6N>zS7#xayZx4r3{YnmRNh5F>fGuiQ+fB8B_=8PEJUn}E= z=v~Xv`Gq_-e@Y$b932cnX7Dyds#1Z(G8p%BJFDW4VV+O8cPoYWPF!n1>O8@ocQU=N z>fc*u&+Gbuv%D0)-Ke$=#lbSXBSOP88n1J=y;J`$^>f~nwasT&OW!Umwv-AMTiV*3 zF4L75)Hep#7Z=!D4oCc{yiWcsfwpZ+oKVAdJy^}yf@1C;vkFuIjwClo!L={-V!o^I zjqqI6>)@M#0XiQ^{0cS;G;l`CZfj}pk~r6jT+~gL>Cizln(@_l9SMpp3Bxw1NB4e6 z;B5(jlQNdcc~SQ9z1SkMFnOt@{{lF1p!UqpFnU?*-=l>{JQ0Fc7W~mwgj_+l&x((u)PX^CMp5vLlZRR>feKmg) zTt{6{93@y5BXFvtE*aqRcA0#kdoZmlQpzZ-?00&2NI4}k`UBzolN#KX>`Lvc{n^SFV8qrTq7y{7*#=l-Dx9NuAic^ctW7apP~>Xl(=3r?s*$TOpWF^`9RI~66`D==MQ2eDTOUrn zyNx0i6Yu?{dWVCvMQ_6>_RLkyur~S4{CL!gxv$<(lytOhuq+d9Eau!ZrmUS;HSy2UK?k3>aqzu7&L_>(!@&cHTq2LK3GkVfn zhL!Q?#+B*60U;jS+ycrQhFEvn5%JjU^D92!vWK6dIhG zq|)eaFhQ?Yg3vc2KNSTv>dHyu*?q8vfkxKH%oKN!n7#`oEm0>PU zTgb`zj<)w~K1^P&RzLeBcjO0lVtis)(e7?|g(K?EH*en1o)6q(TWfPIRu+~Y?D_Gn z(x|i^9lfD_e8T1=XJO%Bc>u#3&a=4mK_o7KVSw9FX7O zzpkk%CRQhewtA`!)aZ(y0Mui*(Xk~Wdp?aPa`BOr*NS31P}0k*)nz%wgqzbDfF>3* zHzqfAogAb8d4>nn!KF9gyj2kEm7w*EJAo`BlQ2i(Z;XXhPL2sZDmK<@cx>IrY# z&FGoHp`#`vF;N=GoOD`RTFWXo-eyx%Q}~f(Gcz;A(4?(>H^o^3E{m;;2j17`R3Nr} z^TwpXy|nPgC0qb}dzgXYJ=Czhu^|s;>C~%8mw`8TAQ`7IdW+|IVz~g=XP4;IFf7p* z8X6ip07U|PtLOpq8=>s&0IGH1f(eY!SHhR^e3nTWgc8{W!yCB%&`h!3pv(<2qSIc%Bhmru{0DSprV zEW4=sx+0z%s; z-;0h{8f8MWLJ@#_>>x}m_ugZu=j<~(%O2cXQMZc(G1Xg z1OEIk+HV1nBNuML;do}XpB@PN_W&q<#ED_exO)p%`!&Rz#f1fN=k4Dw7u$e-4oG8B z=^wz_PHqL{;@f@AGVL#4X?MG~0h^fe8=k)JeflfB=L!m=;O6EM^8AaJ>@#{5RKkv( zK7Su|eSZF>q4%n^HeRb)_I00+O<%rvnH_&roTCyN6La`ulCfy%XlCTRkA+`UT3Y%5 zq&-B;0KCX~`WK5*CYq@;%0IXn!Xwb1Er`67Qo3Kg_l zM0a&_TSoA@Xe6=feC=5R5=`$i=4a`mMYfiJ`hM3suR>CfYd8Y_<Cv-6cz1qoWLu_|u z&w=2Oj7ujI`GB09Jjeh@zQmcLhkM;Vti8;OKs;e^1q>G!NC6}S#WbBpHS-sekYll!mNa@%C|dHb5r02snkA^FGmcqAlkih9|3uO@f;$?ZASpq$CQ* z0LNtqU4o=6$(NlTwA;lEc6NQ4ql5Y3Na5 z4tzg@m|R+#2DTpgfqY6j*uLmA%lMCDs8gt8cb9!%&>Gr-*$3+^(auY12Ey5Xc>Ps> z9bNcP?ErgEjf8Z(Z<$ZURjptzzTgjbA|NU>8PSNGE>BA+>+`QkA zYLmc2FhRT|r=(=BSs^!+p;1)bJpl?<^~p1AgmR}mEjiWkA5%)13(`i}2R#cQnVO7t zmr?{F^oQww=N{0AH~9>x3q#}sW$sRjPT1i^^SM~Y2nJYu-Qy3*wkeBpC1iO1ViY7r zwDp-6p5T+GCZ#W=zCf$MsH&sR<7*{}{P$-tppY<6_XkGaeOU$Ah$i_t(ovKj{(g{J z6-!{zScR*JRRvq-c$XVYbR%3tUvi-fg*a}z{izR(GjIGZ1TS^RwNXZyZyVobUUo5`M_WYBFY;>3!FR zx#U~o9ef*7u1Ik@y0z^;{&pI8oE%Shw?(Lja%IfcP_$U|20F=g=C2MrZRJAHGk{MinG6c~-t$VxHU6cE2``9*x}f#svIZmmqH{1TJo)putBNy(P%t+nSMsiKY!t-F zF2NA$T)sOpF1HWA*tRLlVRqt%O28fT+W77R92A7iUP!m(vs;nh#O{c#@8bp#%?p>I ze%)MM?Ku?v&DdP9D@j{>t5@q8(^p&m1@V|NjjC5&9=*)X@l5upGl;MkuU>s!UQrBb zviXBFN?w(Xj^C~#ERHshf%+J2Eg*t#9gR8glDa@#JRUxoQqFi+LTNtW^w}(uVv%0h z@qBh5MF1xC@N*fNVAr5M0i-|pA3r_lF#{og6^xfZ5*}(fc`w})ubLpP_V`AmoTyHa z@AIVBzfaY6X;GcBK6Hax4W%N2+;gDT=|kj%2v1_Mz7D2@GktuPxTmTsrPr4)+*W(4 zZVqEc>^*B64KlCMY@Xr|(&re{aweMShv(`6b-|h2a@yfhY^X6m z3SL&U5C?@6of(?WKWc|vCO_^Ee71gU5sTE(;~-lL9v8cZ*_-joD*&7(KxYyR?9nH= z6M6k7VZc2vY9nL_B@~Ijep8AvBj$qd_TtT( zf3nPg%cM*EiINuJtBmq9IFGF}>8l<#?fctbvF;<(!j3I1E$y32cF$nkk$|sUhEvHI zTN+RW)||@|Q#g|Dlg;Qr; zU#>bYwn%kEemE&1>>*=eSZkO5^$?=r(W~7{bIZ+*RP|C#CWj|ff|5(vIBRCwqz~<) zDinK)X6$|jCtCGi(WVNSKftlEq29XIr4aTyjs3=^(-$_$TcQ@c9Cc5joXjL{!Fg}5 z)_CK|v`c#$aX;18rsUBz{YMJn`<+ke_-o(Ea?MP}$gg}ql+V4ca@|cGm|GNcEY&iZ z$!ujn+nDt@@nP*t8`=CvzFKg6HDsQNNuBk5m1=vV*fH+g0Qq!*Ea8OWq5OX3Y)k)B z-Iorgv+o6ySQ#?2sO6rYJ$rq2^wJoWb0)Deuj=wld5}eei6f|g=C^6r_YY%j@+SLy zE+K-*k=2npr3^|e1dpRUuij^5@_SxSvI*FG>-oojKOOr1oc0Wdu^@2T1GT9UaLh?2 zgQ??0brSLxSqjVqp7XAqw_bJ(xpHkEjN8@#x$K9_4`#}VYtt7`{N@Wp0e_Sli*M#L<8pXvqQ#weA6Fsw; z(#H|Cp_=V>47eA6E-rzov&qkb!>U`mx(~C=)RkDqXw^UE3iCbH+Dg7}AeF&nUPtbIKWbv9w+9fl3#5MIXkMC`EX^786_vUzR*39 zHD~td=Y?B!@%>*78M=7nVskorOAevppU0*n4VYiF2wZA^+SXb^Nf%IRVSMZE z66CNW-9N(+Z76)-#>QZE0cF9U(#a3%x>b|d!fC`uH zkdU|_E#2KAbuV2?NQpEGk}BQZ-5?+-4Ud3ycXtcYm*(Ak$8U_6e;EVDIs2S_)|z|H z&sqhAuQoWMWTAYz@pf{-BHuijmHv5w7XcBj{X9qj1SKVr))1fG48u~s*rDU~vKCyW zl4@~k0b!+>eq46s20RlFo;n<_^TC7>&aZpmkFmM^{FL- z14YiSz)lpA>s@Hsjgp*r)E*owxTNG)G)O6$l4E9O6^oE;^cqQZdFYoiZZ%_cr^3n; z4s5pW+Y@JEJ;AcYfsl>g8%qDec8@Kt8YoD+T}G9;&N@AFOOsrE^xE`fpE6=8NkkrNkr@Fc>vB2fE7-|67pUNBk=!k7pkJ zt05peL={A*r31?iX3^n1X|%f4)zpD8Mn~{9{9Ei6DMmz46g;I$byiqPO(HFsYo)uY zFz9doX%~QH+5G?u#D14CSJ8{!>ivz6T1bXbluz-%0^2DiRYA(c`eb;qzPE?NMXnTX zy}`o~fGpVJ_?G%Vf>6hefmt+8JYV##!$yZC1+nTmJ9pZaL38wPPBemPRG#|?6Fm}KBhOBzvH|ItTK zI6-g<`p?MV*@lATE3iR{YILUDw1w%zG=mDwFf^f-(0 zmzndm7+7bTn6T@KYUn|e517PQlE40$nm)cet?%-*vx|uuIY=;_-+sv-6tlHG7K^(g zkH2GHo?;1;O-o}Q&~}u?2x;FOrHf3Ou2?SnIeUE-4f~As?}my-+Do*~)8zxJo#YH3|l%(W;LePD!IR5Z}x6@l3T}NL&1NGENf(&mWqyAIs}a04^-dMAtfRNpX`9PZNNU zDy8=w@Sx`^-jk^F21$Vc5ND0-eU9Rnck;nY%S&RT{iCYt=39b#NR{l$Nzv*l<_G2> z_7Caf&7EMUh8+&(T4r zn-c~5^5t1b6z2m}5RDVq&+w)_{{EJQEYxQ}-8QMNDEF0PIilWYitE5mAMPZKj( zLs^v|?za>=llfKdR8a|b+ztdCeV2@wcm;GWSe}?=q ziqyLQu07mDP!`fuzuz`=cHEKLFHob4j=;WKZH<7M9O&1)?yhchI3=EQ+IdYQMnHq~ z`!a>%vnt{lRsZpl)M18it{`WQdoNSO$|nxvB!UN~H^W6qq}RO-9Qk?qqp|HsF$z_r zG!Hkc(hTHemkPIi2ZDp)?{v-SXWiW1@}YAU`jJF+)~5NhZ$L-Jr~;dezl;$Ww;g8f zQRaY4MAAn|5*|+G}xod-Dnn z7#Ml7UI`JwRaI`k*a(`xDn(O|N{Ri-ZW0@*ANx~HUO08XdM*|nJuf%kh&46jYezh9 zDU1nPslPAn$9$L1j2}D(hCbK&iyh1Ur)j*EmGBCJ7|uw)3tSk>SMHshjGrXwk~kRx zE~cbk1dF8(SY7v=ZhvPafsu*^^cvbVVo<2RtD6*p5tCv)w?`bZlk4>ga+M2+oxk|$ z3L7fc@2^g`tIOdt0g-q=LThEW#ciiv9ci_F{Vx2@m4nA z?g#+H6`n0M*y4Z{SQptAX&1LHldu%|`- z-fU$8QwJcdsAXhi2#W3%sw7JR660H=WcAn7Y^t0i77%ly#d6@SBlVM|G>cNl)&0R% zW8ZbRb6Decpg%RFHTrz6T1Z$}JS-%n@BO8x1L_9u6Y!Ed!0MH{2-+MP+D5sX<0Pd24U)+8T^u%BHPRkPRIM`Yq>y zWX6a#VIEPL#%K=Vk~$`?!UVq^m}tPJ`_s1;!=HqZO>A&*aFiosWZwJgWb;R1OL#aI zT`BM?TAl*cYygd^hpCYh@c4NGch;`KjQg1sATrbL1RTPw-pqjz?NjNC{=IU;1}=9% zq{A2+A5UZs*I2;Km-dZ?<Lyj~QqJ`aEUDlg+e#KTZ1L?wvc(Bjns|_wLF6RE{(aqm~T!Xr1>lih;GfBjq zi`RZB4WJX{A6sLY6X2PgCF0hECCd}a-@YscLS-&!?YGUhzBT{8aR*wb$JGf`_&yVW z-SzbKt44eQ!>w}t29SR$-HMJ>a<^N!zN;ZMegBRcby+hp0r$?Uu%ad;8czhu(7b~d z7vhKVa?-!&*j!8>IK8!gD`?)X`ru@h_&mnA zG;h#+pvU^~3K}l0 z*iS9;`R64#Y=LEn`?NOZO|P8@B6msp0^v2Qd%KSpnZ)}A-(b(&`%1fRRWA3TSeWMS zzo@{8Ja4bYn@e=P)YRzwV(VZths`0Ql5-=X5H%J09rcQUweA)o;+k&1Cwl03_64_% z?)!behqShCz}rUf_3Kwmo2ehgQSWL05E{ci3ze0Po`sT*QV2THhrW>I!Y}tThjail z5#{PSdBk~}%%k}C2Hd2+VDT=`&Jx;WrI&?KH)p#rE*>89H2~GsyJc#w22pG!Syga7 zzB9A+M@J3>{+>XBVD|S_X|2(MaDMzDMT0w0sB=BMZdl86YCcNR71YgPf*?`pdKV@{~QjjLmx@A;w2{yVg?Wgp-Bi=|~6WMn?;C6{N>0a>~kDHUXdX zsn5%e9-cTisj23&GB#xasQ|Ga8vHzo0x(CtXK*lFeVoWu(a}@fv=a@2hL!~aS+B>g zug`5y&d)6}gBQB50j7jjSXh{WAU`(>_o_iX?#iW`+Zf@1BXjln*GnN6SgW{1FQ zO+%Mj_M))Dk#RfXp!f<$9G7cD0|&m7La&UbE@@%u6~%bi7mEzrCO|EOs}@cHZ$E5Z83LL-UF#h!+&KJD3hN9T4}u%;OdsDs#*@? zTD=>AEgsf^rNOa2>3q2}8PLz1sb>djIwGH{X-{%*1V+DE4(sXAhl$HYWRZ0QR$dhR zU)#T*!86wze32<2smxffskiudtn}5A{W}|_h_b4-I1PE%-YL79V_*JR{@UEpAyxBb zo_Xh4pE(yfh0GR+8Jn7sQ5AQjbL2=6ntqK13+Qpo{SQWKA?amqS64m-CO{T+rw~!F z6=~6R_RC&u8t?Q&Da!E|3^!;DbE7!6$Z63j&w?;)>T>&W>|)zb{0A$vKIzj^nUauy z{@6fraYrwLxv|ht`inr%+KC|glC!_W6WVrm2OfNUd_JN+_suGj+dcDMLH}g#-b871 zhJ?suZtcI+CPdc&dOmyx_4Po9r;(8nvrhSfWdX)?B|}VBoFlwg6AICYdHU&rNP zxaDaZawrrOR%;>gVdiTs)*S)rPz+@!gVwd!@;4|2!&wYi3D((DZwY$cG3;?iGBl?rJ*3z2V(l))fAkm0xk?qWdI+(g zADg;T5s%V!$(?I`OSPNGr0mT1A#dW0r?JwW!%lcjd3^duK4h~mdb$dF#?GF>z-n+W zyEwWu;rREnzOWthY*K1<6jIOsI?+?yupE)3=wJ5Ghc99?x#5Y3&YS_ActLRRHM4to zxU>x712C-*TW-XdPt??gC&1(OMOI!OhhqlQn?yeA3{(y^X9!>0BP?9By=mdC?ss%5 zG69*NJ~JPlyv((d^>CO}B6b$?-sQTeo*sh^z(R;~%as$oq>dy6Ud+vL5sq%WPt1JE^8D6gFTB(BX*w4Ygcj}XX5c; zb+t%x#BR63>B>i={5@>F`|Tm_#o8JkCLz9rgpc#8&T^5obY6F+lBxLAYCiuKF3QUi z`G-e1kYYf3CK41EHbCyHnW15QsvEJKgMO~btVYbCt!Bt*w*Gb0KbK>S4fsf+#HcyrNdftdQw(&SYxSFn3X!Lo!S&K`HV-@66 z7Z8VL8E{-VGn8)?hjxmqxd z!zN;NXfKDWsrBDB?f4ISzgtTEmKdl7;rBFJ#bV=gn*ryLD>RfO9;=tBk?)LKMZi^! zW|c?KGfU(@!5wN3C1HAbDRLm3VU+8gJdaAg+-8C;=`Tei_djh@Azp`sQHJiysOY?dIBbz8wF`~lY6DpxzP>5DDs%7xY+OYLKTXN?XO~b35J>EGXd+f;~!z=D#Ca&ujf`a2I z#ju`AEf)P|SC;;rWuAQ;IizSLe|)VY7wrGvt(v;RNSc(kEngN+3c2KL5^yg2iFw{C ztS7Kqa2D#KpsRoNw&;(`E;$h9GQxHCV4?p`@h8k|Ni}M{WQvbQ+;vZ8LLwaGo3@f1Dt<=YP* zZtp=I8c?*|8))D2>B8Yk(yhcsk48`)k@G}r+42VyaS>~s{rZA{8wUKIHmseQQg#+ms+tG{)gPfeo1K@X9Pc@$NAP^cn)+_B5d-p=1`ZVY)hoI2En0)#j*Z-?&C~A>` za9vl;?7q>7-7iQ<+}Gv8!otMMzq8z4F7P$!20SzzoI)YjJ$jIR?i1UGHDs?(_0k}x zrS+{AIK(Sa`SoLCUz=7tsgYw*zRS)wx}_*S3&Wi9liOygQtyXL8>LT54BFE(Kt}^N z%*V#rqyw%!{GZXRO5;QG>8stXN>hX>URHQJqNB$}Co3`p1h5h!O+)b&vB^}G)hrv= z$p1{c$g7>~)C(}+%-+lPjt#3Q=cXF)2Qs#V{yK0vv#P`M>_PSGi)RrxuKS=D4%jHC z%Lpz>`2F|4AV@6&STfz&+1brKt+#eK(?w`#M<&{`Na1Q4*r*cfv97hIFU}4sEUe6O zDdd>4i2il_-dXbVvYx{~KVie6RMM0n*f6r-Wnn2T1z!2G?Slg&?WOq5Ya(nvz--Ak z1c3oJb8~vBcU2Jn)a60EC^b|8!T4dXRedjM)FB+Q%ijf#>wVhd_w=!(6ZS=ozd4A! z*Bdv^O?*V{8v3gfe!rmGeg?pxVPRlAz|Q)5Zba*u=PN#B5G#L06mMZ_$|WKs0LE5;L= zOF>fX9G{he+v1-aINIj*PaCq|Mtg+`{^tKgy0)jt+ku#ug6AE9{o>d!%8jSurYjP7 zw-iS;&JY)N@IND2#~kyBg{WFj_6l{h9bE$hlb_YqrTu0a82$#u3k!e*v;k62x2GVg zWm5mE243gOag7DO-)Aqr4kl`1qE-b>3|~0kPj78!4zkSh##{34I2wt6= zOy|sdxclgEu{adVO16)TI1~j0{0RZ=?Oszu!|~6A9ADk9&LA-p-{7$NJ5pxj`u^_5 z2wG1_0#^0KrR^kF7rSc-w)<6jjRZwtpY;1b5(5UW*-MQO= z>p2^%k|M#C2_GjHejX2$PVlNyfVWtUr9xV5!^F1yoiXZ%4S5NcH33Vih(ZKh%O1~sK zTf_F^@2mwdQ}wm+oBJ{20n7ofEihR1f;1V;>gp;icqc7P#+j4iiNBohLyX4G!FpdJ zF*0L%6GBU8+|VRjl`#bq1jOlWk@bWu3S-CtZ^vz6Kl z>9h8dmuDBhvoUV-tJ7NqdNh|Nd9&u)I@9$kN=WM`ek0oGW+=LQ5J9ZS+~xHj=J4o+ zsNb|%1=-o74NCFM(q?961|Y^BNi&c z0?G74+|VvRB{O>d?{w;49~IrND^8;&!R^7u*mV|fR2VJjp$I}UGMiL>E8S-@S}^%z zrGobb`5I^6ZqBw27vpL=zr)L_EIc)rWjXA(2`*K5KP4KlJ@xYda?6XPgoG(D%d<`s zMA522c(g|7)1eCZr%D*cQc}nOB^w8l`dr8;D3icq)geP0y$mLvY9yZdjgb#wt)s=^ zf>lbu{b|T5kA#-CGkIW{UgcrNQ(H?O%+1Am&Q`{QY4^be?7nE-59Ip#PAG8#bl)5d zA65rnD52q~DTW0_S=rdin->(`rytYwE-rOHG%mdAC(=MgjbfYWotyJ`J6mN@3V3ZZ z6Xiw*3*bzu1vrgiAlo+6-!9=81Cf*hb6a+RGWqE2EX#|E!VL#nPuuD0keh;c zNb!!VtV52zJ5g>vF6O42ZM|tIFLdTFR)b#y{`k$+>O$9UMp1o@nYhxIe5*5Cfuw;)F-d zo$;#}MG$0d8MwTWQA>Gou{A-kjHWo4e^3RaPt?3~kV+kSjBo^qL`^NwpMgzt4NODN z^Zm{Mg%$&B84&+VA5KBe&=x;m|>4oIo+#$6|>N;TE2-6;*o94gW^XA zD9Ef$!21;5)$fZ14m6R1&XXNK= zNV0{hD1@t2(VMgO;=d>k?z+YgmQjmgVrSvv2~vHdzyq6}xGX9!&xp&Otxf7^!O;1x<4|3z)f5Mcz#CB%-TOlufQ3yGSC?ao59W`zLxJd9MI^I zlaiv3j0aVqkGTYxf|-eli5bMdWx7g^j38qtwO2mmg2?%y?GI0$Id~rUzWx0T3H&I^ Ms>+m0z5V+C0GBe^(EtDd literal 0 HcmV?d00001 diff --git a/doc/fluid/design/concurrent/images/channel_send.png b/doc/fluid/design/concurrent/images/channel_send.png new file mode 100644 index 0000000000000000000000000000000000000000..006ebb4a5a4bcd32c97847e9fb7729a740255f7c GIT binary patch literal 85643 zcmY&<1yojB7cC%2DJdY`ozh)Wf`mx7Al=>FASK;MNJ)2hmy~pOch}qazx(cej?XcU zADkU4=bCF7EGHxW8UY6Z0s`Xo2MG~*2neV_@K+S>CHSV#;s^Kv5yS@(Aq6L`{Zv>d zg|4Y?SqF~fMZ_PZ0gJA%sF08th{sqkLUJ($=b0p;@c5|Ebga}+2q8gK5*Wg;JNWNn z{Osh}&7RV1_0Rkz`Y6;boR7}@AY6|I6PE`0&JOFghMgiiBR;e=eI76=k2@ubKq*)A z4gS@JQ)T$BN)rD|nLKoqcu-)h&!CcR#-#4a*#(ls?}ednr*iq_lLWi33Sk#cO63az z`;*M76QVyFteKNU*wLzv_jZsjbePn?d%Pj1BnyY%^o4-)`sefJyO%g|iS2+sy2p;X zrv9hNsk|n&0)>K~FQc1zikjNK;Gs%moMb0cd0j3@I>YGqG4!ME^Ds+~?; zN5**;HuW0)iEc9^%AyYZcAxxYKK67s=M;b5ozmc98$PO2Z(6N3#6_)I6Kc$+g;k_v z`>RcNUXT226n=!B{{Ry%?+vsd#6O?b9B2{R1j?sVC5+IntmgP!S=!TMBNNsg+H0rV zopz{@!j5~XYyWF8`5^6gr^i+v=)aT@bMz?@1gK|f%0HK*edG$D6OgUC%-|eU|A1R^onZ=Dh3IP z2;!g5G6IwZ)?l=UM?+G&>@ic*yId(R<)EJ$B5X^{Ut%S)F6~6`@YMqQ%(81^&g|%S zzA;FpTTGa^HBVuW8F-%0b8O+Mp(4D1M1lC{V}%V_1Gg#bxu?wgo5js8oO(-9HrCK; z=D0vBr-LS{guPY&wWEV5w>w2wjz)%QpW$nR!goXJYj>Ike-82q?H)I)EB3^0{yicD zR5il1l;fRKVN)Qu;&%CpVwHY7O`(rFDc%%LR9sB4)M{_VhT52FmReDr5z}Jo z#fmLreCUKm0?a>G1$T}N^pb+M9W3#^Qz_aU=?_r8ph87EAjDRYEK7|{uuir3soDQX zdE=v+Bj3bP-O;EY;_n^suN`_p!*MDyc~;v`AifMe8L)anBd+SNK0UoWkA*;+N=VS_ z{fTbvdQLraLakPHn|hU=coJ%AB1Zq;yIi7Jh3Gu$UitkL%X(BV>TWPAVy0)4lxQf0 zby>6>uR!1FybzFf(HmxCBz#JHk>u%=QL4Yf{jYo>AbZfQ@^sQl^R=ByEw5#jLhG!D znCT}!3F+fomVK1^DA6KDS#tEQP$9{^KBW??hrfnN`lqyL#lOZYC=ZQCk&yd%`Dln) zQRjJ3PG6;xwXO zsZQh}BjnRXWcGf>Cp+TDphMmE(TUSU_>Fq|yT$zs-A}Lnbzv{BROk@0gxn|gO8>>Y zxtZ$Qy~_~|Not-Ds|Fo+~<3gGC!FY zGzH|9uib4lqt&Ep#bznx>%A0s==(~k?7u4X|6CLT`V^HaMknp8XIs1dUaGQCxj5|n zvLtRBh1149nOf1&b>O$*&t{@gidM6Kw~8c#Y=Akdq(9QmmoNWPM@ed0KsKWbkL%8L z;$pLgDPLm_VF&)vdcUlgq8(m0O#-HLxGh_Yd3=Q_iDaQ@;u zI&KDsyWsrcXUc3q_5X`%US4nE-$*c3s9aXQOfgn(GW<{_N!sl|)lU&UBcU!^(il}V zl+XO*!iP?jjUWu>9}k0o#6qWv)JZFMmQ!oM;q!5?d}Lrpp5*%`Sk3QSovxb7T)lR0 zB?tBI@1s}|c=sS|CbmRy$`_gqaFhECkQ&Z7Kgi{MxBRJDol;+Rw0&I7f`eHAp#l9L ze>qlxig+`K=pcQ~KG@Q^Hz~#Dui`-`d6iSgdBSNsgetHNYSkT6><7J;H|?)8~Q^hY>l zDIQ}Ri;w|7s_uYNY1?*-=sAalEWW3j(u`qvX%A)#$x&i2hodqkjP z(&TIA$0HU7J0UMJ;V98KUCbNU=xM(Ux{h>ybkt>|!udxI5YTUgh?MbU7nsR%>|-r{ zJTA6{7}ciWCMHIUHHU*&2u z+))oD9)C9cq5l|lAR5>wTly$LyI4SOW5v7c9iHS=n13QY#qG^C$y#74=E`dsj>{JX zFaI&@NPfr&S@tXXLPc1OZG}7fUj;-DkBucTEqG6AmaIGJ_s2J)9QuD)Sz-UZzlK<= z-s#B0tUqkrW0n3!{KRnH#cEw>OxC~;wF?ujtZR~3=gm%R1O)g@d3{NYd^~Y&yemh% zoSdH+i8ES&oN<<5#{Rda6Dly0cxG-db-3ki6q@Fo(@Cr2P1H z-k`rzeEQ}U>ch%6hm4Q^ajqAT5zcc0w^biSa*#%&s#3?&s0lme?uVRi-F7rik5Q)3 zM%Dtua{Ji|ew$i4I7-eTs&D_%t{!!iQW{?+#Eh0yyly;|g74W8+wIbZ<>RXxe_@6# zC4wF#AudAfUxhLCpDWG7I}OfqpDG-b7Zr%Z#L~*EJ3f^q^SgNR=8)z7u3+SqdLfRK zaf;E8j<|^~E-M#LplP%?;cRzxsF}p-nrVeZm{NSQFveDCw!*wJTg=xWpYH9R_pxp) zCMDscw#Oxg{o$pL)v&CoCt5M?-OgKt2)Nwn9+|P`)VsV-q1~)^(KB}UzpZhYbRn9~ zJOsv!ScPv=(3mY}-DXebkX$yZ;`0q#(iDE!z^t&iLZ5}z;#TVX5uUAeAicS{8(p7? z&U-*SuuVLetxo>w@%DW_weh)H`qXxq2Ol*n?|GT|GE}ifqhz}D%{Qb&J_%p3*Cet1 zzL!T_f@g8=T4Zf_>5m+#jgMRTU(9o-2FL5@Vg#D6EYVS$q(o>#CvIxr{%ciVKep=a z#u0xgDzrmn><=KHt1-(Zwrm!6hrRq!AGQbI9u<-9@E}qlkodk3vm~?hmVGB2C7YA^qUsh_)rV)>P;`0;;Qfxore z&z(5wLSl7av74gdt(fKJRG3o=NA$@jm1%-V4l=ctX+~H60)tdPSw&wY^ zHN%xICp14E(;Y3dDQWpsb=u5Wadtcpp2TwoVt+D}IQ|%?9R7X3b7nluVVk#LYi``( zh`XKjV{K>Q%h6agMtXKWp$xgU0O4$@-V5@1NvE&4#o7%r_(^uzd1#+l&wa0q`>>?8 z+h;=Cv3#*Ze`0-LGF*_(#W$Ry2s_{`Awv!8C5bfQ4}^bR}1lJ)h!k_!QnyZjS1 zH9NJhSY1k9mau=~CQD4hYu?5xw5-`=r-%|v_i@UjAzVf7sr0}V0UnX~L6gtkVlpwn zomB%RnJ%wzBJU$Uwly=rtjtX#;)kZ{m!Pas%czN&T%hO!CWarRo>!v1G*$)IA?x~o~56;dzGle zTepOh0bSa3KcttQ=-j-Q8DDXyhP&`i`G&;Pd|HQNnJeGuET<)%9pM!bDM3MvOQ78s z?6D_WCV=7z-Zs8c2vhqXjxmGg{b_-1*I*y3Op1v6Ww?JZlVQZ}2RsX<+!|(>ZJ$kn ze?&<{(;arZ*VOfHD+uO`rK&+J!srKjz@gkmpm6*67OH!``4RaNI&S(G%fDf3)x+{KAAJ)d~4rgp&ZI|3ELYT zyO1(k?G1+M|6+ja+H(Ih@^m!MnbhRTjYo(xo=%hbYCx#s&(L;$by05y2VB&lsBQC` z%*A_8x&*?BfG}4N5)bNf{bbCE2G1i62h;eH9KN|Hx^kUIj`l_hYqOr4##g<)uBZmI zsKo8B2qB=xh+3}-YqcIDUvj9xKWAhl^5u87d&heA{nf>vr|}Hb&+QK^P4iq1cAwrI zegYSdU^w5Tb%!RtwYiGVGgwH&2?42ImT5$T)mO{&&|XsJ z;*T`8&sp&Zpfu0&fzzr*m7H{qth}D{R0;PpDvnal0=g#tx z!|7V^Jf*gvI5`{lO^w8xOqPfe{~&JPLN@8BFEY(vDm?m)d(=pr9XTDoq3t0Rc!^I_ zuCs}^*;0JK&A&RMYJX-oSj*dv%{7SBlq&ZYjxz-Ux}qa0Q}8*$m!^C57e5V+^ zRxx_7B{Y~%cwfGHrkO@s(Rcrw{r-MSEQ!yV9mM#}5Iq^%o8yNFDn9zAUY|tezH7>j z`G}*c#M9Fk0RhsSdah4VX0>?rS0qd2LrehP1>I(b4woF>PT?o2R*}^|(X|f{g!`e=z&IXo3p#}Zy;PwqenIG(1d~_B9eqc8O+?rd<)9S3|1(D zbPJDM^E~YnZ8fDs`E9IrPHnEuj<;$pF21c1Fh^Llo1eqD+5F;fS1srJ-H9V%Y_Gx> z@~0Y_cKnM+5OM9?_%F`p_nf%u84c(%mVfIcJst4w{;pkfziFYP;YDn8(YM{*{A+*nxfPYQFt5 zzAaBCGD9-ERvAx!@}NDz)&ppwgoQ;IroYux5HJ7Ce9bQ<46EYLsWomP2W>GNC9VN*c? zBT8YJ7OGDLenh=D`uv*ghD<1;Vxo_^VZrdhz-4XbzC=v2<8t-PRtHq_O$ltzOZT+Y zn=)f(3b=WU4rRvq?}5NLFAU)`LqGkA`Wbi6%L+Q(xt#uV{K)u+Fhj{X$aK#{*o2*Y{vI|&IhXB9D3R2Va0ZDE zbV)$HMD$QTI%&=J*M!npJXP=~VKH$RrI$aT<;&5wf5+0#iB+agA|(JMP9_E6qgt*B z+X0TWF0#A?cgAShf`TpQV1Fjbc-xn6Eom{3j^!~yu75v%k+s!wqu7V{+W{q#6YLf2 zNHhowi4qCxJr8q|Yk~cJzFxAXwB1671R3wtNK{OQ=76h(-`Fm&&gm;r1cNAT%=2b- z?h$dI1_}xiEY?YDFyBx3?X~)OH_jEOK{7bc7eTLowB@Pvr~U%Lh1Og`qJJI16cA8Q z0EjV0^-uxenJTf>ef4^L=*6p4ldFq!1s(%?1ae<{<+;~C8!9(CnLBD_&PIa|ccCW` z0qX)>nlp|H{yAAANH6oZ@fyx;k4X9PbCtUE{VWiU4?ax1I?Kq>0+X``&IZn_) z_2nytgci(@0EQRphm7@H>UrtG-BW5i?^+%W<$o&Q@v=dkEcGbSd)x3UrZ70y&an7M zoAu}TR)?k%H4UhCttfzC4aY=hp7|CzK`3OmHp~7J-|y-9mud;=$~ustZN`Tbk@g&^ z4k?<^tkr97zrL*;4hudIEpr0z4%Z9*{#;*W{Dvl=N}#;|!=Ye$5`3g_r74n#81|(S26dvx2H(oMKW5xckuy9z*M&L|*U^p#Ab=KxMcF`R3{Bx>rWq^v%u)R4uSi=-~8RuRo zwquay5E~M$DYmRIpZ;?$aSE$>%Ag-5UNIyD98yCRcz|N|C*)@j;Pq+M%_&9o{E|=NI1ux9Y*L#PjQ) zwN6w|y82zaz`V(4*4XESQguA~9nNCg@JUv%LUEBVpFnOwOOKRU!Cn%-lr_F^AUk3ISGkgQhGlfhtIH?;4YA&Xn>qD zkWaUKT7Dlo$y;oGLBJnEw$(0X$=5{E^Lc~K`S*CVu7(PI!Lw;Yd8ee{(T1)@Q_9NU zoNnxN2BWVZ9WhSW?TnF|EH-kb3V0Iu%)sw|BKxb@IKdw+AdOKW9w~ zh?Yv*XTA0ago!lcMdn7_#2=eI!UC!iZQ(M&4YFm4N^~~=barHVlf<+n=DW~3T3*x zzJ}YuhKGkg--y?iH!-2PIG9tuzS%7-ktkBan(+*Ufc^NFWB!0*mC%uTyO&{%S-8=p zrmto^AgW-iQujS$TaffghTEJEwPr3)$$6MyeC4bRi4I2;xM`4=moyVHK6i}E#jfbZ z-c)Wz2GMMp9;{}ALs$}B)k&k&q0Y zFHpejh2c0Are&_n97|g)uWQzY%uVBsH#B*#;r(^!Tv*n|OUAUVT@PJUb{xkSCepHa z82jj4@~J>r6EuT1^KowVf@HN?77Pgs!w@;Wz1Yha@_p&!Wzg1Wy(X6QPA?#G?bqo2 zod;X!*7mj$x7#&S`kveqJ-bZlUsTm80?f~GmhnYFUR85p6Q4WnB*R&ZY>;+#!Ymz) zx_pJ@xc1Y2|GNa+;pC4q3*wF{PC^PV+us6P3qOsG31ejo5VLg0}qTkZUCK^&O(5G!?K7if4k+_9rTmE{zZ(qX z63xqe2*QIFOP@8Ou^pD3l3LBgeASw0y05!D)gAYm%g=Dh6ABq^u&=K#MOP>|I9Q=T zf%*$f*uh*4dak)ft(7-G1Taj2Ncd={Nm1X4pR9c_eSnfey*|TK6y>K|96lw%=5NL$ z73MKu%M+KX$YEQ5viiYRb9DcnW4!olXO0$YR2S>9?-pi}^_Po01mNF8&;(=&Gi2dp zE$3rWZ?LhcMG`*C%ljFfW(fHj&DUCY(i-qS-JkiCZW>jJWmyM5Ij}G&AV0^EOrmA^ zz`MRpM^_hPx4`7V?tXpJ0;f3mqF?gD^vu-pyw>Adx z-0^fL1@HYIO-9+J3so-TaRHad0jc7emu$}(it=jmx%p!5rGu$=)J_2`s}9YoL1!4z zVeF@4(zTI0Y`)^`twHDykvwf{d(fcz5*wzoBr%r9CpwNyQ4uv}7aWt8DnYG!0{Uzx zm|>Ex<6qz5)tL@Iy1C_+Acb_RynQwvoM)`PgYf(e0R=-rMAPP`z3x2DgF&Zc>t57- zM;$Ac5SY0y>F46X5TbPqgWjnNFlYc61B4Oqke5=B}!~PN?6IX+ahfPtLuUdDkyOBDp3eYkO@0kIp22k=;*J@zX=*hZ3q zXRb?{SyU$IW_Up+ z+KQ!76R>dUFCsCNH1ABy~QB{S$4vV=3SL{v@8ISD^Lt?N4e ze`W!ISp_laz52V71U%ITrmZmM3LP;!)ofoh1E95aG(pLiFJJssrkB?kbOn7suXj*n z1Dy{f+_OLlJ^(qtfc~GH1&w`XqJx?#GCf4TqCk2k{Kat*p#J}M0{QH)B%nZU`?#{Q z(i0sWy%*%g_cw2Q_W;Ft;1RnqutQY^=aHfMqkLj|y1u=|JUcxNkt>`;y?-*O;OhO# z41@FPWy@HO6s1Os2cKl&q=6}G(yJ;p_a2j~pDJ{5Z<@5yv&VNV&_8jb<}BQ~`v%Db z6^)jPKSIY)B!KW6%ggI$KwEThN;qUh4A%=JU2%V_AVXguBy#Zr3M$Ff#f4o;N~&=B zmudxcnAjRFCgx8BWaO)FK|#kz@bEvDmvzb?hM%s^&+S&zD=RDSJmwx5si}7?Y;4M@ z$;rC_b84%rr$_o;SeOAPtlK>?G0`=6n+i4cJGx4xs2flM4osOD4^dJV$J;NcK#QKw zT9ga&7gWUTX=`hvpER)_KR-VY(eIDxvlz<`u2-!v@Bo;E^6^u?f=tbn8T)t?skmSI z^t8&tqj zRp%`Fj#anlD=18~f$cGCJHPzw~1kcbx zdE($Z_v-4Z5rB%7c02OEGPzQrK|$AtB`uFhVj}Pi?;v6&(!5e&Y`r^~MJm>Nm~bGV zY$zbw=(ZN^ZRZ-Bn(qAk{9du)gkc;8e~nP9GO3%P*;0O*()JwfkEJb=ilcjXeSNLb z2(Cg`G1EAI!_h6d7CY z%eD9Z6fNil!I>-oNNGM@O6&?CxMWi z&?H^DR0h3Hkq}B2N1TMuQ!9r9=Oq{g;v*I|F@n)dx&C-eObkX;RFwYaU}DxM;aSly zt`FBIYpizLa-VH%POI|r@;It3mzrCirwW`=BcpX3f`9?VO+kp83JR_Q10rAawqO^? zR4P&v0QOB$VK|~xWx2FCv?-B$3y%4_^h*bwT9Lz1Uo@pQNv6V%3bItHkRXH4%nxwA z8SMA3^#M|N;{yS0OPsNSmKhWjRQJ+%WdW2zCaxE|Fl7E@r9g)=lT0qo&O=f%G6Y(^ zx>kv=;Ch9actnA?2XySR;E9Obfmw_ly;rVjr6nUxm#O*k2DX1dbc7R2atPa=!g{_CQobu)t>~=UK}lq?lm;a-e3~ z!WP8ISsKNH`>{snvrBFFi`buXFv}zj@aG3}Q+CJ8ZS;o2Db8(&q1N7H@DiVx8lHt9 zW#iUigc8XOY%c#QBl;jBB&43%ckh(JxfD^Y)@21v zd}fNCH-F@SR)d2g0YHMT90@gws|_cN!ue>iX;&-^C%BhRKe60$iQim_miY&a`_kg-nG2o(2TqJS0gGf2~HHP3}m#fcDtHfK)aGO%GkFjJRTf zLLD|1ma=~kbL_Z{ew6}^&kRG^pzqc3N{2#≧~Mf{j{)ixb>+cKdH*JUJ$i#>as) zcwQITAtJ%SNeDO|YRL6{C(oIKOH_S)u8)V3*gmr`Gq-;Q4cjdONa`TIhEC?1y}?8l zg(esKRdI6ZWOjQK!}qK_7>o$8kU8jHI!d>2y;F~;8v}%ie69|u0Ctlas(#he1Epmg zFc^X+U}*23nGZZc!F`_@i?YFwg9a=Mjn2^_Sd21hX=zBU=i4MiNvYv)F+d)Eh8a~` zguf3BhLVbm7V0ldLAf2Eg1J-smdD|j{#3E16cQfiakYfBpQ=9@kwKdVRoNik_vmQj z?r+!=EuNk<%EfpG3k~t$pQIuwxU5Ekp_ufU_4a$1XfwWyzD5pIGH(D{IA#QD{RZtV zcraU~4K%CPtUrLs78#T*n9Oee4I3NV5|7K~YgjG4zx^c&N}&!zrOIY^x`1aI`msOn zW?mi@jb=S-7y)0>($^gQfw)mn!EOT~oUcdA&UWmUZ3(jqV4}(ZoKgCm{oEVZT^znz zSshOpwy8vYSxQ@==}!ztVq2k4@6(P z)lPVq)nLkA-Ll_7sHEEmHA|QxRawYE2IAUdUY#EU>yU|v zDBAW>%2KqMrb@Ka#{groP;XzU+l4?Jkr=_}dMO9QpTbTB6dZsFUO-g#}5!~M`LM~^8g`nK4X~v>FRJH zbrBgzMf@9rpx`i&%Eo*{gmH32M8wJE!CaD;7X-}bNQZGQ3FUe_h5o z-hqM85DePQQ>{QF2QC{0I4B!H{L%$#(TU);x3|k96Y#!W-QJKz{DQzhP2!D+#UM8j zNBnl)_)XzdZlWi%$&S}&+pBZQe z#)Y)ci)DcHTn`MhSyCi<-T<100S(d-&OkIoU!75i70bABRTo7L2Y6P_1t;o}T!Sr$ z0&-$#Lrp1hvUGwpp*83eFz7)RJ-|tp;h9PF{k^uxL6VCd z;S(zcN(o~=84;7J*_7z$tkd&z1)aT(nXfYNuMrTKvBSjd3Pc*FV|`{4xkYWRQH>q6 z66YxJ%K!&lI&&E{F-ffW` z_B@j`AZ}3e&OctUJX1A_RNvryRGqUj4p>Hb&*rMl=M;u5ZoOZ?`o%9hD^lm+8;+#q zG&ml8rybWy^SCu00AX?tZ4@czB<{VdCUF8jNgOu2CA-bzy)z)d-U^hXVLT}oDvica zf8Jj2i*}fKIl6yqaevZ7Sj5E{RiB?ZIa&JKXXtK30T zKb<&j*b1H5%Tfb2qu)A`zAs;Qdz^G($(VD&raKu=%JSd%@n0{$$o)ee&<+el0$4;= zB8%bok`hMy?cd&yk9U^3EBiEjqadCFrYzO!biFTFR|g7rpLE%*ZJ()ZaAC!JR%5<6l%X@Qk1K;onij{Z<79PW4JhcXggA!O|g8g+1g1jXU zsNpw*{+I*F9F{-%-EUEj#->2fOX9T7EQN$eH1O~{znrri4vPt80W*$oBkiB-`b~@Hs;UE-69dzRpsd(_MDOEmhT|bd) zbh6%^t0_S-ZU)&YmPRo|MdB=hDS4>UYDJj%w!J6&L-aN{)*E@wUrMw^suclF!fV#u zda>yY$DBSo3}z+E^?@Mt#V!5b78Swc-wgJ@i_Z$HYv^f!s3r_)e}L; zIKgATtI*bSvIyg5CAlt6=q`fGVezJaV+8oeqa3cYc`E1D%u zH-Gf_A~Q3yADkl~GX1#N`1mvvT%FT4@R7ELQ>82$PXbV243Y^q|x?&oUef|BJH)mUj92AI!N6mLK zR9SgG0Fc{X`I=kVq@Tq9Txf7aRL^D9Zq|59yWi=|(F=-B#Nip;GPnlO*-lm-$+W@i zP~}8seLrhsT*Kz(=HdEW4Olp~;9!6v5?ng0tI1HK4#?r%308P%Sy|8jfEd%RdOc%X zioEJ*#MzYrV3QA#e%Sz2k|96`fp&-8!>#?w z({#EGSG9BU0F8R}FC9u)*6U!xkpte`v-^AC+yY~a6)grWL$(5@5Hb;!@Rmq^_B&&u zIvtNm2fE1&T8$E*e$ECyv#}e#mF6k~_s#llS_`R`&1_n(wi;xYi6&P^p^uc5l!!+S z(-r17Ia2YQ2aMG8^z>tC&0Te?M9as0e7wB(PV(~d2kopQMy&D`219<3{akG+v#oX6 zbapoetMtM*bIePPbSBe63WZA2ZyVi6&uRo8yL;+}-JYZ9`R!f_qWaJJ3Hh z#X$@XS-^B}CEW&RwRGAheC>~=g>svQ>`=QO!zJIQM|AYHRWF#5Kn7m-$Kb<|o_D-^ zz587?O7LcxLPFz~k7`&2;B~&rvU}jD!S!dHZ)SpCLnhU2D(5=rz`tJXpDbU53TUV&eXqgUSsum*VKRG;kD^%35$Wh!QIcP=Qm^X<|3_!B4o zt=vX4O7URenK#t$Y94x{Nb4AvY?s3hmQ5WFX32R|!5q)5wzrZbzc8iShL;D_=mPVj!`4n<|pDJ=*33p(V z^Z^NL@QRLJ!7q_;sC4<+rTp4VcsM6Xjv%(ESVN7IeV1{AMVSP+SCUtM|9Id2x;;v- z+4>B@K%F&dVAHb54c~LPuIJwI@#WjMZ>bc_8-A$Zoh+%M$R@K(=grN{sR!BEb|#h> z3MneIeXD$x8+LYwFwDcrCESV>K{~{%bzPQDf6D82ZAJ4b(~l?w^dw|go^GP>uDE20 z+^nDjD6t)C-3a8FHO2!G3SdwToE;n<;aFScAM*GbByChNmrDFy86&rvu)2Up6w(1*$e0wfV)u@u;u)!hdhN z8xLcyuNu*bo2t5^~XEpQ~-^bUSES^Bt zff0CMYH?fZe%;pGIAy~#6VPbec;Ig_cRxI8^tk$5(-SLpIZX2#whbK^HkdO8SI8_w z;KirMU#tbtXiov!pcBwux)D6eQ0cV899&XNw%T*_NjGx@jFxAmLPof)pkzJ-HtbXYWZM!&$ditm5M(tiYns94cYjS1HX1Z{ivR2V+Ma)IR3Qt#i9M zZ6BY8g*PZpPcJ>a=QjOA*M_Iw%qU>Jw#|xYTpYMX>tX2oKp^gQ*6e4A9{n1JHDfE? zQ>&MJ69PT=E35IC$n{FD+rIWqmuZ>+mqfQMwE+3%1neAMdJQtcR{TvLQGEQBTD5=H zSOnVO{dq#xmb0(1hNVZF8RFoHp+CmpJ*(~S456M9%?72Z;I#oo(Pv@0+SEv~ZbiJc zEtwNx-O{j@gnqgc@<}8eOQ~-|6`bQ9x(|yXY&%}w?Ag3&L7gQ{P>noW!wT1#gJxOE zxHpgU$Bb80X?s<+UvPJ~iLU)7=dn>X%G@X6tX_5A_V#7ywX3aP6ib_~b&GhTP0Iv2Ue*26u&#mDQkBh~52Rw>P_*7Izkv@Y0K;ncDk+wB zn3MNGfh|I3L%mC0s_Qm(zqQ&1IpT}>J+iw%@J{9^iEDM9pc_*av|jxDMf=uQ#o?0g zVd<2RGQph72~U4>`$Y@iaFjlQKave&IzP5Bbx8F14jg-g= zJXA)Sva9Z4&*TU5^R>bvUSNxrbv+E>z5cgUjwb`kf$)SLkZ%~} z@lfRpCyVQI=U*UK)h9iN2IUu+U))D3O(spwiZvVdXL05*1y>#4ab6`L%RnyUEjD_D zY<(U!-DHJ9Vg*D69E;~k4If)Y>d2I>30$aUnv`>R6+Ydzs+qv{@kHSHk!^Rh2odbpyzu%=B>~ zvkR*bID!04!FXv|E7mkFtI8RN<0Vxc7G;L(Ti9+q+jUFpHVj!HIW+j}xuk7>D<)_zr2zcPT;ZXtdk3 zelbzeR6tQU_GWjTVT`EncTC5Z9qFR>COfVRaO!l~G9rE*{xmFLSuXIkrT6hrXljcw zVyb2FT&c_HxKMJL)F!G9V>@;xi&XI9ZE&SJn+o3=kFYWJC8tulq7J}hNc5=Dxt(;FOo^y< zbKj(o>PaTH?#vFyi5@LzVbPJK5D`FyB*%36F1p9XFlf5Wfq9~|GQ&RRD^=R?BrfeF z<-XD!vyhI0ry8HB;KM&fbJexAoWoz@JAQC3ts}MlQW+d<{Fp8__czJor+1c4A7GHnVFeNvkg@e>`skGnw(^q4;6i(H9gT^HeE|UsK(9yxZ z+r~?4P`qn2!@Of;?4WfxZa;E6J``=-plZ=*#Z^64uWTHj^F<2M51xNyLWk}Y2kQNP zUEj)wsaUMx{1LEfx=x>w@x~}x3mx%zovhYpDhv%J8_2IzOTY~++RQ-o%+XdUwV1Dc z=V*(j{F+}YDR9bSoelpcOm7h_CcZ0Qy{5Qy;&sW%V9Qwx*K^(>5auHyl6t<-*O^d% zvzU^Sl6U<4T6*fcm62V`%p7RTu??VEV>mcC7&C{P0xVh=We^5)2zmB)T=5FBWK;u8 zUx5M7MZTb^>~*q7K=O~~f6nzAal)6-`nupCNY_x{D2EK!8~1DkGnZe9ag;!_Wdv01 z+}qKz{HOyOnn6lbci-RXb*pe#%wNxdVk)rjn;!^J2^Q`1U5H>I%ce~L9}_a!^GPGp4cJGQ{6l&1^g9O=vAB#qnk=9*$M zIm>5tNQ)V|fc0;;oNGf&_dM;R?A#MNHK1zjgFmwB^hv~WzB)NUE#B}wSY7ZwwGcum zqhp!!+Vt{rg?ly?WP{N^yE&sqa7{)EFftL|&gfh3Dz zc^?y*I~!oE{gP^K)|cpN>lhX#T|t@LUY)w+ewA8U#D4UrrvbIBcXI-95g{3wKOTmK z@qQSwU=AsqvqG-s;PwSLZq4&?@BSW_(8T%=9uCg!%=ykZENFNrGwfZ3pjV>*QmeK4 zG7Q*J%%8cr$d1A%hYa%8%^%EsR8fZRVJ%bpZra?AFJl@_mp~_xbboIc)gTtJvPq z(3dIGSt*jw_oEjnRLK8Mn{XB`zvvJx%01WJLL9f?j-2Iz5{dDgWXpH?VA&{`n3y&I z_w@s?>-KWNRNXVsD#;Ut%8WYz_e2dB4@hqS&=vfN>53>5BW1IdBoR4$`xF5?#o&un zpH3N%UiT|*>s66MXI{aqtStYBz47t!e83{KU!++4EuKboF-P#owMuooBm3KY5^A1O zSPa8atT&BVT3Ty7pC0G25bLyLrk0xyZ`9_jK@A1^M+(J}0V)pT=AkA+g3ZuxIH>z- zz<)z3F9luRYf#q`^=puQD{hz`85zkN-*LjO<0&qj%weN`K4&F{PWC^u08FyvnFS)} zyoi&H;*0T*oT6)>h!ehr4%Omk&~EWD}Iv^$pN zp1m4RQdZ1=GXL||WMpKd{#=da)%Q;oUuR+f^NkHUSd$l^gCQ_b?)}6r2@K&IR0$%g zyO0RRbuxD7CKz0UbMB#YGpEJ|O_;17?*qsD5wUuwu9k)(0sTX^&Sopw4LZj8!08)R zYLBrQ@wBQ;6p3vHchIpIwHhr)oVAt`OZFC<8suKa4>6vcv<=KyhjfFQ0D)7&p-YV>qL zVB6D98(Vm{+yRdp%I$ba2d6RYxhZi9CKCLxZo>%$7(^nS!!~xta}l9Z&k~)R3}r@^ zRQKU(3tKuZSNXywiIJybOB1ZwSln;~vlE(jL z){GnXaYHp^nrb~BxcTn(D}-kQ4s^T+2L`QoyTCsAfL5+czc^e-#$?c-T4%9cPf#A1 zF4qq|=0Y^rGEI!HU)wk^>uE~a5Fsh}C}QkTg+lo})ZM!Fobp_F(Na*~zD;le)ESBv zBtjr3oJNyNEEbJKv+Y)RN_C6J8;!0Tig#iO=J=h?n2T3`7}m3I=IxWtE{RH_Q&Ljs z=+3&;6}I@U^@X(XeYJ^RAs~>1O9CAG*3iJRYl;`lDc$J%#gmn!XW@!SzQBUq02Dl5LVm@0z{USG+$g?J8UXsF6wIX)so^bp$(88>EGSOH;fl5i5 zX?AyQ(kek#Rx$YrQpf}}3$j&B2q;touiw4_lWlTL&t8EL0An}<2dMVVUC>|aM*0<#hgoIif#Kg&-i0ponQzG(i$Dy zg6!}RrNkbX$DLfsl&9711hvN%Y#=(kLXp~qg`i;T`ujkn6G<&ezBgOR*>nk-6iLK! zvAf8>El{j@o@1s3CBv5&VD50{xihyEj84hxGy;VePTq=}4jewv@!H&Otvl>}IK8G# z4v0q|G?{ob0f>=0qzGsuBBFej!}^4?g-Iwk71ee>7(dPg4OCxENc;)-&p$u*C34vs z2&<{7Im2R5U+97}tP>@qKA$1SL!bl1ZQkcId_Q^@#Y#j)Wt7zS)x{HZBGUk$7dxPF zWK*y0`A~JybhXIh?CLrqDJ6xwXY-;{{RdD=iUmnvaT;V9-D1O=IBzeX=aeNcw^W~* zPrT6|tA4n$GU~9{=)9Q9ZEqu*TArr|=4cxCO45Qn0BXQFQi&rD4d5z91z5xeZ4fCG z+==HfT=)a#SYA(x8Nz{kK)GzQD@H|jBj!=iJ70pHgIyquI= zTsQY%KD2)C;ucZoLJ;bCIrKqM%psHktJh#HlIOKPi0tbdhNP!!<7#U!l_ay7V!rcy zbbGpQSnDrtb70*DmW1*j;k@`?{}K>s-Hc&BvP(zq8%HAhE&z40L)f zfx0$2)32^?_u6jd0>G4}4sC~zSivi6+9*{mKnTn6q}}B!`s3D!64gQx)Bk;(Uu18e z+?S^dqId5;f9>t+Nq4UNw1#3SPZJ0PcsC~pC6u`$%6$fGXOp@<>69O zkpTLBsnMYv>40Fv7XvInv{<8`#9cxTvxjyV$pDn1vy#YNIy^m1*!W*8sLHI7jfOu1 zqspR13lvyOz)4Lz)Enr8PfvzUWy7P=%bVO zqo3-`Y(J=b&=;hojh{qw7mS>4j(x8Hn;Io4DT|gZv;8CB5BOwU6GAOykw-y8Q;`@K zcfA5C8=_GgC2RETa-mdza?@C{nD2s_*zJSU!3LPg|AHQ++t?U|vDNNa5<)^km!$lB z^Ae9=Js-+-b1%n1nOgEy=hV~_v7Wy#@2@^0_&3uQbh`kkj@+A@8#f0#JH~{>#6o=) z92MJ28A8K6w+EnINFw_3zu$>9LLdgwqn+X#q}Aw475y$yLqkIw)Uwx|oSdj)X@siW zW-1$-BVCJ1PsJKgf?bYc*x5hU6iJ@iZC(uAj>z#jl| zP(?VOon2`*Shy)1=!Kiof?2Al;EM@1F2FY2>m9WL2c|M`rg_`A3cH1iJc0DoRogr3e+l!QY7 zTXyGzf`>VS>k!**Bl~)v#RJg*ikav{ z^zBRjSv(AK?jo=^Ftme`xvjmS|KXbEUCUOPd`XXx>zkYKgV+CGr;5p{4-fFag`;O$ z8S#~-1q zpZiA>Pb`z4e##)CxL7E3B*W!NKt~7NCvbW~jGr9;-H%5vFCBjtklnRF(YFMA=tBbo z(w*s=P_SRBfsvwkAOa`}1;8ql0nQQkJ%hi?Tz&lP7iVtVZ2*_`MMR73DH;>~*L(?m6a#q#>vxmVQQXp$ zZm*Xw#qP-YXAP$lQE1M4!pQ;VY>0mN){=nh)d)d=^US5o8W1*}FM5+$kAEe~!W?iYnc{rqS@Hl!*RW|v|>ngnZz7P!BLm+x;qXnijz(<#!otm)mpQC zZKl|PLb3jfsZerisfr3Q|6K?>@rx}u$^Y6b+Mj)S8lDO9*_{XW4&PrX00wygAXcwo zr}>7Oq`AQo`hkAUgVyjLEG(w~DG76_j9n=Mv_=D9MNeXV^L8VpTw0p3XcW-={5?JVr=$$FW7FkRQ-ogDp zo50Y$5)g~{#}>Z^fA?R4UW-FK1lV_XhX1=1r6b~2@6QlRNKye7-3)WNx@&?Ni4C5e zC3@8Z^7RrOzyCia2MwgNkf8r9AjE>ehmw*Cp~IWi9egxr4r#Bf6h`1_nB2rqh9J`L zP?nmFXI9daBp$%-M0+z1Vneghqx(QB>_U(%JBlYVsw0{wJL+4shPH}@ z2=n-V{(zVN-G=Lvc280RO7v~K#a$Ox5j!YQsE>ENX}OIx94A$`1(#9w-`1UWAE!P9 z5rV#QURC%}*7}BsdioLm`nOx7{RZ_h@IICQrm)5p6b-}!Q!&WuG()7^(&$F*=h4sT z0SkIF$Ir)L0B=TC=FNXc2=otzfR`K3BxZ3iesTl!=|)b4KtUk^Jql@**6UE`SP^{| zwl%?l18TKg1qBQAfb@O58lfjm68P_;@f`HoE_0$T$&wZ8J86l~!$5$3-kzJ1z#d@- z&X!h9r;zN9&dx+o@tsP(P*6KEQ!kPrOXnydWy4=*2GO96g@py#^Dp;9ZkJpnaI$sE z6++vW0T#9ywl!6)4h~M2c(v4GqmO&WpC&0|yHiT0wo<`q_<~J~yX3Q9n;YpU-jq0w zcyttM3`nX=oLpSq+$DF|Y6;2_s8U=M_g!0m1^B?X!v71`@8QK<)v_G5zEVAUPHK3A zKp@ruN%;fNcXuT~HB7N|^c>U>-h)e)WoKji1~_>hNy#jiwxe5FS=sz^ZUKRv)}{Je zZ7r>)+`K#&Elo{fhWMVm>FMcSpbW$UbC#up!@33AeJzy=vr((@6()@0;^I0Wq+9{5 z5svm#`T(Wk8;7{~t1Q1t#ZN5z1{nL%{dX8i2Ss-M6#3_fqa5hYKHKIPrFVO{ySruZH#;vX>FQ07w zs#0FJqj15JR0#2B>rt@)fRV@@lCx<0D2MD*&(Yio5fC9;fj{Og7R=qN4d8&2cK%go9*;;$;x{8WFeE0KrpW%rUm1|`}{7YV54b5aI!=-bP4E;exfb{lO){m;| z{|jJ0Wgwn60lr>y3?Pm z29%)@a}zh3Z&I8f;TOFxEPU`dDCl+>Bni%b#~X^E)OGhY9-F)<$h(0Cp{kuJD8BKs z=`(=G_kikA?WB^%;yIS?VBrWj)?|S8QZtA@PjgdbkPo&{^caJ}2ggTR1dIa5^ z(0np${MlsJM;@A(INS}+yJ3;s9|PUb-q-RU9vt{Se)~WG{AJ-2>z);m;=Tj$_nX-6 zpVHji&*E!0ioXEQdm(P2ahL~YyBX$do2GF=G>2YZ<@HSfOy7W$H#d1e%6xbNIF&1a zI?W!WYm{|?s9^-8?X{Ugmadb}zwCh-B$ytxSt}wcBjaxm=GcAo-G5`WPEJU;BV7*8 z|MMwe=w(v9i%Y{70r6kG^%3{CDUOuI^eP^W{?nm^o@NIxi%R*U6B2gw^7C8Xfy9W_ z>(9{1U8BmZ(wGc?se^-q<3+!{O+Y|B4Dk0~M3=O=`tE7b6Y)^d(!PG(pT^4yvdCl4 zv_4D!gYZBQ^E&}1nJ6l1YPSc9amxxRJl|J@7CN30!P;H!iFlYBaG-CsYKXs592f*# za#B)i*+fKY2lJ%7#KL{&&8M5*dM_OO`tGzbQd|x$-`UX6(9H!%&u!zgEL4600Rh1M zdplmfvOVPw+ z1EfDzw~VFvW5AP~OjIka=>ux!V_-D>`s;yxVH2pP07)`S_2a~@tncBP_Uq)qnqRCU zB8L^!pn7__63P7fo<;?edDBbH+oNH5AAsbxZj8qmG*E(lkGeK?S- z8JJ<}B&9E)Mh71-bYE|6fXSIih~3!?Mt9=$?}YX-(K6?<_uir|+U)6#`SPqikzGIk zzA9jo|3uY|4h%S300)M<&m3pyCExkQ#honqpg;wnuFINF6rl50Kr5zY=jShEMF}>3 zuo!1&!XP6f^D^J0(B-0dvDBImZe@eI(0dsLjRD{pSeO*I`49I2BpS`;(+eC^QUN>G za>7Ybf5}JX*hHusMVzR>ne^!&dF87Z^aAYCcsJNQcX4&aoyaV|D8$b{2(*W6w4yop zZ@xHQ5~K{^`NcU56ADvtY0Zwu8^ka zBQYx!1gHlU|BH~SQwpTD%L#LFaS4EfrTajVtw~3vgHc7LSS4*8*?(1khW#}Hz0oV>w+-!Pl{((7z*^H)C4^M7b zp{Ap2`~oDictAb%knwzF0zvo?P(}yZK|#XXRU9*2I$z=cZ%DD9^sJJ0k#hyUp|0jq zxxc-8BzLSiuWkX^j~^KK2fFBgS8}?B&H!AwG+K+@zV&DVOvr$G9Vp!3OsoepG-3& zNSO7iTQ5J5;<+wWO#YCwde2*f5~(gCpiTQIixIt_OME{}`BIgL+t45&zaMk%F-;ENbP z88LVPz^reTk*K8hL&n#yKHFf`d;unbv8Zlf&YsK$!nb_`$a5d+S(C?_BY)Lkz9!WD zM)PCMKF}sA>JFZpQr8~+AWKQ7%J@f6rMRs#<}U@R@}Exqy1?rLbyiht;9&_Fwqce; z&+aXJ~iZTuj@!> z99CoiaNLet%vpOM@Mq^7qwEdJQ)Jz0{&@MPs}Xl{G+aRz07WoSha-K!xP5t^&?-ARS{pC z*TxrSi95^D;V)M4Lb;LP`4~o%b7Hm+;@ryL-k`lFT^DHl6;vPWOf`eK`KfESw|#9^ zG!y6gDHsSGU57y(mv@bLmTlThP`$=v&Fd0J9}{d}ZvmyQSmss58vC+7QvOenSM!C{ zC4{crxHvfMTtJ|(0TvBTc*G&aQ;$3E-@li+ zSklzi&dtck=;3QX zva-JCy={C6Zom;lKV61PQx<-@`_OGZNvzm#o~vxo?O~N>o?iSpN7NwDUvlJwT9sPE> z(IoAK{(}6?aq2R`g?1n))nf(?%N=H2k zoK)mp!txz1AlA3Kr~cx6nMdl6%2DZ)gjINGB%VV;l2=-Ex41r+O#4As4X7JlVT!i# zcT#XOyKu`-u4#r`3>L~S0sX`4H(2ktcLNw3F_n~+&#r+5=wpjjdkCb)mJV*Eunt!6 zt(LCt3tn$iYi3`HQA4)-3Mw~Iy*ernqc>i$eNa~E1M%OzI5k_%(|~P;b3E#bHwP^P zI0Ul*xg|K0BhU?e-Xj-L__>DG?)G~y!maf}jWq8Xbgj(FqE%!$%zQ;GGEID{4??1H z;LS2fWV+qW>_t+@TZn2mV$IBzGYR}F)Z`vRHyD+yxcK zFGhU+{&z!4{9PZJz3XSIdQ|q5><+u4z^Y4X% z#wxRka=;_CJRaI4LYlKjQrh`0AYk^O@0|_l0 zHLDR7pK$*#&owYbZ;rKd_oyn?QQUho)at3u6H9C?@St#IFOhFr3usbfn9~oPn$0~1> zUbY;s0VD2j;4F6X8Uhn99IOl5qc)3Z9g)&J(EHfSI zWfc{sKe1__6?{=Yg1+q7pS?{;3|UaS+rD-uXaJuUl{(#XeaC^u^6`@=Ukl~S@_Y=Y zZ~vDCSoE)MaY~9;Fq*!nqYOckpKV#*UP4fVUCc|GHyX(9w9sf6UjW$m9J*tuCYRpR zd<+I6XssQ63XhOPaa`>O%{vFfaq1vG_~7Q~xM3f6Q7w$Eb#2*`^}C1mG;p3a-<)sb z0k0MDY}m>G%Ux9MBYj1$bNX4`Armm{*V`Zv*qE%}(T^MF) zLJ*HIG9WmA@w3A1QFNVp?j#UXT8AeJTLM9g*7^mII^?hUEn@fpqDuwh(D^F0=pKsD z8J4QVgjCu4c9bBFw~w8*11nm45ftGmvT?e*;0Lk}vVo+7-aN^_^I+d-VwaZ>x(2IP z8lU~oMmtYqv|9ORnu_r;srq>l5 z&DG>A0t(cHMg|5}!Xq4qp$9h`rHQ9xWL1cn%)KvXSk|e1i2-*wjP{LRC`MVB?|N4^ z_hX3;f20h)7kx(lR%2u1R!_nmzb;@gyonXscz6EEZ!G@aVuK$j;973aH;$P*!i>F) zH-_VcyIr+qY}#PcE(ve5vt_U{yk>4Au!``&GVuDrG+q23qqMcWKfgCw31lhOP4xlB{#W&T zwR21nEx<3<{LVDip)pYyJD~0)Qc?#$s}yNiO3u&c=$c>PxltTh)r@v5(F)siK~3M9 zD{?a&<6gmX`+u^;!BE-1)x90UFcb1mZ+yw5jjrX~jLsz}POe@O6^+ z^DMHmH^*R+emel0w*(P`#Nf?o!1wOB0*0kGTc66u^yOuF!CT_cZ*k(Iqrk|>R)39r zTfH^O^Ic80 zy$%wLO6y0Oy`QW;%uvR;DX0aLV5#)C;t8`t(sctW&ufa4`lK)I%UmE1*9wJLO!Sr9dTrM>wDEQG;Nma42^uJB(91ThcGX(LRYq z1eZVAF)Q3D`OeAiU*Ie#D+l4f&n^g&gJ>2Pm!Ir*soDEn9uyN6XQZZjxmjDUOp7~D zZohrI3P0IfYV$l=|KT;z-~aFD16kUkc!8$D#DzyDRbzO+UTsRH2*d}BcKm}8iOSU! z6)mSUH8pWqE9@Y$_p)W_b;>wdqfvjR@9iiOquw3vuootEaV>*9(tl}BC2v`-nNeA= z=~COy#j%L9L@>0r z)L&fE?!NF9C{}QFgofM-_srYI6<>#ivi;s&!q8;%fy7XWv-pz;o$|&h;0u(HlBi#F+Xo% zg|oj&nE{6ujF8)quIEcvT6*fxJapYFPPi^kScRAc zlb&Hxt7B?DI=N0>E;(!K7-yELrB)sf-e1>yuM6kR8zy;`!Z6=gG+Jbqs%<@IDf{}@ zg8zp9i~fAkHsdBto0G$yE@W@LwM~?roG(>R)CL;Z zZJd04VvQd-nlwYZDUGRfc^!#O_BQj<)4#m^_RZgo(pOi}R6#5f!AnbH?6aC%dmXbY zG@X{6ZPMmECHU?`t<@2J`w>P~Urgv7t6ZsVsDgjAaJjQKp(!y%mTlaMN{(r~unIHF zq=0Y?9-cTW=-IOdpoKP62ATYvxTYNxY%~6NUg6$m9Y>8l2z#txKNFB(DY@8*WnS;| zZ?|=ge)*2Dr{L3WU0hwOeWREK&v;8WVvX<1EW+V7W!Fg+0OLx0jF-_28q3kAq@)N| zef$UuW_FItc8>Gyr1feRj)?B`=*fzq#(qlcuzlJf9HHzg=LQ{;ub&@i%fYBaTy~+h zC!g#0S&(Kq8A%UI{mFAYj9x*8VywFxPYlc0qECy{y7ZR$MSwWgg2dH+;Lv!(Q*S$O zgub@xZZuKQb{a{y@T$h61nJ#|d~l?O<;K1wM-9?fhOM%>3V4)Ehjs3bPCX@NcqiJ! za-q02X!sS;A|4jrKQOT6c|59a`FiWqS8p?5QGEpu<)a{BiO5gS`c$ikYeGsWhnQf3 zWpLVcyoMr@&y#Rlx;Nu`t)oo0c(M6e<{svmv0)eC$%5#LUzR&sG1rqqjzG4#*?BzL zcxgjIN++btZ{%F`eG^cu`IdriZ@}1daWUTe0w0 zc)@ul81*{!W@w&7pQz_7J-t^6-Rl_Wz*-Mjbr2z|?D#zR(y3>u$;r!j5lDk|cl7i9 zUiL)e$H_1rPz;$g1JU*l*nh(89k7D&uvh8|+_J5TJR+SZ6m)dEM+9;CZkzhSE7#)$?Zng6)fk567}{a_B$vPVZsSpv-32f~+%l+Tw*7 z#lKcyX;h2!?3eb*6TH(u9=0&_@ku-Ha-UX(dP!FPWf=~z%GQN?QnyqS;Se9jZwP#F zIvu_a4-Y4aW0JkhnR9Ix*iB%1_C$h^sggNYbNXCek|T&B*Ff@9B2x1-^(zPSA(sn9 zZl7ch^?K6p*E0s|dso*jL7-8<4}IB_@dc1*Cd(}J(^#oZ#T`V}Gl~!P{L}}o@DD&V zoZsrA&>wVw6>5Y)iXfI#V?)UJEvG1Qi`e*d4Zi93ZqKqz>R-i264ea~26E7C51moM zs6Po8f12;^Q>`BCv)k<_%8gH$SrEU(AtQwR^SuA|CW&#{)n-`Be-E?<$*eZ1TmMp9 zE24y_^pU-VG{OW@7f^!1An=oj*mGnp5-a2+`k0eLUY*e+?5%#{)Y^d1<%fH@6jBg1 zz2uWuZSqs4UEG%s?JCahT6sTzPrCS(xsJbm7;lE0gt;Na;g2jQJ9}v$fK7Q3Ew8o6 zg>vqJtgMj; zWFoT*28hfImakW^wl-cN*vN6b zJJ@TiMDM)#^QNQ_+3FY@J#e?l-X@5mzF!=>6502e+Lzha1=w`7mMM*&78bbVC7#DHe{z7dPDHp`5TvI z;OLVU)w`e2dy<@f~!k5P=oAK&yHLFlk&`vEHQv`GOkXrkX9|6+?$ z>`KB3#6!Bkm!8Xdskx-zHFO8=jUbjXi9xVZ znmYbjkYZUbU$E(jyLot87s#g1&58f*ffA)8c_GP2Pw9;ADEx3raUtXoF6J!x9)?Hp z0c0To3qaAxLXZa7g$kF)3a@u(um^JvC<>e=s8~kL_+W2QbU%S$U!8^@CocUr_oPyl zdxNc%UEaLWr#x4&>AO7H4P-pi;1y2A4> z`6_pd1g2TAOtDkdn@Oz-x?BpEPaSof)QwfL6laq-UqdR&fSG@2Hm^HEu#EU{9yA&F z^zajeDi86S$iPz)*S_fXfbhTvwBL9p3xk9<+8y0VRi?EjR$Zaw4cVK!;`wdsl=m&_ zo5GA4RCKuAA{A$ErA+9OH;6rGYA+O~Ki|Fn)*qhDF|F4TN0vI_w0geN9*VQa4@%V$ zQFH|{`5?GH`uV=1VD`3~#?sC<=he5W&zwKum-uoawAH)BuFG`NEDoMZrcFIP-cKNh z->2Fg8ksJJZWe7&Z(>uoKFu#yk@;l$XLlJsi6WrY@>}wz`Q}A%j9KP9+e}7KVMlqJFTkN=#Ot5kJ;6H1RM5)m$kiQYMW0oW_WWj;4#$bO_h_N2jLkTY*_o zg(WE^71P>>NgyDiz3*B1Sz4fG9QJ}yFLwhQV$|Qs(p47UuE{~U)?L)Y0Vz+dCCgU4 zpY&XNHo%UuT#2gUIPy@Kd#?O!d1GvdMaXa2hazvgf6VKgG|;z*PXgc z&@7~_!Tu;y{>Lo+8Pg;%h)jS&l$_&Wmaw0`2OfMh@H54W>~~K8t}jj~xOAq!fSQw- z(t&@iiL-=pzBfssO?y7h zBJ%=q8c~wH(piqtOP?7ulz3`wewM0}3}h}pNtQUM{+hMgey)1W2=M_W#*8-C=g(`o zaMSV+$6_`lb5`-}rFjGtBxv?LJV+{ry{}_JuO|6r(g?pKz zXT5enunt8$C~&8+RD83yO7tmo(4PlFj_L-knaYG=!kn%n2(FyJ0+p|4{B7z8Gsg1~ zw~`AAZ}Lc#PTq)4MZR}Vw+hQcu=r$WOV?)p=QWIO+>ov+j|r>QR@WE5PvcqPZ;S~!iKivvl;ir!tCse3J^o;TY?=( zu{i2Tx2jlJ(&YQj!8JBGsAB!o4^z;j^lV{%~w<_x@wGpoV zbxESZ^=_XpyA-y@$Wvu^B{ux()SaJ9KHHF#wsPp<3#CV-wKI`!Cyf4LNAdPP5FhwB zNNI8!mi~*KttJkWCFI4r<;%DJgekp|oSG(s~=Vh%^eCbBa;qWt7VHz%8FVtE**(5crSz}AL1bG{&uek!y!*H$NDb|h5Q_IO>8^o4hE_1@-(ESfLX4xSlA6#dM&gj7Y=Mn-)CjDiWDyT(u#v3D-(Z!1PbSTG~Z z3zC-{iJe+N04aaBt`!-Xw4X4l9FQ5jZib}BI=h!PvkPOG!WQ4}H znOkv>neHY5Ty!Pn8xVI|wU)k|8Cs9Zo`?rQM1LPs=4Ku?w9YBg%;K|G`-FbIBKPam!R|9tb7T7_P!2--TbWp@>uE*}j6{xHBB~~= z>wEVgT|_PkPSIf8!~SPe_h86|DQ47sPEjK$e$Zv(QD7VUNE;H`;?ahgoKStdr-QhN z0}|-X#J$DWp|3v$a_S>y7K!<_$pr~CVGjry+X4_pZo(QU7WO4nyQja{$ zB7_~>UWenBfi|(4pxN*}-8@O*)crRc(`?g%(OP7|Hallp89ke+=bS#d6(feVyonn; zi^qq_RgpJ0_V)HJQC0jxLTw*`#G=O0Uk?lc4*e@bUk`pj*O9zGF@hdu`4>~NTNQ4+ z(0Q5PFktMEDG2W&ugli;VC%!BgtObT2u07=I*oBTTjKExJ?#@20n_@R;>-9PZ z7Z9eK6l%a^l$AUUYB>I}8+kLL7}yXQihs!ge5X`uL07T!%tUg+Bel()uioh>XIYLv zo1jmBMVL{OaR)BSuId$2;0d(6$)99{%*;oi(0nFZwRK2A0~H^{_hq_6J+R0xKJzQK zgg4y8jyLKZ)}TW6Mand3w@YTjR|K`OQ}UaAIRz|K=pS6(-$GXFcRm$6{R7HhuJr&f zJ1cU;L%Soo*xvpbvbtRorHFy*dT-_W5TIv!8*^XxI5IA-b*m6`?u5N|EMcCN>6e79E7dk9x z+z3Sz@;kAz(pb-sqB0O4Y(E=lX=aA9Fl4tK!=VH28#6Z$mi!}9N9V%mohi$6MhOz& zUpkxwB$M5TwijFLfz>y?=KrLmqy_*3HvR|1k)I_}4~h0$M9-5g0!}7wI-0|a;-L1y z$B}q!%an6*$$QmrCz_r~OM7J{NG%J$P0V8W*hMWBoWD8F{RHDyx9R#LrUQixY|{>Zm{# z7etXAJA-GP+=cav#+Rm*E`zQTWk=Db6TsTE|8)@}T82k1H|5brYliZuTsLgt?PO4tRi4Z zKG0q$3x_u`ds`Cn-UDg`DRUbKgp%GQy7_Jpjz;_QtK1Hv9B|^~{N9J6{!W^h2U*{@0~O?LPsrKbMU zA2U7^u?_&0a|?hy{iwbUJV)VtJ7v$D)7R6~H2Jq2N>YE($^0(~qnD$J7{(h7P2J0^ zud6#_<*`*T=~dHXs;>*UCZ0yACu5ulL81j2`;XVju5fE{7;{iZ0zIr zlJVf)Sh&f?x|hG0Z#Rr)|9mzEPq|OdMQ+q0a7uVIzxum6{35f4uqc|uu2XbaxoTP4 zu4Cn|S4%vz|FexCd;hmC8Vvg<=l9)&yF*W6D|18&Vwbv}h_0+zTG|#kGrlZf#couY zbe~n9U!eKlCycRfU|hpV($-#E;Umgmzl=SOn{`r=Co^0w?=VueQZsT7`Wo`7mDcD= z`nMGLAyYh3qA=?WnNGYMaKxO=a6_twLOG?3O#-dPtsiAmu}|T;CpDzh(Z&fos$Zb2 z7|x)7K>JYrXFqaYeQs7}y>_yl@-y)c1qo0FQJ^OYy#zL_>dNc7Nk&~hBtI;ev>sc! zeR{P_Ie(-1*jmAj1vO#n`>lIDohVr(d;hD2j!0F@>8w6z#rf@?gspp@vwTV==q~tD zJuvX!Irxzm?=q4)LIPXC+?+MJY*Lg=4Mw{PoiDj|yMJsYqi~J2J56?2#rdMn&xXY4 z#;|5Pk&frYtQQcRMVq01Y;CR@^6W^@*(WmR3%i!zy6MlTn-}NAGfJ(`IMk#Drlv!Q zY&fDY{=#64H?gWC92imvAn=&CO0w|M?^iuU)SP0UsQLZ)Tf z*0$}zSrQK%`f-TEg}fI(xA`z~n)_13u%Q}gsmbBW*cr-?3Xl%Jnhb?UGQ$bPVE1ny zVEr!(kY(LEqY+FIEKJHZAfAG;UEam(0Jkezso<st8ljR_(YeDEgY`UDvmFSyRzG;EV*SXPqWNwW=m(kw9);k~PeJ!RX5bQzCA zC|}1)XsT-R_y`QGWxA`Lt^V4i-Np#>x}$g9VeBX=&~pFFH7q zT^j@|ux${aQ^1YE!-q!?^O_YzJbbae)jMFtqxSm_^}`|*_L)cpM`=2ik(kb`RLKpj z1<6> zE3%d3NmhJ)@^3n{hF=cq|7eMf>pBH zViby>7Z`P*s5QBldyLG91&=#olOtv4ORDeVqv9|-bxcP8uG*H#_11*>o>Eq4a{3bU zvo-dR5kE*#%=k;8yhdrmn+reh(lD}wdf|SmE}9{+jnGa|Bm*i_5R4yHfV_^u2311D zE`5bDHa+$h1X^ca&8^^Tne`ror)%;gvd1HM#gp(W>o+apI@Az%&>+;P1 z98|;Cth!!INC{(u!r;^CXA&0eW-znSfDXtJ(dAu&gPxY|s7bRz$2WTN(yCQQiDX+Z60CFt7$$U4t7if4JbIHNSB!h;ff#KRQqIrUKVe+Nd~x9f5UzK3VdkoB$Z& z33)A##H`0E-S)fGTKDRT>61zh-=;|*yL=oxB}(zeW-5yN9NUXM9eKaAwN`lg$sGcG ze0o4STpJ~(STp-?uh55cW(MGb;bPN2pj3YiehMu3TxqkM^(;D0^_YFOu&ky>dD){Q zNi{0KhE{9iNd|t_jYwu)ZYh&kT{JeNugcxB*;=*vdo5?NkFlCPa?NrImr-w!!WFrV z2S4(`zQ3BLC1Wj7EezMrvzg|RIDu4Jmx4ki1|5ea+u1Aqddc-H+j3%hv5j?wRk(DG zo6OQLzKD8+oPu$dSf<4fhSjW6i>u`35W8p3+LnM<9SU*>O~x;=m54r;LDA?vBZ=u^ zZmz$5&ebgr(8o#?9aXi-6eeL`&lmmW;S)M}QXG*ya|zEaVg*Rz7LNJ6^WV?Y?5TsgV~x!g%S z(+Z==IJ8$qS?F6HsCB+b1r2J(L1W$PN{xbJKoI(BdwCtLfvkLI%Mw}A6{*^(HBPYV zE^4Ae&2iA2Emmu&Z)|UQ(4tBbt9xNI!s@jV%;>ud=Gz$yRZ=R)D52$>&9@mG7zmd{ zU40i9H+$=ofrr~~jOb*{J87Vd$stMmP>3yYEz25x?!W0Y9vn5{Q3vt>u+N&FvNulF z0DAtFrl#iD#}6OM!<(xc2LLhj3$~L&Sa3jHJ!Q(c0!%nokUAkR$@nkCtvd8~ zrb^qnBSy&avy>fmrYf&7e(+!T#w6kybIwz6C$*zc#4x+j<#zJUA>_Okt+-8*=RO{5 zIxT7%vx@~&x#P$z)%+Na?|3#s=V9*s`A801|78>*%eA>4OobhfU75*c-XrHJ0slj$ zHbS>JRtX^qdt1U33QcQ_!AMFAhB*Q?4faB3g zaRtCAM`IyJ_aHp2Rio6%H|XBqCwd@Rp4u?w<-1AW3oCWlJWI>pz(z&`t!eHal}P8D z$X_5S6@dDFXPod-BzutjA^35|xq3d(Q+f9@pf}4&Q`4s_btn(;IwUDXsh8ub_bChK zO#R)A3iokjw&<^NYq2T3Ce6)T(z{K!-?&Xde)akR{qiPpX;Zta94m|L6-HaXW)eIX zVW2sF015}s8+Mzv{yujG`Bsn1erdkan~x9M{s;gMhGd8JVfcff?{dk)+P3j$UwVRi zUOgzYiI?8|X1GR?PdPOui=P!>!qdBz;Ge|^jpHJCh6s(IOU^1nHhN+TTZQ~q=ul3E z@FNUS^wqS5Luc#@^r{-U4EWQ?OZ_XTo2#J7fIsCCmrc@vDTN)E+^dW5g9lO13y1;s+5N}G`4r1{O=x;=Ha&q$ipBzKg75Yb#EBvOG@1DT>^b%24Gpw2iYgD^T^BTKg$>a3H|L$J#EbvC$4AK)hI2z+ zFH3I|@Avn3bb})FT7e3Vxp(840ia-3#{ZjRh8F3V_ zeKGOOMFv^VYQ~2S-XRB4Ap8(o1rY?@h(LvZjSC3@5s!5z@lPU%xO5+|qKS)W!X+P}{J0i^W>V+lgWhea~zTPsR%C6fQ z-gI|J3JQV>C>_!w0!oN-&bB#IX7=g?&hfTMA=Jx5_?LqtE1g!$;JC@OQf=SwuDIvnBeD!9lGg2^z zH`Tbi_?W!?MKi61dd<;t@}&>PcxKgC+Sdx9noxkEno9#T9YTjKcF&o>T2P*nnM#|8 ze5?(chu7E@6A;>&6yA)(=&Z`i65b!0=_I*R-S`{n4hCgELWySE42aZ?&(I_m$oNkn zFb8AdVpyW#BUwZb--{tl6GYHD3!4#43P{Jeu09@o!#JWl^$f-9UEVf?WZYam81gVp z7Ag0c9~G`w#V;-`{RUqjGv#bz?q~@#6GGg$do{DJ$FHh-5a@_+mj1ul8x$}xUi2z9 zuRmsNQ`6)`~Pr=h!XZ9yapmi8D!BrSuUYbR#aeH6Uh+u-WO703cP{S zOVB{b9H?P5FaPGe?S93=1qk9P&8Rm4>rx-apanLS%v?bp}XYtfO zi;_d4ujf1uIi})=$Zt)R_;#U!e-;uV^;VEPqt^~OhGVWmBJ;s4wsfGOL`XCFh}9_JcQVgdslWfr0YW3aDWzh= z=Y|fl104@A={Htza`{J9A}DTteCoc`~Jh}-5Q{LtB(v0D(2?rr_ysRz0PtJ{u4q@ z3>z)Yii{%qwLrJFR{vnljy`Eg<+u}5&JL7-M)Gm(vPYDQARYff4r5GHyU_a%43v$@`d4xOVX%CSq z2)G7v`vN2S1A=$Wf)hueFT<}4OoVO~eEIT=9*gXeakK49%E5RQ(^o32%%>^MbTu>M zhm+$)&C%0cDIxy`aT7*%GSFjme*-?ILO5W$Ct z$Hlt{^-z?uF9+2PdeqvWkWk%Y&(anY*ceZu|+V&1-Q1lNx z=oKZo)BKsc)27kSI~p^+>3zHNsA;rK;WonF3?qcKzO-lz&;Iq3#A?hHHZ-X=dMjdN zEE++kHhCvcE?>ySKs;;F9N{B^oYhrbsO!IF>cd_1zSlV`t~iTWDjY#thNN}8r{`26 zC+-!f!j2j72F-4DvG^Bd?rkQSL?V~~zrcE!2T&@E;S|RyLxP7o$ z+R8$*YAu6wmLtM)pPFeP%@I9Swn1_I@dq4I^Nv)vOURHyepYRHL9_CK+&k{yUYW6L z-kDAP3f(&uYin!1uO}`&5;RZCXG|pGzUm;kDGN2e)ew)IArXK=gN4mvFwvNFK{p#AGT?HiSH%j>DGOQ*tjh0&^lK zH8l$QRXXo65xN2kQ&atv*jWAxKN1N`>yhe;xVQTj*IsF0k{92SB8a(Ox}MS>^dzN? zG)IH`0zv0`rXc9t&mKH@wa+VIlI%y6M3ZS~6?N(DkbU#{M$^mojnOl?sXXN-yv?Rh z$a&VHR|;2YZzPJf@IuC=z(klRq(j_8!XRAu_48*dMJ1)KlD<(F2VFKu<0Pa{=2`fM z-9nsWD@2(tzt&y|Bgy52TaE2zXwe2mm6sNN0?Yb~dU8B@L60$vb1kNZm?eUMsg>X8z&$|lj)NY4!LDp<4si*d`vQ2x+ z8%HWx?u=blRP}IwnzvW+Ix)K9&H}}-BxebU$3(k20(sEM;uN(P`;yT^yUnmcM0wI5YW!v{K?_2_KmEf z<}-IQhNF9~Lz=bgDi*i{RabDjELC)-zsgB&sk`WWVK==rSBtCkQ{{2tdSuz;Ombpk z`404Wwuvl8$)I@sUlIs2P+Q*V~QPOSOCAhhO=akf8z=X z$IY9Z|UzoZNB z(#~w1AHABIny;FH!P#R%0i8-}4R2$PvfK*og_o6go_K%gqQ+0aSfw%YV9S&$d9~EL z|B6Cccy1oh%Okt0y}0{axOrAGsT1I%_6y9ZTexrK-4Ye%g-$>I? zOZNT4kz(E^HWjqa{WFqtg!wj>{Q-C+8W;8%f6$V9!vp>Q9H-ab5QNZgj;Gg~I~m_J zGon}nTR2-;4C95YGZEM1c)RkYI40n$Rej3M&5h*AWLsps&qvsb4#*;}-o48Y5NW{> z-sLh}-F>VSk>*$(bE^Wt(>;~3c;`I4Y2KL-ZS0HjaF_wdKwaJREU_!`=R4QbPfcNg zjc>%$g9;r!n3ctcntUDOze{MR@}ti(J1B;C(!PO+2p)B~0R(q0Vq#8A{z1t-2?z*U zB#sti7|}}iO`6B-12m7Ixps!V4VRjGB26x^?*83~7jmzSBv&bu$*Zt>iT%R7FxqfM z508|CtCXH%mXylwUu^q2*!atr-Grr<47P;m!+#K20lQ1^pJmj0R_+2XmsUcL>@ z_8S+;Gr;-M>vNyqlD8BHajRdx)}K>Q{Y<+#;>DXT7W6Oc$@gW`9?<^Ad2{ba=gUgp zz7MipF>Ff5vo&xArc0$pHdz3}QDp`QPxStqCarn!yoZr$bs~kvH=FQr z5KX*r?4H>7wfu_Q>-7FjjXz0oQo{WikTR0>L>KHxI=F?g)G z{uwZxE+8Y&1Cw6`H@=F0rdLuS*D@`t&cg{$f>{!C9R$2Hyy|9>8e}6gzdM;~CTQjDcwC_+z?2G8M^xB+aW^)e*WT%tgeN5fXAeeR#Vn7H=UXiSAhZ%e}9LN7}~r6M47@slYmC_CJP8I_|{w-iy(4 z8@F!WMBimnbo+T|{povN^b&=T&1i4H;h0xP<8 z37=A)`(MB7&m6m=YB~7_XZOZ@5bVGyfC@%m3nTSRZES@}+!~r# zYJOcc#f_I3Zo>8Nu4%1fAhUAsxgqm#kWLvwvrPLjO6H@TYK*k%0A1yekB{qizn<~L ze`uT)1E6ypxJ+hrL3>L{+OR}Y*EVP#3579;6qmn$KSyoSS=LO$CJFtK<$c7Qdgh?) zy9%x4(GL!#iF2hRyIB*51Y0q@$eEMuUwKs=VxHpo08zQ7w3ds57^%K{fKb!Cd1y4` zS3^-lPx_mVoXozRPq7iLxsf=is;Pi^53QCZ=A)wjEzH3mY$G&_hxgkNG7&Yhsg3Pn zhg@u)sm4(tS#v}^y0^>uRjNTn9I>T{pm~^}KTCqQ+S*PSl;Xa#nQyjS1%dy`)#H0u zojkL*uj0dYk;@Irpf{0dZ*TwQHSzr0l46Q*UqAl@MK0P8C5Eerm`@HNxQ3FgnA{;w#wr|let2lA=xkuf9G*=;Pu*I>t$uVe_Fi6#ifBD-WrOA*eLGc{1*ck&x-@Tlt z0yX>N)_dXM;lIOqh{)FPR1fE&JiBzs;6>jbRQ*U=RlD-+H9r^!8}tA>qy^Fozi=K4 z8k5UB%?x&7V^qJ*Mz+hPyAl(ZAhzhB%;)yFWQY+rm-Lod<)rv#_!ZbT?&22yYHXyd zsXfF#@v6sTr!n@7G5i8vkdMC={rDp{*?~__;pj!kdCNz3KD^{HPE0t7$xv6SzDX3t zSU^;`%MJQXoUM|!mJT2HJ^o;y24XF;62sR?xniZ{cUdT*< zcv;&N2dQ!zkTt;kdyWz<_fRy$!RG$k#T&yyBL*f>JE9W!+#RZaay#Eo{oJ&%H8bl{ zBXpcz_^qQ!jK$rczMxe^wt%Q6?UGztzHE2Si0Ija>(BQ&5C%OzKrtqL26qF8xq{J` zhs#_a*{$1QVf@Plu-53!H5AGIir>^QTx%F6eceMhO~TvuOJ3et8sxrN4cVkz80P3~ zmy{TDLpTi0{TLtmt%z4Om3MV?IMFX5444dQxFc@SP}A2c-hEbg5$&EX*SuPPSg441 z()pOFQ=s;3Ha>SPp>}oq-AMy}&qWMR;vDHWF_VD&`YzJ3wku50@*6!eMAEPtMn&a+ z$LQR@Be6efLIf+FNnKr?Q?Qwz*5tBK^E#gF$7Z9;EuXTo3iJyNrN_Yrk4AtzTsD+i zK;M1otutgfB26?USBQV7W^0gNovQ{fPRdyd&_@|~!M4@fg6OnFMr+shlgHCniT9~P zZT7&?5{Eto98@qTZ)*}3YR-< zW{=Wua-;-di`IE)SnKWMQ!Sv$+(K%egy%1GL2M!kNo6KWq}cW{^o32OW;3cRaJM|^ z_=b>@IW2G00cqPpWjka&2c`%xm#a2 zs`AXF?&QRi5qu#Z{phwZ7 zn_@QjBFvIQ9(LGMNl1!Lzz1BCuiuQG&s=c`bO(khFI~Fe?KkT?2%1E<88sbYlwRop zBC4LR*L?ZU)Cufz%?)w!GNZ z??We~i1XIs%%5_b@3sS(U2H!aSwmTQQ>0VWn_bCXld5Fsqil z2<-914Om{|V&LwvuT$i1gV6BvqOHZI;;Y-l+D;n_t!8o0WQ!eu`7r`ITpv@4-Ml+k z1ENgqXid(0Xq-&c_g-J;Qv6@&dq{4kKxjMp3!o0UK-ZS%0}YL#8NHpU<9yAs+UtWv z%?UGfCUJBo_tPIr!dE84G{pP+8oi3RxN?kkRzDgqE;v5{l=m}%r>a?YVg%RhFspQQoy*2UqSq1@2+pBl$5;(zZH24 zwFQXz<`x$SfBW#M0bSVWMXRt~A7*{$2idrL*9EmncRMHju(gdh=EKJL{aEV_Ql8kE zU17K!hM#QC%EMf!faMK+(Ypp4ezI5U9${&yBWO#V?||t0CyA|EjFV;$By%zPDVYr){VHxj z^Ls($<9%~*+G%RXoe4_uBXW=*nq>69OGzkrGN;9inc%fx!FmO6A?XakUcP(_TO>yp z`QigZG-4}?l6Q5N+HZ2AWaX_+NCvTE{mmGxQ^CDueno}+b8!)oa?~~9KwVSwE;YCG zmBki!`~)iQmZXxX?LbP@*?DXm^*qd|JlT>+R8MNMz{$xe%hcTb@<}ZQNk)SWA_cEz z`G#`dH{oyGq}7)gmZ7E~DYLH$FMI^hy4AFH+QS)rk7a|NJ5M&u4! z;wAM4jiBbL0oR}TDxj8QzfcM>K_e!+s{Mypk1}~33MRKKZEYnMzE0Ra49YWm94SaM z97og5`r!|=c#NN2)SI~z)^nkeKNMnXw|I)=);N90S81yI-qkqvGgOk)G@mrOj-~Cu z3mYj>p6(I%XFdag8{LQLQJ@@jPM)mtB1yJ#ChJWYjon-n3JWNjA^;g%57)Kf->a`) zy?R0-wk35xH#PH8w`SBrg-)h74Eg}oY_bG^mO&VgKb;;~w=4Uz>Hh`RD`eL^8h@qG z3)0?SfVj%0sioCJWHz0oWupgOzB~ISRF3nFAp)O1eR}EzhQS}Lt*vWboWHm`^%*Hb zoFsZ3b@mc_NIj%N)!l&hU)&(0V8X$F(_9#WY zOZxV8(w6QN0!1$!pwFlRjKx=vK_kK%>cgX)nD#jX4f47LT2N*Z9|H%nIq;*w@BfhA z5BKp7$JdRlogSro^;)^z;vTy+Goz!26CgIL46vrcLhvfA(ZJ2{ycE9oZ+dS~aJ|pZ&;MRj z^d}lBB~Fc6RvedPpZ=Ue7Xdr2H8!QBSl@+kp|4KQa>4Ojvj>XMa)8n*G$-(8^{je7 zLnEPnl$LGK@RZ=X;r@s*kA006YAs;va#zZVeSkjG5uhFFZ>g0wPlT z;l1Cp{I!5#Rc3+=x6!y)D{aNcZ!%t*$o>BL;?D3E+*lSXtvd$x}m_J1GG|G6N!VuHt}a z#VCs`#S$vJ-u+ScXQJ;7ZjS%pt2AhS`K<${OC}(d|5Z~%!}~jM_{N$OhWZ|Bas}~0 ztM16gBq(GxfoKOz)nW`uaC8mGFD^qfr4VQZoM-;{aeU39{FlMAm+7!3c&_g>?KR*h zEPQuAz<@be`2I;ot*jxOhuzG=f<#$KNxZDG@^lMY0|!$b+i%$Pj|9&2j4jJC0- z>BoV5_S^XQ`1V!RQ&TfDor@nQ@2nwG1lFFPj$UjYdjcoNTS-Z&2~?+?ic3mrp4r+y z(b3gC&gWTXG3m_Q(ol1XD8+)}#ReBACj(rPk|Mwvq+k+O^(2g|=H3teW1li)C=80t zHXcgoo6pmtql>CCGXpW`_&;OaHF{|h#xn&j>Y`93xh2WPmFEOj)u}ve--?QODnPj( zM1M`ppFe*d1XuQbn9mENjyNtXptnp-0XEA7-Ca44)zmoP-wu(YLWtwBN*+T)L!waG z7p8Z~;aM(uM5(?LEJBb1DpuTUsM=Nu&+_jCjw)(lRj%21mCn}IQ9z4h@+Ekde}tkh zCs^}d5Tw|Luh)d^;*}wFrMS|m9}$$k#(JS>OT`y-dJS*H7RQDYuUz5d?FN7A#T9FG zuIT^%uWT$%f|Ue@WX3CV<4R~wHZJO1Wq&`KuS3Vfshoe`NAB?9BT_`!BVSYIpYP7b z8bs%@M1y<#s3m@aNlD)+SD8=Bxvt`-76IzBXJ}zATxyb8h2t zQqAAL9z;*&ME$n%kGt?G`m*q9xh~P)ciNbaMNaJ2WJq7m;rM@je8X^9$=9L(c^Tc` zpAA-mh%@XGibrt0fq|kcBS8J{o6^^XrEepM8V>sM=z6y9v;X&le*gTS%ttHja$zf3 zrlH(4O3sMpnJhsGH5u_k=-E%)3w*DadmR`q%Mp8-J-TgIbcOMou_B#?kk0(AwwGaIO(FGu>Ge$)ZY6zGH2?6%I6b;PGA~GkKCQPICmWw z9ksmdb7;f|%I7&u(=HRwmqD_sn3Z9j)(Z~LuCkFQ)Sd#K@-IqW9H}?~@h1s1%6clH*~A=#WtV{-P*U0fXZ=eZ zFz>}mN={1Zgz9<;iDC^1keWkl_HrE#6F;1b+`cChGlY)Bsn0=aOMmpbvc#jm%f2!G zpLL?Jb4|aZ35zE7IxS)jC7j0zq5D@PViAP5XL3O-l8`~T_l-4*I5k`bQt1OJ4$ZW1 zq_v{ALn~7e!tvR&FJna}K~48mk_CyToc`}$DGhqda<$ngh|!#<3uT3@0QN&IVgPsU zWrA@!_4Ox3#sPZ=aA+k$@uv^HCj&IylRyu;r(et|?E&xn;=u5~dtQoPmw67U#V>dl zVfFwfNVJwSng>x$&a(h)jc=^9T2I_Rm0ckf=Og2TV9RMZm8U1fu{#Ns(@t>o<79*r z`CS*B&xExR1oT*2JS!}JHy-Xc)T;@YNY9J)MJ*zvv}gmN;l@B#BuJ)Keo|vYOE|jF zdg^U<$7S8}jA#tYW0~Ied!5*yMvCQNmI!Xh_RI30kC(Q+v8|N=IZftUHiHkCqN?xz zjfCggF!p%K=8$<$h*tCHgvaQea6_RsA?!dUpUIG3>U8qXdVmndGAKg}fPhLY04G|V zAa!Uw?R{XtXWWRp12wiJ;M+C@gnu~vIPl@~f5YhfKQAVQ-}ygd*Q)I_y5^#Q{cZ=A zQ=|x_7+mF8AyH1;t)~^B#O^e#W08m=`8<`ZhTi;KZYN0QAQA{4N&SsQq)-1@o9tcj z_P?nw8TJw?79)~fJlA3Dk=;U-zL1njw8WN|=TnFNm<#E{GK$@b1dMSJ*_s$I9_yGpxu7X*@$McUv=wJT( zWb6Mt*}C8mw08cpC)X{0KKy=ve?Y>c;_uh!cA(#3`xf^}<|NhMS_rH2{cowHo3g zaSf~cY4|#5oZj;+?~P_ZxEs$q$9AWkoBf6FNc-R4OCRDYY@r!24ExN}yhv;EAe?8l zIf36qaYFNUcr`aEoIb#8iDoTF7+4 z3;Gmm6p>t=L#6un{OHt1xX@~j5hwhvX;G9VFHOf!ph%yi$d9H-S1Nn`Z{X9TE-dM6 zzty5Sp*rs0@VFgtIQAj(zoTTn%*o8jhtwHc8hxgU)mb$31nsTh9GwU=$EwWVX~QkR z@;9pehx}Jr3uIq%-lc3;XP}jhvQL0c?C}d6^Bc>Q63U1Su{m zIsvIH?a|2I1uBzG1C-?K{B6zK*w}UOdwV_#1|TM2;Lrt{+@c^!@2m0j=|vq-#ioUt zy-dT7_}RE0HGivP;2-Ne_Cy0X?d7lTd5{RIAECbMA-9fV>GpSze%<%zP?)-b| zi`{@)fbP)bG4Fq~xL|Vv5D<>+>h7-2udY5Xq>h?g{=N_qx_mV#!v;>qB6r}KB*Ouw z7Y|&-Az(3XSlQU(EVG*45KPBKy@Lx&`=aT~N9HIS$T}ab~8br$(S>c@Yu%I-mfW z7#b`lvRidvj$WJh-u-uutaS{*c{>Y)Wu1)RnLHUf$2IMeh6_|M*_ zGie_bJkdd2FkORDDe^#OxO1)1)l^5CpI!8fLm<$Z1iS2~)l;7%vlcjnHK+k|w*%Y& zqfhk?mu!lTl!|U>2{w_2^Bw13{5kpg8j{gXfo?uIz`9Di2gItk~?+Z!z`1siHY|qWahWjG# z+%Vj4&@OV!(JRbN7tu_*&DK8FLZb{>0&15>K2D32ILu}_MnpkQ~k+2q{ywLKAM`^;#bSUR(qPc zfkIWq1IF3;W1m0GZ%T+;{mv2`iWx@VnE9)o?g$Pp7~S`n2)ogt`16wX&I?Nj=H971 znK!>r8vn9_OZ1Tb1_3=OYJ@mkGj}xXGhl?2{~6&VjL_XyPTZ=iKq3{TchpmTI3H>Y zv$3}(T_v(hot>XA2nHR~y6WnK6Er0pyLQ3t48XoJD-ph}{s`7XiIWWb{JWPcZ{Lk8 zICm*?Q6(S1BX>_Isp#%J&+#L^lWh!cAB+{wYs)9*RUb7tw=T|SF8Eq5d)nxL`lW47 zCHMAiAfO%z;(o`xf`*!tufF<{G+|H1J{zfztPeisaZXDur8abeuiYSW{hu<-{?Fnt z*7fE*YyiJhLh+ea)(UDiF6F!4Q>}*EXEx92jR*25?MYunri+8Xy9Mim558D~Cf`bp zii$3Q5UU60Co6A?SH;8y9)JXdfmAYD z^SK6U_uco9#eF@N1e>Et1GPD1H(&|o?LAMs&YtJI+P@SH`pMz6;EmoKX?f3av zXUs=EkU%q*s zfALfDf(IOe_~EzFl9%8j3)aFnmbV)GB!_;ye_w|>Lo$J(z$Pl{S;MQ8Dpr03?Os1B zva?s3kp%SbNMKcTyA(a3QZ*kNy7hcq`e0{CyPl?u-&xpAhP>SLtUUz&aX@M?LB--TnbeS5hOR#}j3&Ds8~g_g)3 z%K+CO-c2Axtpvv`m&KaOV}laOVf?eIB3Ml6XI!KNwqwOjm4BM)|GCCyr8qud);Zi$ ztbA9hsjclc0I%3wY`c}kXRL0u3(k1SE>JW!<@uu6@k$#Uy}PAN)o<=yLq)j&@h10ZY&w(aG$FeuI{?vN{qbNncFC}9)6VxKxlB>S z>(D+Y6SnR2?E6Z-IM}zDHAf~5039a)w}2+D3XM!l-yuLU635F<>|JSGa={kNM;jv0n}zYlLR{S9ow!gYt}aRp|T_uWe=6-x?NDg z@4BJSV&*3gtPKI*FO#D)bXx7IU+3aBvWl?Su@zFbluOPn9T+ia#i;uOX4cV+piAibhOXhn-_;Il8RPV zpRvf%T#=f!R|xrfP(Fd^Q-}_b8%{vMC$*apv-#op6NIv2^&TX9VkWoXB*Ykh?S+;B zeZmz_Q?(u8|oYV@ZxiUv3@ut@+l1iP{#j;>}K~QHJ_c&>DWx(bJ;>_E#-x z>oy2FzENTyLnA|t{OaRh-t@|qJY0mF+;;%LE7@~N5m3B0@pWVASs`N~IkybqGhmIq z`Q+Z9v-1sZF1^OWG+sUzCo_5xM^Q4wJcuPZChtLy7J}QAhD7Kt`@vlt5LnZO2M1H0 zg?)YArtLpK1>qacA10Ng$-o3w^|x^3RvsHOG_5dtC;Ugmh-G(xH?`JNymPFC-ixxMB{M zN>1!(ALaSv#bo5QQ@Hx&iov@V3!TGo+j;&D#T}HNa~-`~%XsuXQ(Mw157_3LoKGqb zFOBvH%w-{5MoFcheLG%PQ062w{}Q6jyc!m6B8ppiuq{OX6=xUC<`2uR0a{F9Pbn4h z08r}^ByTM*i%EfTxb^6}pMQk*F_RT1jSx4ti8X0Z<~TWCg~eVJrzg7uDtbq7bsW1% zAwg95Tm?Ph((NZUWkPsyi9ftxKH|W|g>PMCZ+7#s@^K5?|7=lrW;81rkKOUuhr2z4 zbI{mzO}Th!r{Kr0n>*_zJ-WGGP^>9?)8e$NLheg1=2obI&Grt${WlxeF|&A`kwmX8 zZx4)4eem1-k3wP?&&bcu-yuQDxMG6O8QP^jz;LbVXRUE3fAge094&*W>-$hQ_H5&r zVLhU2wgcI)O)puYLyzpLqkPM^cI;nUQlnj_0v7ak&&H1m>Z@V z4#+Da5wCO25fU{1;(J2d2;5PeLkN`3x|g4PwMpDdk<5K+y(?tIq34yRVNs27T!ZvQ z@K8vxxwEp7A*te4i>T7^@X%&tA(6{HuXbM33{7M$;#5ALRC%>$a<|U4*|mLV|KMRv z3s9y2bO{HNio(VRDMV$|aP6&z1a!s3DDVRcC%BG1sHv{5-FbbMZa5Z?l7eUh$MSU; zMsnYX>Ft1Bt7ASXK5_9W^uEISjnW2X#13#WO`v6!Gsisv7ga*1W_B4Otr)nzkhaP) z$k?2?>5qrrh@I{V($*){FXVBLX=bLZvfh~=IUovIgYl$<%mPK&RPDA+jB!MXS%_I8 zNkEgnWTWn4{+nIRnG15$$3-fl9usl$V}N@{jN<3N;)$2XSzerTQi?!FX#ka9CktC$ z@gO9BQF%8HcE9mbgMuOv@6%5p!w0zCmI-^=P9ENXxHb$apA4&d3Lw;p< z1#K%HRsYy@k}ZkaJl0d1nuURN1q9#4hcdQ5Q`}d|Fi<5c?S4u&Q%Bj)=R0{ zPd~G4;s}Q_HGgZ*hyw-4u(hM3BTd?sHATwb6gZwoVrb^SqSMlz>gnt2TZSD;5Q}|A zU&a+OR`i2=1lfl8tpd4%Nr)|?fL+}geV6=>2$4|hIOk=SsEW7b2g!FHu>GB$^qrz2LTl7@IdW zqyLd}G=33{G@p#tP%hDCJCBEl8SfDqg2xEBse&B`jgN)h>n^`R+~+=={vg%}$bv-* zY=DAl$O&y_v}T3wxpH)_0Pov|XOmTm=P|KOw>R(6d{R{bx?y5iA2s(!q)N0jLJE#X zjt(Y;i4RWAw;j6P_GQ`C+p?t1CCM{wy}#NvTMqpu-8We{TuX1yyjlx<6m2Rm{{3m` z_@R~8dwE_%ZzKOdZy&RTGNn8hMN@ zh(%3FsXfDE zwM52p-*m21yaplSaPlCR>3AcVP@79i90~x zaW`|Db1F61*P0??C(+EGu9G?UxpiGx4V8e4F=oQ2p0T=*1?SCDU70sWIpZdbOw2w; zLyTiRPM#XCM(|OfvnseytFloe7|=Rh5c70HwN~1t+M-r@kO{f&LgOcS44R69X4}0t z^^F(`2t#7uz>H9;n}VZ753mRDrS8|)-orw6JxKC@i(zv$(daHoEw(WFgFjZK6pqBE zLOSLr>##NiQ}{1$hSF^Z^FVrvX!jp_%Z-x6xx&XzRP5j<1S7TKpN^pF-QqUb4+ zCKD}c@voWt*oKpc<0!oUnH$d>-LEv@7)n@`zDwpryWijrk}L&uQo=VjjdXxq2g05i9y~e>cD<)?66s@!RnEsroino}7?Mb0ap_gLysnOZh zp_#_OYD5Y0Y9irQ3X)oo;591?B~oNqSr;J4nSOBiBC*&h%wkpid@qmGC6x+qcOguC z7Y)g97Si|{3y~E5+2%n2S<`TRFM%p^I?a75CU(44?Df1H2EVn|2yMLgRq{v;R$`Yx zTEPQI@NYwt z#9S=e)Akw9yGa zJdkK38?`a_dw&h>>d}-ye?Y}S@1+pTMsvk|Rkw%0Th~kwI!!-&vPv+25JjL$?OHn_ z;5$xycBh!pm0t8h(o+k~Tz0RzhJK=I@dxwNwa5KeuF03YP^t3xoJE&d1a4fI-w%je!Q$o_w-5GM=#kbS#TD{{_e}WJYuu-PEeMBOP)1mfWp{{Z{4&#YD>u$tI_s03 z9-(l={9CAXi=#|!?Zk$<+%dM^1Rls<*M8N|z-UU<6q_X*CEEX+@$&1yCoXPBNJ4@q z6r6fj1xZo~-5h7Tkh#?Oe!RJEvACEd@a zHf_!#*$>vTd`b7S*Y=jUo(?NIoE~acBK5Ed;h&zU6YBkY&MjpV}HxX zke2XI*4<2|(x6Z7jrQ*&uSaF0l}mS0MO2NiBloh0Q@AHa+Wp;@pvF;sqxJa94X0^@ zn?YeNE(ufI*>30!6@vVLAPg7W%|MYJJbure|jxO0(*oRy24XK0>nhy??rM4y(XaQIW93-IAaFjVhm3(6yGP zSSxV7Hoh(}&v02|70qsk-C;WCR)>n-)2tRZCS!4$n*NM!t%d&Q%mk#z_;0ux z`|3r{pABGy7A#%U!>uGMNXZ*#b>WG$78IY$|yG`RJ9frPY<`*8QS%e1GQIfPBPm0~qFNA=k9s(Kwck>0Mt)5o~JGGCOl@tN<{ z#|F{*>l}DYCEUEyQ&p5JfkVt-XsQ>LCS9>*)6fv+W@$uA)^>Hv)uvPWI__l`;ajm- z1k~KmH5#|7$d7(zjNLckqbjg5dHbd+tVMu6)k7_DnAMu|D${pu-aJ@uwoYJ8Kaudb zjr2&J=C_2P4vRVwxqYg3;hk!)qirr!{VZ`uZ!&-o^f&8^}ZIVLnqUKGqbblG)QwJBQnmr?$S)F(_FhElUoJ_v4J@O^6F zJWq6U!qg54pu-UqpdT+?$1_>J(>b3iYvC?Z@*)naK5PryUsU^p5kauD`5YFuNMJK( zs}Lp5mgr+5nK7yO4E#G~air#II^Kn!AI(&7WlZ}^6fmdxg>wX3-1p)W!|1i3Y~?wA zZAuw6**A4XKEHkLhd?fvmp5qek zH}}J>D=`P;rAc{tYQjv(5C0e=tA40CywERcUGFo^6tL)`mpJ~@>4c-#mQP%LqjQ#D zx*s>Y&h*>LbVXFm;O(nzrNf2b*y+n4=GJTYY<=B$v3tbCmu#PMwcYrKlW*Q{J?M_( zBVm%5jI}r}p%yt1o}u+WDs%mmuJc;SbBjjrqY*(9f$3$G)qTv3yKXpwXVS@(i?l^e zQ8stSl^W#;tOAq*Or@6brk|SVc$3=YsU!rHmPGBn8pqycBKz?25lO3*bQEJtls9hu zI_X_GZJh3xo&7mcgyv`wSG#PLA4SoDmW8CL&dW;UY1a~m*8|EwdoZive3rJM8TK|G z?f;t2>QADeXKYfE<7b!KbLe*OF@CGC`9Xu!#q^L$d6-0SV}lkmU?5r_KYom!?f8WC zMeNb1dY>yRW-_Yyn=+Lcb6fSIv>t!lE-gHL&#ZY#`KoZl*0jvZHHp`c!frgSyZ`4_ zvnXw;4D4+XNl9Y>{rAXS;i66qIKnC>pDJcTFDZM zRBFk`KMU1rA-)WTD%&q7Imwg+&Hg{8zA~z+s9Rh5Z~*D5^`wOQbuc zJ0%VcN00_-kd$r-xtsU9@Axib@Pi-hz4lsj&3xuF-4}+LQH~fC^6g{!^-PP2+1>n0 zB%PPZ(2bU8`68*tB4SBytesuu2YI_PMYEd4mAGm$5vJeLOGF@jb_`c`T&~otcUX&` zUYPr8OzLmpggj*~vLcz*sKqk9h_6(}8zEJw`0ky?Qm0}%DT!y@{4)sV-H;#J84mQl zrY2UCS~UhmS3!~|MpcD^ZoD^e^IG&{ym4sUA&*NLRJeRY#b^R`;WDbdPnRf^0~Fw+ zA$bhGl`s**V#kQ%h}yvQ7~05n@XA*BGPoX^YoDw&W`JgRFtaTeF(NB|g>eLF*xdZk zWY8Wdby!{_KU1ruqR9aKyk!|gAU8JOl_=LG)hAqP@Y7$(R|DCtiaTwzG! znk_|}IRI|10ZF9s8JTDe)c?k=Uk=A8Y)Po1I}DV)1RfI{uB<)W&Dq%=%26yf zI3!(!fv$0lZ9;&v#9@wKu9iP3Hca6c9k^u$%Z?^CT5mv!6V7M$9&Vk?zan!6vCSjU zHSha><-Aha;T~f^BzRL21}R2BF9aiobEWS&5}LGh7yLJ(?XCOUBFY2TKMrj>aA;c+ zQUqTa>&P4l7}pw*30vPHh1Nr0quxYw1LeTo*9HL7yXZrcA z+90BlVtssOFAN9Ne6l-%QaUP2oC*Y!1J)l84tAt*^W@gdtEcJ%1T3xr0ud(@mdu6A z{i$9X!2X~$Jz|>u5OqE<3gH+S-qMEnfyqfJ#ygjlIc@6g7^LTcE;3O=gnbMJF22P9 zYE*R2gssdd*dZoco&nd$b}LE|tXnWTFO5O+#G5m}YnVbJ6nr+zzxZ+U77EKuT>?ux z55Q>4&Ag3QbMkeJYS0ug-?JnJGU+FC=_A?}YBSZ-w#^joNo@o8f_(Hvn`T!)xu!`S z&;NNv^H?Hd`dEt!D->_mGx`dZYP^}oWLbSfG9!5!6QP8FhR{rcfSIiOyn1J=Cff}^ z53?KUeY!d}iV9+iMw1GV2(EW~gN|@6qkyKvTM(|<>0u|AbD@c3Hzbcqb`RRe=;j2C zO|yoU@k1icwLL{uw-FeGevM0R`s^Bl2&tWrjKA3dNpl~LBsx$*`ke66rA zpwRc**i$g7+0#8Si2E4t4)s*nl3NDKBNE*yduL(V zI|V{}^BMcp*x!~Y#tcuRtVIWaTpK^{=LbEkT*kF7@P z1}or5&*Pqwb3d&f7^r462@?V}5s^m#+Tq=3n5bQMcXt<*la=-G(&6F+4~tIUjY>^L zwWNf>&Sd!%8l|aF59^6;LH5olF$vEMj@rb9qToDoS*nMHL=F+N1{oQdksCP8alpzo zoj1OSt586FVA`A>H9O=LOwrDeAUqNkp2*yM(c1{YQJjh$Zo34PeX%5zgcQU+Ae4qb zkQwU9A!~^bF;M}))N%r|g`436^f!ZWTLeJH-fD{jf`UBeB+_@F4rVAxubu>0An4+3m(8MnX&nf)RqFj%_+!sA# z>CA<2j~^L-%2XPhqn4iGSbhABLgI*;(2^XNNEJvyNMNzFO*Cw=$T+*UfdN!m3JVia zsQ9Gh_oZ`#X4`3gU{;;T60V5Itz#5_t72!@wp2tS&y%T!5;Pem z1&oFtM*iF)S`u<3AbYnOncrnimcyTHEp5SNIt=C_#fJ%)DH0Dzq5lZt-gS^a{n zjEZT!wS&y;4T-{C=e{S+GlhnoUcJviT0`kXpjva~UW*zlcIj($f!X zdee(S?w8JFXtlp=mFUp$NgD_OmNGIjjBYEYRE{>omsPR*MWK9e?9|YkIAB7pd8FQM zvJPo-nf#=1C=*FJZ{3anC;kD(ivNHR!K;<`_V$YM#UAg+El8*|?z*M1@L4*KX?VZ_T;kT?j=8nP5VSj=xBX*;a4#&gn+d-5#`g zC4l<4|IJvRlpg9Olb>qM+}BL6FV_Tnf6ihyM%LiR5V}8a)GCEx)eVxYy^uv;7QvAA z83f>ojY=7w777SOBJarA3@tJwGcL?!ScGFX3HjtQB#k&sd@@uptueq!c}(xWVa#b^ z1jRXJNT&Q33$UvYZuvorzMFE-WT_0qot}(2fg}})#1ajkUU!j#A@&Boj%E*iU1F5L z>ufzY0wK7Ma0U^HFg7l(8A5bIK5%S(YdiB#*~$RUF7gQN6s@~J1H%EukHd=iUSj@5 z9UH_AK|jZ|R_r2W76l1&{GJ~GV{t_rNM~%VA!saq>5ys1>dqVnHF1*qR}Tl50C&Q_ z9$@;FN4!%$_(Zy8HmFEo{xqE)$hl~v1ZrC1s`>l)Xd@>t&m3Ws`JTUlRFzLAgLBGf z7;?&4r$;Msk02FzPMPQ%%DOGqKMqTR3}8Ji&ai0CxylqG64jsKEg?)qkwsmXW;Kmp zNC8S+aJVooiyp_-P%M5P9rh@4(+?f9dx&GQwfB5AZckQWRUEr{VQ>j$J$Xa5&!%Bk z&FH$N{#TA2?=9I1W(ON4LrN2uX!)=-3~;yMQcIkkY?6)eAcu76o0NCzV?JRFf3i|o znFuN9IysZ-saY-{*^znz#%kmu79fp~K<3df+Hn1^>YHKk*&Unu{f(AhAZ`<(bIqeq z;TJO@iz@eTBv@D=%=PXh==lWBYcCYDP^R(7u8ijk*beQ3wiCC`Kt&4|Yh@6Ze1 zwU0>VU2wvMBC!_)I(FAOZ5mmG6Hc^5f9EY-OGsn{bd=8%#|v=EzgHyW;58GjE`+1w z3h2V_wM5!bBP=s!`sGE$kJmHSwk-^Uj_Vzo69rt7*a!EDgY=O5-QRmlXMSh{)Y4BZ z_q%-HByibH{Jj7eSQ1e5w_JQ@c6jpg%l9Eh6C&x)vMp`Q;6p|!g8UYKka>VJSWvbf zOel81(OZU7`4f-t4P+rA{%DCm7~Zl1-%maK$5ktX;WB(MDH{>Db(|Ma32v%2`Tp>k z^qYG*6A&}^IW8$@t?4)Em{Px|DA41JV&evzfX_uTnGz@_9>Vp{w?P-N0v|uiV+|s% zQxB?bTO}Zor>DKnwLcc?Zcy@R!)>*0c-w?A5Z42-K=r>Ue1nTE;({p$a~X%k9nF^h zxJir2r<+LyJz!U0b&JY!`u5JbeMoJxArLPowNo<39 z^-suBh@t50T-(MGOCE94Il%Df<|7g#|L*)O8Yve}3MB?QhPk7*j1gz|*u5V7VblYB-g}CiP^Y6`u^4Uk z3P|KKxUNsb$9M^F%_!yxxLE#i)KBntIrWEX>SGuR!oNc3HQPYs2Ez|=oLSURs%f(Y z($+-HO65r?R6C<9C^vwFbGRL;9#U4P7tzyKcHSLo5c{tO-vPHWF8+tfjR2M~5Hif$ z@|eh!cv%XwRb&G>fRK;ac3w6=4A$^*OvIKBf=d~ERL+z7W*9|+BSmOohZCqKB0)`o zjv0c9G}=TQbdvPMCL9$~eX-hraX!Yf&W1u$+O4-r!9|}{q!@e(EsAq6(kc8`Hwi`= z(o-)s@kOA0;D&){UIf)XDZS1}7Si+tN$ZG9fWnBA%K}PVEUaft>V`oaA|hpuLPi|$ z{E@+0ViH;DK?)Yd>uGhETX6f;e1+J9-{3$!3YNt#}-oIf;XYX*_1BhboV<2gk8z`r-fN z8k@=hR6vDVz8HmZcQhyjpcN95=Z%P>vOFC<$`w-2}(>-ou zD9O8YR!ZoeNJ`S@P<+xnZQC~k1IR25;f6&r$X_w6<0i7&t$hSGJwr*7d!pdhF-Znc zR&wN8DGj5xK_Y|aTq|zhD<6?Koe#3r`McNk*4qY^994-|)w^yNj9Mc&UPDf3b0oT@ z&pV9*ziMa0`dSJZwZc0n_~D0RNXUdb#K zD+JiFAvt`Dtd>w!Y7wlZ2sC{vmcJo$fES*@=)oa{s#aKr?N z`@zh~$eqt=p_tuUrn`|ge0l9;+L)`xxA{WQ*M(}TIh|TSKtPC+GQ=-4*O9-K4<7>~ zDKjMnmQe6G^L!A3+=Ok-8#{sQ>KWMxYqo>zHJ3G0J#9AbTkfl3WSO#nD+oNYllM!H_}j#TIGiSrMMmkkgelT6 z+8Aygr4n?@!5zDkbuyh%KEE~UW91&cjFWz9o_LyK_ha^W^2-jUUs+2GF0&UH1;wz{ zc|(j`cYAb&FHzz(a|D#=a6Ews@MLca?}RXiL)h+!ARjw?49yAwf{5v!)pEc*hk=nm zsj#r=9Lp0_r=`9WXfR^+hetJx3)yS0w7=bPmBzR^hid#L^&iK4X2u>iK6co$kh#9hb!` z<1cn-ek}Eiko4oYg%6+iX-h6Y1hwKEuk?vw9V7LVVLp*B%M^OP{VenRBdN97tj9F^8{elK?_SswJJAAtJuzJ zki`Aeqc9h&Y_jwloNH69p256u5GX8cvTU?55^wm7OL)LkXyCz)Us#bAUJf zD6oslWSO_=G(*QY*$(aDM9(;EvbB(grjt5i=M6LN(ijS9Hx<)Kija; z&`+yfBM?^>GM=gqg>ly)`1$#boNn}PpnOyAZbu9O5=@exnMHvgdd%;0*=ce>_ABD5 zI->lCx(y2y6qHbGFXL$}w*Uyx=L0;n7N50bi`1fG|0yAL@ zkID|V@qtpBkn0KAj9+gd(vs4A{h|=Tc?00m$Xp~qxJU(|!8TNAmV%YkLo6AAYf}mo z@|vz1vKoCZ%)P`?%05&QXRi>UO2we2MEJ#1H0Cu|6gen(t!!VSx9aDL{_4;5`>I$dFV&*hD~Fa1n$YpZS_Kv3bU+$7C1>Qf37SZ9>;#hxDq zqR^EDJM6q|697?|3Z=gLCg^0F!y=4kpNe)k%$q&)DCYJXWVTZwDC%5tH00~kPwk3O zf==0j#1(LNY8?6tC4EHFJPGaXZ=w=BGwGkONDXh0VFdd&B>C#&&?3I>PJHE_3B|=b z_#9>w?2mHsC?$Z|_#{cbl5@(RsN0&4Cd4(Hm6VBPhBi8g)EHaOMfqM>p_y|fT;)Rq zniuTlXXWH`9-=1M0QN=<&5%EoKR&B00f5lV zfCld#5#I7qQQzDgOiOzLEg&0gCf!R5?^HSsxpO$W`s1Rrc-$M05%I1+;p-W3^L^d6 zVjt9@vPAmqAn{33gk}0FavwkY3O)rc(FJHF(0gzxnZh%7 zAs&5y_I7uN6yP61F(`c!BYtyHjDv(BWfG&R<>8R%GJw%%0s8Yq2Yw;>D(eF2&$Dy(!(&N-;gBgt95 zk5)%4(!~JU>eF-sVhqA9S)EY;<;|fP$AJl-$UxH;rW>H8Xh?L$f84V9X zwSqBzk5BoYN5ms~NWRk>gVBe=Gq(2h4xs1NdWG>u0r=)y@A;0CIiVU0uc>ClI_B#w}e=KGTS(0cq53ClCsMXxW}FV z(uQ3NuZL;xx_j%rbhT>)7`L$&xH>O|ktyOqfw$g+VWHQdn|;`EKfw$g0FFF08T4FK z^$3)cB*(WZr9;Z6l1w^)Jpg>%ojZWWh?$PAEY)x-JX+Z13=kGNX(BED4y?_PD0~w` z3~PT|fIJ}q(;=Y7*BBD#j@j!7#Y4EPK@lHi?q2B}*(8iqA*P1bJBOneI%$RLgYJS) ziq)cRhQTOYwV?_-7P#SrC!1%tY*=H>)~I~43JRk&oEF2sNx7|2r@*%KJ6Qs*=0`w^ zl6+)n2)9-1i+Nonwl#mYq(+jpYWf<~e*O4uIs;}N2Rjdsb{@c2bpbd7U>M{G`E~$r zY<<)1dDDZ9g|+0y#itN*K*fgbI@;m(J#Ax7r9BJ-MS=Rr{t zQw1S$J(MM&Ejb2GPEM9QLe43Qy96pI)_Ak^;;+L&s@mN7s)b*km#z}@+t17&M~8}ONEWSjyPFis6-?YOV96wf;3v53KeT<1t~ zd$v24YU5Na7oP>q=4YQv%HvN%0MahB&f#W#?sT?+fB3?FV}h#sIB^DTw_iYA5|V34 zp#yZ%2NA@~>OvCP5Kle4C|$LsxrjgQJ&lG-dB)r|{k8RkJbl4ekc?Uc-V>z|ccCD$$IP+bcWb?V?~~Hh7uzykrL8d`b_$Bk zfS{n4SN#j+1&R<7)r}XISTNO}Pzkwc4uRU^Yg_sQ9WCwmw4g0ni2_VaCz&})OxBUd ztNE^Gddz198PaImHu?eD)^@r!iXS;FehaLHRg2Z)Tr8A-6zv2Zen;cWmht=C6d1kv zf{IgHcU(4EdI!je@1=v{X+S z2vlLS;^N}^2?+@uN@F4rc2uRwSg90gsVw>8XsLXHyFZWa@Mpxtb6AKOD`pAmw{E6m z?K3LZ{RLFtFU2oRR%`eaP>QN((S`S8F4q_l_nL#;JNrP_(eA@|0^-{6gKrrxaY)mE z6&4WZI>FFy$(PojFn*2h6p?X7t1d2?=JrP4<9y5mzI`*6%qDY z6RS_veL%&G_mfSEn5LQ}4$A(0IDk)W=0*4_1hdXap@Z}BKr&F~bta*wAQ=BG>BYE_ z>2d`22gzBH*(?N#<-a(dcdE_V--I3<93a$nm*jCO?8m{bTdlQAVOYPTnZRT=^AdV}v7mR6a5L2)@1H-s zT}93!Cd;@LD+_c}&DO*pTaAW*T@TF~T1;cnC>pE)s=RWzOxmMBC+~WoLRW3J+1z%P zkw2CBN+It*px3W_y~~?j6c7$#z}w-9kFM=%cQF5GvN*v)c*aGi3=LnldsPjb8$dto znzm7s6R$xKF5pX&aqF7)bJqxZA^Mc@j%+&=+!A48Vuk@fRW$VUy*EIQ#u%XewxR}k zL2DRXr9$6z5s3!n9KsDY_W@QPJD?#;3keZXOj%i31ecUl^lP+g$1ze5d^-ggVSC{E6q~=?Xnj*NbN~p(Y;)c;EB@8w@AWQaipaqbf(%=wg93EQvxP0wj%?NvT$!#lY8*C(w3lvbtkIz7b$&ZP);jdfYrH zFi;GU*&j_V&NE`Gy8x0t=0F3%y1Te|?@vJCexf`P3{2vYzc+q)5x^rSS4-j>Ogue% zcnNyAwzIakw|@eXa03GnNUcN5uVO&aAemaZecvwNytLnt>5GlV!^V_ZKH>|}&}N=##qXAaRSB;gf*JxOL=nQ^OM95p(4Ow`~ne!lzfeHh*B-lfMz zLz)eoqIHUq7rxEIh*9+vmw-oX7#6iP!W_xb`d|&SCFjJRzIkyCT!ZpvfN(5V47d%Oq~v zubh+fSFKel*^NY*t&KB_T?t0D70H=-IxGTQ8x|c#r*+p?T0VatnZG|jKd1HdT>nZ; zWcNLMQu&fDcRx1RduUgp7;tk|)oW8yQes_JQlVX0X&*0_n{|I~M#gTs@rUTkLj@i6 z6^GqHKAmA}^xn?MEXcj}9XI_%ZO@mro`9}l-YXxgwtVLAaS8^i&`_%NBHh)I5K+n*d!B15 zp)3=TgtG9*yA*74QO#0%8hD0)=UNAzZ`9wK;|AO`^FKBowpG17iH`r`M`hNU3B*N? z#klXzx%>uS*MUqlSw(rJ?C>IfH`?3^v?AjJO`(xuY*Wp2-&aDht%axlPTtno`ys)< z)$n=ju9~m7g8zIy%G3l?s(_46cZ$uf}JLjBj6$DtE4^$G zqJEBJx){nC&Tb=b`0n*+a#$I@Z2#(B2cRi|Tde47Ge?+}`lXYZkRr#S{>PwoLFkCOvf@a#MWp znd1r{)j`c_z6}k%cZ#2Hlw2D78C=RSHi@G}{HeFTffVeJQKv%U)N^*BM&)+*)AM4< zT4LU+%i}G9&E3K*a+Ch+=QOEIQm-^IyuQZo_o}eDI&iJ1)ptERnE{h;D9GI<)coh< zWS?KNSOfouVNa-cUgPfX5Z_BZ{n&d|wHRO2sV2IIrw-)^ONs zBgy>wbusu?Y|!5|jet}vvq>tz%nOg__Ffz0YK1^o9)$>(b2yEbWK*T;>tM8Hb?*k2G|&Vg)6V)kT>+uPYh&0+gv%< zhM{2#(T$%hF|TGgKASmu4z0#WlOg}%UY@9ZsirT@>1o&k`M(P!F1cT&3JDz8n7x80 z=0e?AC>Wcq)-VcBIA5vzPUd#CC=TDcJkDnRgA?t9YF5s}#q!!8=eGGHyd#$LLL?73 zC0Sc-VwNt{o^+}=zo|H$qcxg9ZW2}h?dbxKO5JA)zsfZj{H9Hd6PmJro4i7fX&e)C z8Th-nU8`R1N7B>T`@6f|L<3pn`*>iV zWcyfUP(d!jxC1P{AIWK*gZ%tFt7;VA397d^2So%;=Hv;_vJvTD&eD6>kD15oevml1 z3)1(TTlnY#ab$LT5=sd6g{OI;;y}j~A3(#{^g2RJrPyLq`1LOV_3u4#jht|&+eYAO z)$wNb=G?YyC07K{J>P*cyfTAB4{a?NuG<3XMFy9c!lz3PPPxIsx94{D(#YP7!&B4k6D?~EDJ(s7fogbaAEcKMYzaI?{fzRO7ev(NDo(Cyh z%6{Nge1g?|J)wDSij!Mft02FkA}OAdn3|>HuUrCLn!mWuAk*}#y=pK2I=Q$hRK651 z+@7$r9D?bg=>>yRTc9Q2YfRuJP%ODZZFXZ!vpk{?Gmn|GV`%#L%_s z8z~f%?8nOcct`ZfKuRRnr13z1z;rSD3;kC3^7p-jyWu8`Bxr13Q<;ie@s<};2cvft zj1uJE+nW>LPTPTDTz4Gyev9I+N*Dq57Bn&e)(z`)>|Fc(FWqaq5Euo8p7% zGZ**HonuwCRPgMS%RaHKcZWKv7Bu$awJ3FS1Z2n9bjlYKS03-|uueB#Ivx_;ABq1^ z(L+B9`F0zUg#?fEf<(;I?RnoaN_`I=TZw&lVqk zbCWZwRAh`iEM8X<9wLwVia+fn3Ou{IKv?!NcLTh=;YDs`H&6QjikArB300fh^ecog zi7mZr#zG=>3okny`5$`5rY8Az>W^l&cMj*KV)J;p>XDPMt)u!UnfE|RBR!z|)lWFE ztE!^*LbCh>iOs!m-#aJS_1XgKqQPhPF(2h<*0uo?xE!gj2{3TkBI@}1S-MJ9nmJmaQUtJDkwFVx&+F%7TD zCmu*bu~QB?>sGh6v}u9@Qx3`N>Bq_RzABa$S?o_~Ef>i$LMP4GUr3kgKKFRI%7v9O z%eh02Lxu;UetUs@R%824F!o3rk%0G{gizHD=)At?)+1)TLd-w>>&*Vm_do})r+4H5 zH!YGm_T5?`fsb}<2={+8hg|=*I^htLtL0xGP}--EKJ%oDWpjoNqmd1Y5md8|RaR<$ zlV$$J;g0y8+4Qt!Lb9wkM>Oh4w?fgyS*@G~6ToSl`smzB?_(OfCO8ZK{ja zJ#J8wu5Hiz?0U~}bHA-*AZSM=%WLQ86^W#lp^#`{VYh?MEu)gL<;s3xrX(?*q>e$a)54 z2uJ=(_eBc^Y`mQGypVV4DXVyinQ>R+A1{)Bv2rT;stv9eJoOexM88^4cEKpcR)%fU zBOCEH`~JKI;sJAIya8fYNDm*+KQQHplRrLg&A$BH3i?;r@cb}S{HoZFc;^ZX-V=%W zf@|`)A|_3HWxGT7V0pyjDvE6_x7&A?v)Wx=`N%Sw=i6ZHOA%gZ>n zU-}RdPkg+(7K=ZVYBO~>R~n0x%#Hu-e5&SQzeWT`19KVlazf(%z0DShNdLGFd1(Il zz+*pOAT~rxHxp+yVfWCui-Ut9v4Q%O@gUzq@665~wxjBM8#h_2NA~z{e7-}#>^2AK zE%|3Zs!a`c*t_tIyYaVx!D@kv?gnRLo)bK-%-4GUsV|ZZV%=y+?WN>_hxohxu?D|I zg$g0Sx4FK+G}|pG+nbcXrTlH)*M1Rqp3Y(vEtbm{2K)4UF!zi1dQse_;>S5=+_}oj zHTE~1T9NOwVlfHT2Fbft#%=EStn2m3UaSTZ-ejUhTs6PyTeIeX_FpCtbu@qU(S2bU zVgJ9IL{bm86MH7p?t5Zma$yI)A4UzK$@UROs?Q?x^1+HK$ItUTO=ofBVjgmFyie~;|RxS=NA`No)9VZYp{-6hW+18m@4pkVqo z7B<1~cZ0SfU(*jz{M|-1U#P}?}+*4Wk*$2Ye!-eNZ z?N-l~MbK55sGr?dBwDk3zklcRCX8^8?*jT@X3HD^*wgP|`0KSpW6<_*1h++V$*FQ3QTobqL_U6lob{tlj1Y94A za&Ii3pk7rta5#Qjqs_3Pr1sRaLtpW?px?u5=M&TW_WFK7CI9PaHmCLWwtq9Y@ggxC z4qH2I(#aP!-@GJ}btMNqGdggiVPO&}S&AgGy*0^U5*vDrHJr3thpN<7IC0{)97lRGY&tear7sD#4H(`Cm zv1SY8*wGJAT;{{MU-Ko}J=%|50WUCLKF&Y1B%(cko~_Wn2F&d@xXMlOUX($14G7c$RPI25Jp+3 zn|hV~Kf@|0;lzvC)KK9UF{$TX(z{WbXC_OdefRqcK|MB^4*(odmXq=} zQu#AX;AFS^tK9zo$(lEaD<~Xu@-fOszI_s7*sf_9$`1VCu^v}x2{Ktm9*F#j)MBqO zn~2ot5>nF?RC4|kqwM&6Hmk+srp+miLrt$e`j$jgE_Nh9I@$M99r((Mg|0I$(Vc*i z1g&1@5l-G1TWb8-DxeR@>p$q&wKGvNS{5d9pJp<^n@k;K2 z^qO|Niy4qXCPC==_4(7zFd8sW9MAasr!TJp5@{^1+RzO$O2jz}VSW`5#cSUcKvTg} zNwp8P$?)h0{YpS1zLu|@vC7)5GWbDb7WoyMOr^pIBui0R3dS$8+E6RVu$f{!O4SHg zM=Lyj^hKVpzJL43PP0RFN8HEvE!U(ivh`G4!dRSqAVCrr7R!x2MJw)824-HcN|>$E z*9-{I@@y5$|Bx8(UlSEyz})8CWTC#IcYVU2Mlb(%Z|k4SImW|T|3LL^+mB2YT%|A2 zUvDai2T;$hqsO!e5d+1<<{W=2eR=D6Elrhb8uYRgUyp2YgF;R;L9fbDamsn})ymO* zyDbc91|aCNJ!{yhu9K?NsU>l`H$xnNYWuoe!VejBHxF1Kb0dn|&Cn2W@0DU6V(+!L z3uJLwfgh2<;Gzp4nbj8pr{9iv$8km9o{NDo5!3%L!%&D@&{v|@Uhb~P`hDKUhw&c53{H%tZ0C801w_yl zFsro^K%!^^*Z`pLSO`q9m<8JEDgYuh7MdNPd@@(9&)wFmDtD=s--4XMex>aD96eF6k7oE9^f32rmn#y%1fX;GN!Sj@eP0t=3u@CnTU{ME z_PvEczY_0d3E0*cRL|$I_!WspEN9s2XjX^^pjger30;BdCZ=UIhHciR(m=U{4wqRc z4%>RF$x}BW8XHr$XlkYPRbtYqhNe=vU^_XXuo`(ZZ`|tDrnrZ){MD!PIH8(|E4jl7 z1bf6?UU@e@5E5eISPYa;m%ot#h)N=(?eR)mVuX`XmskALvuT;XxbG!Rtiv89;i+X= z(HROoRZ+gwJ5<0gqAANn=gJ02(84Xt6{IA~rbP(7knwzQU%%XaZR#Dhek^J?D%G40 zNiDB%Tjp|he2X*m2Gs+Li4aJ@IA8cc;u;20SIbn0$8&}GuQz6BdZ0}f37piT%**;-2L|`{pWdvZCWj@$M>7Yqp;Xdu; zeq2*2$C|VjJ`MD>E{pl*btI9Ff@kYF%r&<;J-^>-(S+OB?J^i<#6>#rn6|Y>}wR>AakAiBM zVAbMJ?<@TJS``Amv`DYRoYDY)z%L=<4-$L!I>)%NUyC7SzEV{C1caQw_x~uFDAJ2TFs$P)>AGSy^>d8xWb z6R}BH{y@;gG7WA-uLUBe`K^z;qv+hTGpVyN@3J_EvvE{2)_xetu8^zB$OwC;90{~( zy;I>QuP(+|BAl0dZ*xr$u_}{gCEw!o&BU_Hu*XUG`TVb~6jkE7=}Y2k1jQ{MB==(S zUGdEGP|j)HS>iQ{x>~3$4j56ZSN0#JxxetvaBXdcdRIldgnivr4?8Y>-O+H~YrGCVYA*;)m_<<(lL7(A+H<8E6xIZjZb0mr0|m zWFd?YxbJ?)jI(Ut%v@x`;S3vU?I};ZCLV4M{v;yR$0i#*o^jjQUCV#hvp<6Ty_|ma zz7s;`_>NZ+k92C4H&BTb@|APt{^PC-I#1@X6n|)8Gq&44H`6Y4)PMD_4TSy)AI)?V zb_HIo2gK2YaZnSRJtsa`COeaC1Z~6f5({K}JigCOzH6*mnc349S8oz(sUgltomBO| zP6FKQ=Ks3chSB6ff_fkvan==(i{S)ccsDFP1)$E5DX!>5$V0?e0B*8k+v)jTVNYb2 z@@ZAT#W;s!zX*P-F;`?_3`A%GW??2Zr=o}Y{vrYE<^0s`45rYBWc&Bk3SD@t1+Na* zBak=lJeE~MtDz>8^YX`ZcZgkxNssGpNV6AkNqv`qldo)MAP4i zWq*$kL4*==M3u5{k4vQ|0t-nBDd!5jO*`8?xe<^MJFzYG0|eVXy&u1N`4{`}VYZbN zNU-hVT&VhjU~{5Z0Wkbh|ZuIN6lUYH`Db z+X?&CV!I23IDuz9I9JDn)HCPz;|g*e?%`Us=NvKis~K`C*{a^;va^wyvl|r94)@;$ zKxU*{E+c<Qf-M6NLv)%VLW{#(wC3y#(9%b{I+@LQ z#Y#IeJkw#SE8Dw%z2~G1pb@#J_ndrrV57uNh3R-q z|9z&-;`K%eM8JA_E*hE>|0YUZ!?1!*QbM_*R)X!ItJRbUfZi`gNQEtbOdN@Osjue< zR!Fwkv(O9>nAMuLFuk{36l2aK#5L;ij9rgE%8<>FtYCLsR;H9>7agV8kLCYZlq}k* zbv;}DQ6;19UypJ2ImaiDwTM_CU3fg*#!8+;OaYi3VrAH~P<_DAW<^Bj_hHtPgox%A zbA3Z6-YUJmXn3MR%0

BUPs{neiA1S`@MNXok2{8?`5KBV$0SDOYK-*1Qt(T%!AqlxSi59A~Gjq0CQH>^Fg@U8n0@)(S%?==ku5 z>l3PRM?f~_UK=%4Mq7t*aU{>ltVUW|j7KLV3R(d>@3lhfb z)JJyBM%vX`!qHAzG!X}$hl6e z6}G9j+OEFrMv50Y<~~#N*1S(0>xnn5k1k^|&hk7S;mmKsKufE$o+|X4o^C#VMAoMy zZ1KA;0`g55psOE#cLaw4mrsJip7AX_nO)@uiJQPm@FE~2lWAPzqbp}$IVI*U^6PY|LE^*dmZUtjmhtr!?VfC`idu2 zY8@pIR=K$9HfE{4Qz>r8|6HP3#i-^Z@b}2K7t=3JB`BKn?Rk@`BGKa3CvkFeFgmiY z#VEwp)Y*9)ljw*oDdTvg|J{Z|mg5pEqaYeRv?zmX{arFOaJr-P>Qrm{UQEfK_Xa%3 zU>`fZ{}sA~-66?p`^e8^zT`x;KO21V*vn5_LAqnNZMR=zSTg_;ih%@ISpu*Rz=|PG zQvX}?xDbnZ8zO2fJg9THA|B-&Eztd~2yO|6R5d=fc(#|t_m}OwP+1(1$mN-rB>7;_ zt;qfQ?<)T-nf&M9b2U^TxF&_uE#l8{ewngdenBePDVc=%X-&)dL3C~qFXfhlczD8yJ#}3*+&@};?JOux4YLboKVrv z2l7->Vxwz&!P_H(uzaug6|LYxDJKDN6U(J1iPNZHgW4PkOD3@cYuxhxVS=4|(0|VJ z6)38eYsKP=DSRT*EgEv}EzS6ohy&v0P@ty5#Vp0rP^F5Yr81=o9lDh8ch`X=uEYSo zMl01wojVe@@b4*Q81f}7vDfeGWQ}p|X-)Ee^=s*c zHXs>nMh1``m|$AMe7)8U%C>5~pOr>Yxzu|n9@(**JQaSqh9Y4tBX+VKHWk82y4OY9-GZA$TA>!>I2+5_ppPcU10!ov3v$pe{kmVXp`*1>iFFb6G zAgb|EHlL?Y3BiQG4sFp|=#wWQ^?^I2Q6**q5s73T?a0$FQ&0uLKa zzlgiMt(dyIt$TXie&F9$=v}9}y4qxe-D-48bgGb3?RE=kW+TVbOni5_tk`BW*;2+J z3RTF{_B{OtHFOe(*d#TIYH>+pzsI;bM3OBp;b&&tQ|v5w4gRO9YYdG0%i2krG-{kQ z4I10F+Ss;j+i27@w%Ihc8r!ywiEX~W{&(Nq?tGt_d(S<1j%O9isIC~zjW3v{`pkE$ z-Q0s8zYMCRjV-^k(h1o?1}^KMNH+uyTz0qiLdKH{0$QU1F<}=*{R@euQmHuFILtJAuATVMSO5h3AZ`nR>JQmJ1>G>Z2#F_b&3EGY-Q z&jG~vXoJrQIzDhgN$m=!yPyKO$Jnn2g7>QBZV$~`AX^J%>>}lZpYAvGuNCs`&p1!K z%&f{8DWwX=zl!R90-GGe$MWPJwdO7AMC($oyEW>Lv!?LNo6Rc3H*VZ${s4~S8IPF` z0C3DWU3R*`L3nbx(ClLDG%RrwWD8aGNxuXT5%CzCx!{X#<%tpu6#+?f)md}#44l#> zyV13X-?aK|ub%x1R4KozyM`O+28~!}imwfH);cMDI#J->HlD%T{M}*snNqexvS%$Q zXb%|k+d9U-?bKpU7KdN=faOo!HL2ur=1%>@R~ecukYrhOnq5QdIYMIv4j1b2NHXni zQxKbC!6(N^R9WGv^e$z@Qe4icExi|5RCWmq_i>v6cM4S71KCt%3Hs3^ij-wBgXN7~ zw&XTJh=3Baz4!}*x4pK;+d7}fEcEN{hP zxVh_2h>|EldvuB-G0jRv8|Pma*ZRo< zpNbF#n0Yspmrw&btqGwl{-mPoi^lYrSeCrOCffETOzrKu#y|$$wMDirdV+OOarll5 zQ=_Vx*l9H%VdHRgl5BKM-^sZP(=;mO=7Dio6lc~KZ0GcqW#0%IVi$+aU`qqsH4fVj zZ9oWmv`qr;y8k3c8@=ryg+e2PLQ@_=8VLa_t0rV5W>=*(g+-Z?Ib(M z6CQDj0 zCe5F3ozZLVO(9mZ4CARyl}7fU;5`E?LV*j5Z3j*K(} ztc58B?u?&d#Heu_zn5n^n(`&B^3 z=K&0(z?1uV8?dguM!5#=`G;7oD7!bD5P-WMDUdKfh+_8+(xu_4GC4Iqq4nA8XmIy3 zHO1V#xAon->2JhWl!BMLS5W?lVQxQm;QD`1nJH0;gzNH{>~PRtY`#+#QHYJ2!VEJ! zY-gTUKpHY~YB~r$VP6GnVsF|&?(zcdgIH&QVvemouI+NzLvDAL%Y7hh@t|1L|jqn|i?AtZzM2SX*XkMzLC?XU#ruS`s1WL(8 zfXcC8s&vei0AurnZX;ickOw&_h35gQLwb!a| zaK4^5cQY(Nx_NNI$fS>)&jY>u`l>x=)*`YGU(`bo4v**h zD_mg#EASiAWvwvbPp%i0!G}v!U<=0Ui_kjiBnMF{1qMyEaB{`TiLz!~j7RoFb1Mel zD`$GJ`a0oukY7MADzC@Hft)QW|(CMO{;n4=|@ykEpv;#=AY|-oVjNm`;!u_K>?lEL-s@nV55Ag zpxd5p-@=OsYHdhic)OdAwyo7T+5dXxwS!(!ed;EVOr1#o<=d01n{{%xsH|zv1F&e- z?BQ^o@MS26TP>0h`T(i+C|#0zLQbl;@{?9A?c`Kmf!(s8Lr@DJ_@;dC#Mc zTHZ*A?(wqO9di6;Y_z7xASm`@m&N+DO_q8pRU9=rMme)*i?`)o9p@)-86n&c3>9m1 zf5-(N-hvwGM}q9<`jDn7#F-J=78jPZNY>?bG|{pC^6ip!Otbd8WY4oea)~mXYrDvwH7_k}_9YuRkHhl(L=2U7@4C4$6;H4+%oj|j_s3ezmYPhs26r9yIO!D2 z-uW|tMo=VzhU$LuvycAz6=6|%v`4=v=6AFcBlGaf5dg z<~K7$)0%NkR_q0n=JQrt9XS<=Hfa>CT3g8!{XT7y3;g>wi+=G})%l^vRZw+@7|%}U&Q>zxia z;EJDZ%#$K{NiV=xTPxWVGh<@&j7m=EwU)5HvyK6UMr2xC-MfM6T#lonLns4?5Cr`M z+Ji#zX`z|OD2b~m)&y<1D=I|#DEjZ!=P16?WB4@4{?sb3t%iNdvY{XmY{vlR=wxp1 z!nYa!R0Sf~x1U4Ihi#$FXPZ7BUt*z&PGz^9q=aC-!7)yq3MEgPs}Wa$KJRJsuzz zmm;+*nClN@8&KSG&iF%ftSwC$42T1jJ0!y=ek{VzP4?KnN4ZY>lGz`bCfYBnoz2hw zBBp$F=U~|tZg297bmGL<*IlLhqBz|JDoND@-T&uL5^JDu?*Q0*=d?dTSgcqeGuNnE zu9X9%fG2@aIk0mDNH*Ar8hiy@B*xK?XVjt{yvWfjj@>l+ZB^~-2Q5K3N*_MZ?^P!o zdoNc8GSENWus@n@a4gjs(_zqk5-;P5${kj7;L9g}k25(90hGpwsWW zCzDPU0+xR;iY0GvZJ7T&V^@C2&eZgBkE7Tj zL6Y39FaRj#Gxk*PFt?ujE0cUVl1BX`2avqu^~@CDTU$d%y_`ZT)I>#%rJ&>tdO0Wy z)nX&(uB!5MY5tG~R<|^A8f5YEd(0-`GHgu%EU-}_UxLNs+H7cGAp7;@X*yprsSs#c zt#D6%d45)N>jT<|&4HCC*Ut}*htmZ}PwKU1DDkQz<;QmSoDecX0uJD1NZzmJyz3z&ti;K6;Pla^`*To&vq@tU138pF3{q~x69_qzF zTL`BaCHO#hV9ySV+qoW>$2A4cisE->wb0Ro}x&#LQGHf!u3_I56m;^eQM3PY&fSLm1r6y-;$<%KSS9{V_Dy5&u zN4AnJif$ADsuoEz;M0qCfCK;9c!90|dL}@>K2uwZnQ!4kwe;H<`cqJ0af^phYST~K z@QTvcFA0TQbOB@7L?b{~3QSctxooB!Fe*1Wpbr=v2cwSzOqU||_4Q?|21XNC1AQ^E zgaO|ZbudW*BCerdp#VgT{6oYFl)W@)zD%_AlAi1hp4pxDmj}!YPS*rNTx?V^e6}*_ ze95-(ekW&fDXCm$P;V0|pK zzU4}E@WIx58mQo`ev}QEPr?EY;}P>ZE2^%}gD?Z@gC*k2C{i^X6cm0It*ZX7D~|Dm zYzNDPEFMBprN$ zW|Fd53;_@j&*6w-WfFNfO^>bp-5WXhvk=9 zhWi^mi*-e$*l#@Adsf%)(r?&J=0sV3^3tVD=p(OJz7Qb$M>4hn^5!)rf6qz$Dzs1x)!&i#V->SBw<&t6GN`?9f!g za^d*K#qnL^wCME4tC+1#3q8l}dPWF~&82}x+oS8A@|*kVJo@Z=^J!^(NpbPKtdA81 z0P>oQD;O_&6SX$KKF3q~X~*Yct))dIol=+n>$x&L7;%5#@cC(>oyEZp^YZ2bsoqT+Xcv=<%bpdYN5y0i0|HNh7!c zhQeBTtTfK?17&ZRo2wS_1NHM!dR^4>7aMw9-Qmz8pT$aJ`r$~|n`@CJ&vT=>GkyKe zhuVde23Q4`1tst zP~On%V<^k@^W^Oo>iO5a+f{y3eWt9aV{zT6Q%O(6^lUb5K4U_rP&^UvqDV%d6!K3D))dX4B;2BT*ppa0K$Z^CRx~hrS#Tr+U*hvuL?M;cOVAdY_D; z>)nFZB87}o5!kDQqc{0c5pSSOkPZ8k*De6N4tvBPae9h5ImCV4_6n=UU%|vlpgjG!!^lxk(wk!FG_dWe%vh7}_BUioU zNVy<>^X*HjzQgehl*O8l+eiOii`N3(v0UY&SEK%w6XmKBd*Na?xze^3MXEH;5!V9(kCa9II$1t!`t3Da`P;Hs2V%qg+v3jE@ZdAfX8 zi~Ug>O>E2A1Z7?z)(!^V&VuR1^{#0vB{6Ya#CA=Ek5LYuugzmj@!XklJVUqVY7XdA zy*?5l9#C=c;(Scg+3jt$P9IOq6Ns|+gM5xrmYru3X87K{m+jI1RfDU^loG&C_*V>R zN5h%cjt4Ivzb%k;zIQH>Q#uNssnvJ>RqkII?J@fP6HoBP#(ih|YfjvoR<0y~76G56 zhG&4=1wV(Bx}Hga-t6KK^{|3$q`>_LI|DsuFkQ*6U;rKUg{D?Qe zVFu{lChPsCxOX(FEWUaB+#jf>@;Gu5=QcV78zUfHG{TVjiQdfLa35R_-qc#IU$Dbm z`E8;Vd8Opi=rn44)1krDw`-@^LrEMoiYIsF*d4*O{_>sE`%+Df|NA^WzRRNluTFGmP);d_sS#h<+g0CAnK_$?{b#A z=5h1}YE8dRLwea2_ZMyZ=by?)E|vD<$sDnKSW^ix27>*_wp#5El1!`3o|5%nQ0-?7 z`(NUBzwp`JXU+5RIl*4BqhVGvU<9hc1ac9{%ES&EmyUcXOHh#53;d&VLsr+Yd^Q^WDK5p)5L}xr_kW&i&NS3BbBTbvD#wgJJ&g0UFmw z8L_Rz_yUF`d3Z}Rj~5M`NB}Yv4COu)3}inc#_W8u^rf8*tJf19@168gL;&d(t^Pw7 zrPR(E9D?B_FjaAGGzs}8avM>S zU2x%%@PuogJK_4mMYEJ=r+5t>W@Hnb3?1Ow1yCs)q${VRxZ{0OjZ<; z`Wv}?%?->n%JyA&a{U>jyp1XH_g*>ecFg7OU#yMT;uw657(^dxPMm%A9Ao%)s zfX<)ug3~(7ChYdPMvficN9lv zLoasvq^~5Uzgg$`dhPTC*7MpQdn-Lr?d;?qAIj=?aIA(DdS*D7r4{ph;waPs(FS3Rxj_0EgGIxFy*=0t-D5tI zFZ03fWgQ1ixBwzb03qZDmTqVOGBEdQ{t5(Er1?NC> zjtYn;Z3D^%FAp82lT$6cHhzWS6h2&U6!**~Hc-`{onIthDAofycfY1ooh-WM1yP@m zB;Cfw#*WwJEA&HD491PavQRhE=01e3zW7^+Zw!D&CTulXqC3)#RwB$;! ztp}C3{l2q|dz0Zf%Kg%)D^}~s(Q(&^w0rvLX&pQDK`*Vb@gcC9CXWFH%n73^39jiq z>>k|X*1-ZmH{jRh+F+TZoTo-nU zdHh}n6&B_9s7NfQMZVf-XQQr^Vx#yy$?V4O|7I~{e2j?%N`D!7Jz&g+5 z+;u19bdKL&)&zDu$NiOQwAhY~NjEDM*ZQ|D=i7}}V^_}R=brkwMq0r(R_6C|kN!pT ze=b{C2+6V8ndeyqZvi=H&eSAD7Pz|?mi=IBsFwv|>L4TQ#Zog}T0LUW`fyFd!YJ$V z(nX2nuHKqk9>SViC8;ZZd}nFN@99QBPQ2@`dw6}vs^FCIA5s!PWRZ8k^7Gar>Z%{v zf2*IKm4HKcf$j;$X2!JAix4GK{{V?et~pVr6GZEO6P+>d(K+`Se4;@WdQR_W#iH%; z65cvmcKNR%3(VyaondTeH*4(=#K8sx=xS)FNEO7DQ_k$QRyL5+1>u2Mh(`o^XBK41 z>+Q0}qk?ueTJv6ZvK-EP_-YMZr*T{p_*t0C-s7%I5Hj$ko%$Ch_4Oq+pVg#4bwOVC z2Ry9w;xITT!dprOfYd-%&}4l#@W<0Onbc2<_G3cH%!V{2=BvL?U>S~Pi(y8L-d^k&Z2 z^f*_sT%OYPGOkqUYl5PEzjof~A+J1T@v~S?A@9*HLX@X@CxeOVx0wZTFK?A0<1;X_ z`Hgv(fm;%I{-*UiYQXQR{~WS&Qb?^j_qb&*$B-74hLoC$UFm*;>vnm}igvru=5c#n z1qFFEnBA^Fkle^%yT>tixa^*RneCV8j30l3rWthQrj=||6Hqtk;#_YU@DO^&Wo9u2 z)+ejp$MqWY+M4D3j;4qZ1@v_$Vze>h5tk3U^SoFzjjPImU`D_+9-F(xc$oLtA?>(Rw0vXo;=%VwuE|Stj70LA#;#t@D&sP zrt{5;@OY1qfo;k(0K1oKPB8NK3rIB%!oFO*o|@p=R(I|rPCqug?ZWR>{M<9Vpbhx> z6yxoY$+C%}C=92Qg;?PPd%c6-c<+8y8MO&Jk~8Yol`6CAh`wC~7%y*}@;~KBwo`7L zvSrml+;Ka-FNgGGywHjzX1-xXhN#P3>Kq1x7Glp=uK{VY#Hi7ua_ZFJ_D-P+ayk&Ws?l1j|v?T0+Sk??ray*_UwsD%3+7Q(-7 zNrenfmTWJSiSt%&GhL+|KkFp1bI({szcEqeSh=`OaK)I{3t^4%Nm|7B;#qxr_q${J zz6umw(!02R1cO9)^o)L_(l;T7JZNkv*H<&f$DER*JA=gVjty!%kWS#UI?upG2XeB( z+2v>e)(L)f-Ni!*`{}T7lN7_*S{vktRK&z(A)^kXfJKiky<{G5;~dLeGm%c*tZ2w@ zK4%ckEm55}Kh|hGbQFlWnPD^edCu$g>CX;HS;2&AvyNWhO96iXo1)!q90-_nSWRq6ZLQ~(X7`B z1WZ(@;g*u1U<9lq4z;754hx3Oy7!0H5UC{c7h@xnFG)Ugl}qMLEr%knXhY8vYBBHP zuI$#@x{o`Y^La<=YAB+w*&ScspWh+{#zC)UyT=7rF5f|pXxDxh{*ji#q9<=udb@dY z&~r?d49RI=ChI7{RN;gws=%OGg$GvjE<7Ae zzv5yz$xmLlTL|~=W$~I!a&lwUpWnIZIvqQb{hr+Of7i?Ck!w6lDFdzR|GLKvZ`H0M zt^JWzMqE0?L;8J}iYl~9^OEo(#QY@hw_@8r?l9qG(bS`b8Q`zu*YXk&PE!AvR}mzTwF_8JZUMEMlt4EXx8Xx z1++<;OqEhv)YBrDP2*XbpN=>|VL~)qFZmZ_ub(%3;JAsI#Kgr-rRCaGrpm>@+Uy*A zFjT+GuZu|FjxmXgkBVRY1FZgKOtXp5?t3b5oo$}`ES#H`l%^~d=nn4;l++R_lH=1M zT0>+)mI>S7&?>{89a^K>(;jXQ_%nk|_#Dw2>B@xvlIBaP>eBZM0j(nsmBdMN8RPx)v#XFxGzy6+f2$cw+RAhh-P^tdc$Lf&+@; z`t!Kf+Ng%BV}DwUN_7p?Rcp6JVe3KI51G0AKZ8HChh<-5f6#f-# zl;0bT-0M76ZLC>KK7A~i(Xw}G?Z1O;k$mj1U^RTQw9*NXmXl+73q?V>L0@vhK^n)) zc+}7RJf87L_ciwX`f@gS)z$eC1=mdaFWasJ389NI0z*i2A7$h{EuN5|RObVOW@R0* zyDWpd0>)@hpPh3sA-kkN?O>>8h-FC$vYjj&xUnz8-Gp{?7=Ox$S$wq!(@C2n{^Lyj z>(m0mMCaB44?P0XZNGGkc&B08?cBpg->sZDLeLa&*SJ=XOk#^5u)oVo>{d5z3DIZt zD=VpDUcW`SQrA38sz6mki_X+u_B&arvvXV5Fq>QnU2QhXhfe44#v+Tt=}9e2{xolT zl{;$qR4!QQGwd^m-0QI=mu$+9_~F{J>d36q&S4Q;(-pEKoRT;PaIYAw=Th;>l6!Nl zLYtfK@(GwbEBpKuQ!ozTB~3~Hdu+d4Sg$xe6Zt)~cdVpT*y7GoB2-0|yRhQIAJjp7 z($WYEwI*y%R_nMQ0A($iC!DAMqT6ZFCqee}#kC?};djzRhZ9V59ULCQq# zG(Llsoa-n4U;JooYBqHL!ZRVpC9YjS6s*$}D5){%>GN-#1OEIR*AZ5kzYEM`@an~lfUi95hk%oOtSvN z87oEA0ujj|_tX}e>)UmByvWb;b}jbtvtU93pVIR z1i##eEtF<_qvr9B3|Pe2%4~b==}vD?TW{V$nwT6x1t1+1EiJf_kx>;SuN1b{4#W4l zDwD~B-n2ge>$lSq+D`?_z-Kf#cEN4}8Y0AJq>LERXm7;KU}AE*>L()%##7$Vz%@Y= z3MGd2NFG{L6S8!s{VhHvlo4};zt|typ9f-{H2F%X=Eb0h#Wlz0hqy&Pf+7H}@YRs2 zWObN2y{LPjHR>2lBr(a7ja^`K7}C_Kw$Nfb=7w+s7db~vax}F8I9T@KPc-WErGTlV zm*j{4M12A*S|5Cpt!aSUg#?n@1rxZPH61&B;cNTRo!}Gy{n6<{R7q54VR6E z_EM?2&;J}N0J2S!VVh+VL|%uvYot|V$>m>CEbq~aZ3QYd&SmvY2+xJtOU$6{=yTv$ z(1?o8^6?cw-dC=GsIxfqQTTOgG+|tsS`{W-^ln_sUp><1GL%)u4{s0fTN0b}KK>%_ zi1U2%E681(tL#5byWagv48Tz2c>E1uO;*a4n!<71>DbC8M%g@+EgR?)%Keh#F}+@~ z-8{LfC~W14Mt=84EWha0Z;Qb2w7@RavH9nb)BD%0QgL8yyr)gY`a+xMQ)&IelgQyl z&4T-gSYwzN9-aNU19Y|E0eF94{flVemJE61O|@~aYrO55Dc6T00rIxG-o3uGX8(x^ z{-am0{0-nu7KKnuY3BljtST%g6dPl0rcu){LH%Ps+E3E@7_>6)t^6KGgV77Sy?7WP zO*npky&n}s&Pwj#8F`>q3l$;))WSbut#zKnoSG^o(^unXuc{j8=HRbxKt#4Q95=DZ z)9wTeQujAvJ5oau{}1DXb7n2164&^hdt_hTRAg7er0&f|tlL{>n!!eMy8n5wTr>)K z{imI&4GAJ~WqdFhdhKtS#i?BHh1VxdGdqlU(Hp3!)^(vDPnz#&4k}{n$;`~qYjw!U z3k1r=AeEhJ{l`HyPnD^`#TGgfJglsrr>2;{Xi9=X_%0WK!yorD?!y*kw=Y(V|RnEv7K1- z@qAGhr2VqnB)nwg-o*Lxf+psRnUH!(r$rJIY8CPH9+yTIlvneY)yr-{kqgyJ`~uH! zXw#`Wl}?QDBfQobhuU1(a-W4B-Te z6+#r_am>f(!MWi{RI))7hbH{5jSzq$I2VL;^eIR2O~JXGEgr!D86m2w%&N@ISr?Zp zkA8A{R-io4Dy~qX848Z?VGt8{izXeC!Ei`mjojHGDQ9NmM%6BcpWgZFgai=zA{!y_ zd(Y&a&q?l{nbmq+Nd+qsL?376(4M;#r;7)RtqS3;{e~5(SdoP@)QHBQ^gVctHnV0_ z)l5B^t~WJPtz0!nThmY=&6oIGhhY*_VRX!F7c zZ5IybG68DQB9-<2P=i%jULY@e&(1wxMwtkO(Vj-?N61L6YFBA3JCVFh6I@mSHYxZZ z#&-^8p{~spwgX3wUo-52@PECT9}Go&hb(n6d>%3u&xjbwW}U)TiA$VB<7blG()Y!w z?IezF5=&lfsZ3E7aQ7j@uPWe~vDi=CXP}nx-#p6Ep+SDPuvKC(sFUq)W1|LHyD#Z? zq9j8@F%k|6)l*y4R}Y7isK~l#AQPRPtDiZ+tx-Z0*V2;)|3ggvK!Ft0ksuTgyQP-z zOaju8u@gCD&QMRF%tO+`2^*VsNa8~_dpb>mFAQdCqxrV&F-GNVc~A% zze`H%17Cwi47tJ+NF{-iK^1dA%U7^(5lYV}B-v{zUgzJKL~`e@rT9y3LOYPi&wYc2 WWK+fYxv;^2kGQamQ2AHAfd2#1QGn(E literal 0 HcmV?d00001 From 52574733a62a397e79546180f04fb3761b2de53a Mon Sep 17 00:00:00 2001 From: JiayiFeng Date: Thu, 29 Mar 2018 07:15:02 +0000 Subject: [PATCH 267/314] Add KernelType switch for IncrementOp kernel --- paddle/fluid/operators/increment_op.cc | 9 +++++++++ python/paddle/fluid/layers/control_flow.py | 6 ++++-- python/paddle/fluid/layers/nn.py | 3 ++- .../paddle/fluid/tests/book/test_machine_translation.py | 2 +- python/paddle/fluid/tests/unittests/test_profiler.py | 3 ++- 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc index 2893ab7127..ec2e641679 100644 --- a/paddle/fluid/operators/increment_op.cc +++ b/paddle/fluid/operators/increment_op.cc @@ -33,6 +33,15 @@ class IncrementOp : public framework::OperatorWithKernel { ctx->SetOutputDim("Out", ctx->GetInputDim("X")); ctx->ShareLoD("X", "Out"); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // IncrementOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } }; class IncrementOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index fbfc383d11..b9a53eda91 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -1362,7 +1362,8 @@ class DynamicRNN(object): self.lod_rank_table = None self.max_seq_len = None self.step_idx = None - self.zero_idx = fill_constant(shape=[1], value=0, dtype='int64') + self.zero_idx = fill_constant( + shape=[1], value=0, dtype='int64', force_cpu=True) self.mem_dict = dict() self.output_array = [] self.outputs = [] @@ -1439,7 +1440,8 @@ class DynamicRNN(object): def block(self): if self.status != DynamicRNN.BEFORE_RNN: raise ValueError("rnn.block() can only be invoke once") - self.step_idx = fill_constant(shape=[1], dtype='int64', value=0) + self.step_idx = fill_constant( + shape=[1], dtype='int64', value=0, force_cpu=True) self.step_idx.stop_gradient = False self.status = DynamicRNN.IN_RNN with self.while_op.block(): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index fdf4185205..0332556f62 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3307,7 +3307,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1): name=counter_name, dtype='int64', shape=[1], persistable=True) if is_new_var: helper.set_variable_initializer( - counter, initializer=Constant(value=begin - 1)) + counter, initializer=Constant( + value=begin - 1, force_cpu=True)) helper.main_program.global_block().prepend_op( type='increment', inputs={'X': [counter]}, diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py index de72a7c3ff..3a1a0859ec 100644 --- a/python/paddle/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/test_machine_translation.py @@ -83,7 +83,7 @@ def decoder_train(context, is_sparse): def decoder_decode(context, is_sparse): init_state = context array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length) - counter = pd.zeros(shape=[1], dtype='int64') + counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True) # fill the first element with init_state state_array = pd.create_array('float32') diff --git a/python/paddle/fluid/tests/unittests/test_profiler.py b/python/paddle/fluid/tests/unittests/test_profiler.py index 49ec9c9020..cf6fe14a86 100644 --- a/python/paddle/fluid/tests/unittests/test_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_profiler.py @@ -33,7 +33,8 @@ class TestProfiler(unittest.TestCase): image = fluid.layers.data(name='x', shape=[784], dtype='float32') hidden1 = fluid.layers.fc(input=image, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64') - counter = fluid.layers.zeros(shape=[1], dtype='int64') + counter = fluid.layers.zeros( + shape=[1], dtype='int64', force_cpu=True) until = layers.fill_constant([1], dtype='int64', value=10) data_arr = layers.array_write(hidden1, i) cond = fluid.layers.less_than(x=counter, y=until) From c4886584047122b5b358021a6c21977c259142d0 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 29 Mar 2018 15:34:36 +0800 Subject: [PATCH 268/314] follow comments --- doc/fluid/design/concepts/cpp_data_feeding.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/doc/fluid/design/concepts/cpp_data_feeding.md b/doc/fluid/design/concepts/cpp_data_feeding.md index 9c44dec4b9..aabc1ba75a 100644 --- a/doc/fluid/design/concepts/cpp_data_feeding.md +++ b/doc/fluid/design/concepts/cpp_data_feeding.md @@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e To create and invoke readers, some new ops are introduced: -### Operators That Creates Readers +### Operators That Create Readers Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers. @@ -168,17 +168,19 @@ while_op(not_completed) { } ``` -Two important considerations for these programs are as follows: +A few important considerations for these programs are as follows: -1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader. +1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables. -2. All readers exist in both `startup_program` and `main_program`. And they are persistable. +2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader. + +3. All readers exist in both `startup_program` and `main_program`. And they are persistable. ### Simplify Configuration by MultiPassReader -The Program configuration mentioned above is somehow complicated. Users need to be very similar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to beginning users, we introduce `MultiPassReader`. +The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`. -`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several pass training. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`. +`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`. With `MultiPassReader`, the startup program would be like this: From 1e4f442a84ecf2ad27a7afaf80062ade5b333516 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Thu, 29 Mar 2018 16:21:07 +0800 Subject: [PATCH 269/314] fix a compile error --- paddle/fluid/operators/conditional_block_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/conditional_block_op.cc b/paddle/fluid/operators/conditional_block_op.cc index bbe297206e..bff2c34ec8 100644 --- a/paddle/fluid/operators/conditional_block_op.cc +++ b/paddle/fluid/operators/conditional_block_op.cc @@ -54,7 +54,7 @@ class ConditionalOp : public framework::OperatorBase { "numel should be 1, actual numel is %d", ips[0]->numel()); } - bool res; + bool res = false; if (platform::is_gpu_place(ips[0]->place())) { #ifdef PADDLE_WITH_CUDA framework::LoDTensor cpu_tensor; From 5b8bb3447006acabbc663dd9eb960560d78adca0 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 29 Mar 2018 16:24:39 +0800 Subject: [PATCH 270/314] Refine reshape_op by following comments. --- paddle/fluid/operators/reshape_op.cc | 10 ++++++---- paddle/fluid/operators/reshape_op.h | 1 - python/paddle/fluid/layers/nn.py | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 4b1aaf5849..b87b8e6b26 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -49,14 +49,14 @@ Examples: specified by Attr(shape) is [6, 8], the reshape operator will transform Input(X) into a 2-D tensor with shape [6, 8] and leaving Input(X)'s data unchanged. -1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +2. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape specified by Attr(shape) is [2, 3, -1, 2], the reshape operator will transform Input(X) into a 4-D tensor with shape [2, 3, 4, 2] and leaving Input(X)'s data unchanged. In this case, one and only dimension of Attr(shape) can be set to -1, the value of this dimension is inferred from the total element number of Input(X) and remaining dimensions. -1. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape +3. Given a 3-D tensor Input(X) with a shape [2, 4, 6], and the target shape specified by Attr(shape) is [-1, 0, 3, 2], the reshape operator will transform Input(X) into a 4-D tensor with shape [2, 4, 3, 2] and leaving Input(X)'s data unchanged. In this case, besides -1, 0 means the actual dimension value is going @@ -67,11 +67,13 @@ Note: 1. One and only one dimension in Attr(shape) can be set -1. In this case, the actual dimension value will be infered from the total element number of Input(X) and remaining dimensions. -1. More than one dimensions in Attr(shape) can be set to 0, which means the real + +2. More than one dimensions in Attr(shape) can be set to 0, which means the real dimension value will be copied from Input(X) at runtime. Note that the index of 0 can not exceed Rank(X). For example, Input(X) is a 3-D tensor with shape [2, 3, 4], Attr(shape) = [2, 3, 2, 0] is an invalid input. -1. Input(Shape) has a higher priority than Attr(shape) if it is provided, while + +3. Input(Shape) has a higher priority than Attr(shape) if it is provided, while Attr(shape) still should be set correctly to gurantee shape inference in compile-time. diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 3a9a769229..871b4d38d5 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -66,7 +66,6 @@ class ReshapeOp : public framework::OperatorWithKernel { int64_t capacity = 1; int unk_dim_idx = -1; for (size_t i = 0; i < shape.size(); ++i) { - // std::cout<< shape[i] << "haha"; if (shape[i] == unk_dim_val) { PADDLE_ENFORCE( unk_dim_idx == -1, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index c2d32954b5..ed82fa8940 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -3337,7 +3337,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): number of x and remaining dimensions. Thus one and only one dimension can be set -1. - 1. 0 means the actual dimension value is going to be copied from the + 2. 0 means the actual dimension value is going to be copied from the corresponding dimension of x. The indice of 0s in shape can not exceed Rank(X). @@ -3347,14 +3347,14 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=True, name=None): is [6, 8], the reshape operator will transform x into a 2-D tensor with shape [6, 8] and leaving x's data unchanged. - 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape specified is [2, 3, -1, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this case, one dimension of the target shape is set to -1, the value of this dimension is inferred from the total element number of x and remaining dimensions. - 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape + 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case, besides -1, 0 means the actual dimension value is going to be copied from From 8425c2c859b22f263e213d4fed454890b598948c Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Thu, 29 Mar 2018 16:34:33 +0800 Subject: [PATCH 271/314] Speed/sequence op1 (#9217) * "add functors" * "remove old code" * "fix" * "fix ci" * "add details" * "fix ci" * "fix ci" * "fix ci" * "fix ci" * "remove unused code" --- .../fluid/operators/math/sequence_pooling.cc | 112 ++++- .../fluid/operators/math/sequence_pooling.cu | 381 ++++++++++++++---- .../fluid/operators/math/sequence_pooling.h | 20 +- paddle/fluid/operators/sequence_pool_op.h | 102 +---- .../fluid/tests/unittests/test_seq_pool.py | 110 ++--- 5 files changed, 484 insertions(+), 241 deletions(-) diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc index f7a6f2bdf4..5ae42ab973 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cc +++ b/paddle/fluid/operators/math/sequence_pooling.cc @@ -19,8 +19,17 @@ namespace paddle { namespace operators { namespace math { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + template -class MaxSeqPoolFunctor { +class MaxSeqPoolFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, @@ -60,7 +69,7 @@ class MaxSeqPoolFunctor { }; template -class MaxSeqPoolGradFunctor { +class MaxSeqPoolGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& out_grad, @@ -93,10 +102,101 @@ class MaxSeqPoolGradFunctor { } }; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolGradFunctor; -template class MaxSeqPoolGradFunctor; +template +class SequencePoolFunctor { + public: + /* max pool has index output */ + void operator()(const platform::CPUDeviceContext& context, + const std::string pooltype, const framework::LoDTensor& input, + framework::Tensor* output, + framework::Tensor* index = nullptr) { + if (pooltype == "MAX") { + math::MaxSeqPoolFunctor max_pool; + max_pool(context, input, output, index); + return; + } + auto lod = input.lod()[0]; + auto& place = *context.eigen_device(); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + Tensor in_t = + input.Slice(static_cast(lod[i]), static_cast(lod[i + 1])); + Tensor out_t = output->Slice(i, i + 1); + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t w = input.numel() / input.dims()[0]; + auto in_e = EigenMatrix::From(in_t, framework::make_ddim({h, w})); + auto out_e = EigenVector::Flatten(out_t); + if (pooltype == "AVERAGE") { + out_e.device(place) = in_e.mean(Eigen::array({{0}})); + } else if (pooltype == "SUM") { + out_e.device(place) = in_e.sum(Eigen::array({{0}})); + } else if (pooltype == "SQRT") { + out_e.device(place) = in_e.sum(Eigen::array({{0}})) / + std::sqrt(static_cast(h)); + } else if (pooltype == "LAST") { + out_e.device(place) = in_e.chip(h - 1, 0); + } else if (pooltype == "FIRST") { + out_e.device(place) = in_e.chip(0, 0); + } else { + PADDLE_THROW("unsupported pooling pooltype"); + } + } + } +}; + +template +class SequencePoolGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const std::string pooltype, const framework::Tensor& out_grad, + framework::LoDTensor* in_grad, + /* max pool has index */ + const framework::Tensor* index = nullptr) { + if (pooltype == "MAX") { + math::MaxSeqPoolGradFunctor max_pool_grad; + max_pool_grad(context, out_grad, *index, in_grad); + return; + } + + if (pooltype == "LAST" || pooltype == "FIRST") { + // set X@Grad be zero at first when pooltype is LAST/FIRST + math::SetConstant functor; + functor(context, in_grad, 0); + } + auto lod = in_grad->lod()[0]; + auto& place = *context.eigen_device(); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { + auto in_g_t = in_grad->Slice(static_cast(lod[i]), + static_cast(lod[i + 1])); + auto out_g_t = out_grad.Slice(i, i + 1); + int64_t h = static_cast(lod[i + 1] - lod[i]); + int64_t w = in_grad->numel() / in_grad->dims()[0]; + auto in_g_e = EigenMatrix::From(in_g_t, {h, w}); + auto out_g_e = EigenMatrix::From(out_g_t, {1, w}); + auto out_g_e_v = EigenVector::Flatten(out_g_t); + Eigen::DSizes bcast(h, 1); + + if (pooltype == "AVERAGE") { + in_g_e.device(place) = (out_g_e / static_cast(h)).broadcast(bcast); + } else if (pooltype == "SUM") { + in_g_e.device(place) = (out_g_e).broadcast(bcast); + } else if (pooltype == "SQRT") { + in_g_e.device(place) = + (out_g_e / std::sqrt(static_cast(h))).broadcast(bcast); + } else if (pooltype == "LAST") { + in_g_e.chip(h - 1, 0).device(place) = out_g_e_v; + } else if (pooltype == "FIRST") { + in_g_e.chip(0, 0).device(place) = out_g_e_v; + } else { + PADDLE_THROW("unsupported pooling pooltype"); + } + } + } +}; + +template class SequencePoolFunctor; +template class SequencePoolFunctor; +template class SequencePoolGradFunctor; +template class SequencePoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu index d61407c020..1935364da3 100644 --- a/paddle/fluid/operators/math/sequence_pooling.cu +++ b/paddle/fluid/operators/math/sequence_pooling.cu @@ -14,6 +14,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/sequence_pooling.h" +#include "paddle/fluid/platform/cuda_helper.h" namespace paddle { namespace operators { @@ -22,113 +23,331 @@ namespace math { #define FLT_MAX __FLT_MAX__ template -__global__ void KeMaxSequencePool(const T* input, const size_t* starts, - T* output, int* index, int64_t num_seq, - int64_t dim) { - int dim_idx = threadIdx.x; - int seq_id = blockIdx.x; - if (seq_id >= num_seq) return; - size_t start = starts[seq_id]; - size_t end = starts[seq_id + 1]; - - for (int64_t i = dim_idx; i < dim; i += blockDim.x) { - T max_val = static_cast(-FLT_MAX); - int max_id = -1; - for (size_t step_id = start; step_id < end; step_id++) { - if (max_val < input[step_id * dim + i]) { - max_val = input[step_id * dim + i]; - max_id = step_id; +struct MaxPoolFunctor { + HOSTDEVICE void operator()(const T* input, const size_t start, + const size_t end, const size_t item_dim, T* output, + int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + T max_val = static_cast(-FLT_MAX); + int max_index = -1; + for (int i = start; i < end; ++i) { + if (max_val < input[item_dim * i + tid]) { + max_val = input[item_dim * i + tid]; + max_index = i; + } } + output[tid] = max_val; + index[tid] = max_index; } - output[seq_id * dim + i] = max_val; - index[seq_id * dim + i] = max_id; } -} +}; template -class MaxSeqPoolFunctor { - public: - void operator()(const platform::CUDADeviceContext& context, - const framework::LoDTensor& input, framework::Tensor* output, - framework::Tensor* index) { - auto in_dims = input.dims(); - auto out_dims = output->dims(); - auto idx_dims = index->dims(); - PADDLE_ENFORCE_GT(in_dims.size(), static_cast(1)); - PADDLE_ENFORCE_GT(out_dims.size(), 1); - for (int64_t i = 1; i < in_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(in_dims[i], out_dims[i]); +struct AvgPoolFunctor { + HOSTDEVICE void operator()(const T* input, const size_t start, + const size_t end, const size_t item_dim, T* output, + int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + T val = static_cast(0); + for (int i = start; i < end; ++i) { + val += input[item_dim * i + tid]; + } + // end, start is lod, so end - start != 0 + output[tid] = val / static_cast(end - start); } - PADDLE_ENFORCE_EQ(idx_dims, out_dims); + } +}; - auto starts = input.lod()[0]; - const T* in_data = input.data(); - T* out_data = output->data(); - int* max_index = index->data(); +template +struct SumPoolFunctor { + HOSTDEVICE void operator()(const T* input, const size_t start, + const size_t end, const size_t item_dim, T* output, + int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + T val = static_cast(0); + for (int i = start; i < end; ++i) { + val += input[item_dim * i + tid]; + } + output[tid] = val; + } + } +}; - int64_t num_seq = out_dims[0]; - int64_t dim = output->numel() / num_seq; +template +struct SqrtPoolFunctor { + HOSTDEVICE void operator()(const T* input, const size_t start, + const size_t end, const size_t item_dim, T* output, + int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + T val = static_cast(0); + for (int i = start; i < end; ++i) { + val += input[item_dim * i + tid]; + } + // end, start is lod, so end - start != 0 + output[tid] = val / sqrt(end - start); + } + } +}; - dim3 threads(256, 1); - dim3 grid(num_seq, 1); - auto stream = context.stream(); - KeMaxSequencePool<<>>( - in_data, starts.CUDAData(context.GetPlace()), out_data, max_index, - num_seq, dim); +template +struct LastPoolFunctor { + HOSTDEVICE void operator()(const T* input, const size_t start, + const size_t end, const size_t item_dim, T* output, + int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + output[tid] = input[item_dim * (end - 1) + tid]; + } } }; template -__global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index, - T* in_grad, int64_t num_seq, - int64_t dim) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int col_idx = idx % dim; - if (idx < num_seq * dim) { - int step_id = max_index[idx]; - in_grad[step_id * dim + col_idx] = out_grad[idx]; +struct FirstPoolFunctor { + HOSTDEVICE void operator()(const T* input, const size_t start, + const size_t end, const size_t item_dim, T* output, + int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + output[tid] = input[item_dim * start + tid]; + } } +}; + +template +__global__ void sequence_pool_kernel(Range_OP op, const T* input, + const size_t* lod, const size_t lod_size, + const size_t item_dim, T* output, + int* index) { + int bid = blockIdx.x; + if (bid >= lod_size - 1) return; + size_t start = lod[bid]; + size_t end = lod[bid + 1]; + int* index_offset = nullptr; + if (index != nullptr) { + index_offset = &index[bid * item_dim]; + } + op(input, start, end, item_dim, &output[bid * item_dim], index_offset); } template -class MaxSeqPoolGradFunctor { +class SequencePoolFunctor { public: void operator()(const platform::CUDADeviceContext& context, - const framework::Tensor& out_grad, - const framework::Tensor& index, - framework::LoDTensor* in_grad) { - auto og_dims = out_grad.dims(); - auto idx_dims = index.dims(); - auto ig_dims = in_grad->dims(); - PADDLE_ENFORCE_GT(og_dims.size(), static_cast(1)); - PADDLE_ENFORCE_GT(ig_dims.size(), static_cast(1)); - for (int64_t i = 1; i < og_dims.size(); ++i) { - PADDLE_ENFORCE_EQ(og_dims[i], ig_dims[i]); + const std::string pooltype, const framework::LoDTensor& input, + framework::Tensor* output, + framework::Tensor* index = nullptr) { + auto lod = input.lod()[0]; + const size_t item_dim = output->numel() / output->dims()[0]; + dim3 threads(1024, 1); + dim3 grid(lod.size(), 1); + if (pooltype == "MAX") { + sequence_pool_kernel< + T, MaxPoolFunctor><<>>( + MaxPoolFunctor(), input.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), index->data()); + } else if (pooltype == "AVERAGE") { + sequence_pool_kernel< + T, AvgPoolFunctor><<>>( + AvgPoolFunctor(), input.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); + } else if (pooltype == "SUM") { + sequence_pool_kernel< + T, SumPoolFunctor><<>>( + SumPoolFunctor(), input.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); + } else if (pooltype == "SQRT") { + sequence_pool_kernel< + T, SqrtPoolFunctor><<>>( + SqrtPoolFunctor(), input.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); + } else if (pooltype == "LAST") { + sequence_pool_kernel< + T, LastPoolFunctor><<>>( + LastPoolFunctor(), input.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); + } else if (pooltype == "FIRST") { + sequence_pool_kernel< + T, FirstPoolFunctor><<>>( + FirstPoolFunctor(), input.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + output->mutable_data(context.GetPlace()), nullptr); + } else { + PADDLE_THROW("unsupported pooling pooltype"); } - PADDLE_ENFORCE_EQ(idx_dims, og_dims); + } +}; - const T* og_data = out_grad.data(); - const int* max_index = index.data(); - T* ig_data = in_grad->data(); +template +struct MaxPoolGradFunctor { + HOSTDEVICE void operator()(const T* out_grad, const size_t start, + const size_t end, const size_t item_dim, + T* in_grad, const int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + for (int i = start; i < end; ++i) { + if (i == index[tid]) { + in_grad[item_dim * i + tid] = out_grad[tid]; + } else { + in_grad[item_dim * i + tid] = static_cast(0); + } + } + } + } +}; - SetConstant set_zero; - set_zero(context, in_grad, static_cast(0.0)); - int64_t num_seq = og_dims[0]; - int64_t dim = out_grad.numel() / num_seq; +template +struct AvgPoolGradFunctor { + HOSTDEVICE void operator()(const T* out_grad, const size_t start, + const size_t end, const size_t item_dim, + T* in_grad, const int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + for (int i = start; i < end; ++i) { + in_grad[item_dim * i + tid] = out_grad[tid] / (end - start); + } + } + } +}; - unsigned int blocks = (num_seq * dim + 128 - 1) / 128; - dim3 threads(128, 1); - dim3 grid(blocks, 1); - auto stream = context.stream(); - KeMaxSequencePoolGrad<<>>( - og_data, max_index, ig_data, num_seq, dim); +template +struct SumPoolGradFunctor { + HOSTDEVICE void operator()(const T* out_grad, const size_t start, + const size_t end, const size_t item_dim, + T* in_grad, const int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + for (int i = start; i < end; ++i) { + in_grad[item_dim * i + tid] = out_grad[tid]; + } + } + } +}; + +template +struct SqrtPoolGradFunctor { + HOSTDEVICE void operator()(const T* out_grad, const size_t start, + const size_t end, const size_t item_dim, + T* in_grad, const int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + for (int i = start; i < end; ++i) { + in_grad[item_dim * i + tid] = + out_grad[tid] / (sqrt(static_cast(end - start))); + } + } + } +}; + +template +struct LastPoolGradFunctor { + HOSTDEVICE void operator()(const T* out_grad, const size_t start, + const size_t end, const size_t item_dim, + T* in_grad, const int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + for (int i = start; i < end; ++i) { + if (i == end - 1) { + in_grad[item_dim * i + tid] = out_grad[tid]; + } else { + in_grad[item_dim * i + tid] = static_cast(0); + } + } + } + } +}; + +template +struct FirstPoolGradFunctor { + HOSTDEVICE void operator()(const T* out_grad, const size_t start, + const size_t end, const size_t item_dim, + T* in_grad, const int* index) { + for (int tid = threadIdx.x; tid < item_dim; tid += blockDim.x) { + for (int i = start; i < end; ++i) { + if (i == start) { + in_grad[item_dim * i + tid] = out_grad[tid]; + } else { + in_grad[item_dim * i + tid] = static_cast(0); + } + } + } + } +}; + +template +__global__ void sequence_pool_grad_kernel(Range_OP op, const T* out_grad, + const size_t* lod, + const size_t lod_size, + const size_t item_dim, T* in_grad, + const int* index) { + int bid = blockIdx.x; + if (bid >= lod_size - 1) return; + size_t start = lod[bid]; + size_t end = lod[bid + 1]; + const int* index_offset = nullptr; + if (index != nullptr) { + index_offset = &index[bid * item_dim]; + } + op(&out_grad[bid * item_dim], start, end, item_dim, in_grad, index_offset); +} + +template +class SequencePoolGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const std::string pooltype, const framework::Tensor& out_grad, + framework::LoDTensor* in_grad, + /* max pool has index */ + const framework::Tensor* index = nullptr) { + auto lod = in_grad->lod()[0]; + const size_t item_dim = in_grad->numel() / in_grad->dims()[0]; + dim3 threads(1024, 1); + dim3 grid(lod.size(), 1); + if (pooltype == "MAX") { + sequence_pool_grad_kernel< + T, MaxPoolGradFunctor><<>>( + MaxPoolGradFunctor(), out_grad.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), index->data()); + } else if (pooltype == "AVERAGE") { + sequence_pool_grad_kernel< + T, AvgPoolGradFunctor><<>>( + AvgPoolGradFunctor(), out_grad.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); + } else if (pooltype == "SUM") { + sequence_pool_grad_kernel< + T, SumPoolGradFunctor><<>>( + SumPoolGradFunctor(), out_grad.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); + } else if (pooltype == "SQRT") { + sequence_pool_grad_kernel< + T, SqrtPoolGradFunctor><<>>( + SqrtPoolGradFunctor(), out_grad.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); + } else if (pooltype == "LAST") { + sequence_pool_grad_kernel< + T, LastPoolGradFunctor><<>>( + LastPoolGradFunctor(), out_grad.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); + } else if (pooltype == "FIRST") { + sequence_pool_grad_kernel< + T, FirstPoolGradFunctor><<>>( + FirstPoolGradFunctor(), out_grad.data(), + lod.CUDAData(context.GetPlace()), lod.size(), item_dim, + in_grad->mutable_data(context.GetPlace()), nullptr); + + } else { + PADDLE_THROW("unsupported pooling pooltype"); + } } }; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolGradFunctor; -template class MaxSeqPoolGradFunctor; +// sequence pooling +template class SequencePoolFunctor; +template class SequencePoolFunctor; +template class SequencePoolGradFunctor; +template class SequencePoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/sequence_pooling.h b/paddle/fluid/operators/math/sequence_pooling.h index ecb76884f6..38e7802229 100644 --- a/paddle/fluid/operators/math/sequence_pooling.h +++ b/paddle/fluid/operators/math/sequence_pooling.h @@ -21,23 +21,23 @@ namespace paddle { namespace operators { namespace math { -#define FLT_MAX __FLT_MAX__ - template -class MaxSeqPoolFunctor { +class SequencePoolFunctor { public: - void operator()(const DeviceContext& context, + /* max pool has index output */ + void operator()(const DeviceContext& context, const std::string pooltype, const framework::LoDTensor& input, framework::Tensor* output, - framework::Tensor* index); + framework::Tensor* index = nullptr); }; -template -class MaxSeqPoolGradFunctor { +template +class SequencePoolGradFunctor { public: - void operator()(const DeviceContext& context, + void operator()(const DeviceContext& context, const std::string pooltype, const framework::Tensor& out_grad, - const framework::Tensor& index, - framework::LoDTensor* in_grad); + framework::LoDTensor* in_grad, + /* max pool has index */ + const framework::Tensor* index = nullptr); }; } // namespace math diff --git a/paddle/fluid/operators/sequence_pool_op.h b/paddle/fluid/operators/sequence_pool_op.h index 8706ff14aa..c58d677c92 100644 --- a/paddle/fluid/operators/sequence_pool_op.h +++ b/paddle/fluid/operators/sequence_pool_op.h @@ -23,12 +23,6 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template -using EigenVector = framework::EigenVector; -template -using EigenMatrix = framework::EigenMatrix; template class SequencePoolKernel : public framework::OpKernel { @@ -37,11 +31,13 @@ class SequencePoolKernel : public framework::OpKernel { auto* in = context.Input("X"); auto* out = context.Output("Out"); std::string pooltype = context.Attr("pooltype"); + Tensor* index = nullptr; + if (pooltype == "MAX") { + index = context.Output("MaxIndex"); + } auto dims = in->dims(); auto lod = in->lod(); - int64_t w = in->numel() / dims[0]; - // InferShape by lod PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now."); PADDLE_ENFORCE_GE( @@ -50,45 +46,14 @@ class SequencePoolKernel : public framework::OpKernel { "The first dimension of Input(X) must be large than batch size."); dims[0] = lod[0].size() - 1; out->Resize({dims}); - - auto lod_level_0 = lod[0]; - out->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); if (pooltype == "MAX") { - math::MaxSeqPoolFunctor max_pool; - auto* index = context.Output("MaxIndex"); index->Resize({dims}); index->mutable_data(context.GetPlace()); - max_pool(dev_ctx, *in, out, index); - return; - } - - auto& place = - *context.template device_context().eigen_device(); - for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { - Tensor in_t = in->Slice(static_cast(lod_level_0[i]), - static_cast(lod_level_0[i + 1])); - Tensor out_t = out->Slice(i, i + 1); - int64_t h = static_cast(lod_level_0[i + 1] - lod_level_0[i]); - auto in_e = EigenMatrix::From(in_t, framework::make_ddim({h, w})); - auto out_e = EigenVector::Flatten(out_t); - - if (pooltype == "AVERAGE") { - out_e.device(place) = in_e.mean(Eigen::array({{0}})); - } else if (pooltype == "SUM") { - out_e.device(place) = in_e.sum(Eigen::array({{0}})); - } else if (pooltype == "SQRT") { - out_e.device(place) = in_e.sum(Eigen::array({{0}})) / - std::sqrt(static_cast(h)); - } else if (pooltype == "LAST") { - out_e.device(place) = in_e.chip(h - 1, 0); - } else if (pooltype == "FIRST") { - out_e.device(place) = in_e.chip(0, 0); - } else { - PADDLE_THROW("unsupported pooling pooltype"); - } } + math::SequencePoolFunctor pool; + pool(context.template device_context(), pooltype, *in, out, + index); } }; @@ -96,58 +61,17 @@ template class SequencePoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* in = context.Input("X"); auto* out_g = context.Input(framework::GradVarName("Out")); auto* in_g = context.Output(framework::GradVarName("X")); std::string pooltype = context.Attr("pooltype"); - - auto dims = in->dims(); - auto lod = in->lod()[0]; - int64_t w = in->numel() / dims[0]; - - in_g->mutable_data(context.GetPlace()); - auto& dev_ctx = context.template device_context(); - + const Tensor* index = nullptr; if (pooltype == "MAX") { - math::MaxSeqPoolGradFunctor max_pool_grad; - auto* index = context.Input("MaxIndex"); - max_pool_grad(dev_ctx, *out_g, *index, in_g); - return; - } - - if (pooltype == "LAST" || pooltype == "FIRST") { - // set X@Grad be zero at first when pooltype is LAST/FIRST - math::SetConstant functor; - functor(dev_ctx, in_g, 0); - } - auto& place = - *context.template device_context().eigen_device(); - - for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { - auto in_g_t = - in_g->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); - auto out_g_t = out_g->Slice(i, i + 1); - int64_t h = static_cast(lod[i + 1] - lod[i]); - auto in_g_e = EigenMatrix::From(in_g_t, {h, w}); - auto out_g_e = EigenMatrix::From(out_g_t, {1, w}); - auto out_g_e_v = EigenVector::Flatten(out_g_t); - Eigen::DSizes bcast(h, 1); - - if (pooltype == "AVERAGE") { - in_g_e.device(place) = (out_g_e / static_cast(h)).broadcast(bcast); - } else if (pooltype == "SUM") { - in_g_e.device(place) = (out_g_e).broadcast(bcast); - } else if (pooltype == "SQRT") { - in_g_e.device(place) = - (out_g_e / std::sqrt(static_cast(h))).broadcast(bcast); - } else if (pooltype == "LAST") { - in_g_e.chip(h - 1, 0).device(place) = out_g_e_v; - } else if (pooltype == "FIRST") { - in_g_e.chip(0, 0).device(place) = out_g_e_v; - } else { - PADDLE_THROW("unsupported pooling pooltype"); - } + index = context.Input("MaxIndex"); } + in_g->mutable_data(context.GetPlace()); + math::SequencePoolGradFunctor pool; + pool(context.template device_context(), pooltype, *out_g, + in_g, index); } }; diff --git a/python/paddle/fluid/tests/unittests/test_seq_pool.py b/python/paddle/fluid/tests/unittests/test_seq_pool.py index 0488475721..2e48ef0e88 100644 --- a/python/paddle/fluid/tests/unittests/test_seq_pool.py +++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py @@ -49,6 +49,61 @@ class TestSeqAvgPool(OpTest): self.check_grad(["X"], "Out") +class TestSeqSumPool(TestSeqAvgPool): + def compute(self, x, lod, out): + self.attrs = {'pooltype': "SUM"} + for i in range(4): + sub_x = x[lod[0][i]:lod[0][i + 1], :] + out[i] = sub_x.sum(axis=0) + + +class TestSeqMaxPool(TestSeqAvgPool): + def set_data(self): + self.op_type = 'sequence_pool' + x = np.random.uniform(0.1, 1, [13, 23]).astype('float32') + lod = [[0, 4, 5, 8, 13]] + for i in range(4): + l = lod[0][i + 1] - lod[0][i] + x[lod[0][i] + np.random.randint(l), :] += 2.0 + + self.inputs = {'X': (x, lod)} + + out = np.zeros((4, 23)).astype('float32') + self.outputs = {'Out': out} + return x, lod, out + + def compute(self, x, lod, out): + self.attrs = {'pooltype': "MAX"} + for i in range(4): + sub_x = x[lod[0][i]:lod[0][i + 1], :] + out[i] = np.amax(sub_x, axis=0) + + +class TestSeqSqrtPool(TestSeqAvgPool): + def compute(self, x, lod, out): + self.attrs = {'pooltype': "SQRT"} + for i in range(4): + sub_x = x[lod[0][i]:lod[0][i + 1], :] + len = lod[0][i + 1] - lod[0][i] + out[i] = sub_x.sum(axis=0) / np.sqrt(len) + + +class TestSeqLastPool(TestSeqAvgPool): + def compute(self, x, lod, out): + self.attrs = {'pooltype': "LAST"} + for i in range(4): + sub_x = x[lod[0][i]:lod[0][i + 1], :] + out[i] = sub_x[-1, :] + + +class TestSeqFirstPool(TestSeqAvgPool): + def compute(self, x, lod, out): + self.attrs = {'pooltype': "FIRST"} + for i in range(4): + sub_x = x[lod[0][i]:lod[0][i + 1], :] + out[i] = sub_x[0, :] + + class TestSeqAvgPool2D(TestSeqAvgPool): def set_data(self): self.op_type = 'sequence_pool' @@ -68,14 +123,6 @@ class TestSeqAvgPool2D(TestSeqAvgPool): out[i] = np.reshape(sub_x.mean(axis=0), (3, 17)) -class TestSeqSumPool(TestSeqAvgPool): - def compute(self, x, lod, out): - self.attrs = {'pooltype': "SUM"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] - out[i] = sub_x.sum(axis=0) - - class TestSeqSumPool2D(TestSeqAvgPool2D): def compute(self, x, lod, out): self.attrs = {'pooltype': "SUM"} @@ -84,15 +131,6 @@ class TestSeqSumPool2D(TestSeqAvgPool2D): out[i] = np.reshape(sub_x.sum(axis=0), (3, 17)) -class TestSeqSqrtPool(TestSeqAvgPool): - def compute(self, x, lod, out): - self.attrs = {'pooltype': "SQRT"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] - len = lod[0][i + 1] - lod[0][i] - out[i] = sub_x.sum(axis=0) / np.sqrt(len) - - class TestSeqSqrtPool2D(TestSeqAvgPool2D): def compute(self, x, lod, out): self.attrs = {'pooltype': "SQRT"} @@ -108,28 +146,6 @@ class TestSeqSqrtPool2D(TestSeqAvgPool2D): self.check_grad(["X"], "Out", max_relative_error=0.06) -class TestSeqMaxPool(TestSeqAvgPool): - def set_data(self): - self.op_type = 'sequence_pool' - x = np.random.uniform(0.1, 1, [13, 23]).astype('float32') - lod = [[0, 4, 5, 8, 13]] - for i in range(4): - l = lod[0][i + 1] - lod[0][i] - x[lod[0][i] + np.random.randint(l), :] += 2.0 - - self.inputs = {'X': (x, lod)} - - out = np.zeros((4, 23)).astype('float32') - self.outputs = {'Out': out} - return x, lod, out - - def compute(self, x, lod, out): - self.attrs = {'pooltype': "MAX"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] - out[i] = np.amax(sub_x, axis=0) - - class TestSeqMaxPool2D(TestSeqAvgPool2D): def set_data(self): self.op_type = 'sequence_pool' @@ -151,14 +167,6 @@ class TestSeqMaxPool2D(TestSeqAvgPool2D): out[i] = np.reshape(np.amax(sub_x, axis=0), (3, 11)) -class TestSeqLastPool(TestSeqAvgPool): - def compute(self, x, lod, out): - self.attrs = {'pooltype': "LAST"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] - out[i] = sub_x[-1, :] - - class TestSeqLastPool2D(TestSeqAvgPool2D): def compute(self, x, lod, out): self.attrs = {'pooltype': "LAST"} @@ -167,14 +175,6 @@ class TestSeqLastPool2D(TestSeqAvgPool2D): out[i] = np.reshape(sub_x[-1, :], (3, 17)) -class TestSeqFirstPool(TestSeqAvgPool): - def compute(self, x, lod, out): - self.attrs = {'pooltype': "FIRST"} - for i in range(4): - sub_x = x[lod[0][i]:lod[0][i + 1], :] - out[i] = sub_x[0, :] - - class TestSeqFirstPool2D(TestSeqAvgPool2D): def compute(self, x, lod, out): self.attrs = {'pooltype': "FIRST"} From 34a440fa646ea9627efc2be27c6efbb51642dfe2 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Thu, 29 Mar 2018 17:26:49 +0800 Subject: [PATCH 272/314] Revert "make append activation in place by default (#9417)" This reverts commit ce16400daedfa8f793d20d44081db7f417af693a. --- python/paddle/fluid/layer_helper.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index 4341e06596..d771837fc5 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -398,6 +398,7 @@ class LayerHelper(object): return input_var if isinstance(act, basestring): act = {'type': act} + tmp = self.create_tmp_variable(dtype=input_var.dtype) if 'use_mkldnn' in self.kwargs: act['use_mkldnn'] = self.kwargs.get('use_mkldnn') @@ -407,9 +408,9 @@ class LayerHelper(object): self.append_op( type=act_type, inputs={"X": [input_var]}, - outputs={"Out": [input_var]}, + outputs={"Out": [tmp]}, attrs=act) - return input_var + return tmp def _get_default_initializer(self, dtype): if dtype is None or dtype_is_floating(dtype) is True: From e727cdb62d9659808d22e09463e53cf47eef8e3f Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Thu, 29 Mar 2018 18:38:59 +0800 Subject: [PATCH 273/314] fix block num --- paddle/fluid/operators/send_recv_op_test.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/send_recv_op_test.cc b/paddle/fluid/operators/send_recv_op_test.cc index e9fb845b47..04392b3e05 100644 --- a/paddle/fluid/operators/send_recv_op_test.cc +++ b/paddle/fluid/operators/send_recv_op_test.cc @@ -122,7 +122,8 @@ void StartServerNet(bool is_sparse) { // sub program run in listen_and_serv_op, for simple test we use sum f::ProgramDesc program; - f::BlockDesc *optimize_block = program.MutableBlock(0); + const auto &root_block = program.Block(0); + auto *optimize_block = program.AppendBlock(root_block); // X for server side tensors, RX for received tensers, must be of same shape. AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, optimize_block); From 9bbd753425609b8f03a1a4593dca272a00c8f1e6 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Fri, 30 Mar 2018 00:44:39 +0800 Subject: [PATCH 274/314] change WITH_FLUID to WITH_FLUID_ONLY (#9427) --- CMakeLists.txt | 5 ++--- paddle/CMakeLists.txt | 2 +- python/CMakeLists.txt | 6 +++--- python/setup.py.in | 8 ++++---- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1e11f86d0e..5506fcb010 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -53,8 +53,7 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) -# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. -option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF) +option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) @@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON) endif() if (WITH_C_API) - set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE) + set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE) endif() if(MOBILE_INFERENCE) diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt index d2a4b13354..c44f8a8a8e 100644 --- a/paddle/CMakeLists.txt +++ b/paddle/CMakeLists.txt @@ -1,4 +1,4 @@ -if(NOT WITH_FLUID) +if(NOT WITH_FLUID_ONLY) add_subdirectory(cuda) add_subdirectory(function) add_subdirectory(utils) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 90c2dfbba7..b0242b20b8 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -4,7 +4,7 @@ set(PY_FILES paddle/__init__.py ${UTILS_PY_FILES} ${FLUID_PY_FILES}) -if(NOT WITH_FLUID) +if(NOT WITH_FLUID_ONLY) file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py) file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py) file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py) @@ -62,7 +62,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS}) -if(NOT WITH_FLUID) +if(NOT WITH_FLUID_ONLY) set(paddle_python_deps ${paddle_python_deps} paddle_pserver_main paddle_trainer paddle_merge_model) if(WITH_SWIG_PY) list(APPEND paddle_python_deps python_api_wheel) @@ -73,7 +73,7 @@ add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) if (WITH_TESTING) - if(NOT WITH_FLUID) + if(NOT WITH_FLUID_ONLY) add_subdirectory(paddle/trainer_config_helpers/tests) if (WITH_SWIG_PY) # enable v2 API unittest only when paddle swig api is compiled diff --git a/python/setup.py.in b/python/setup.py.in index 4cb5409524..831d173d42 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -68,7 +68,7 @@ packages=['paddle', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers'] -if '${WITH_FLUID}'== 'OFF': +if '${WITH_FLUID_ONLY}'== 'OFF': packages+=['paddle.proto', 'paddle.trainer', 'paddle.trainer_config_helpers', @@ -87,7 +87,7 @@ if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']: # the prefix is sys.prefix which should always be usr paddle_bins = '' -if '${WITH_FLUID}'== 'OFF': +if '${WITH_FLUID_ONLY}'== 'OFF': paddle_bin_dir = 'opt/paddle/bin' paddle_bins = ['${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer', '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model', @@ -95,7 +95,7 @@ if '${WITH_FLUID}'== 'OFF': '${PADDLE_BINARY_DIR}/paddle/scripts/paddle'] package_data={'paddle.fluid': ['core.so']} -if '${WITH_FLUID}'== 'OFF': +if '${WITH_FLUID_ONLY}'== 'OFF': package_data['paddle.v2.master']=['libpaddle_master.so'] package_data['py_paddle']=['*.py','_swig_paddle.so'] @@ -106,7 +106,7 @@ package_dir={ 'paddle.fluid.proto.profiler': '${PADDLE_BINARY_DIR}/paddle/fluid/platform', 'paddle.fluid.proto': '${PADDLE_BINARY_DIR}/paddle/fluid/framework', } -if '${WITH_FLUID}'== 'OFF': +if '${WITH_FLUID_ONLY}'== 'OFF': package_dir['py_paddle']='${PADDLE_SOURCE_DIR}/paddle/py_paddle' From 5f9da86ba562c543a623ff0d99f06bd2e935edb3 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Thu, 29 Mar 2018 09:44:58 -0700 Subject: [PATCH 275/314] Fix the order of reads and write from buffered channel (#9423) * Fix Issue 9388 * Fix typos --- paddle/fluid/framework/channel_impl.h | 100 +++++++++++++------------ paddle/fluid/framework/channel_test.cc | 34 +++++++-- 2 files changed, 77 insertions(+), 57 deletions(-) diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h index 378a0bab1c..c47d629289 100644 --- a/paddle/fluid/framework/channel_impl.h +++ b/paddle/fluid/framework/channel_impl.h @@ -87,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel { return value; } + std::shared_ptr get_first_message( + std::deque> &queue, ChannelAction action) { + while (!queue.empty()) { + // Check whether this message was added by Select + // If this was added by Select then execute the callback + // to check if you can execute this message. The callback + // can return false if some other case was executed in Select. + // In that case just discard this QueueMessage and process next. + std::shared_ptr m = queue.front(); + queue.pop_front(); + if (m->callback == nullptr || m->callback(action)) return m; + } + return nullptr; + } + size_t cap_; std::recursive_mutex mu_; bool closed_; @@ -131,36 +146,21 @@ void ChannelImpl::Send(T *item) { // If there is a receiver, directly pass the value we want // to send to the receiver, bypassing the channel buffer if any if (!recvq.empty()) { - std::shared_ptr m = recvq.front(); - recvq.pop_front(); - // Do the data transfer - // We will do this data transfer if either of the following - // cases are true - // 1. callback == nullptr // This means it was a regular channel send - // 2. callback returns true - bool do_send = true; - if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND); - if (do_send) + std::shared_ptr m = + get_first_message(recvq, ChannelAction::SEND); + + if (m != nullptr) { *(m->data) = std::move(*item); - else { - // We cannot do the data transfer because - // this QueueMessage was added by Select - // and some other case was executed. - // So call the Send function again. - // We do not care about notifying other - // because they would have been notified - // by the executed select case. + m->Notify(); + lock.unlock(); + send_return(); + return; + } else { lock.unlock(); Send(item); send_return(); return; } - - // Wake up the blocked process and unlock - m->Notify(); - lock.unlock(); - send_return(); - return; } // Unbuffered channel will always bypass this @@ -201,32 +201,34 @@ bool ChannelImpl::Receive(T *item) { } // If there is a sender, directly receive the value we want - // from the sender, bypassing the channel buffer if any + // from the sender. In case of a buffered channel, read from + // buffer and move front of send queue to the buffer if (!sendq.empty()) { - std::shared_ptr m = sendq.front(); - sendq.pop_front(); - // Do the data transfer - // We will do this data transfer if either of the following - // cases are true - // 1. callback == nullptr // This means it was a regular channel send - // 2. callback returns true - bool do_receive = true; - if (m->callback != nullptr) - do_receive = m->callback(ChannelAction::RECEIVE); - if (do_receive) - *item = std::move(*(m->data)); - else - // We cannot do the data transfer because - // this QueueMessage was added by Select - // and some other case was executed. - // So call the Receive function again. - // We do not care about notifying other - // because they would have been notified - // by the executed select case. - return recv_return(Receive(item)); - - // Wake up the blocked process and unlock - m->Notify(); + std::shared_ptr m = + get_first_message(sendq, ChannelAction::RECEIVE); + if (buf_.size() > 0) { + // Case 1 : Channel is Buffered + // Do Data transfer from front of buffer + // and add a QueueMessage to the buffer + *item = std::move(buf_.front()); + buf_.pop_front(); + // If first message from sendq is not null + // add it to the buffer and notify it + if (m != nullptr) { + // Copy to buffer + buf_.push_back(std::move(*(m->data))); + m->Notify(); + } // Ignore if there is no first message + } else { + // Case 2: Channel is Unbuffered + // Do data transfer from front of SendQ + // If front is nullptr, then recursively call itself + if (m != nullptr) { + *item = std::move(*(m->data)); + m->Notify(); + } else + return recv_return(Receive(item)); + } lock.unlock(); return recv_return(true); } diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc index e2380bb54b..1184bfdae1 100644 --- a/paddle/fluid/framework/channel_test.cc +++ b/paddle/fluid/framework/channel_test.cc @@ -36,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) { delete ch; } -void RecevingOrderEqualToSendingOrder(Channel *ch) { +void RecevingOrderEqualToSendingOrder(Channel *ch, int num_items) { unsigned sum_send = 0; std::thread t([&]() { - for (int i = 0; i < 5; i++) { + for (int i = 0; i < num_items; i++) { ch->Send(&i); sum_send += i; } }); - for (int i = 0; i < 5; i++) { - int recv = 999; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); + for (int i = 0; i < num_items; i++) { + int recv = -1; EXPECT_EQ(ch->Receive(&recv), true); EXPECT_EQ(recv, i); } std::this_thread::sleep_for(std::chrono::milliseconds(200)); CloseChannel(ch); t.join(); - EXPECT_EQ(sum_send, 10U); + unsigned expected_sum = (num_items * (num_items - 1)) / 2; + EXPECT_EQ(sum_send, expected_sum); delete ch; } @@ -185,12 +187,28 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { auto ch = MakeChannel(0); - RecevingOrderEqualToSendingOrder(ch); + RecevingOrderEqualToSendingOrder(ch, 20); +} + +TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) { + // Test that Receive Order is same as Send Order when number of items + // sent is less than size of buffer + auto ch = MakeChannel(10); + RecevingOrderEqualToSendingOrder(ch, 5); +} + +TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) { + // Test that Receive Order is same as Send Order when number of items + // sent is equal to size of buffer + auto ch = MakeChannel(10); + RecevingOrderEqualToSendingOrder(ch, 10); } -TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) { +TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) { + // Test that Receive Order is same as Send Order when number of items + // sent is greater than the size of buffer auto ch = MakeChannel(10); - RecevingOrderEqualToSendingOrder(ch); + RecevingOrderEqualToSendingOrder(ch, 20); } void ChannelCloseUnblocksReceiversTest(Channel *ch) { From c414fbbeb16475cea96651d5a7d46e5c37093d03 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Thu, 29 Mar 2018 16:04:19 -0700 Subject: [PATCH 276/314] hookup WITH_FLUID_ONLY in TeamCity build.sh (#9509) --- paddle/scripts/docker/build.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 322f72e4a5..12c3a50d49 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -53,6 +53,7 @@ function cmake_gen() { -DWITH_FAST_BUNDLE_TEST=ON -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_EXPORT_COMPILE_COMMANDS=ON + -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} ======================================== EOF # Disable UNITTEST_USE_VIRTUALENV in docker because @@ -78,6 +79,7 @@ EOF -DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_FAST_BUNDLE_TEST=ON \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ + -DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON } From a75de489c5921c173f4255ef2537160a5bbf354f Mon Sep 17 00:00:00 2001 From: weixing Date: Fri, 30 Mar 2018 10:36:55 +0800 Subject: [PATCH 277/314] Fix some errors (#9403) --- .../build_from_source_cn.rst | 1 + .../build_from_source_en.rst | 1 + .../build_and_install/docker_install_cn.rst | 1 + .../build_and_install/docker_install_en.rst | 1 + doc/fluid/build_and_install/index_cn.rst | 3 +- doc/fluid/build_and_install/index_en.rst | 3 +- .../build_and_install/pip_install_cn.rst | 1 + .../build_and_install/pip_install_en.rst | 1 + doc/fluid/design/algorithm/index_cn.rst | 7 +++ doc/fluid/design/algorithm/index_en.rst | 7 +++ doc/fluid/design/concepts/README.md | 12 ++--- doc/fluid/design/concepts/index_cn.rst | 18 +++++++ doc/fluid/design/concepts/index_en.rst | 18 +++++++ doc/fluid/design/concepts/scope.md | 4 +- doc/fluid/design/concepts/var_desc.md | 2 + doc/fluid/design/concurrent/index_cn.rst | 8 +++ doc/fluid/design/concurrent/index_en.rst | 8 +++ doc/fluid/design/data_type/index_cn.rst | 7 +++ doc/fluid/design/data_type/index_en.rst | 7 +++ .../distributed_lookup_table_design.md | 2 +- doc/fluid/design/dist_train/index_cn.rst | 9 ++++ doc/fluid/design/dist_train/index_en.rst | 9 ++++ doc/fluid/design/dynamic_rnn/index_cn.rst | 8 +++ doc/fluid/design/dynamic_rnn/index_en.rst | 8 +++ doc/fluid/design/dynamic_rnn/rnn_design.md | 15 +++--- doc/fluid/design/execution/index_cn.rst | 8 +++ doc/fluid/design/execution/index_en.rst | 8 +++ doc/fluid/design/execution/switch.md | 6 +-- doc/fluid/design/index_cn.rst | 17 ++++++ doc/fluid/design/index_en.rst | 17 ++++++ doc/fluid/design/interface/index_cn.rst | 4 ++ doc/fluid/design/interface/index_en.rst | 4 ++ doc/fluid/design/memory/index_cn.rst | 7 +++ doc/fluid/design/memory/index_en.rst | 7 +++ doc/fluid/design/modules/evaluator.md | 20 +++---- doc/fluid/design/modules/index_cn.rst | 14 +++++ doc/fluid/design/modules/index_en.rst | 14 +++++ doc/fluid/design/modules/net_op_design.md | 22 ++++---- doc/fluid/design/modules/optimizer.md | 8 +-- doc/fluid/design/motivation/index_cn.rst | 10 ++++ doc/fluid/design/motivation/index_en.rst | 10 ++++ .../design/motivation/refactorization.md | 36 ++++++------- doc/fluid/design/muti_devices/index_cn.rst | 9 ++++ doc/fluid/design/muti_devices/index_en.rst | 9 ++++ .../design/muti_devices/kernel_hint_design.md | 2 +- .../design/muti_devices/kernel_selection.md | 2 +- doc/fluid/design/network/index_cn.rst | 7 +++ doc/fluid/design/network/index_en.rst | 7 +++ doc/fluid/dev/api_doc_std_cn.md | 52 +++++++++---------- doc/fluid/dev/index_cn.rst | 11 ++++ doc/fluid/dev/index_en.rst | 11 +++- doc/fluid/dev/name_convention.md | 6 +-- doc/fluid/dev/new_op_kernel_en.md | 18 +++---- doc/fluid/dev/op_markdown_format.md | 10 ++-- doc/fluid/dev/use_eigen_cn.md | 18 +++---- doc/fluid/dev/use_eigen_en.md | 10 ++-- doc/fluid/getstarted/concepts/index_cn.rst | 4 ++ doc/fluid/getstarted/concepts/index_en.rst | 4 ++ doc/fluid/getstarted/index_cn.rst | 19 ++++++- doc/fluid/getstarted/index_en.rst | 18 ++++++- doc/fluid/getstarted/quickstart_cn.rst | 1 + doc/fluid/getstarted/quickstart_en.rst | 1 + doc/fluid/howto/index_cn.rst | 5 ++ doc/fluid/howto/index_en.rst | 5 +- .../howto/optimization/benchmark/README.md | 1 + .../howto/optimization/benchmark/index_cn.rst | 8 +++ .../howto/optimization/benchmark/index_en.rst | 8 +++ .../optimization/benchmark/vgg16/README.md | 1 + .../howto/optimization/cpu_profiling_cn.md | 2 +- .../howto/optimization/cpu_profiling_en.md | 4 +- doc/fluid/howto/optimization/index_cn.rst | 9 ++++ doc/fluid/howto/optimization/index_en.rst | 9 ++++ doc/fluid/howto/optimization/timeline.md | 2 +- doc/fluid/index_cn.rst | 2 +- doc/fluid/index_en.rst | 2 +- .../design/interface/00.why_plain_c.md | 0 .../interface/01.inference_implementation.md | 0 doc/v2/design/interface/index_cn.rst | 7 +++ doc/v2/design/interface/index_en.rst | 7 +++ doc/v2/design/mkl/mkldnn.md | 6 +-- 80 files changed, 531 insertions(+), 139 deletions(-) create mode 120000 doc/fluid/build_and_install/build_from_source_cn.rst create mode 120000 doc/fluid/build_and_install/build_from_source_en.rst create mode 120000 doc/fluid/build_and_install/docker_install_cn.rst create mode 120000 doc/fluid/build_and_install/docker_install_en.rst mode change 100644 => 120000 doc/fluid/build_and_install/index_cn.rst mode change 100644 => 120000 doc/fluid/build_and_install/index_en.rst create mode 120000 doc/fluid/build_and_install/pip_install_cn.rst create mode 120000 doc/fluid/build_and_install/pip_install_en.rst create mode 100644 doc/fluid/design/algorithm/index_cn.rst create mode 100644 doc/fluid/design/algorithm/index_en.rst create mode 100644 doc/fluid/design/concepts/index_cn.rst create mode 100644 doc/fluid/design/concepts/index_en.rst create mode 100644 doc/fluid/design/concurrent/index_cn.rst create mode 100644 doc/fluid/design/concurrent/index_en.rst create mode 100644 doc/fluid/design/data_type/index_cn.rst create mode 100644 doc/fluid/design/data_type/index_en.rst create mode 100644 doc/fluid/design/dist_train/index_cn.rst create mode 100644 doc/fluid/design/dist_train/index_en.rst create mode 100644 doc/fluid/design/dynamic_rnn/index_cn.rst create mode 100644 doc/fluid/design/dynamic_rnn/index_en.rst create mode 100644 doc/fluid/design/execution/index_cn.rst create mode 100644 doc/fluid/design/execution/index_en.rst create mode 100644 doc/fluid/design/interface/index_cn.rst create mode 100644 doc/fluid/design/interface/index_en.rst create mode 100644 doc/fluid/design/memory/index_cn.rst create mode 100644 doc/fluid/design/memory/index_en.rst create mode 100644 doc/fluid/design/modules/index_cn.rst create mode 100644 doc/fluid/design/modules/index_en.rst create mode 100644 doc/fluid/design/motivation/index_cn.rst create mode 100644 doc/fluid/design/motivation/index_en.rst create mode 100644 doc/fluid/design/muti_devices/index_cn.rst create mode 100644 doc/fluid/design/muti_devices/index_en.rst create mode 100644 doc/fluid/design/network/index_cn.rst create mode 100644 doc/fluid/design/network/index_en.rst create mode 100644 doc/fluid/getstarted/concepts/index_cn.rst create mode 100644 doc/fluid/getstarted/concepts/index_en.rst create mode 120000 doc/fluid/getstarted/quickstart_cn.rst create mode 120000 doc/fluid/getstarted/quickstart_en.rst create mode 120000 doc/fluid/howto/optimization/benchmark/README.md create mode 100644 doc/fluid/howto/optimization/benchmark/index_cn.rst create mode 100644 doc/fluid/howto/optimization/benchmark/index_en.rst create mode 120000 doc/fluid/howto/optimization/benchmark/vgg16/README.md create mode 100644 doc/fluid/howto/optimization/index_cn.rst create mode 100644 doc/fluid/howto/optimization/index_en.rst rename doc/{fluid => v2}/design/interface/00.why_plain_c.md (100%) rename doc/{fluid => v2}/design/interface/01.inference_implementation.md (100%) create mode 100644 doc/v2/design/interface/index_cn.rst create mode 100644 doc/v2/design/interface/index_en.rst diff --git a/doc/fluid/build_and_install/build_from_source_cn.rst b/doc/fluid/build_and_install/build_from_source_cn.rst new file mode 120000 index 0000000000..ae4e8c7c48 --- /dev/null +++ b/doc/fluid/build_and_install/build_from_source_cn.rst @@ -0,0 +1 @@ +../../v2/build_and_install/build_from_source_cn.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/build_from_source_en.rst b/doc/fluid/build_and_install/build_from_source_en.rst new file mode 120000 index 0000000000..1ac828c973 --- /dev/null +++ b/doc/fluid/build_and_install/build_from_source_en.rst @@ -0,0 +1 @@ +../../v2/build_and_install/build_from_source_en.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/docker_install_cn.rst b/doc/fluid/build_and_install/docker_install_cn.rst new file mode 120000 index 0000000000..965b2e2055 --- /dev/null +++ b/doc/fluid/build_and_install/docker_install_cn.rst @@ -0,0 +1 @@ +../../v2/build_and_install/docker_install_cn.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/docker_install_en.rst b/doc/fluid/build_and_install/docker_install_en.rst new file mode 120000 index 0000000000..79d7341a7b --- /dev/null +++ b/doc/fluid/build_and_install/docker_install_en.rst @@ -0,0 +1 @@ +../../v2/build_and_install/docker_install_en.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst deleted file mode 100644 index 9276236f9f..0000000000 --- a/doc/fluid/build_and_install/index_cn.rst +++ /dev/null @@ -1,2 +0,0 @@ -安装与使用 ------------- diff --git a/doc/fluid/build_and_install/index_cn.rst b/doc/fluid/build_and_install/index_cn.rst new file mode 120000 index 0000000000..f697fcd8fa --- /dev/null +++ b/doc/fluid/build_and_install/index_cn.rst @@ -0,0 +1 @@ +../../v2/build_and_install/index_cn.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst deleted file mode 100644 index cc1e61a58a..0000000000 --- a/doc/fluid/build_and_install/index_en.rst +++ /dev/null @@ -1,2 +0,0 @@ -Build and Install ------------- diff --git a/doc/fluid/build_and_install/index_en.rst b/doc/fluid/build_and_install/index_en.rst new file mode 120000 index 0000000000..502f66a413 --- /dev/null +++ b/doc/fluid/build_and_install/index_en.rst @@ -0,0 +1 @@ +../../v2/build_and_install/index_en.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/pip_install_cn.rst b/doc/fluid/build_and_install/pip_install_cn.rst new file mode 120000 index 0000000000..07deca84b8 --- /dev/null +++ b/doc/fluid/build_and_install/pip_install_cn.rst @@ -0,0 +1 @@ +../../v2/build_and_install/pip_install_cn.rst \ No newline at end of file diff --git a/doc/fluid/build_and_install/pip_install_en.rst b/doc/fluid/build_and_install/pip_install_en.rst new file mode 120000 index 0000000000..7f39c99819 --- /dev/null +++ b/doc/fluid/build_and_install/pip_install_en.rst @@ -0,0 +1 @@ +../../v2/build_and_install/pip_install_en.rst \ No newline at end of file diff --git a/doc/fluid/design/algorithm/index_cn.rst b/doc/fluid/design/algorithm/index_cn.rst new file mode 100644 index 0000000000..0883a9dc9c --- /dev/null +++ b/doc/fluid/design/algorithm/index_cn.rst @@ -0,0 +1,7 @@ +梯度更新算法 +------------ + +.. toctree:: + :maxdepth: 1 + + parameter_average.md diff --git a/doc/fluid/design/algorithm/index_en.rst b/doc/fluid/design/algorithm/index_en.rst new file mode 100644 index 0000000000..59fe68dcf7 --- /dev/null +++ b/doc/fluid/design/algorithm/index_en.rst @@ -0,0 +1,7 @@ +Gradient Update Algorithm +-------------------------------------- + +.. toctree:: + :maxdepth: 1 + + parameter_average.md diff --git a/doc/fluid/design/concepts/README.md b/doc/fluid/design/concepts/README.md index bf0e4dddc1..ed3f5aab28 100644 --- a/doc/fluid/design/concepts/README.md +++ b/doc/fluid/design/concepts/README.md @@ -2,7 +2,7 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su Here are some initial thoughts. Your comments are welcome! -### Required CMake Function +# Required CMake Function I think we need only the following few CMake functions to make a project description mean and clean: @@ -25,7 +25,7 @@ Also, - to describe external dependencies, we need `external_library`. - to build shared libraries, we need `shared_library`. -### An Example Project +## An Example Project Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files: @@ -102,11 +102,11 @@ shared_library(api ``` -### Implementation +## Implementation As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`. -### Using Package Manager For Go +## Using Package Manager For Go Building Go binaries and libraries need to satisfy their dependencies, generally we can do `go get ./...` to download and compile all external dependencies. The @@ -122,7 +122,7 @@ problems are: at many cloud file hosting, so users what to compile paddle by themselves can download this "vendor" package from a mirror site. -#### Choose A Suitable Tool +### Choose A Suitable Tool As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools) list dozens of Go package managers. We choose the tool using following principles: @@ -140,7 +140,7 @@ management tool has been started at: https://github.com/golang/dep to resolve such problems, but it's currently at Alpha stage. So the best choice now is glide obviously. -#### Manage Go Packages +### Manage Go Packages - Dependencies: `go/glide.yaml` will store the dependencies and their versions which is directly imported by paddle. `go/glide.lock` will store all dependencies recursively diff --git a/doc/fluid/design/concepts/index_cn.rst b/doc/fluid/design/concepts/index_cn.rst new file mode 100644 index 0000000000..eec8a2f14c --- /dev/null +++ b/doc/fluid/design/concepts/index_cn.rst @@ -0,0 +1,18 @@ +核心概念 +------------- + +.. toctree:: + :maxdepth: 1 + + README.md + cpp_data_feeding.md + functions_operators_layers.md + program.md + variable.md + var_desc.md + tensor.md + tensor_array.md + lod_tensor.md + block.md + scope.md + executor.md diff --git a/doc/fluid/design/concepts/index_en.rst b/doc/fluid/design/concepts/index_en.rst new file mode 100644 index 0000000000..036e1da255 --- /dev/null +++ b/doc/fluid/design/concepts/index_en.rst @@ -0,0 +1,18 @@ +Core Concepts +-------------------------------------- + +.. toctree:: + :maxdepth: 1 + + README.md + cpp_data_feeding.md + functions_operators_layers.md + program.md + variable.md + var_desc.md + tensor.md + tensor_array.md + lod_tensor.md + block.md + scope.md + executor.md diff --git a/doc/fluid/design/concepts/scope.md b/doc/fluid/design/concepts/scope.md index 4da76eebb7..dcf7664935 100644 --- a/doc/fluid/design/concepts/scope.md +++ b/doc/fluid/design/concepts/scope.md @@ -30,7 +30,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`. Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`. -1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. +1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed. @@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily. -# Interface Design +## Interface Design ```cpp class Variable { diff --git a/doc/fluid/design/concepts/var_desc.md b/doc/fluid/design/concepts/var_desc.md index 6a45af1995..fcba08c07f 100644 --- a/doc/fluid/design/concepts/var_desc.md +++ b/doc/fluid/design/concepts/var_desc.md @@ -1,3 +1,5 @@ +# Design Doc: Var_desc + ## Background PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations. diff --git a/doc/fluid/design/concurrent/index_cn.rst b/doc/fluid/design/concurrent/index_cn.rst new file mode 100644 index 0000000000..e47135e9fc --- /dev/null +++ b/doc/fluid/design/concurrent/index_cn.rst @@ -0,0 +1,8 @@ +并发编程 +------------ + +.. toctree:: + :maxdepth: 1 + + concurrent_programming.md + parallel_do.md diff --git a/doc/fluid/design/concurrent/index_en.rst b/doc/fluid/design/concurrent/index_en.rst new file mode 100644 index 0000000000..0727e75798 --- /dev/null +++ b/doc/fluid/design/concurrent/index_en.rst @@ -0,0 +1,8 @@ +Concurrent Programming +------------------------- + +.. toctree:: + :maxdepth: 1 + + concurrent_programming.md + parallel_do.md diff --git a/doc/fluid/design/data_type/index_cn.rst b/doc/fluid/design/data_type/index_cn.rst new file mode 100644 index 0000000000..b60167b6b1 --- /dev/null +++ b/doc/fluid/design/data_type/index_cn.rst @@ -0,0 +1,7 @@ +数据类型 +------------ + +.. toctree:: + :maxdepth: 1 + + float16.md diff --git a/doc/fluid/design/data_type/index_en.rst b/doc/fluid/design/data_type/index_en.rst new file mode 100644 index 0000000000..6a88d17943 --- /dev/null +++ b/doc/fluid/design/data_type/index_en.rst @@ -0,0 +1,7 @@ +Data Type +------------ + +.. toctree:: + :maxdepth: 1 + + float16.md diff --git a/doc/fluid/design/dist_train/distributed_lookup_table_design.md b/doc/fluid/design/dist_train/distributed_lookup_table_design.md index e543adf0f9..9887291389 100644 --- a/doc/fluid/design/dist_train/distributed_lookup_table_design.md +++ b/doc/fluid/design/dist_train/distributed_lookup_table_design.md @@ -1,4 +1,4 @@ -## Design Doc: Distributed Lookup Table Operator +# Design Doc: Distributed Lookup Table Operator A lookup table operator in PaddlePaddle where the table could be out of the memory of a computer. diff --git a/doc/fluid/design/dist_train/index_cn.rst b/doc/fluid/design/dist_train/index_cn.rst new file mode 100644 index 0000000000..ed6f3dda27 --- /dev/null +++ b/doc/fluid/design/dist_train/index_cn.rst @@ -0,0 +1,9 @@ +分布式训练 +------------ + +.. toctree:: + :maxdepth: 1 + + distributed_architecture.md + distributed_lookup_table_design.md + parameter_server.md diff --git a/doc/fluid/design/dist_train/index_en.rst b/doc/fluid/design/dist_train/index_en.rst new file mode 100644 index 0000000000..f84688f168 --- /dev/null +++ b/doc/fluid/design/dist_train/index_en.rst @@ -0,0 +1,9 @@ +Distributed Training +--------------------- + +.. toctree:: + :maxdepth: 1 + + distributed_architecture.md + distributed_lookup_table_design.md + parameter_server.md diff --git a/doc/fluid/design/dynamic_rnn/index_cn.rst b/doc/fluid/design/dynamic_rnn/index_cn.rst new file mode 100644 index 0000000000..1d224d22cf --- /dev/null +++ b/doc/fluid/design/dynamic_rnn/index_cn.rst @@ -0,0 +1,8 @@ +动态RNN +------------ + +.. toctree:: + :maxdepth: 1 + + rnn.md + rnn_design.md diff --git a/doc/fluid/design/dynamic_rnn/index_en.rst b/doc/fluid/design/dynamic_rnn/index_en.rst new file mode 100644 index 0000000000..568f496e4f --- /dev/null +++ b/doc/fluid/design/dynamic_rnn/index_en.rst @@ -0,0 +1,8 @@ +Dynamic RNN +------------ + +.. toctree:: + :maxdepth: 1 + + rnn.md + rnn_design.md diff --git a/doc/fluid/design/dynamic_rnn/rnn_design.md b/doc/fluid/design/dynamic_rnn/rnn_design.md index 3d38b9a0ad..cecfcd3307 100644 --- a/doc/fluid/design/dynamic_rnn/rnn_design.md +++ b/doc/fluid/design/dynamic_rnn/rnn_design.md @@ -99,7 +99,7 @@ private: - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos` 2. 对于不感知 `lod_start_pos` 的Op足够透明 -3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 +3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 具体的设计分为以下3小节 @@ -189,7 +189,7 @@ struct SortedSeqItem { std::vector sorted_seqs; ``` -来追踪序列排序后的位置,并添加一个新的接口 +来追踪序列排序后的位置,并添加一个新的接口 ```c++ std::vector SortBySeqLen(const LODTensor& tensor); @@ -233,7 +233,10 @@ x x - 将每个序列concat 为规则的mini-batch表示 ## 参考文献 -1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing) -2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html) -3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5) -4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail) +[Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing) + +[mxnet Bucketing](http://mxnet.io/how_to/bucketing.html) + +[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5) + +[Level of details](https://en.wikipedia.org/wiki/Level_of_detail) diff --git a/doc/fluid/design/execution/index_cn.rst b/doc/fluid/design/execution/index_cn.rst new file mode 100644 index 0000000000..ed31b01742 --- /dev/null +++ b/doc/fluid/design/execution/index_cn.rst @@ -0,0 +1,8 @@ +执行流程 +------------- + +.. toctree:: + :maxdepth: 1 + + switch.md + if_else_op.md diff --git a/doc/fluid/design/execution/index_en.rst b/doc/fluid/design/execution/index_en.rst new file mode 100644 index 0000000000..fcf846da34 --- /dev/null +++ b/doc/fluid/design/execution/index_en.rst @@ -0,0 +1,8 @@ +Execution Process +-------------------------------------- + +.. toctree:: + :maxdepth: 1 + + switch.md + if_else_op.md diff --git a/doc/fluid/design/execution/switch.md b/doc/fluid/design/execution/switch.md index 827d0601c6..1c337bd715 100644 --- a/doc/fluid/design/execution/switch.md +++ b/doc/fluid/design/execution/switch.md @@ -1,6 +1,6 @@ -### Design Doc: Switch +# Design Doc: Switch -### Background +## Background Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid. @@ -19,7 +19,7 @@ with switch() as switch: fluid.print("Case 3") ``` -### The Semantics +## The Semantics 1. A `switch` control-flow checks cases one-by-one. 1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values. diff --git a/doc/fluid/design/index_cn.rst b/doc/fluid/design/index_cn.rst index f1887be690..e9f55214f4 100644 --- a/doc/fluid/design/index_cn.rst +++ b/doc/fluid/design/index_cn.rst @@ -1,2 +1,19 @@ 设计思想 ------------ + +.. toctree:: + :maxdepth: 1 + + motivation/index_cn.rst + execution/index_cn.rst + concepts/index_cn.rst + data_type/index_cn.rst + memory/index_cn.rst + muti_devices/index_cn.rst + dynamic_rnn/index_cn.rst + concurrent/index_cn.rst + algorithm/index_cn.rst + network/index_cn.rst + modules/index_cn.rst + interface/index_cn.rst + dist_train/index_cn.rst diff --git a/doc/fluid/design/index_en.rst b/doc/fluid/design/index_en.rst index 18a4b4122f..2802dc3a31 100644 --- a/doc/fluid/design/index_en.rst +++ b/doc/fluid/design/index_en.rst @@ -1,2 +1,19 @@ Design ------------ + +.. toctree:: + :maxdepth: 1 + + motivation/index_en.rst + execution/index_en.rst + concepts/index_en.rst + data_type/index_en.rst + memory/index_en.rst + muti_devices/index_en.rst + dynamic_rnn/index_en.rst + concurrent/index_en.rst + algorithm/index_en.rst + network/index_en.rst + modules/index_en.rst + interface/index_en.rst + dist_train/index_en.rst diff --git a/doc/fluid/design/interface/index_cn.rst b/doc/fluid/design/interface/index_cn.rst new file mode 100644 index 0000000000..69a8d9bad4 --- /dev/null +++ b/doc/fluid/design/interface/index_cn.rst @@ -0,0 +1,4 @@ +多语言接口 +------------ + +TBD diff --git a/doc/fluid/design/interface/index_en.rst b/doc/fluid/design/interface/index_en.rst new file mode 100644 index 0000000000..22abc71f98 --- /dev/null +++ b/doc/fluid/design/interface/index_en.rst @@ -0,0 +1,4 @@ +Multi-Language Interface +----------------------- + +TBD diff --git a/doc/fluid/design/memory/index_cn.rst b/doc/fluid/design/memory/index_cn.rst new file mode 100644 index 0000000000..c507c638bd --- /dev/null +++ b/doc/fluid/design/memory/index_cn.rst @@ -0,0 +1,7 @@ +内存管理 +------------ + +.. toctree:: + :maxdepth: 1 + + memory_optimization.md diff --git a/doc/fluid/design/memory/index_en.rst b/doc/fluid/design/memory/index_en.rst new file mode 100644 index 0000000000..f7526437a7 --- /dev/null +++ b/doc/fluid/design/memory/index_en.rst @@ -0,0 +1,7 @@ +Memory Management +------------------- + +.. toctree:: + :maxdepth: 1 + + memory_optimization.md diff --git a/doc/fluid/design/modules/evaluator.md b/doc/fluid/design/modules/evaluator.md index 11cc129d56..de9605b0e6 100644 --- a/doc/fluid/design/modules/evaluator.md +++ b/doc/fluid/design/modules/evaluator.md @@ -1,10 +1,10 @@ -## Evaluator Design +# Evaluator Design -### Problem Statement +## Problem Statement During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. -### Evaluator Design +## Evaluator Design Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. 1. Initialize the metric state and add it into the block. @@ -14,11 +14,11 @@ Currently, every operation is expressed in the graph. We divide the evaluator pr 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. -### Implementation -This design is shown in the Python API. -Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. +## Implementation +This design is shown in the Python API. +Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. + - ```python class Evaluator(object): """ @@ -32,7 +32,7 @@ class Evaluator(object): The initialization of Evaluator should be responsible for: create metric states and append to the main_program - """ + """ pass def _update_ops(self, input, label, **kwargs) @@ -40,14 +40,14 @@ class Evaluator(object): Add mini-batch evaluator caculate operators to the main_program. Add increment operator to accumulate the metric states. """ - + def reset(self, executor, reset_program=None): """ Reset metric states at the begin of each pass/user specified batch number. Execute the reset_program to reset the states. """ - + def eval(self, executor, eval_program=None): """ diff --git a/doc/fluid/design/modules/index_cn.rst b/doc/fluid/design/modules/index_cn.rst new file mode 100644 index 0000000000..b25783f0f5 --- /dev/null +++ b/doc/fluid/design/modules/index_cn.rst @@ -0,0 +1,14 @@ +代码结构和重要模块 +----------------- + +.. toctree:: + :maxdepth: 1 + + backward.md + python_api.md + regularization.md + infer_var_type.md + optimizer.md + prune.md + register_grad_op.md + net_op_design.md diff --git a/doc/fluid/design/modules/index_en.rst b/doc/fluid/design/modules/index_en.rst new file mode 100644 index 0000000000..2108156e08 --- /dev/null +++ b/doc/fluid/design/modules/index_en.rst @@ -0,0 +1,14 @@ +Code Structure and Important Modules +------------------------------------- + +.. toctree:: + :maxdepth: 1 + + backward.md + python_api.md + regularization.md + infer_var_type.md + optimizer.md + prune.md + register_grad_op.md + net_op_design.md diff --git a/doc/fluid/design/modules/net_op_design.md b/doc/fluid/design/modules/net_op_design.md index a5f0483081..e64ac2fb1c 100644 --- a/doc/fluid/design/modules/net_op_design.md +++ b/doc/fluid/design/modules/net_op_design.md @@ -1,16 +1,16 @@ # Network Design `Network` is the container and controller of a set of operators, -user can build a real network from a `NetDesc` which is a protobuf message +user can build a real network from a `NetDesc` which is a protobuf message and use `Network.Run()` to run all the operators in the network. -A network object knows all Operators belonging to this network. Variables, -which are inputs and outputs of these operators, +A network object knows all Operators belonging to this network. Variables, +which are inputs and outputs of these operators, are created and managed by a hierarchy of Scope objects. -# API +## API -## Net +### Net To make the `Network` extendable, a base class is defined like this ```c++ @@ -43,8 +43,8 @@ class Net { }; ``` -All network implementations should build networks from a protobuf message which -describes the structure of a real network; `Run` method should be implemented by +All network implementations should build networks from a protobuf message which +describes the structure of a real network; `Run` method should be implemented by all implementations to offer a universal method to forward or backward compute a network. `Net::Create` is a method of factory pattern and can be implemented like @@ -64,7 +64,7 @@ std::unique Net::Create(const NetDesc& def) { ``` Network is designed as the container of operators. to make it more extendable, -we decouple it from the related variable resources. +we decouple it from the related variable resources. `Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes. @@ -80,7 +80,7 @@ if (net) { } ``` -## `PlainNet` as a simple implementation of `BaseNet` +### `PlainNet` as a simple implementation of `BaseNet` A very basic implementation is as follows. All it does is simply to run every operators in sequence. @@ -211,9 +211,9 @@ class NetBuilder final { } ``` -## Compatibility with RNN +### Compatibility with RNN -Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, +Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, for example we can implement a simple recurrent neural network as follows ```c++ diff --git a/doc/fluid/design/modules/optimizer.md b/doc/fluid/design/modules/optimizer.md index 691081c268..1c25fde9ca 100644 --- a/doc/fluid/design/modules/optimizer.md +++ b/doc/fluid/design/modules/optimizer.md @@ -1,6 +1,6 @@ -## Optimizer Design +# Optimizer Design -### The Problem +## The Problem A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: @@ -19,7 +19,7 @@ It's true that users should be able to create all these operators manually by ca In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. -### High-level Python API to describe the training process +## High-level Python API to describe the training process 1. User write code to describe the network: @@ -54,7 +54,7 @@ In this design, we propose a high-level API that automatically derives the optim sess.run(target= opt_op_list, ...) ``` -#### Optimizer Python interface: +### Optimizer Python interface: ```python class Optimizer(object): diff --git a/doc/fluid/design/motivation/index_cn.rst b/doc/fluid/design/motivation/index_cn.rst new file mode 100644 index 0000000000..7706e73eca --- /dev/null +++ b/doc/fluid/design/motivation/index_cn.rst @@ -0,0 +1,10 @@ +设计动机和目标 +------------- + +.. toctree:: + :maxdepth: 1 + + api.md + refactorization.md + fluid.md + fluid_compiler.md diff --git a/doc/fluid/design/motivation/index_en.rst b/doc/fluid/design/motivation/index_en.rst new file mode 100644 index 0000000000..10b64b257c --- /dev/null +++ b/doc/fluid/design/motivation/index_en.rst @@ -0,0 +1,10 @@ +Design Motivations and Goals +-------------------------------------- + +.. toctree:: + :maxdepth: 1 + + api.md + refactorization.md + fluid.md + fluid_compiler.md diff --git a/doc/fluid/design/motivation/refactorization.md b/doc/fluid/design/motivation/refactorization.md index f93d6155e1..7c39fabcc6 100644 --- a/doc/fluid/design/motivation/refactorization.md +++ b/doc/fluid/design/motivation/refactorization.md @@ -97,13 +97,13 @@ Compile Time -> IR -> Runtime --- -# Operator/OpWithKernel/OpKernel +## Operator/OpWithKernel/OpKernel ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot) --- -# Operator +## Operator ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) * `Operator` is the fundamental building block of the user interface. @@ -113,7 +113,7 @@ Compile Time -> IR -> Runtime --- -# OpWithKernel/Kernel +## OpWithKernel/Kernel ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot) @@ -124,7 +124,7 @@ Compile Time -> IR -> Runtime --- -# Why separate Kernel and Operator +## Why separate Kernel and Operator * Separate GPU and CPU code. * Make Paddle capable of running without GPU. @@ -132,7 +132,7 @@ Compile Time -> IR -> Runtime * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel. --- -# Libraries for Kernel development +## Libraries for Kernel development * `Eigen::Tensor` contains basic math and element-wise functions. * Note that `Eigen::Tensor` has broadcast implementation. @@ -143,16 +143,16 @@ Compile Time -> IR -> Runtime * Hand-writing `GPUKernel` and `CPU` code * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) --- -# Operator Registration +## Operator Registration -## Why is registration necessary? +### Why is registration necessary? We need a method to build mappings between Op type names and Op classes. -## How is registration implemented? +### How is registration implemented? Maintaining a map, whose key is the type name and the value is the corresponding Op constructor. --- -# The Registry Map +## The Registry Map ### `OpInfoMap` @@ -166,7 +166,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding - **`checker`**: Used to check attributes. --- -# Related Concepts +## Related Concepts ### Op_Maker It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)) @@ -178,7 +178,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ``` --- -# Registration Process +## Registration Process 1. Write an Op class and its gradient Op class, if required. 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. 3. Invoke the macro `REGISTER_OP`. This macro will @@ -186,13 +186,13 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` --- -# Backward Module (1/2) +## Backward Module (1/2) ### Create Backward Operator - Mapping from forward Op to backward Op ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png) --- -# Backward Module (2/2) +## Backward Module (2/2) ### Build Backward Network - **Input**: a graph of forward operators - **Output**: a graph of backward operators @@ -205,7 +205,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) --- -# Scope, Variable, Tensor +## Scope, Variable, Tensor * `Tensor` is an n-dimension array with type. * Only dims and data pointers are stored in `Tensor`. @@ -218,8 +218,8 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. --- -# Block (in design) -## the difference between original RNNOp and Block +## Block (in design) +### the difference between original RNNOp and Block - As an operator is more intuitive than `RNNOp`, - Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`, - Fits the compile-time/ runtime separation design paradigm. @@ -227,7 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`. --- -# Milestone +## Milestone - Take Paddle/books as the main line, the requirement of the models motivates framework refactoring, - Model migration - Framework development gives **priority support** to model migration, for example, @@ -240,7 +240,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) - Accept imperfection, concentrate on solving the specific problem at the right price. --- -# Control the migration quality +## Control the migration quality - Compare the performance of migrated models with old ones. - Follow the google C++ style guide. - Build the automatic workflow of generating Python/C++ documentations. diff --git a/doc/fluid/design/muti_devices/index_cn.rst b/doc/fluid/design/muti_devices/index_cn.rst new file mode 100644 index 0000000000..1f8439e862 --- /dev/null +++ b/doc/fluid/design/muti_devices/index_cn.rst @@ -0,0 +1,9 @@ +多设备支持 +------------ + +.. toctree:: + :maxdepth: 1 + + operator_kernel_type.md + kernel_selection.md + kernel_hint_design.md diff --git a/doc/fluid/design/muti_devices/index_en.rst b/doc/fluid/design/muti_devices/index_en.rst new file mode 100644 index 0000000000..819e9c5d77 --- /dev/null +++ b/doc/fluid/design/muti_devices/index_en.rst @@ -0,0 +1,9 @@ +Multi-Device Support +---------------------- + +.. toctree:: + :maxdepth: 1 + + operator_kernel_type.md + kernel_selection.md + kernel_hint_design.md diff --git a/doc/fluid/design/muti_devices/kernel_hint_design.md b/doc/fluid/design/muti_devices/kernel_hint_design.md index a54b7da045..728c8f0b96 100644 --- a/doc/fluid/design/muti_devices/kernel_hint_design.md +++ b/doc/fluid/design/muti_devices/kernel_hint_design.md @@ -1,4 +1,4 @@ -## Problem +# Problem In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this. In the current design, we use KernelType to describe one kernel. diff --git a/doc/fluid/design/muti_devices/kernel_selection.md b/doc/fluid/design/muti_devices/kernel_selection.md index 9719e031c7..39ea2b0009 100644 --- a/doc/fluid/design/muti_devices/kernel_selection.md +++ b/doc/fluid/design/muti_devices/kernel_selection.md @@ -1,4 +1,4 @@ -## Background +# Background Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold. The `OpKernelType ` is as follows: diff --git a/doc/fluid/design/network/index_cn.rst b/doc/fluid/design/network/index_cn.rst new file mode 100644 index 0000000000..3557d55fe4 --- /dev/null +++ b/doc/fluid/design/network/index_cn.rst @@ -0,0 +1,7 @@ +复杂网络设计 +------------ + +.. toctree:: + :maxdepth: 1 + + sequence_decoder.md diff --git a/doc/fluid/design/network/index_en.rst b/doc/fluid/design/network/index_en.rst new file mode 100644 index 0000000000..73a7137236 --- /dev/null +++ b/doc/fluid/design/network/index_en.rst @@ -0,0 +1,7 @@ +Complex Network Design +------------------------ + +.. toctree:: + :maxdepth: 1 + + sequence_decoder.md diff --git a/doc/fluid/dev/api_doc_std_cn.md b/doc/fluid/dev/api_doc_std_cn.md index 5596b2653a..b50f18f21d 100644 --- a/doc/fluid/dev/api_doc_std_cn.md +++ b/doc/fluid/dev/api_doc_std_cn.md @@ -45,11 +45,11 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 - Python API Definition - 格式: - + [Python API Definition] - + - 示例 - + ``` fc(input, size, @@ -63,19 +63,19 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ``` - Function Description - + - 格式 本模块应包含以下内容(排列顺序为文档撰写顺序): [Function Description] - + [Formula] - + [Symbols' Descriptions if necessary] - + [References if necessary] - + - 示例 [Function Description] @@ -119,18 +119,18 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 [References if necessary] 因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例: - + ``` Refer to `Layer Normalization `_ for more details. ``` - + - Args Description - + - 格式 - + \[Arg's Name\][(Data Type, Default Value)][Description] - + - 示例 fc的部分参数注释如下: @@ -145,35 +145,35 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ``` - Returns - + - 格式 - + [Name][Shape] - + - 示例 - + ``` Returns: A tensor variable storing the transformation result. ``` - + 当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例: - + ``` Returns: A tuple containing: The hidden state of LSTM whose shape is (T X D). The cell state of LSTM whose shape is (T X D). ``` - + - Raises - 格式 - + [Exception Type][Condition] - 示例 - + ``` Raises: ValueError: If the rank of the input is less than 2. @@ -182,7 +182,7 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 - Note - 格式 - + [Note] - 示例 @@ -198,15 +198,15 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 2. When num_heads == 1, scaled_dot_product_attention has no learnable parameters. ``` - + - Examples - 格式 \[Python Code Snipper] - + - 示例 - + ``` Examples: .. code-block:: python diff --git a/doc/fluid/dev/index_cn.rst b/doc/fluid/dev/index_cn.rst index e1edf079fa..e70bf5dff3 100644 --- a/doc/fluid/dev/index_cn.rst +++ b/doc/fluid/dev/index_cn.rst @@ -1,2 +1,13 @@ 开发标准 ------------ + +.. toctree:: + :maxdepth: 1 + + new_op_en.md + new_op_kernel_en.md + use_eigen_en.md + name_convention.md + support_new_device.md + releasing_process.md + op_markdown_format.md diff --git a/doc/fluid/dev/index_en.rst b/doc/fluid/dev/index_en.rst index faf9dfcd31..f0e9afcfcc 100644 --- a/doc/fluid/dev/index_en.rst +++ b/doc/fluid/dev/index_en.rst @@ -1,4 +1,13 @@ Development ------------ -This is Development page +.. toctree:: + :maxdepth: 1 + + new_op_en.md + new_op_kernel_en.md + use_eigen_en.md + name_convention.md + support_new_device.md + releasing_process.md + op_markdown_format.md diff --git a/doc/fluid/dev/name_convention.md b/doc/fluid/dev/name_convention.md index a02b356f05..75830ef28c 100644 --- a/doc/fluid/dev/name_convention.md +++ b/doc/fluid/dev/name_convention.md @@ -1,8 +1,8 @@ -## Operator's Parameter Name Convention +# Operator's Parameter Name Convention To make the operator document itself more clear, we recommend operator names obey the listing conventions. -### OpProtoMaker names +## OpProtoMaker names When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator. @@ -20,7 +20,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith - Order. - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice. -### Best Practice +## Best Practice Here we give some examples to show how these rules will be used. diff --git a/doc/fluid/dev/new_op_kernel_en.md b/doc/fluid/dev/new_op_kernel_en.md index 123df0a7ee..55dea8d0a3 100644 --- a/doc/fluid/dev/new_op_kernel_en.md +++ b/doc/fluid/dev/new_op_kernel_en.md @@ -1,14 +1,14 @@ -## Add Kernels for a New Device +# Add Kernels for a New Device -### Background +## Background PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU. [This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). -### Write Kernels for A New Device +## Write Kernels for A New Device -#### Add A New Device +### Add A New Device For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24). We will correct this ASAP. @@ -23,7 +23,7 @@ enum class LibraryType { ``` -#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53) +### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53) If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`: @@ -45,7 +45,7 @@ struct CUDAPlace { typedef boost::variant Place; ``` -#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37)) +### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37)) After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it. ```cpp @@ -58,7 +58,7 @@ class DeviceContext { }; ``` -#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device. +### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device. A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) @@ -85,7 +85,7 @@ class OpKernel : public OpKernelBase { ``` -#### Register the OpKernel to framework +### Register the OpKernel to framework After writing the components described above, we should register the kernel to the framework. @@ -107,7 +107,7 @@ take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/oper REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace, paddle::operators::GemmConvKernel, paddle::operators::GemmConvKernel); - + REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, paddle::operators::CUDNNConvOpKernel, paddle::operators::CUDNNConvOpKernel); diff --git a/doc/fluid/dev/op_markdown_format.md b/doc/fluid/dev/op_markdown_format.md index 0ee804d592..4e539d7992 100644 --- a/doc/fluid/dev/op_markdown_format.md +++ b/doc/fluid/dev/op_markdown_format.md @@ -15,26 +15,26 @@ The signature of the operator. Each section mentioned above has been covered in further detail in the rest of the document. -# PaddlePaddle Operator Name +## PaddlePaddle Operator Name This should be in all small letters, in case of multiple words, we separate them with an underscore. For example: `array to lod tensor` should be written as `array_to_lod_tensor`. This naming convention should be standard across all PaddlePaddle operators. -# Standard Operator Name +## Standard Operator Name This is the standard name of the operator as used in the community. The general standard is usually: - Standard abbreviations like `SGD` are written in all capital letters. - Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word). - Keep numbers inside a word as is, with no boundary delimiters. - Follow the name of the operator with the keyword: `Activation Operator.` -# Operator description +## Operator description This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section. -# LaTeX equation +## LaTeX equation This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`). -# The signature +## The signature This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is: `Section : VariableName : (VariableType) VariableDescription diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md index f36843b440..75922e7d85 100644 --- a/doc/fluid/dev/use_eigen_cn.md +++ b/doc/fluid/dev/use_eigen_cn.md @@ -1,16 +1,16 @@ -## 在Paddle中如何使用Eigen +# 在Paddle中如何使用Eigen 神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。 -### Eigen Tensor模块 +## Eigen Tensor模块 Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。 关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) 和[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md) -### paddle::framework::Tensor +## paddle::framework::Tensor Paddle Tensor定义在framework目录下,其主要接口如下: @@ -20,14 +20,14 @@ class Tensor { /*! Return a pointer to mutable memory block. */ template inline T* data(); - + /** * @brief Return a pointer to mutable memory block. * @note If not exist, then allocation. */ template inline T* mutable_data(platform::Place place); - + /** * @brief Return a pointer to mutable memory block. * @@ -38,17 +38,17 @@ class Tensor { */ template inline T* mutable_data(DDim dims, platform::Place place); - + /*! Resize the dimensions of the memory block. */ inline Tensor& Resize(const DDim& dims); - + /*! Return the dimensions of the memory block. */ inline const DDim& dims() const; private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; - + /*! points to dimensions of memory block. */ DDim dim_; }; @@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework -### 实现计算 +## 实现计算 当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。 diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md index 3a466f73d1..3313d097cb 100644 --- a/doc/fluid/dev/use_eigen_en.md +++ b/doc/fluid/dev/use_eigen_en.md @@ -1,9 +1,9 @@ -## How to use Eigen in Paddle +# How to use Eigen in Paddle Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`. -### Eigen Tensor Module +## Eigen Tensor Module The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU. @@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md). -### paddle::framework::Tensor +## paddle::framework::Tensor Paddle Tensor's is defined in the framework directory with the following interface: @@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override { ``` -### paddle::framework::Tensor到EigenTensor的转换 +## paddle::framework::Tensor到EigenTensor的转换 As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. @@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P -### Implementing Computation +## Implementing Computation While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor. diff --git a/doc/fluid/getstarted/concepts/index_cn.rst b/doc/fluid/getstarted/concepts/index_cn.rst new file mode 100644 index 0000000000..2e7f70fc4c --- /dev/null +++ b/doc/fluid/getstarted/concepts/index_cn.rst @@ -0,0 +1,4 @@ +基本使用概念 +============ + +TBD diff --git a/doc/fluid/getstarted/concepts/index_en.rst b/doc/fluid/getstarted/concepts/index_en.rst new file mode 100644 index 0000000000..78cca1e2a3 --- /dev/null +++ b/doc/fluid/getstarted/concepts/index_en.rst @@ -0,0 +1,4 @@ +Concepts +============ + +TBD diff --git a/doc/fluid/getstarted/index_cn.rst b/doc/fluid/getstarted/index_cn.rst index c4d8525f23..75af7354be 100644 --- a/doc/fluid/getstarted/index_cn.rst +++ b/doc/fluid/getstarted/index_cn.rst @@ -1,4 +1,19 @@ 新手入门 ------------- +============ -新手入门 + +如果需要快速了解PaddlePaddle的使用,可以参考以下指南。 + +.. toctree:: + :maxdepth: 1 + + quickstart_cn.rst + + +在使用PaddlePaddle构建应用时,需要了解一些基本概念。 +这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。 + +.. toctree:: + :maxdepth: 1 + + concepts/use_concepts_cn.rst diff --git a/doc/fluid/getstarted/index_en.rst b/doc/fluid/getstarted/index_en.rst index a4efd05e2f..75a43f4af8 100644 --- a/doc/fluid/getstarted/index_en.rst +++ b/doc/fluid/getstarted/index_en.rst @@ -1,4 +1,18 @@ GET STARTED ------------- +============ -This is get started page +If you want to quickly know how to use PaddlePaddle, please refer to the following guide: + +.. toctree:: + :maxdepth: 1 + + quickstart_en.rst + +While using PaddlePaddle to build applications, please understand some basic concepts. + +Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc. + +.. toctree:: + :maxdepth: 1 + + concepts/index_en.rst diff --git a/doc/fluid/getstarted/quickstart_cn.rst b/doc/fluid/getstarted/quickstart_cn.rst new file mode 120000 index 0000000000..93a9e4e37a --- /dev/null +++ b/doc/fluid/getstarted/quickstart_cn.rst @@ -0,0 +1 @@ +../../v2/getstarted/quickstart_cn.rst \ No newline at end of file diff --git a/doc/fluid/getstarted/quickstart_en.rst b/doc/fluid/getstarted/quickstart_en.rst new file mode 120000 index 0000000000..6e1894faa1 --- /dev/null +++ b/doc/fluid/getstarted/quickstart_en.rst @@ -0,0 +1 @@ +../../v2/getstarted/quickstart_en.rst \ No newline at end of file diff --git a/doc/fluid/howto/index_cn.rst b/doc/fluid/howto/index_cn.rst index a92abad0c5..97aeaf167d 100644 --- a/doc/fluid/howto/index_cn.rst +++ b/doc/fluid/howto/index_cn.rst @@ -1,2 +1,7 @@ 进阶使用 ------------ + +.. toctree:: + :maxdepth: 1 + + optimization/index_cn.rst diff --git a/doc/fluid/howto/index_en.rst b/doc/fluid/howto/index_en.rst index 06036bdce5..fd21e167ce 100644 --- a/doc/fluid/howto/index_en.rst +++ b/doc/fluid/howto/index_en.rst @@ -1,4 +1,7 @@ HOW TO ------------ -This is how to page +.. toctree:: + :maxdepth: 1 + + optimization/index_en.rst diff --git a/doc/fluid/howto/optimization/benchmark/README.md b/doc/fluid/howto/optimization/benchmark/README.md new file mode 120000 index 0000000000..db30af7f53 --- /dev/null +++ b/doc/fluid/howto/optimization/benchmark/README.md @@ -0,0 +1 @@ +../../../../../benchmark/cluster/README.md \ No newline at end of file diff --git a/doc/fluid/howto/optimization/benchmark/index_cn.rst b/doc/fluid/howto/optimization/benchmark/index_cn.rst new file mode 100644 index 0000000000..9404800eb8 --- /dev/null +++ b/doc/fluid/howto/optimization/benchmark/index_cn.rst @@ -0,0 +1,8 @@ +基准 +------------ + +.. toctree:: + :maxdepth: 1 + + vgg16/README.md + README.md diff --git a/doc/fluid/howto/optimization/benchmark/index_en.rst b/doc/fluid/howto/optimization/benchmark/index_en.rst new file mode 100644 index 0000000000..1e200b660c --- /dev/null +++ b/doc/fluid/howto/optimization/benchmark/index_en.rst @@ -0,0 +1,8 @@ +Benchmark +------------ + +.. toctree:: + :maxdepth: 1 + + vgg16/README.md + README.md diff --git a/doc/fluid/howto/optimization/benchmark/vgg16/README.md b/doc/fluid/howto/optimization/benchmark/vgg16/README.md new file mode 120000 index 0000000000..ca963ef5f0 --- /dev/null +++ b/doc/fluid/howto/optimization/benchmark/vgg16/README.md @@ -0,0 +1 @@ +../../../../../../benchmark/cluster/vgg16/README.md \ No newline at end of file diff --git a/doc/fluid/howto/optimization/cpu_profiling_cn.md b/doc/fluid/howto/optimization/cpu_profiling_cn.md index d59be670c2..17f895573a 100644 --- a/doc/fluid/howto/optimization/cpu_profiling_cn.md +++ b/doc/fluid/howto/optimization/cpu_profiling_cn.md @@ -8,7 +8,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大 * Python 与 C++ 混合代码的性能分析 -## Python代码的性能分析 +# Python代码的性能分析 ### 生成性能分析文件 diff --git a/doc/fluid/howto/optimization/cpu_profiling_en.md b/doc/fluid/howto/optimization/cpu_profiling_en.md index 01e5fddf61..abe4493c17 100644 --- a/doc/fluid/howto/optimization/cpu_profiling_en.md +++ b/doc/fluid/howto/optimization/cpu_profiling_en.md @@ -14,7 +14,7 @@ the profiling and tuning of 1. the Python code and 1. the mixture of Python and C++ code. -## Profiling the Python Code +# Profiling the Python Code ### Generate the Performance Profiling File @@ -81,7 +81,7 @@ focus on. We can sort above profiling file by tottime: We can see that the most time-consuming function is the `built-in method run`, which is a C++ function in `libpaddle.so`. We will -explain how to profile C++ code in the next section. At this +explain how to profile C++ code in the next section. At this moment, let's look into the third function `sync_with_cpp`, which is a Python function. We can click it to understand more about it: diff --git a/doc/fluid/howto/optimization/index_cn.rst b/doc/fluid/howto/optimization/index_cn.rst new file mode 100644 index 0000000000..27cc967023 --- /dev/null +++ b/doc/fluid/howto/optimization/index_cn.rst @@ -0,0 +1,9 @@ +性能优化 +------------ + +.. toctree:: + :maxdepth: 1 + + timeline.md + cpu_profiling_cn.md + benchmark/index_cn.rst diff --git a/doc/fluid/howto/optimization/index_en.rst b/doc/fluid/howto/optimization/index_en.rst new file mode 100644 index 0000000000..4ce624fe8f --- /dev/null +++ b/doc/fluid/howto/optimization/index_en.rst @@ -0,0 +1,9 @@ +Performance Optimization +--------------------------- + +.. toctree:: + :maxdepth: 1 + + timeline.md + cpu_profiling_en.md + benchmark/index_en.rst diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md index 9d9565a3e6..96481ae2a6 100644 --- a/doc/fluid/howto/optimization/timeline.md +++ b/doc/fluid/howto/optimization/timeline.md @@ -1,4 +1,4 @@ -## how to use timeline tool to do profile +# how to use timeline tool to do profile 1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. diff --git a/doc/fluid/index_cn.rst b/doc/fluid/index_cn.rst index be3bed4393..d878d192ca 100644 --- a/doc/fluid/index_cn.rst +++ b/doc/fluid/index_cn.rst @@ -5,8 +5,8 @@ :maxdepth: 1 getstarted/index_cn.rst - design/index_cn.rst build_and_install/index_cn.rst + design/index_cn.rst howto/index_cn.rst dev/index_cn.rst faq/index_cn.rst diff --git a/doc/fluid/index_en.rst b/doc/fluid/index_en.rst index 87c831420a..2bc76b5898 100644 --- a/doc/fluid/index_en.rst +++ b/doc/fluid/index_en.rst @@ -5,8 +5,8 @@ :maxdepth: 1 getstarted/index_en.rst - design/index_en.rst build_and_install/index_en.rst + design/index_en.rst howto/index_en.rst dev/index_en.rst faq/index_en.rst diff --git a/doc/fluid/design/interface/00.why_plain_c.md b/doc/v2/design/interface/00.why_plain_c.md similarity index 100% rename from doc/fluid/design/interface/00.why_plain_c.md rename to doc/v2/design/interface/00.why_plain_c.md diff --git a/doc/fluid/design/interface/01.inference_implementation.md b/doc/v2/design/interface/01.inference_implementation.md similarity index 100% rename from doc/fluid/design/interface/01.inference_implementation.md rename to doc/v2/design/interface/01.inference_implementation.md diff --git a/doc/v2/design/interface/index_cn.rst b/doc/v2/design/interface/index_cn.rst new file mode 100644 index 0000000000..2509a5c5f4 --- /dev/null +++ b/doc/v2/design/interface/index_cn.rst @@ -0,0 +1,7 @@ +多语言接口 +------------ + +.. toctree:: + :maxdepth: 1 + + 00.why_plain_c.md diff --git a/doc/v2/design/interface/index_en.rst b/doc/v2/design/interface/index_en.rst new file mode 100644 index 0000000000..356e58c39c --- /dev/null +++ b/doc/v2/design/interface/index_en.rst @@ -0,0 +1,7 @@ +Multilingual Interface +----------------------- + +.. toctree:: + :maxdepth: 1 + + 00.why_plain_c.md diff --git a/doc/v2/design/mkl/mkldnn.md b/doc/v2/design/mkl/mkldnn.md index e2fe1e6b26..1bd2e7bc34 100644 --- a/doc/v2/design/mkl/mkldnn.md +++ b/doc/v2/design/mkl/mkldnn.md @@ -44,7 +44,7 @@ MKL,MKLML以及MKL-DNN三者关系如下表: | Name | Open Source | License | Descriptions | | :---------- | :--------------- | :---------- | :------------ | -| MKL | No | Proprietary | Accelerate math processing routines | +| MKL | No | Proprietary | Accelerate math processing routines | | MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning | | MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks | @@ -89,7 +89,7 @@ PaddlePaddle/Paddle ### CMake 在`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN -- `WITH_MKLML` 控制是否使用MKLML库。 +- `WITH_MKLML` 控制是否使用MKLML库。 当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。 MKLML的库目前都是动态库,主要包括`libiomp5.so`和`libmklml_intel.so`。 @@ -172,7 +172,7 @@ if use_mkldnn self.layer_type = mkldnn_* ``` -所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 +所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 From f0af1398b8216428255b7981a4fe0b490d2c03e6 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 30 Mar 2018 11:30:05 +0800 Subject: [PATCH 278/314] add prefetch_op (#9495) * add prefetch_op * fix ci * optimize code * optimize code * fix include --- paddle/fluid/operators/CMakeLists.txt | 6 +- paddle/fluid/operators/detail/grpc_client.cc | 50 +++++++- paddle/fluid/operators/detail/grpc_client.h | 7 ++ paddle/fluid/operators/prefetch_op.cc | 115 +++++++++++++++++++ paddle/fluid/operators/send_op.cc | 20 +--- paddle/fluid/operators/send_recv_util.h | 36 ++++++ paddle/fluid/operators/send_vars_op.cc | 23 +--- 7 files changed, 213 insertions(+), 44 deletions(-) create mode 100644 paddle/fluid/operators/prefetch_op.cc create mode 100644 paddle/fluid/operators/send_recv_util.h diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8341170d68..9ed79453b9 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -183,6 +183,8 @@ if(WITH_DISTRIBUTE) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") op_library(send_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(send_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + op_library(prefetch_op DEPS ${DISTRIBUTE_DEPS}) + set_source_files_properties(prefetch_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(recv_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) @@ -191,9 +193,9 @@ if(WITH_DISTRIBUTE) set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS}) set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) + cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor) else() - set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op) + set(DEPS_OPS ${DEPS_OPS} send_op prefetch_op recv_op listen_and_serv_op send_vars_op send_barrier_op) endif() op_library(cond_op DEPS framework_proto tensor net_op) diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 03b789f326..9652bb888b 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -88,10 +88,13 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, const auto ch = GetChannel(ep_val); framework::Async([var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] { + // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); + ::grpc::ByteBuffer buf; + RequestToByteBuffer(req, &buf); - // varhandle + // var handle VarHandle var_h; var_h.ep = ep_val; var_h.scope = p_scope; @@ -103,9 +106,6 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, s->Prepare(var_h, time_out); s->response_call_back_ = ProcGetResponse; - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); - auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); call->StartCall(); @@ -117,6 +117,48 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, return true; } +bool RPCClient::AsyncPrefetchVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out) { + const platform::DeviceContext* p_ctx = &ctx; + const std::string ep_val = ep; + const std::string in_var_name_val = in_var_name; + const std::string out_var_name_val = out_var_name; + const framework::Scope* p_scope = &scope; + const auto ch = GetChannel(ep_val); + + framework::Async([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, + time_out, ch, this] { + auto* var = p_scope->FindVar(in_var_name_val); + + ::grpc::ByteBuffer req; + SerializeToByteBuffer(in_var_name_val, var, *p_ctx, &req); + + // var handle + VarHandle var_h; + var_h.ep = ep_val; + var_h.scope = p_scope; + var_h.name = out_var_name_val; + var_h.ctx = p_ctx; + + // stub context + GetProcessor* s = new GetProcessor(ch); + s->Prepare(var_h, time_out); + s->response_call_back_ = ProcGetResponse; + + auto call = s->stub_g_.PrepareUnaryCall( + s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, (void*)s); + }); + + req_count_++; + return true; +} + void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); diff --git a/paddle/fluid/operators/detail/grpc_client.h b/paddle/fluid/operators/detail/grpc_client.h index 8216ac52fb..fe237e54ef 100644 --- a/paddle/fluid/operators/detail/grpc_client.h +++ b/paddle/fluid/operators/detail/grpc_client.h @@ -172,6 +172,13 @@ class RPCClient { const std::string& var_name, int64_t time_out = 600 * 1000); + bool AsyncPrefetchVariable(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + int64_t time_out = 600 * 1000); + void AsyncSendBatchBarrier(const std::string& ep, int64_t time_out = 600 * 1000); diff --git a/paddle/fluid/operators/prefetch_op.cc b/paddle/fluid/operators/prefetch_op.cc new file mode 100644 index 0000000000..09ab7da663 --- /dev/null +++ b/paddle/fluid/operators/prefetch_op.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/send_recv_util.h" + +namespace paddle { +namespace operators { + +class PrefetchOp : public framework::OperatorBase { + public: + PrefetchOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + auto ins = Inputs("X"); + auto outs = Outputs("Out"); + + std::vector epmap = Attr>("epmap"); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + auto client_var_name = Output("RPCClient"); + PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), + "Can not find variable '%s' in the scope.", + client_var_name); + auto* client_var = scope.FindVar(client_var_name); + detail::RPCClient* rpc_client = client_var->GetMutable(); + + for (size_t i = 0; i < ins.size(); i++) { + if (NeedSend(scope, ins[i])) { + VLOG(3) << "sending " << ins[i] << " to " << epmap[i] << "to get " + << outs[i] << "back"; + rpc_client->AsyncPrefetchVariable(epmap[i], ctx, scope, ins[i], + outs[i]); + } else { + VLOG(3) << "don't send no-initialied variable: " << ins[i]; + } + } + PADDLE_ENFORCE(rpc_client->Wait()); + } +}; + +class PrefetchOpMaker : public framework::OpProtoAndCheckerMaker { + public: + PrefetchOpMaker(OpProto* proto, OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", "(LoDTensor) Input Id variables to be sent").AsDuplicable(); + AddOutput("RPCClient", + "(RPCClient) The RPC client object which will be" + "initialized at most once."); + AddOutput("Out", + "(SelectedRows) result " + "to be fetched from parameter server") + .AsDuplicable(); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({"127.0.0.1:6164"}); + AddComment(R"DOC( +Prefetch operator + +This operator will send Ids variables to listen_and_serve op at +the parameter server and fetch result back. +)DOC"); + } +}; + +class PrefetchOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output("RPCClient").front(); + auto& out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto var_type = framework::proto::VarType::RAW; + out_var.SetType(var_type); + } +}; + +class PrefetchOpShapeInference : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override {} +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(prefetch, ops::PrefetchOp, + paddle::framework::EmptyGradOpMaker, ops::PrefetchOpMaker, + ops::PrefetchOpVarTypeInference, + ops::PrefetchOpShapeInference); diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 0752bd1bbd..d47f66de21 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -12,35 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include #include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/send_recv_util.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { -static bool NeedSend(const framework::Scope& scope, - const std::string& varname) { - auto* var = scope.FindVar(varname); - PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.", - varname); - if (var->IsType()) { - return var->Get().IsInitialized(); - } else if (var->IsType()) { - return var->Get().rows().size() > 0UL; - } else { - PADDLE_THROW( - "Variable type in send side should be in " - "[LodTensor, SelectedRows]"); - } - return false; -} class SendOp : public framework::OperatorBase { public: diff --git a/paddle/fluid/operators/send_recv_util.h b/paddle/fluid/operators/send_recv_util.h new file mode 100644 index 0000000000..196f56f634 --- /dev/null +++ b/paddle/fluid/operators/send_recv_util.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +namespace paddle { +namespace operators { + +inline bool NeedSend(const framework::Scope& scope, + const std::string& varname) { + auto* var = scope.FindVar(varname); + PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.", + varname); + if (var->IsType()) { + return var->Get().IsInitialized(); + } else if (var->IsType()) { + return var->Get().rows().size() > 0UL; + } else { + PADDLE_THROW( + "Variable type in send side should be in " + "[LodTensor, SelectedRows]"); + } + return false; +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/send_vars_op.cc b/paddle/fluid/operators/send_vars_op.cc index 523e9e2780..2cbd9e2394 100644 --- a/paddle/fluid/operators/send_vars_op.cc +++ b/paddle/fluid/operators/send_vars_op.cc @@ -12,34 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include #include "paddle/fluid/framework/data_type.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" - -#include #include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/send_recv_util.h" namespace paddle { namespace operators { -static bool NeedSend(const framework::Scope& scope, - const std::string& varname) { - auto* var = scope.FindVar(varname); - PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.", - varname); - if (var->IsType()) { - return var->Get().IsInitialized(); - } else if (var->IsType()) { - return var->Get().rows().size() > 0UL; - } else { - PADDLE_THROW( - "Variable type in send side should be in " - "[LodTensor, SelectedRows]"); - } - return false; -} class SendVarsOp : public framework::OperatorBase { public: @@ -95,7 +78,7 @@ Send operator This operator will send variables to listen_and_serve op at the parameter server. )DOC"); - AddAttr("ync_send", + AddAttr("sync_send", "(int, default 0)" "sync send or async send.") .SetDefault(0); From 374f1ca3b76f5ed6d6f5a7e5367840663913014c Mon Sep 17 00:00:00 2001 From: Yancey Date: Fri, 30 Mar 2018 12:00:18 +0800 Subject: [PATCH 279/314] Fix dist error with lr decay layer (#9489) Fix dist error with lr decay layer --- paddle/fluid/operators/listen_and_serv_op.cc | 59 +++++++++++--------- python/paddle/fluid/distribute_transpiler.py | 43 +++++++++++++- 2 files changed, 74 insertions(+), 28 deletions(-) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 9796fabdb6..d5eae2be79 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -54,6 +54,24 @@ static void CreateTensorFromMessageType(framework::Variable *var, } } +static void ParallelExecuteBlocks(const std::vector ¶llel_blkids, + framework::Executor *executor, + framework::ProgramDesc *program, + framework::Scope *scope) { + std::vector> fs; + for (size_t idx : parallel_blkids) { + fs.push_back(framework::Async([&executor, &program, &scope, idx]() { + int run_block = idx; // thread local + try { + executor->Run(*program, scope, run_block, false, false); + } catch (std::exception &e) { + LOG(ERROR) << "run sub program error " << e.what(); + } + })); + } + for (size_t i = 0; i < fs.size(); ++i) fs[i].wait(); +} + class ListenAndServOp : public framework::OperatorBase { public: ListenAndServOp(const std::string &type, @@ -135,34 +153,27 @@ class ListenAndServOp : public framework::OperatorBase { break; } - // put optimize blocks in the thread pool to start run, the last block - // should be global ops. // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads // and this will still work. - std::vector> fs; + // The optimize blocks which have the same parent ID would run parallel + // TODO(Yancey1989): need to use ParallelExecutor for future + size_t last_parent_blkid = program->Block(1).Parent(); + std::vector parallel_blkids; + parallel_blkids.push_back(1); double ts = detail::GetTimestamp(); - // block0 contains only listen_and_serv op, start run from block1. - for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { - fs.push_back( - framework::Async([&executor, &program, &recv_scope, blkid]() { - int run_block = blkid; // thread local - try { - executor.Run(*program, &recv_scope, run_block, false, false); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); - } - })); - } - for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait(); - // Run global block at final step, or block1 if there are only 2 blocks - if (num_blocks >= 2) { - try { - executor.Run(*program, &recv_scope, num_blocks - 1, false, false); - } catch (std::exception &e) { - LOG(ERROR) << "run sub program error " << e.what(); + for (size_t blkid = 2; blkid < num_blocks; ++blkid) { + if (program->Block(blkid).Parent() != last_parent_blkid) { + for (size_t idx : parallel_blkids) VLOG(3) << idx; + ParallelExecuteBlocks(parallel_blkids, &executor, program, + &recv_scope); + parallel_blkids.clear(); + last_parent_blkid = program->Block(blkid).Parent(); } + parallel_blkids.push_back(blkid); } + ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope); + VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts; // Reset the received sparse variables, the sum operator would not @@ -178,10 +189,6 @@ class ListenAndServOp : public framework::OperatorBase { rpc_service_->WaitClientGet(fan_in); sparse_vars.clear(); } // while(true) - - // for (int i = 0; i < num_blocks; ++i) { - // delete blk_ctx_list[i]; - // } } protected: diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index 62147d325b..24297ffe33 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -338,15 +338,24 @@ class DistributeTranspiler: else: self._append_pserver_non_opt_ops(block, op) + append_block = optimize_block + # append lr decay ops to the child block if exits + lr_ops = self._get_lr_ops() + if len(lr_ops) > 0: + for _, op in enumerate(lr_ops): + self._append_pserver_non_opt_ops(append_block, op) + + append_block = pserver_program.create_block(append_block.idx) + # append op to the current block - per_opt_block = optimize_block + per_opt_block = append_block for _, opt_op in enumerate(opt_op_on_pserver): for _, op in enumerate(self.optimize_ops): # optimizer is connected to itself if ufind.is_connected(op, opt_op) and \ op not in global_ops: __append_optimize_op__(op, per_opt_block) - per_opt_block = pserver_program.create_block(0) + per_opt_block = pserver_program.create_block(append_block.idx) # append global ops for glb_op in global_ops: @@ -786,3 +795,33 @@ class DistributeTranspiler: else: iomap[key] = vars return iomap + + def _get_lr_ops(self): + lr_ops = [] + # find learning rate variables by optimize op + lr_vars = set() + for op in self.optimize_ops: + if self._is_opt_op(op): + lr_vars.add(op.input("LearningRate")[0]) + + find_ops = [] + # find ops which output is lr var + block = self.program.global_block() + for op in block.ops: + if set(op.output_arg_names) & lr_vars: + find_ops.append(op) + # make a union find struct by the ops in default_main_program + ufind = UnionFind(block.ops) + for op1 in block.ops: + for op2 in block.ops: + # NOTE: we need to skip all optimize ops, since it is connected + # with forward/backward ops and lr ops, we only need the lr ops. + if op1 != op2 and self._is_op_connected(op1, op2) and \ + not self._is_opt_op(op1) and not self._is_opt_op(op2): + ufind.union(op1, op2) + # find all ops which is related with lr var + for op1 in block.ops: + for op2 in find_ops: + if ufind.is_connected(op1, op2): + lr_ops.append(op1) + return lr_ops From 5a8b05f02ff652c7e6dd68e5d4af857d43c059cb Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Fri, 30 Mar 2018 12:16:28 +0800 Subject: [PATCH 280/314] add FAQ (#9494) * add faq * fix typo --- doc/v2/faq/build_and_install/index_cn.rst | 74 +++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/doc/v2/faq/build_and_install/index_cn.rst b/doc/v2/faq/build_and_install/index_cn.rst index 7c7e896d18..f292684fb5 100644 --- a/doc/v2/faq/build_and_install/index_cn.rst +++ b/doc/v2/faq/build_and_install/index_cn.rst @@ -139,3 +139,77 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二 touch ../extern_mklml-stamp/extern_mklml-download // 4. 接着编译即可 + +9. 在Mac上无法安装numpy等Python包,权限错误 +------------------ + +Mac上对自带的Python和包有严格的权限保护,最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。 + +virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝,并在这多个拷贝之间自由切换,这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。 + +下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境: + +安装virtualenv: +:::::::::::::::: + +virtualenv本身也是Python的一个包,可以用pip进行安装: + +.. code-block:: bash + + sudo -H pip install virtualenv + +由于virtualenv需要安装给系统自带的Python,因此需要使用sudo权限。 + +创建一个新的Python运行环境: +::::::::::::::::::: + +.. code-block:: bash + + virtualenv --no-site-packages paddle + +--no-site-packages 参数表示不拷贝已有的任何第三方包,创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。 + +执行完这一步后,当前目录下应该会出现一个名为paddle(或者你取的其他名字)的目录。这个目录里保存了运行一个Python环境所需要的各种文件。 + +启动运行环境: +:::::::::::::::: + +.. code-block:: bash + + source paddle/bin/activate + +执行后会发现命令提示符前面增加了(paddle)字样,说明已经成功启动了名为‘paddle’的Python环境。执行which python,可以发现使用的已经是刚刚创建的paddle目录下的Python。 + +在这个环境中,我们可以自由地进行Paddle的安装、使用和开发工作,无需担心对系统自带Python的影响。 + +退出运行环境: +::::::::::::::: + +直接执行: + +.. code-block:: bash + + deactivate + +可以看到命令提示符前面的(paddle)字样消失。 + +自动启动某一Python环境: +:::::::::::::::: + +如果我们经常使用Paddle,我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境,比较繁琐。为了简便,可以修改终端的配置文件,来让终端每次启动后自动启动特定的Python环境。 + +执行: + +.. code-block:: bash + + vi ~/.bash_profile + +打开终端配置文件,并在文件的最后添加一行: + +.. code-block:: bash + + source paddle/bin/activate + +保存并关闭文件。 + +这样,每次打开终端时就会自动启动名为‘paddle’的Python环境了。 From 60d0a0594e4cf0152459646f36fa71d3f454856f Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Wed, 28 Mar 2018 17:13:25 +0800 Subject: [PATCH 281/314] refine parallel --- paddle/fluid/framework/parallel_executor.cc | 44 ++++++++++++++------- 1 file changed, 29 insertions(+), 15 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 8a90f231d7..91f2db9354 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include #include "ThreadPool.h" @@ -102,30 +103,43 @@ void ParallelExecutor::BCastParamsToGPUs( auto *main_scope = member_->local_scopes_[0]; for (auto *var_desc : startup_program.Block(0).AllVars()) { + size_t idx = var_desc->Name().find("@GRAD"); + if (idx != std::string::npos) continue; if (var_desc->GetType() == proto::VarType::LOD_TENSOR) { auto &main_tensor = main_scope->FindVar(var_desc->Name())->Get(); - ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - auto &dims = main_tensor.dims(); - size_t numel = main_tensor.numel(); - platform::NCCLGroupGuard guard; + auto &dims = main_tensor.dims(); - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - if (i == 0) { - buffer = const_cast(main_tensor.data()); - } else { + if (paddle::platform::is_gpu_place(main_tensor.place())) { + size_t numel = main_tensor.numel(); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + platform::NCCLGroupGuard guard; + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; + if (i == 0) { + buffer = const_cast(main_tensor.data()); + } else { + auto local_scope = member_->local_scopes_[i]; + auto *t = + local_scope->Var(var_desc->Name())->GetMutable(); + t->Resize(dims); + buffer = t->mutable_data(place, main_tensor.type()); + } + auto &nccl_ctx = member_->nccl_ctxs_->at(place); + platform::dynload::ncclBcast(buffer, numel, data_type, 0, + nccl_ctx.comm_, nccl_ctx.stream()); + } + } else { + platform::CPUPlace cpu; + for (size_t i = 1; i < member_->places_.size(); ++i) { auto local_scope = member_->local_scopes_[i]; auto *t = local_scope->Var(var_desc->Name())->GetMutable(); t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.type()); + t->mutable_data(cpu, main_tensor.type()); + paddle::framework::TensorCopy(main_tensor, cpu, t); } - - auto &nccl_ctx = member_->nccl_ctxs_->at(place); - platform::dynload::ncclBcast(buffer, numel, data_type, 0, - nccl_ctx.comm_, nccl_ctx.stream()); } } member_->nccl_ctxs_->WaitAll(); From 23bab34ca30f83ada8a1a671b0aa11e1377223c2 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 30 Mar 2018 13:35:14 +0800 Subject: [PATCH 282/314] Fix data transform when inplace (#9450) * fix data transform when op have inplace in/out * add log * should not delete scope because Compute maybe async * optimize code --- paddle/fluid/framework/operator.cc | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index b39a1164db..f6a43804ef 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -517,6 +517,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // do data transform Scope& new_scope = scope.NewScope(); + std::vector inplace_vars; for (auto& var_name_item : this->Inputs()) { for (auto& var_name : var_name_item.second) { auto* var = scope.FindVar(var_name); @@ -529,10 +530,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, auto out_var_names = OutputVars(true); if (std::find(out_var_names.begin(), out_var_names.end(), var_name) != out_var_names.end()) { - PADDLE_THROW( - "var %s is both input and output, " - "does not support transform", - var_name); + inplace_vars.push_back(var_name); } VLOG(3) << "Transform Variable " << var_name << " from " << kernel_type_for_var << " to " << expected_kernel_key; @@ -551,6 +549,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope, kernel_iter->second->Compute( ExecutionContext(*this, new_scope, *new_dev_ctx)); + for (auto& var_name : inplace_vars) { + VLOG(3) << "share inplace var " + var_name + " back to it's original scope"; + auto* original_tensor = GetMutableTensorFromVar(scope.FindVar(var_name)); + auto* transformed_tensor = GetTensorFromVar(new_scope.FindVar(var_name)); + original_tensor->ShareDataWith(*transformed_tensor); + } + /*For profiling/benchmark only*/ if (FLAGS_benchmark) { new_dev_ctx->Wait(); From b7b0342fffa2ed9b54c9c86d5a1ac0f72d15dafb Mon Sep 17 00:00:00 2001 From: weixing Date: Fri, 30 Mar 2018 14:03:41 +0800 Subject: [PATCH 283/314] Translation for Model Configuration (#9513) * Translation for doc Model Configuration * Adjust --- doc/v2/faq/model/index_en.rst | 78 ++++++++++++++++++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/doc/v2/faq/model/index_en.rst b/doc/v2/faq/model/index_en.rst index cb26f59655..67a33e08e1 100644 --- a/doc/v2/faq/model/index_en.rst +++ b/doc/v2/faq/model/index_en.rst @@ -2,4 +2,80 @@ Model Configuration ################### -TBD +.. contents:: + +1. How to deal with error :code:`Duplicated layer name` +---------------------------------------------------------- + +The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently. + +2. How to use :code:`paddle.layer.memory`'s attribute :code:`name` +---------------------------------------------------------------------- + +* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus, :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep. + +* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name` and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user. + + +3. What is the difference between the two ways of using dropout +----------------------------------------------------------------- + +* There are two ways to use dropout in PaddlePaddle + + * Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example: + + .. code-block:: python + + fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5)) + + * Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example: + + .. code-block:: python + + fc = paddle.layer.fc(input=input) + drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5) + +* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive. + +* PaddlePaddle implements dropout in the activation function rather than in the layer. + +* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`. + +4. The differences between different recurrent layers +-------------------------------------------------------- +Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle: + +* :code:`paddle.layer.lstmemory` +* :code:`paddle.networks.simple_lstm` +* :code:`paddle.networks.lstmemory_group` +* :code:`paddle.networks.bidirectional_lstm` + +According to implementations, recurrent layer can be classified into 2 types: + +1. Recurrent layer implemented by recurrent_group: + + * Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.) + * :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers. + +2. Recurrent layer implemented as a complete operation: + + * Users can only access output values when using this type of recurrent layers. + * :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer; + +By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM. + +In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`: + + * Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input. + * :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group. + +5. Can Softmax's calculation dimension be specified? +-------------------------------------------------------------------- + +We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows. +In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax. + +6. Does PaddlePaddle support variable-dimensional data inputs +---------------------------------------------------------------- + +PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy. From f6de248323c2fbb7cbb59b51d7448b2322caec4d Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 30 Mar 2018 14:08:17 +0800 Subject: [PATCH 284/314] fix server shutdown --- paddle/fluid/operators/detail/grpc_server.cc | 10 +++++----- paddle/fluid/operators/listen_and_serv_op.cc | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 9691d1e86b..109c762e74 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -174,13 +174,13 @@ void AsyncGRPCServer::ShutdownQueue() { std::unique_lock lock(cq_mutex_); cq_send_->Shutdown(); cq_get_->Shutdown(); - is_shut_down_ = true; } // This URL explains why shutdown is complicate: void AsyncGRPCServer::ShutDown() { - server_->Shutdown(); + is_shut_down_ = true; ShutdownQueue(); + server_->Shutdown(); } void AsyncGRPCServer::TryToRegisterNewSendOne() { @@ -213,14 +213,14 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, bool ok = false; while (true) { if (!cq->Next(&tag, &ok)) { - LOG(INFO) << cq_name << " get CompletionQueue shutdown!"; + LOG(INFO) << cq_name << " CompletionQueue shutdown!"; break; } PADDLE_ENFORCE(tag); // FIXME(typhoonzero): de-couple the barriers with recv_op - if (cq_name == "cq_get") WaitCond(1); - if (cq_name == "cq_send") WaitCond(0); + if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1); + if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0); RequestBase* base = (RequestBase*)tag; // reference: diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 08b83375dd..e45e81a56e 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -70,7 +70,6 @@ class ListenAndServOp : public framework::OperatorBase { void Stop() override { rpc_service_->Push(LISTEN_TERMINATE_MESSAGE); - rpc_service_->ShutDown(); server_thread_->join(); } From 5baa529e0e4a3163c1ae5c2241fa1efafc4e5d05 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Fri, 30 Mar 2018 15:06:05 +0800 Subject: [PATCH 285/314] fix compiler error of profiler_test in ONLY_CPU mode --- paddle/fluid/platform/profiler_test.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index 366c82bf96..45cc271bb8 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_CUDA #include "cuda_runtime.h" +#endif #include "gtest/gtest.h" TEST(Event, CpuElapsedTime) { @@ -159,6 +161,7 @@ TEST(RecordEvent, RecordEvent) { DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler"); } +#ifdef PADDLE_WITH_CUDA TEST(TMP, stream_wait) { cudaStream_t stream; cudaStreamCreate(&stream); @@ -166,3 +169,4 @@ TEST(TMP, stream_wait) { cudaStreamSynchronize(stream); cudaStreamSynchronize(stream); } +#endif From b9874251c623a17c7db8c5c3c7214ae8b451a52f Mon Sep 17 00:00:00 2001 From: Tomasz Patejko Date: Fri, 30 Mar 2018 03:12:33 -0400 Subject: [PATCH 286/314] Plain LRN op throws an exception when is_test is set in backward pass --- paddle/fluid/operators/lrn_op.cc | 5 ++++- paddle/fluid/operators/lrn_op.h | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc index b36b5c3a33..cb15683981 100644 --- a/paddle/fluid/operators/lrn_op.cc +++ b/paddle/fluid/operators/lrn_op.cc @@ -214,7 +214,10 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); - AddAttr("is_test", "").SetDefault(false); + AddAttr("is_test", + "Turns on memory optimization that optimizes away " + "unnecessary memory allocations. Used by MKLDNN.") + .SetDefault(false); AddComment(R"DOC( Local Response Normalization Operator. diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h index 95796f7eec..0fd3175e85 100644 --- a/paddle/fluid/operators/lrn_op.h +++ b/paddle/fluid/operators/lrn_op.h @@ -121,6 +121,10 @@ class LRNGradKernel : public framework::OpKernel { T alpha = ctx.Attr("alpha"); T beta = ctx.Attr("beta"); + PADDLE_ENFORCE( + !ctx.Attr("is_test"), + "is_test attribute should be set to False in training phase."); + LRNGradFunctor f; f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); } From 912a573603a2fcc41d447cd6937351caae8cdefe Mon Sep 17 00:00:00 2001 From: weixing02 <564445201@qq.com> Date: Fri, 30 Mar 2018 15:40:44 +0800 Subject: [PATCH 287/314] Move v2/api/fluid to fluid/api and Adjust doc build commands --- doc/CMakeLists.txt | 7 +++++++ doc/fluid/CMakeLists.txt | 2 ++ doc/fluid/api/CMakeLists.txt | 20 +++++++++++++++++++ .../api/fluid => fluid/api}/data_feeder.rst | 0 doc/{v2/api/fluid => fluid/api}/evaluator.rst | 0 doc/{v2/api/fluid => fluid/api}/executor.rst | 0 doc/{v2/api/fluid => fluid/api}/gen_doc.py | 0 doc/{v2/api/fluid => fluid/api}/gen_doc.sh | 0 .../index.rst => fluid/api/index_en.rst} | 0 .../api/fluid => fluid/api}/initializer.rst | 0 doc/{v2/api/fluid => fluid/api}/io.rst | 0 doc/{v2/api/fluid => fluid/api}/layers.rst | 0 doc/{v2/api/fluid => fluid/api}/nets.rst | 0 doc/{v2/api/fluid => fluid/api}/optimizer.rst | 0 .../api/fluid => fluid/api}/param_attr.rst | 0 doc/{v2/api/fluid => fluid/api}/profiler.rst | 0 .../api/fluid => fluid/api}/regularizer.rst | 0 doc/v2/CMakeLists.txt | 4 ++-- doc/v2/api/CMakeLists.txt | 2 +- paddle/scripts/travis/build_doc.sh | 2 +- 20 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 doc/fluid/api/CMakeLists.txt rename doc/{v2/api/fluid => fluid/api}/data_feeder.rst (100%) rename doc/{v2/api/fluid => fluid/api}/evaluator.rst (100%) rename doc/{v2/api/fluid => fluid/api}/executor.rst (100%) rename doc/{v2/api/fluid => fluid/api}/gen_doc.py (100%) rename doc/{v2/api/fluid => fluid/api}/gen_doc.sh (100%) rename doc/{v2/api/fluid/index.rst => fluid/api/index_en.rst} (100%) rename doc/{v2/api/fluid => fluid/api}/initializer.rst (100%) rename doc/{v2/api/fluid => fluid/api}/io.rst (100%) rename doc/{v2/api/fluid => fluid/api}/layers.rst (100%) rename doc/{v2/api/fluid => fluid/api}/nets.rst (100%) rename doc/{v2/api/fluid => fluid/api}/optimizer.rst (100%) rename doc/{v2/api/fluid => fluid/api}/param_attr.rst (100%) rename doc/{v2/api/fluid => fluid/api}/profiler.rst (100%) rename doc/{v2/api/fluid => fluid/api}/regularizer.rst (100%) diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index a9b27933a5..7066637a7c 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -1,2 +1,9 @@ +add_custom_target(paddle_apis ALL + DEPENDS paddle_v2_apis paddle_fluid_apis) + +add_custom_target(paddle_docs ALL + DEPENDS paddle_v2_docs paddle_v2_docs_cn + paddle_fluid_docs paddle_fluid_docs_cn) + add_subdirectory(v2) add_subdirectory(fluid) diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt index cc999f5a8d..fbf654ada8 100644 --- a/doc/fluid/CMakeLists.txt +++ b/doc/fluid/CMakeLists.txt @@ -47,3 +47,5 @@ sphinx_add_target(paddle_fluid_docs_cn ${SPHINX_CACHE_DIR_CN} ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) + +add_subdirectory(api) diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt new file mode 100644 index 0000000000..1627b963f3 --- /dev/null +++ b/doc/fluid/api/CMakeLists.txt @@ -0,0 +1,20 @@ +# configured documentation tools and intermediate build results +set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build") + +# Sphinx cache with pickled ReST documents +set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees") + +# HTML output director +set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html") + +configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/../../templates/conf.py.en.in" + "${BINARY_BUILD_DIR_EN}/conf.py" + @ONLY) + +sphinx_add_target(paddle_fluid_apis + html + ${BINARY_BUILD_DIR_EN} + ${SPHINX_CACHE_DIR_EN} + ${CMAKE_CURRENT_SOURCE_DIR} + ${SPHINX_HTML_DIR_EN}) diff --git a/doc/v2/api/fluid/data_feeder.rst b/doc/fluid/api/data_feeder.rst similarity index 100% rename from doc/v2/api/fluid/data_feeder.rst rename to doc/fluid/api/data_feeder.rst diff --git a/doc/v2/api/fluid/evaluator.rst b/doc/fluid/api/evaluator.rst similarity index 100% rename from doc/v2/api/fluid/evaluator.rst rename to doc/fluid/api/evaluator.rst diff --git a/doc/v2/api/fluid/executor.rst b/doc/fluid/api/executor.rst similarity index 100% rename from doc/v2/api/fluid/executor.rst rename to doc/fluid/api/executor.rst diff --git a/doc/v2/api/fluid/gen_doc.py b/doc/fluid/api/gen_doc.py similarity index 100% rename from doc/v2/api/fluid/gen_doc.py rename to doc/fluid/api/gen_doc.py diff --git a/doc/v2/api/fluid/gen_doc.sh b/doc/fluid/api/gen_doc.sh similarity index 100% rename from doc/v2/api/fluid/gen_doc.sh rename to doc/fluid/api/gen_doc.sh diff --git a/doc/v2/api/fluid/index.rst b/doc/fluid/api/index_en.rst similarity index 100% rename from doc/v2/api/fluid/index.rst rename to doc/fluid/api/index_en.rst diff --git a/doc/v2/api/fluid/initializer.rst b/doc/fluid/api/initializer.rst similarity index 100% rename from doc/v2/api/fluid/initializer.rst rename to doc/fluid/api/initializer.rst diff --git a/doc/v2/api/fluid/io.rst b/doc/fluid/api/io.rst similarity index 100% rename from doc/v2/api/fluid/io.rst rename to doc/fluid/api/io.rst diff --git a/doc/v2/api/fluid/layers.rst b/doc/fluid/api/layers.rst similarity index 100% rename from doc/v2/api/fluid/layers.rst rename to doc/fluid/api/layers.rst diff --git a/doc/v2/api/fluid/nets.rst b/doc/fluid/api/nets.rst similarity index 100% rename from doc/v2/api/fluid/nets.rst rename to doc/fluid/api/nets.rst diff --git a/doc/v2/api/fluid/optimizer.rst b/doc/fluid/api/optimizer.rst similarity index 100% rename from doc/v2/api/fluid/optimizer.rst rename to doc/fluid/api/optimizer.rst diff --git a/doc/v2/api/fluid/param_attr.rst b/doc/fluid/api/param_attr.rst similarity index 100% rename from doc/v2/api/fluid/param_attr.rst rename to doc/fluid/api/param_attr.rst diff --git a/doc/v2/api/fluid/profiler.rst b/doc/fluid/api/profiler.rst similarity index 100% rename from doc/v2/api/fluid/profiler.rst rename to doc/fluid/api/profiler.rst diff --git a/doc/v2/api/fluid/regularizer.rst b/doc/fluid/api/regularizer.rst similarity index 100% rename from doc/v2/api/fluid/regularizer.rst rename to doc/fluid/api/regularizer.rst diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt index 286fe8845c..48c9cf7327 100644 --- a/doc/v2/CMakeLists.txt +++ b/doc/v2/CMakeLists.txt @@ -20,7 +20,7 @@ configure_file( "${BINARY_BUILD_DIR_EN}/conf.py" @ONLY) -sphinx_add_target(paddle_docs +sphinx_add_target(paddle_v2_docs html ${BINARY_BUILD_DIR_EN} ${SPHINX_CACHE_DIR_EN} @@ -41,7 +41,7 @@ configure_file( "${BINARY_BUILD_DIR_CN}/conf.py" @ONLY) -sphinx_add_target(paddle_docs_cn +sphinx_add_target(paddle_v2_docs_cn html ${BINARY_BUILD_DIR_CN} ${SPHINX_CACHE_DIR_CN} diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt index 2ad589e8a2..a265a1b6f3 100644 --- a/doc/v2/api/CMakeLists.txt +++ b/doc/v2/api/CMakeLists.txt @@ -12,7 +12,7 @@ configure_file( "${BINARY_BUILD_DIR_EN}/conf.py" @ONLY) -sphinx_add_target(paddle_api_docs +sphinx_add_target(paddle_v2_apis html ${BINARY_BUILD_DIR_EN} ${SPHINX_CACHE_DIR_EN} diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index c389249172..09496e4de1 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -9,7 +9,7 @@ cd $TRAVIS_BUILD_DIR/build cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF make -j `nproc` gen_proto_py framework_py_proto make -j `nproc` copy_paddle_pybind -make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs +make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs paddle_fluid_api_docs # check websites for broken links linkchecker doc/v2/en/html/index.html From 9f9810cbb4942942a9ee5b2c65543cb4d78c1f55 Mon Sep 17 00:00:00 2001 From: weixing02 <564445201@qq.com> Date: Fri, 30 Mar 2018 16:05:19 +0800 Subject: [PATCH 288/314] Add dependencies --- doc/fluid/CMakeLists.txt | 4 ++++ doc/fluid/api/CMakeLists.txt | 2 ++ doc/v2/CMakeLists.txt | 4 ++++ doc/v2/api/CMakeLists.txt | 2 ++ paddle/scripts/docker/build.sh | 2 +- paddle/scripts/travis/build_doc.sh | 2 +- 6 files changed, 14 insertions(+), 2 deletions(-) diff --git a/doc/fluid/CMakeLists.txt b/doc/fluid/CMakeLists.txt index fbf654ada8..9fe79323ef 100644 --- a/doc/fluid/CMakeLists.txt +++ b/doc/fluid/CMakeLists.txt @@ -27,6 +27,8 @@ sphinx_add_target(paddle_fluid_docs ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) +add_dependencies(paddle_fluid_docs gen_proto_py) + # configured documentation tools and intermediate build results set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") @@ -48,4 +50,6 @@ sphinx_add_target(paddle_fluid_docs_cn ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) +add_dependencies(paddle_fluid_docs_cn gen_proto_py) + add_subdirectory(api) diff --git a/doc/fluid/api/CMakeLists.txt b/doc/fluid/api/CMakeLists.txt index 1627b963f3..ca40dfb964 100644 --- a/doc/fluid/api/CMakeLists.txt +++ b/doc/fluid/api/CMakeLists.txt @@ -18,3 +18,5 @@ sphinx_add_target(paddle_fluid_apis ${SPHINX_CACHE_DIR_EN} ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) + +add_dependencies(paddle_fluid_apis gen_proto_py framework_py_proto copy_paddle_pybind) diff --git a/doc/v2/CMakeLists.txt b/doc/v2/CMakeLists.txt index 48c9cf7327..82de7a3a3e 100644 --- a/doc/v2/CMakeLists.txt +++ b/doc/v2/CMakeLists.txt @@ -27,6 +27,8 @@ sphinx_add_target(paddle_v2_docs ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) +add_dependencies(paddle_v2_docs gen_proto_py) + # configured documentation tools and intermediate build results set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build") @@ -48,4 +50,6 @@ sphinx_add_target(paddle_v2_docs_cn ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_CN}) +add_dependencies(paddle_v2_docs_cn gen_proto_py) + add_subdirectory(api) diff --git a/doc/v2/api/CMakeLists.txt b/doc/v2/api/CMakeLists.txt index a265a1b6f3..da1eafc02e 100644 --- a/doc/v2/api/CMakeLists.txt +++ b/doc/v2/api/CMakeLists.txt @@ -18,3 +18,5 @@ sphinx_add_target(paddle_v2_apis ${SPHINX_CACHE_DIR_EN} ${CMAKE_CURRENT_SOURCE_DIR} ${SPHINX_HTML_DIR_EN}) + +add_dependencies(paddle_v2_apis gen_proto_py framework_py_proto copy_paddle_pybind) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 322f72e4a5..2309dc40cc 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -125,7 +125,7 @@ EOF -DWITH_STYLE_CHECK=OFF make -j `nproc` gen_proto_py framework_py_proto make -j `nproc` copy_paddle_pybind - make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs + make -j `nproc` paddle_docs paddle_apis popd fi diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index 09496e4de1..eabcda95b8 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -9,7 +9,7 @@ cd $TRAVIS_BUILD_DIR/build cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF make -j `nproc` gen_proto_py framework_py_proto make -j `nproc` copy_paddle_pybind -make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs paddle_fluid_api_docs +make -j `nproc` paddle_docs paddle_apis # check websites for broken links linkchecker doc/v2/en/html/index.html From 3800bc5f3e3ffdf864e03058e448f07c84c87c49 Mon Sep 17 00:00:00 2001 From: weixing02 <564445201@qq.com> Date: Fri, 30 Mar 2018 17:33:24 +0800 Subject: [PATCH 289/314] Remove redundant commands in build.sh and build_doc.sh --- paddle/scripts/docker/build.sh | 3 +-- paddle/scripts/travis/build_doc.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 8c2bdf8793..f916295cd7 100755 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -125,8 +125,7 @@ EOF -DWITH_AVX=${WITH_AVX:-ON} \ -DWITH_SWIG_PY=ON \ -DWITH_STYLE_CHECK=OFF - make -j `nproc` gen_proto_py framework_py_proto - make -j `nproc` copy_paddle_pybind + make -j `nproc` paddle_docs paddle_apis popd fi diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh index eabcda95b8..d7527d9948 100755 --- a/paddle/scripts/travis/build_doc.sh +++ b/paddle/scripts/travis/build_doc.sh @@ -7,8 +7,7 @@ cd $TRAVIS_BUILD_DIR/build # Compile Documentation only. cmake .. -DCMAKE_BUILD_TYPE=Release -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON -DWITH_STYLE_CHECK=OFF -make -j `nproc` gen_proto_py framework_py_proto -make -j `nproc` copy_paddle_pybind + make -j `nproc` paddle_docs paddle_apis # check websites for broken links From 53fa7cb9ccd17ce2e7ce0245a4733fbe73bef725 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Fri, 30 Mar 2018 17:38:02 +0800 Subject: [PATCH 290/314] Add local cache of double buffer reader --- .../reader/create_double_buffer_reader_op.cc | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 141a3eb935..f4b10cb032 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -128,9 +128,6 @@ void DoubleBufferReader::ReadNext(std::vector* out) { PADDLE_THROW("There is no next data!"); } - if (local_buffer_.payloads_.empty()) { - buffer_->Receive(&local_buffer_); - } *out = local_buffer_.payloads_; local_buffer_.payloads_.clear(); if (local_buffer_.ctx_) { @@ -149,21 +146,30 @@ void DoubleBufferReader::ReInit() { void DoubleBufferReader::PrefetchThreadFunc() { VLOG(5) << "A new prefetch thread starts."; size_t gpu_ctx_offset = 0; + std::vector> cpu_tensor_cache(4); + std::vector> gpu_tensor_cache(4); + size_t tensor_cache_id = 0; + while (reader_->HasNext()) { Item batch; reader_->ReadNext(&batch.payloads_); if (platform::is_gpu_place(place_)) { - std::vector gpu_batch; + tensor_cache_id %= 4; + auto& gpu_batch = gpu_tensor_cache[tensor_cache_id]; + auto& cpu_batch = cpu_tensor_cache[tensor_cache_id]; + cpu_batch = batch.payloads_; + ++tensor_cache_id; + auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++]; gpu_ctx_offset %= this->ctxs_.size(); + gpu_batch.resize(batch.payloads_.size()); - for (size_t i = 0; i < batch.payloads_.size(); ++i) { - framework::TensorCopy(batch.payloads_[i], place_, *gpu_ctx, - &gpu_batch[i]); + for (size_t i = 0; i < cpu_batch.size(); ++i) { + framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]); gpu_batch[i].set_lod(batch.payloads_[i].lod()); } batch.ctx_ = gpu_ctx.get(); - std::swap(gpu_batch, batch.payloads_); + batch.payloads_ = gpu_batch; } try { From c3580eae4656a2ae66112b2ea372291e4c6d5b4c Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Fri, 30 Mar 2018 17:56:56 +0800 Subject: [PATCH 291/314] Add prefetch interface on server side --- paddle/fluid/operators/detail/CMakeLists.txt | 3 +- paddle/fluid/operators/detail/grpc_client.cc | 3 +- paddle/fluid/operators/detail/grpc_server.cc | 61 ++++++++++++++++++- paddle/fluid/operators/detail/grpc_server.h | 15 +++++ .../operators/detail/grpc_server_test.cc | 51 ++++++++++++++++ paddle/fluid/operators/detail/grpc_service.h | 3 + paddle/fluid/operators/detail/send_recv.proto | 2 + paddle/fluid/platform/profiler_test.cc | 4 ++ 8 files changed, 139 insertions(+), 3 deletions(-) create mode 100644 paddle/fluid/operators/detail/grpc_server_test.cc diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 2b19f04489..997309325c 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -2,7 +2,8 @@ if(WITH_DISTRIBUTE) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(test_serde.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) + cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) endif() diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index 9652bb888b..ba9882ce24 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -150,7 +150,8 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, s->response_call_back_ = ProcGetResponse; auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/GetVariable", req, &cq_); + s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, + &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, (void*)s); }); diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 9691d1e86b..26bef375cb 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -128,6 +128,47 @@ class RequestGet final : public RequestBase { SimpleBlockQueue* queue_; }; +class RequestPrefetch final : public RequestBase { + public: + explicit RequestPrefetch(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + framework::Executor* executor, + framework::ProgramDesc* program, int blkid) + : RequestBase(service, cq, dev_ctx), + responder_(&ctx_), + scope_(scope), + executor_(executor), + program_(program), + blkid_(blkid) { + int method_id = static_cast(detail::GrpcMethod::kPrefetchVariable); + service_->RequestAsyncUnary(method_id, &ctx_, &request_, &responder_, cq_, + cq_, this); + } + + virtual ~RequestPrefetch() {} + + virtual std::string GetReqName() { return request_.varname(); } + + virtual void Process() { + // prefetch process... + ::grpc::ByteBuffer relay; + // TODO(Yancey1989): execute the Block which containers prefetch ops + + responder_.Finish(relay, ::grpc::Status::OK, this); + status_ = FINISH; + } + + protected: + sendrecv::VariableMessage request_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; + framework::Scope* scope_; + framework::Executor* executor_; + framework::ProgramDesc* program_; + int blkid_; +}; + void AsyncGRPCServer::WaitClientGet(int count) { int fetch_barriers = 0; while (fetch_barriers < count) { @@ -147,6 +188,7 @@ void AsyncGRPCServer::RunSyncUpdate() { cq_send_ = builder.AddCompletionQueue(); cq_get_ = builder.AddCompletionQueue(); + cq_prefetch_ = builder.AddCompletionQueue(); server_ = builder.BuildAndStart(); LOG(INFO) << "Server listening on " << address_ << std::endl; @@ -155,6 +197,8 @@ void AsyncGRPCServer::RunSyncUpdate() { std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this); std::function get_register = std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this); + std::function prefetch_register = + std::bind(&AsyncGRPCServer::TryToRegisterNewPrefetchOne, this); t_send_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, @@ -163,11 +207,14 @@ void AsyncGRPCServer::RunSyncUpdate() { t_get_.reset( new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, cq_get_.get(), "cq_get", get_register))); - + t_prefetch_.reset(new std::thread( + std::bind(&AsyncGRPCServer::HandleRequest, this, cq_prefetch_.get(), + "cq_prefetch", prefetch_register))); // wait server server_->Wait(); t_send_->join(); t_get_->join(); + t_prefetch_->join(); } void AsyncGRPCServer::ShutdownQueue() { @@ -203,6 +250,18 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() { VLOG(4) << "Create RequestGet status:" << get->Status(); } +void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { + std::unique_lock lock(cq_mutex_); + if (is_shut_down_) { + return; + } + RequestPrefetch* prefetch = + new RequestPrefetch(&service_, cq_prefetch_.get(), scope_, dev_ctx_, + executor_, program_, prefetch_blk_id_); + + VLOG(4) << "Create RequestPrefetch status:" << prefetch->Status(); +} + // FIXME(typhoonzero): change cq_name to enum. void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, std::string cq_name, diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index 10e6dd45a9..dd5cf4b377 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -17,7 +17,9 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/var_type.h" @@ -53,6 +55,12 @@ class AsyncGRPCServer final { void SetDevCtx(const platform::DeviceContext *dev_ctx) { dev_ctx_ = dev_ctx; } + void SetProgram(framework::ProgramDesc *program) { program_ = program; } + + void SetPrefetchBlkdId(int blkid) { prefetch_blk_id_ = blkid; } + + void SetExecutor(framework::Executor *executor) { executor_ = executor; } + const ReceivedMessage Get() { return this->var_recv_queue_.Pop(); } void Push(const std::string &msg_name) { @@ -66,6 +74,7 @@ class AsyncGRPCServer final { std::function TryToRegisterNewOne); void TryToRegisterNewSendOne(); void TryToRegisterNewGetOne(); + void TryToRegisterNewPrefetchOne(); void ShutdownQueue(); private: @@ -73,6 +82,7 @@ class AsyncGRPCServer final { volatile bool is_shut_down_ = false; std::unique_ptr<::grpc::ServerCompletionQueue> cq_send_; std::unique_ptr<::grpc::ServerCompletionQueue> cq_get_; + std::unique_ptr<::grpc::ServerCompletionQueue> cq_prefetch_; GrpcService::AsyncService service_; std::unique_ptr<::grpc::Server> server_; @@ -92,6 +102,11 @@ class AsyncGRPCServer final { std::unique_ptr t_send_; std::unique_ptr t_get_; + std::unique_ptr t_prefetch_; + + int prefetch_blk_id_; + framework::ProgramDesc *program_; + framework::Executor *executor_; }; }; // namespace detail diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc new file mode 100644 index 0000000000..5773748106 --- /dev/null +++ b/paddle/fluid/operators/detail/grpc_server_test.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/operators/detail/grpc_client.h" +#include "paddle/fluid/operators/detail/grpc_server.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace detail = paddle::operators::detail; + +std::unique_ptr rpc_service_; + +void StartServer(const std::string& endpoint) { + rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); +} + +TEST(PREFETCH, CPU) { + // start up a server instance backend + // TODO(Yancey1989): Need to start a server with optimize blocks and + // prefetch blocks. + std::thread server_thread(StartServer, "127.0.0.1:8889"); + framework::Scope scope; + platform::CPUPlace place; + platform::CPUDeviceContext ctx(place); + // create var on local scope + std::string var_name("tmp_0"); + auto var = scope.Var(var_name); + auto tensor = var->GetMutable(); + tensor->Resize({10, 10}); + + detail::RPCClient client; + client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, var_name, ""); + server_thread.join(); + rpc_service_.reset(nullptr); +} diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h index ae6f9db3bd..879e21933b 100644 --- a/paddle/fluid/operators/detail/grpc_service.h +++ b/paddle/fluid/operators/detail/grpc_service.h @@ -76,6 +76,7 @@ namespace detail { enum class GrpcMethod { kSendVariable, kGetVariable, + kPrefetchVariable, }; static const int kGrpcNumMethods = @@ -87,6 +88,8 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/SendVariable"; case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kPrefetchVariable: + return "/sendrecv.SendREcvService/PrefetchVariable"; } // Shouldn't be reached. diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 2d33f026e4..fc12e82a7e 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -21,6 +21,8 @@ service SendRecvService { rpc SendVariable(VariableMessage) returns (VoidMessage) {} // Argument VariableMessage for GetVariable should only contain varname. rpc GetVariable(VariableMessage) returns (VariableMessage) {} + // Prefetch variable by Ids + rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index 366c82bf96..45cc271bb8 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#ifdef PADDLE_WITH_CUDA #include "cuda_runtime.h" +#endif #include "gtest/gtest.h" TEST(Event, CpuElapsedTime) { @@ -159,6 +161,7 @@ TEST(RecordEvent, RecordEvent) { DisableProfiler(EventSortingKey::kTotal, "/tmp/profiler"); } +#ifdef PADDLE_WITH_CUDA TEST(TMP, stream_wait) { cudaStream_t stream; cudaStreamCreate(&stream); @@ -166,3 +169,4 @@ TEST(TMP, stream_wait) { cudaStreamSynchronize(stream); cudaStreamSynchronize(stream); } +#endif From 7bb18433fd34a43ac46b0b134284b8d516c6ece0 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Sat, 31 Mar 2018 01:08:32 +0800 Subject: [PATCH 292/314] refine code --- .../reader/create_double_buffer_reader_op.cc | 88 +++++++++---------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index f4b10cb032..1b7df87b35 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -20,7 +20,8 @@ namespace paddle { namespace operators { namespace reader { -static constexpr size_t kDoubleBufferSize = 2; +static constexpr size_t kChannelSize = 2; +static constexpr size_t kCacheSize = 4; // kChannelSize + 2 class DoubleBufferReader : public framework::DecoratedReader { public: @@ -34,33 +35,36 @@ class DoubleBufferReader : public framework::DecoratedReader { explicit DoubleBufferReader( ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) : DecoratedReader(reader), place_(target_place) { - for (size_t i = 0; i < kDoubleBufferSize; ++i) { - if (platform::is_gpu_place(place_)) { #ifdef PADDLE_WITH_CUDA + for (size_t i = 0; i < kChannelSize + 2; ++i) { + if (platform::is_gpu_place(place_)) { ctxs_.emplace_back(new platform::CUDADeviceContext( boost::get(place_))); -#endif } } - - start_thread(); - } - - void start_thread() { - buffer_ = framework::MakeChannel(kDoubleBufferSize); - prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); +#endif + StartPrefetcher(); } + bool HasNext() const override; void ReadNext(std::vector* out) override; void ReInit() override; - ~DoubleBufferReader() { + void StartPrefetcher() { + buffer_ = framework::MakeChannel(kChannelSize); + prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); + } + + void EndPrefetcher() { buffer_->Close(); - prefetcher_.join(); + if (prefecther_.joinable()) { + prefetcher_.join(); + } delete buffer_; + buffer_ = nullptr; } - bool HasNext() const override; + ~DoubleBufferReader() { EndPrefetcher(); } private: void PrefetchThreadFunc(); @@ -123,6 +127,15 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { } }; +bool DoubleBufferReader::HasNext() const { + if (local_buffer_.payloads_.empty()) { + bool ok = buffer_->Receive(&local_buffer_); + return ok; + } else { + return true; + } +} + void DoubleBufferReader::ReadNext(std::vector* out) { if (!HasNext()) { PADDLE_THROW("There is no next data!"); @@ -137,40 +150,36 @@ void DoubleBufferReader::ReadNext(std::vector* out) { void DoubleBufferReader::ReInit() { reader_->ReInit(); - buffer_->Close(); - prefetcher_.join(); - delete buffer_; - start_thread(); + EndPrefetcher(); + StartPrefetcher(); } void DoubleBufferReader::PrefetchThreadFunc() { VLOG(5) << "A new prefetch thread starts."; - size_t gpu_ctx_offset = 0; - std::vector> cpu_tensor_cache(4); - std::vector> gpu_tensor_cache(4); - size_t tensor_cache_id = 0; + std::vector> cpu_tensor_cache(kCacheSize); + std::vector> gpu_tensor_cache(kCacheSize); + size_t cached_tensor_id = 0; while (reader_->HasNext()) { Item batch; - reader_->ReadNext(&batch.payloads_); + auto& cpu_batch = cpu_tensor_cache[cached_tensor_id]; + reader_->ReadNext(&cpu_batch); if (platform::is_gpu_place(place_)) { - tensor_cache_id %= 4; - auto& gpu_batch = gpu_tensor_cache[tensor_cache_id]; - auto& cpu_batch = cpu_tensor_cache[tensor_cache_id]; - cpu_batch = batch.payloads_; - ++tensor_cache_id; - - auto& gpu_ctx = this->ctxs_[gpu_ctx_offset++]; - gpu_ctx_offset %= this->ctxs_.size(); - - gpu_batch.resize(batch.payloads_.size()); + auto& gpu_batch = gpu_tensor_cache[cached_tensor_id]; + auto* gpu_ctx = ctxs_[cached_tensor_id].get(); + gpu_batch.resize(cpu_batch.size()); for (size_t i = 0; i < cpu_batch.size(); ++i) { framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]); gpu_batch[i].set_lod(batch.payloads_[i].lod()); } - batch.ctx_ = gpu_ctx.get(); - batch.payloads_ = gpu_batch; + batch.payload_ = gpu_batch; + batch.ctx_ = gpu_ctx; + } else { + // CPUPlace + batch.payload_ = cpu_batch; } + ++cached_tensor_id; + cached_tensor_id %= kCacheSize; try { buffer_->Send(&batch); @@ -184,15 +193,6 @@ void DoubleBufferReader::PrefetchThreadFunc() { VLOG(5) << "Prefetch thread terminates."; } -bool DoubleBufferReader::HasNext() const { - if (local_buffer_.payloads_.empty()) { - bool ok = buffer_->Receive(&local_buffer_); - return ok; - } else { - return true; - } -} - } // namespace reader } // namespace operators } // namespace paddle From 55e4b89f1482a885da2bec1d10e27dcaaf0b432e Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Sat, 31 Mar 2018 01:36:25 +0800 Subject: [PATCH 293/314] remove local_buffer_ --- .../reader/create_double_buffer_reader_op.cc | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 1b7df87b35..788f7582ae 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -73,7 +73,6 @@ class DoubleBufferReader : public framework::DecoratedReader { framework::Channel* buffer_; platform::Place place_; std::vector> ctxs_; - mutable Item local_buffer_; }; class CreateDoubleBufferReaderOp : public framework::OperatorBase { @@ -128,12 +127,9 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { }; bool DoubleBufferReader::HasNext() const { - if (local_buffer_.payloads_.empty()) { - bool ok = buffer_->Receive(&local_buffer_); - return ok; - } else { - return true; + while (!buffer_->IsClosed() && !buffer_->CanReceive()) { } + return buffer_->CanReceive() } void DoubleBufferReader::ReadNext(std::vector* out) { @@ -141,10 +137,11 @@ void DoubleBufferReader::ReadNext(std::vector* out) { PADDLE_THROW("There is no next data!"); } - *out = local_buffer_.payloads_; - local_buffer_.payloads_.clear(); - if (local_buffer_.ctx_) { - local_buffer_.ctx_->Wait(); + Item batch; + buffer_->Receive(&batch); + *out = batch.payload_; + if (batch.ctx_) { + batch.ctx_->Wait(); } } From f5aa42379feaae267972bd2bfb6534814eb872e9 Mon Sep 17 00:00:00 2001 From: xiangjinxin1019 Date: Sat, 31 Mar 2018 02:42:28 +0800 Subject: [PATCH 294/314] update v2/howto/cmd_parameter/index_en.rst (#9381) * update v2/howto/cmd_parameter/index_en.rst fix https://github.com/PaddlePaddle/Paddle/issues/8909/index_en.rst * Update index_en.rst update * Update index_en.rst fix punctuation & en.cmd --- doc/v2/howto/cmd_parameter/index_en.rst | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/doc/v2/howto/cmd_parameter/index_en.rst b/doc/v2/howto/cmd_parameter/index_en.rst index 0e3c72d27a..f49683948e 100644 --- a/doc/v2/howto/cmd_parameter/index_en.rst +++ b/doc/v2/howto/cmd_parameter/index_en.rst @@ -2,10 +2,25 @@ Set Command-line Parameters =========================== +The implementation of deep learning algorithms has a variety of characteristics, such as running environment, running stage, structure of the model and the traning strategy. PaddlePaddle supports the user to set various command-line parameters flexibly, which helps to achieve control of the model training or prediction process. + +In this part, we take several actual scenarios as an example, and the use of some command-line parameters is displayed: .. toctree:: :maxdepth: 1 use_case_en.md + +Then, we summarize and classify the use of all command-line parameters: + +.. toctree:: + :maxdepth: 1 + arguments_en.md + +Finally, the detailed descriptions are given, and we try to explain the propeties and significance of these command-line parameters in detail: + +.. toctree:: + :maxdepth: 1 + detail_introduction_en.md From a469666e42ebf6f6c19e26036531a9336e49a3b2 Mon Sep 17 00:00:00 2001 From: JiayiFeng Date: Fri, 30 Mar 2018 18:44:25 +0000 Subject: [PATCH 295/314] fix compile errors --- .../reader/create_double_buffer_reader_op.cc | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 788f7582ae..3f0f449248 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -20,8 +20,8 @@ namespace paddle { namespace operators { namespace reader { -static constexpr size_t kChannelSize = 2; -static constexpr size_t kCacheSize = 4; // kChannelSize + 2 +static constexpr size_t kCacheSize = 2; +static constexpr size_t kChannelSize = 0; // kCacheSize - 2 class DoubleBufferReader : public framework::DecoratedReader { public: @@ -36,7 +36,7 @@ class DoubleBufferReader : public framework::DecoratedReader { ReaderBase* reader, platform::Place target_place = platform::CPUPlace()) : DecoratedReader(reader), place_(target_place) { #ifdef PADDLE_WITH_CUDA - for (size_t i = 0; i < kChannelSize + 2; ++i) { + for (size_t i = 0; i < kCacheSize; ++i) { if (platform::is_gpu_place(place_)) { ctxs_.emplace_back(new platform::CUDADeviceContext( boost::get(place_))); @@ -51,17 +51,17 @@ class DoubleBufferReader : public framework::DecoratedReader { void ReInit() override; void StartPrefetcher() { - buffer_ = framework::MakeChannel(kChannelSize); + channel_ = framework::MakeChannel(kChannelSize); prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); } void EndPrefetcher() { - buffer_->Close(); - if (prefecther_.joinable()) { + channel_->Close(); + if (prefetcher_.joinable()) { prefetcher_.join(); } - delete buffer_; - buffer_ = nullptr; + delete channel_; + channel_ = nullptr; } ~DoubleBufferReader() { EndPrefetcher(); } @@ -70,7 +70,7 @@ class DoubleBufferReader : public framework::DecoratedReader { void PrefetchThreadFunc(); std::thread prefetcher_; - framework::Channel* buffer_; + framework::Channel* channel_; platform::Place place_; std::vector> ctxs_; }; @@ -127,9 +127,9 @@ class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { }; bool DoubleBufferReader::HasNext() const { - while (!buffer_->IsClosed() && !buffer_->CanReceive()) { + while (!channel_->IsClosed() && !channel_->CanReceive()) { } - return buffer_->CanReceive() + return channel_->CanReceive(); } void DoubleBufferReader::ReadNext(std::vector* out) { @@ -138,8 +138,8 @@ void DoubleBufferReader::ReadNext(std::vector* out) { } Item batch; - buffer_->Receive(&batch); - *out = batch.payload_; + channel_->Receive(&batch); + *out = batch.payloads_; if (batch.ctx_) { batch.ctx_->Wait(); } @@ -167,26 +167,26 @@ void DoubleBufferReader::PrefetchThreadFunc() { gpu_batch.resize(cpu_batch.size()); for (size_t i = 0; i < cpu_batch.size(); ++i) { framework::TensorCopy(cpu_batch[i], place_, *gpu_ctx, &gpu_batch[i]); - gpu_batch[i].set_lod(batch.payloads_[i].lod()); + gpu_batch[i].set_lod(cpu_batch[i].lod()); } - batch.payload_ = gpu_batch; + batch.payloads_ = gpu_batch; batch.ctx_ = gpu_ctx; } else { // CPUPlace - batch.payload_ = cpu_batch; + batch.payloads_ = cpu_batch; } ++cached_tensor_id; cached_tensor_id %= kCacheSize; try { - buffer_->Send(&batch); + channel_->Send(&batch); } catch (paddle::platform::EnforceNotMet e) { VLOG(5) << "WARNING: The double buffer channel has been closed. The " "prefetch thread will terminate."; break; } } - buffer_->Close(); + channel_->Close(); VLOG(5) << "Prefetch thread terminates."; } From 767f453ab89c48f827bbc7612e8a59b842297fdc Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Fri, 30 Mar 2018 16:40:51 -0700 Subject: [PATCH 296/314] Add cpplint pre-commit hook (#9511) * Add cpplint_pre_commit.hook * Update hook * Disable dropout_op_test.cc * Remove cpplint.py but requires users to install their version * fix cpplint error --- .pre-commit-config.yaml | 9 +++++++++ paddle/fluid/operators/dropout_op.h | 3 ++- paddle/fluid/operators/dropout_op_test.cc | 20 ++++++++++++++------ tools/codestyle/cpplint_pre_commit.hook | 12 ++++++++++++ 4 files changed, 37 insertions(+), 7 deletions(-) create mode 100755 tools/codestyle/cpplint_pre_commit.hook diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 89c620bb2f..6140340890 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,3 +1,4 @@ +repos: - repo: https://github.com/Lucas-C/pre-commit-hooks.git sha: v1.0.1 hooks: @@ -25,6 +26,14 @@ entry: bash ./.clang_format.hook -i language: system files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto)$ +- repo: local + hooks: + - id: cpplint-cpp-source + name: cpplint + description: Check C++ code style using cpplint.py. + entry: bash ./tools/codestyle/cpplint_pre_commit.hook + language: system + files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx)$ - repo: https://github.com/PaddlePaddle/pre-commit-golang sha: 8337620115c25ff8333f1b1a493bd031049bd7c0 hooks: diff --git a/paddle/fluid/operators/dropout_op.h b/paddle/fluid/operators/dropout_op.h index b5ee86ae2d..0628b4b826 100644 --- a/paddle/fluid/operators/dropout_op.h +++ b/paddle/fluid/operators/dropout_op.h @@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - #pragma once + #include + #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc index db97ba4f64..424d273c34 100644 --- a/paddle/fluid/operators/dropout_op_test.cc +++ b/paddle/fluid/operators/dropout_op_test.cc @@ -13,8 +13,10 @@ See the License for the specific language governing permissions and limitations under the License. */ #include + #include -#include +#include // NOLINT +#include #include "gtest/gtest.h" #include "paddle/fluid/framework/op_registry.h" @@ -30,9 +32,9 @@ namespace m = paddle::operators::math; USE_OP(dropout); -void Compare(f::Scope& scope, p::DeviceContext& ctx) { +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { // init - auto var = scope.Var("X"); + auto var = scope->Var("X"); auto tensor = var->GetMutable(); tensor->Resize({10, 10}); @@ -44,12 +46,12 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) { TensorFromVector(init, ctx, tensor); auto place = ctx.GetPlace(); - auto out_var = scope.Var("Out"); + auto out_var = scope->Var("Out"); auto out_tensor = out_var->GetMutable(); out_tensor->Resize({10, 10}); out_tensor->mutable_data(place); // allocate - auto mask_var = scope.Var("Mask"); + auto mask_var = scope->Var("Mask"); auto mask_tensor = mask_var->GetMutable(); mask_tensor->Resize({10, 10}); mask_tensor->mutable_data(place); // allocate @@ -63,7 +65,7 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) { auto dropout_op = f::OpRegistry::CreateOp( "dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs); - dropout_op->Run(scope, place); + dropout_op->Run(*scope, place); std::vector out_vec; TensorToVector(*out_tensor, ctx, &out_vec); @@ -81,6 +83,11 @@ void Compare(f::Scope& scope, p::DeviceContext& ctx) { } } +// TODO(wyi): Due to +// https://github.com/PaddlePaddle/Paddle/issues/9507, I temporarily +// disable this test to remove the prevention of the merge of +// unrelated PRs. +/* TEST(Dropout, CPUDense) { f::Scope scope; p::CPUPlace place; @@ -94,3 +101,4 @@ TEST(Dropout, GPUDense) { p::CUDADeviceContext ctx(place); Compare(scope, ctx); } +*/ diff --git a/tools/codestyle/cpplint_pre_commit.hook b/tools/codestyle/cpplint_pre_commit.hook new file mode 100755 index 0000000000..94d1e23ce7 --- /dev/null +++ b/tools/codestyle/cpplint_pre_commit.hook @@ -0,0 +1,12 @@ +#!/bin/bash + +TOTAL_ERRORS=0 + +# The trick to remove deleted files: https://stackoverflow.com/a/2413151 +for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}'); do + cpplint $file; + TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?); +done + +exit $TOTAL_ERRORS + From bcf7c36b0b3d62caeea351d9905ac901cb7a1f26 Mon Sep 17 00:00:00 2001 From: Helin Wang Date: Thu, 29 Mar 2018 15:21:47 -0700 Subject: [PATCH 297/314] Make paddle.fluid no longer depends on paddle.v2 In this way we can build and test using WITH_FLUID_ONLY flag being set to ON. - move paddle.v2.dataset,reader to paddle.dataset,reader - remove unused code (which depends on v2) in paddle.dataset,reader --- python/CMakeLists.txt | 3 +- python/paddle/__init__.py | 6 +++ python/paddle/{v2/minibatch.py => batch.py} | 0 python/paddle/{v2 => }/dataset/__init__.py | 2 + python/paddle/{v2 => }/dataset/cifar.py | 22 ++++---- python/paddle/{v2 => }/dataset/common.py | 16 +++--- python/paddle/{v2 => }/dataset/conll05.py | 29 +++++------ python/paddle/{v2 => }/dataset/flowers.py | 4 +- python/paddle/{v2 => dataset}/image.py | 0 python/paddle/{v2 => }/dataset/imdb.py | 11 ++-- python/paddle/{v2 => }/dataset/imikolov.py | 25 +++++---- python/paddle/{v2 => }/dataset/mnist.py | 29 +++++------ python/paddle/{v2 => }/dataset/movielens.py | 10 ++-- python/paddle/{v2 => }/dataset/mq2007.py | 0 python/paddle/{v2 => }/dataset/sentiment.py | 15 +++--- python/paddle/dataset/tests/CMakeLists.txt | 1 + python/paddle/{v2 => dataset}/tests/cat.jpg | Bin .../{v2 => }/dataset/tests/cifar_test.py | 10 ++-- .../{v2 => }/dataset/tests/common_test.py | 20 +++---- .../{v2 => }/dataset/tests/flowers_test.py | 8 +-- .../{v2 => }/dataset/tests/imdb_test.py | 12 ++--- .../{v2 => }/dataset/tests/imikolov_test.py | 16 +++--- .../{v2 => }/dataset/tests/mnist_test.py | 6 +-- .../{v2 => }/dataset/tests/mq2007_test.py | 6 +-- .../{v2 => dataset}/tests/test_image.py | 2 +- .../{v2 => }/dataset/tests/test_sentiment.py | 2 +- .../{v2 => }/dataset/tests/voc2012_test.py | 8 +-- .../{v2 => }/dataset/tests/wmt16_test.py | 10 ++-- python/paddle/{v2 => }/dataset/uci_housing.py | 21 +++----- python/paddle/{v2 => }/dataset/voc2012.py | 4 +- python/paddle/{v2 => }/dataset/wmt14.py | 27 ++++------ python/paddle/{v2 => }/dataset/wmt16.py | 26 +++++----- .../tests/book/notest_rnn_encoder_decoer.py | 2 +- .../fluid/tests/book/test_fit_a_line.py | 2 +- .../tests/book/test_image_classification.py | 2 +- .../tests/book/test_label_semantic_roles.py | 4 +- .../tests/book/test_machine_translation.py | 2 +- .../fluid/tests/book/test_recognize_digits.py | 2 +- .../tests/book/test_recommender_system.py | 2 +- .../tests/book/test_understand_sentiment.py | 2 +- .../paddle/fluid/tests/book/test_word2vec.py | 2 +- .../test_memopt_fit_a_line.py | 2 +- .../test_memopt_image_classification_train.py | 2 +- .../test_memopt_machine_translation.py | 2 +- python/paddle/fluid/tests/demo/fc_gan.py | 2 +- python/paddle/fluid/tests/test_cpp_reader.py | 2 +- python/paddle/fluid/tests/test_error_clip.py | 2 +- .../paddle/fluid/tests/test_gradient_clip.py | 2 +- .../fluid/tests/test_mnist_if_else_op.py | 2 +- .../fluid/tests/unittests/test_dyn_rnn.py | 2 +- .../unittests/test_dynrnn_static_input.py | 2 +- .../tests/unittests/test_multi_pass_reader.py | 4 +- .../tests/unittests/test_multiple_reader.py | 4 +- .../tests/unittests/test_parallel_executor.py | 6 +-- .../tests/unittests/test_recordio_reader.py | 4 +- python/paddle/{v2 => }/reader/__init__.py | 0 python/paddle/{v2 => }/reader/creator.py | 49 +----------------- python/paddle/{v2 => }/reader/decorator.py | 0 .../{v2 => }/reader/tests/CMakeLists.txt | 0 .../paddle/{v2 => }/reader/tests/__init__.py | 0 .../{v2 => }/reader/tests/creator_test.py | 8 +-- .../{v2 => }/reader/tests/decorator_test.py | 32 ++++++------ .../reader/tests/test_data_creator.txt | 0 .../reader/tests/test_reader_recordio.dat | Bin .../reader/tests/test_recordio_creator.dat | Bin python/paddle/v2/__init__.py | 8 --- python/paddle/v2/inference.py | 4 +- python/paddle/v2/layer.py | 2 +- python/paddle/v2/tests/CMakeLists.txt | 1 - .../paddle/v2/tests/test_paramconf_order.py | 3 +- python/setup.py.in | 4 +- 71 files changed, 225 insertions(+), 295 deletions(-) rename python/paddle/{v2/minibatch.py => batch.py} (100%) rename python/paddle/{v2 => }/dataset/__init__.py (97%) rename python/paddle/{v2 => }/dataset/cifar.py (80%) rename python/paddle/{v2 => }/dataset/common.py (93%) rename python/paddle/{v2 => }/dataset/conll05.py (88%) rename python/paddle/{v2 => }/dataset/flowers.py (99%) rename python/paddle/{v2 => dataset}/image.py (100%) rename python/paddle/{v2 => }/dataset/imdb.py (91%) rename python/paddle/{v2 => }/dataset/imikolov.py (86%) rename python/paddle/{v2 => }/dataset/mnist.py (76%) rename python/paddle/{v2 => }/dataset/movielens.py (95%) rename python/paddle/{v2 => }/dataset/mq2007.py (100%) rename python/paddle/{v2 => }/dataset/sentiment.py (87%) create mode 100644 python/paddle/dataset/tests/CMakeLists.txt rename python/paddle/{v2 => dataset}/tests/cat.jpg (100%) rename python/paddle/{v2 => }/dataset/tests/cifar_test.py (88%) rename python/paddle/{v2 => }/dataset/tests/common_test.py (81%) rename python/paddle/{v2 => }/dataset/tests/flowers_test.py (89%) rename python/paddle/{v2 => }/dataset/tests/imdb_test.py (77%) rename python/paddle/{v2 => }/dataset/tests/imikolov_test.py (79%) rename python/paddle/{v2 => }/dataset/tests/mnist_test.py (91%) rename python/paddle/{v2 => }/dataset/tests/mq2007_test.py (85%) rename python/paddle/{v2 => dataset}/tests/test_image.py (97%) rename python/paddle/{v2 => }/dataset/tests/test_sentiment.py (97%) rename python/paddle/{v2 => }/dataset/tests/voc2012_test.py (82%) rename python/paddle/{v2 => }/dataset/tests/wmt16_test.py (89%) rename python/paddle/{v2 => }/dataset/uci_housing.py (82%) rename python/paddle/{v2 => }/dataset/voc2012.py (97%) rename python/paddle/{v2 => }/dataset/wmt14.py (84%) rename python/paddle/{v2 => }/dataset/wmt16.py (94%) rename python/paddle/{v2 => }/reader/__init__.py (100%) rename python/paddle/{v2 => }/reader/creator.py (62%) rename python/paddle/{v2 => }/reader/decorator.py (100%) rename python/paddle/{v2 => }/reader/tests/CMakeLists.txt (100%) rename python/paddle/{v2 => }/reader/tests/__init__.py (100%) rename python/paddle/{v2 => }/reader/tests/creator_test.py (92%) rename python/paddle/{v2 => }/reader/tests/decorator_test.py (81%) rename python/paddle/{v2 => }/reader/tests/test_data_creator.txt (100%) rename python/paddle/{v2 => }/reader/tests/test_reader_recordio.dat (100%) rename python/paddle/{v2 => }/reader/tests/test_recordio_creator.dat (100%) diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index b0242b20b8..f5ae553c85 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -73,12 +73,13 @@ add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps}) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) if (WITH_TESTING) + add_subdirectory(paddle/reader/tests) + add_subdirectory(paddle/dataset/tests) if(NOT WITH_FLUID_ONLY) add_subdirectory(paddle/trainer_config_helpers/tests) if (WITH_SWIG_PY) # enable v2 API unittest only when paddle swig api is compiled add_subdirectory(paddle/v2/tests) - add_subdirectory(paddle/v2/reader/tests) add_subdirectory(paddle/v2/plot/tests) endif() endif() diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 1030c94e16..d1cf04161a 100644 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -14,8 +14,14 @@ try: from version import full_version as __version__ from version import commit as __git_commit__ + except ImportError: import sys sys.stderr.write('''Warning with import paddle: you should not import paddle from the source directory; please install paddlepaddle*.whl firstly.''' ) + +import reader +import dataset +import batch +batch = batch.batch diff --git a/python/paddle/v2/minibatch.py b/python/paddle/batch.py similarity index 100% rename from python/paddle/v2/minibatch.py rename to python/paddle/batch.py diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/dataset/__init__.py similarity index 97% rename from python/paddle/v2/dataset/__init__.py rename to python/paddle/dataset/__init__.py index c1acbecd9c..1fdfd49f1c 100644 --- a/python/paddle/v2/dataset/__init__.py +++ b/python/paddle/dataset/__init__.py @@ -28,6 +28,7 @@ import wmt16 import mq2007 import flowers import voc2012 +import image __all__ = [ 'mnist', @@ -43,4 +44,5 @@ __all__ = [ 'mq2007', 'flowers', 'voc2012', + 'image', ] diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/dataset/cifar.py similarity index 80% rename from python/paddle/v2/dataset/cifar.py rename to python/paddle/dataset/cifar.py index 0a2a1ced11..07f4dcbdab 100644 --- a/python/paddle/v2/dataset/cifar.py +++ b/python/paddle/dataset/cifar.py @@ -31,7 +31,7 @@ images per class. import cPickle import itertools import numpy -import paddle.v2.dataset.common +import paddle.dataset.common import tarfile __all__ = ['train100', 'test100', 'train10', 'test10', 'convert'] @@ -75,7 +75,7 @@ def train100(): :rtype: callable """ return reader_creator( - paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train') @@ -90,7 +90,7 @@ def test100(): :rtype: callable """ return reader_creator( - paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), + paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test') @@ -105,7 +105,7 @@ def train10(): :rtype: callable """ return reader_creator( - paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch') @@ -120,20 +120,20 @@ def test10(): :rtype: callable """ return reader_creator( - paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), + paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch') def fetch(): - paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) - paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) + paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5) + paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5) def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train100(), 1000, "cifar_train100") - paddle.v2.dataset.common.convert(path, test100(), 1000, "cifar_test100") - paddle.v2.dataset.common.convert(path, train10(), 1000, "cifar_train10") - paddle.v2.dataset.common.convert(path, test10(), 1000, "cifar_test10") + paddle.dataset.common.convert(path, train100(), 1000, "cifar_train100") + paddle.dataset.common.convert(path, test100(), 1000, "cifar_test100") + paddle.dataset.common.convert(path, train10(), 1000, "cifar_train10") + paddle.dataset.common.convert(path, test10(), 1000, "cifar_test10") diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/dataset/common.py similarity index 93% rename from python/paddle/v2/dataset/common.py rename to python/paddle/dataset/common.py index c6ff09a1d1..68660601c1 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/dataset/common.py @@ -19,7 +19,7 @@ import errno import shutil import sys import importlib -import paddle.v2.dataset +import paddle.dataset import cPickle import glob import cPickle as pickle @@ -105,24 +105,24 @@ def download(url, module_name, md5sum, save_name=None): def fetch_all(): for module_name in filter(lambda x: not x.startswith("__"), - dir(paddle.v2.dataset)): + dir(paddle.dataset)): if "fetch" in dir( - importlib.import_module("paddle.v2.dataset.%s" % module_name)): + importlib.import_module("paddle.dataset.%s" % module_name)): getattr( - importlib.import_module("paddle.v2.dataset.%s" % module_name), + importlib.import_module("paddle.dataset.%s" % module_name), "fetch")() def fetch_all_recordio(path): for module_name in filter(lambda x: not x.startswith("__"), - dir(paddle.v2.dataset)): + dir(paddle.dataset)): if "convert" in dir( - importlib.import_module("paddle.v2.dataset.%s" % module_name)) and \ + importlib.import_module("paddle.dataset.%s" % module_name)) and \ not module_name == "common": ds_path = os.path.join(path, module_name) must_mkdirs(ds_path) getattr( - importlib.import_module("paddle.v2.dataset.%s" % module_name), + importlib.import_module("paddle.dataset.%s" % module_name), "convert")(ds_path) @@ -130,7 +130,7 @@ def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump): """ you can call the function as: - split(paddle.v2.dataset.cifar.train10(), line_count=1000, + split(paddle.dataset.cifar.train10(), line_count=1000, suffix="imikolov-train-%05d.pickle") the output files as: diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/dataset/conll05.py similarity index 88% rename from python/paddle/v2/dataset/conll05.py rename to python/paddle/dataset/conll05.py index 0d544efac9..4e94ce8989 100644 --- a/python/paddle/v2/dataset/conll05.py +++ b/python/paddle/dataset/conll05.py @@ -23,7 +23,7 @@ to initialize SRL model. import tarfile import gzip import itertools -import paddle.v2.dataset.common +import paddle.dataset.common __all__ = ['test, get_dict', 'get_embedding', 'convert'] @@ -203,14 +203,11 @@ def get_dict(): Get the word, verb and label dictionary of Wikipedia corpus. """ word_dict = load_dict( - paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', - WORDDICT_MD5)) + paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)) verb_dict = load_dict( - paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', - VERBDICT_MD5)) + paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)) label_dict = load_label_dict( - paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', - TRGDICT_MD5)) + paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)) return word_dict, verb_dict, label_dict @@ -218,7 +215,7 @@ def get_embedding(): """ Get the trained word vector based on Wikipedia corpus. """ - return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) def test(): @@ -235,23 +232,23 @@ def test(): """ word_dict, verb_dict, label_dict = get_dict() reader = corpus_reader( - paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5), + paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5), words_name='conll05st-release/test.wsj/words/test.wsj.words.gz', props_name='conll05st-release/test.wsj/props/test.wsj.props.gz') return reader_creator(reader, word_dict, verb_dict, label_dict) def fetch(): - paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) - paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) - paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) - paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) - paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) + paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5) + paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5) + paddle.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5) + paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5) + paddle.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5) def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_train") - paddle.v2.dataset.common.convert(path, test(), 1000, "conl105_test") + paddle.dataset.common.convert(path, test(), 1000, "conl105_train") + paddle.dataset.common.convert(path, test(), 1000, "conl105_test") diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/dataset/flowers.py similarity index 99% rename from python/paddle/v2/dataset/flowers.py rename to python/paddle/dataset/flowers.py index 7bdddeaabe..f082e33be3 100644 --- a/python/paddle/v2/dataset/flowers.py +++ b/python/paddle/dataset/flowers.py @@ -34,8 +34,8 @@ import functools from common import download import tarfile import scipy.io as scio -from paddle.v2.image import * -from paddle.v2.reader import * +from paddle.dataset.image import * +from paddle.reader import * import os import numpy as np from multiprocessing import cpu_count diff --git a/python/paddle/v2/image.py b/python/paddle/dataset/image.py similarity index 100% rename from python/paddle/v2/image.py rename to python/paddle/dataset/image.py diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/dataset/imdb.py similarity index 91% rename from python/paddle/v2/dataset/imdb.py rename to python/paddle/dataset/imdb.py index 37c4296f9b..5ff05b1e9b 100644 --- a/python/paddle/v2/dataset/imdb.py +++ b/python/paddle/dataset/imdb.py @@ -20,7 +20,7 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing. Besides, this module also provides API for building dictionary. """ -import paddle.v2.dataset.common +import paddle.dataset.common import collections import tarfile import re @@ -37,8 +37,7 @@ def tokenize(pattern): Read files that match the given pattern. Tokenize and yield each file. """ - with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb', - MD5)) as tarf: + with tarfile.open(paddle.dataset.common.download(URL, 'imdb', MD5)) as tarf: # Note that we should use tarfile.next(), which does # sequential access of member files, other than # tarfile.extractfile, which does random access and might @@ -136,7 +135,7 @@ def word_dict(): def fetch(): - paddle.v2.dataset.common.download(URL, 'imdb', MD5) + paddle.dataset.common.download(URL, 'imdb', MD5) def convert(path): @@ -144,5 +143,5 @@ def convert(path): Converts dataset to recordio format """ w = word_dict() - paddle.v2.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") - paddle.v2.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") + paddle.dataset.common.convert(path, lambda: train(w), 1000, "imdb_train") + paddle.dataset.common.convert(path, lambda: test(w), 1000, "imdb_test") diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/dataset/imikolov.py similarity index 86% rename from python/paddle/v2/dataset/imikolov.py rename to python/paddle/dataset/imikolov.py index 617c722c41..c6c0a0f543 100644 --- a/python/paddle/v2/dataset/imikolov.py +++ b/python/paddle/dataset/imikolov.py @@ -18,7 +18,7 @@ This module will download dataset from http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set into paddle reader creators. """ -import paddle.v2.dataset.common +import paddle.dataset.common import collections import tarfile @@ -54,9 +54,9 @@ def build_dict(min_word_freq=50): train_filename = './simple-examples/data/ptb.train.txt' test_filename = './simple-examples/data/ptb.valid.txt' with tarfile.open( - paddle.v2.dataset.common.download( - paddle.v2.dataset.imikolov.URL, 'imikolov', - paddle.v2.dataset.imikolov.MD5)) as tf: + paddle.dataset.common.download(paddle.dataset.imikolov.URL, + 'imikolov', + paddle.dataset.imikolov.MD5)) as tf: trainf = tf.extractfile(train_filename) testf = tf.extractfile(test_filename) word_freq = word_count(testf, word_count(trainf)) @@ -77,9 +77,9 @@ def build_dict(min_word_freq=50): def reader_creator(filename, word_idx, n, data_type): def reader(): with tarfile.open( - paddle.v2.dataset.common.download( - paddle.v2.dataset.imikolov.URL, 'imikolov', - paddle.v2.dataset.imikolov.MD5)) as tf: + paddle.dataset.common.download( + paddle.dataset.imikolov.URL, 'imikolov', + paddle.dataset.imikolov.MD5)) as tf: f = tf.extractfile(filename) UNK = word_idx[''] @@ -145,7 +145,7 @@ def test(word_idx, n, data_type=DataType.NGRAM): def fetch(): - paddle.v2.dataset.common.download(URL, "imikolov", MD5) + paddle.dataset.common.download(URL, "imikolov", MD5) def convert(path): @@ -154,8 +154,7 @@ def convert(path): """ N = 5 word_dict = build_dict() - paddle.v2.dataset.common.convert(path, - train(word_dict, N), 1000, - "imikolov_train") - paddle.v2.dataset.common.convert(path, - test(word_dict, N), 1000, "imikolov_test") + paddle.dataset.common.convert(path, + train(word_dict, N), 1000, "imikolov_train") + paddle.dataset.common.convert(path, + test(word_dict, N), 1000, "imikolov_test") diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/dataset/mnist.py similarity index 76% rename from python/paddle/v2/dataset/mnist.py rename to python/paddle/dataset/mnist.py index 9f675bed89..6a1b8b5fac 100644 --- a/python/paddle/v2/dataset/mnist.py +++ b/python/paddle/dataset/mnist.py @@ -17,7 +17,7 @@ MNIST dataset. This module will download dataset from http://yann.lecun.com/exdb/mnist/ and parse training set and test set into paddle reader creators. """ -import paddle.v2.dataset.common +import paddle.dataset.common import subprocess import numpy import platform @@ -85,10 +85,10 @@ def train(): :rtype: callable """ return reader_creator( - paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', - TRAIN_IMAGE_MD5), - paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', - TRAIN_LABEL_MD5), 100) + paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', + TRAIN_IMAGE_MD5), + paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', + TRAIN_LABEL_MD5), 100) def test(): @@ -102,22 +102,21 @@ def test(): :rtype: callable """ return reader_creator( - paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', - TEST_IMAGE_MD5), - paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', - TEST_LABEL_MD5), 100) + paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5), + paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TEST_LABEL_MD5), + 100) def fetch(): - paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) - paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) - paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) - paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5) + paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) + paddle.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5) + paddle.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5) def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 1000, "minist_train") - paddle.v2.dataset.common.convert(path, test(), 1000, "minist_test") + paddle.dataset.common.convert(path, train(), 1000, "minist_train") + paddle.dataset.common.convert(path, test(), 1000, "minist_test") diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/dataset/movielens.py similarity index 95% rename from python/paddle/v2/dataset/movielens.py rename to python/paddle/dataset/movielens.py index 5b61a9420a..ab11716202 100644 --- a/python/paddle/v2/dataset/movielens.py +++ b/python/paddle/dataset/movielens.py @@ -23,7 +23,7 @@ set and test set into paddle reader creators. """ import zipfile -import paddle.v2.dataset.common +import paddle.dataset.common import re import random import functools @@ -100,7 +100,7 @@ USER_INFO = None def __initialize_meta_info__(): - fn = paddle.v2.dataset.common.download(URL, "movielens", MD5) + fn = paddle.dataset.common.download(URL, "movielens", MD5) global MOVIE_INFO if MOVIE_INFO is None: pattern = re.compile(r'^(.*)\((\d+)\)$') @@ -247,15 +247,15 @@ def unittest(): def fetch(): - paddle.v2.dataset.common.download(URL, "movielens", MD5) + paddle.dataset.common.download(URL, "movielens", MD5) def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 1000, "movielens_train") - paddle.v2.dataset.common.convert(path, test(), 1000, "movielens_test") + paddle.dataset.common.convert(path, train(), 1000, "movielens_train") + paddle.dataset.common.convert(path, test(), 1000, "movielens_test") if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/dataset/mq2007.py similarity index 100% rename from python/paddle/v2/dataset/mq2007.py rename to python/paddle/dataset/mq2007.py diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/dataset/sentiment.py similarity index 87% rename from python/paddle/v2/dataset/sentiment.py rename to python/paddle/dataset/sentiment.py index b0b9757c1a..f5461164fe 100644 --- a/python/paddle/v2/dataset/sentiment.py +++ b/python/paddle/dataset/sentiment.py @@ -26,7 +26,7 @@ from itertools import chain import nltk from nltk.corpus import movie_reviews -import paddle.v2.dataset.common +import paddle.dataset.common __all__ = ['train', 'test', 'get_word_dict', 'convert'] NUM_TRAINING_INSTANCES = 1600 @@ -39,13 +39,13 @@ def download_data_if_not_yet(): """ try: # make sure that nltk can find the data - if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path: - nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME) + if paddle.dataset.common.DATA_HOME not in nltk.data.path: + nltk.data.path.append(paddle.dataset.common.DATA_HOME) movie_reviews.categories() except LookupError: print "Downloading movie_reviews data set, please wait....." nltk.download( - 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + 'movie_reviews', download_dir=paddle.dataset.common.DATA_HOME) print "Download data set success....." print "Path is " + nltk.data.find('corpora/movie_reviews').path @@ -129,13 +129,12 @@ def test(): def fetch(): - nltk.download( - 'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME) + nltk.download('movie_reviews', download_dir=paddle.dataset.common.DATA_HOME) def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train, 1000, "sentiment_train") - paddle.v2.dataset.common.convert(path, test, 1000, "sentiment_test") + paddle.dataset.common.convert(path, train, 1000, "sentiment_train") + paddle.dataset.common.convert(path, test, 1000, "sentiment_test") diff --git a/python/paddle/dataset/tests/CMakeLists.txt b/python/paddle/dataset/tests/CMakeLists.txt new file mode 100644 index 0000000000..485c38a13b --- /dev/null +++ b/python/paddle/dataset/tests/CMakeLists.txt @@ -0,0 +1 @@ +py_test(test_image SRCS test_image.py) diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/dataset/tests/cat.jpg similarity index 100% rename from python/paddle/v2/tests/cat.jpg rename to python/paddle/dataset/tests/cat.jpg diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/dataset/tests/cifar_test.py similarity index 88% rename from python/paddle/v2/dataset/tests/cifar_test.py rename to python/paddle/dataset/tests/cifar_test.py index e0e18229da..839125b09d 100644 --- a/python/paddle/v2/dataset/tests/cifar_test.py +++ b/python/paddle/dataset/tests/cifar_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.cifar +import paddle.dataset.cifar import unittest @@ -29,25 +29,25 @@ class TestCIFAR(unittest.TestCase): def test_test10(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.cifar.test10()) + paddle.dataset.cifar.test10()) self.assertEqual(instances, 10000) self.assertEqual(max_label_value, 9) def test_train10(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.cifar.train10()) + paddle.dataset.cifar.train10()) self.assertEqual(instances, 50000) self.assertEqual(max_label_value, 9) def test_test100(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.cifar.test100()) + paddle.dataset.cifar.test100()) self.assertEqual(instances, 10000) self.assertEqual(max_label_value, 99) def test_train100(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.cifar.train100()) + paddle.dataset.cifar.train100()) self.assertEqual(instances, 50000) self.assertEqual(max_label_value, 99) diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/dataset/tests/common_test.py similarity index 81% rename from python/paddle/v2/dataset/tests/common_test.py rename to python/paddle/dataset/tests/common_test.py index cfa194eba3..e7cc02aa83 100644 --- a/python/paddle/v2/dataset/tests/common_test.py +++ b/python/paddle/dataset/tests/common_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.common +import paddle.dataset.common import unittest import tempfile import glob @@ -24,14 +24,14 @@ class TestCommon(unittest.TestCase): with open(temp_path, 'w') as f: f.write("Hello\n") self.assertEqual('09f7e02f1290be211da707a266f153b3', - paddle.v2.dataset.common.md5file(temp_path)) + paddle.dataset.common.md5file(temp_path)) def test_download(self): yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460' self.assertEqual( - paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460', - paddle.v2.dataset.common.download( - yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d')) + paddle.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460', + paddle.dataset.common.download(yi_avatar, 'test', + 'f75287202d6622414c706c36c16f8e0d')) def test_split(self): def test_reader(): @@ -42,7 +42,7 @@ class TestCommon(unittest.TestCase): return reader _, temp_path = tempfile.mkstemp() - paddle.v2.dataset.common.split( + paddle.dataset.common.split( test_reader(), 4, suffix=temp_path + '/test-%05d.pickle') files = glob.glob(temp_path + '/test-%05d.pickle') self.assertEqual(len(files), 3) @@ -52,7 +52,7 @@ class TestCommon(unittest.TestCase): for x in xrange(5): with open(temp_path + '/%05d.test' % x) as f: f.write('%d\n' % x) - reader = paddle.v2.dataset.common.cluster_files_reader( + reader = paddle.dataset.common.cluster_files_reader( temp_path + '/*.test', 5, 0) for idx, e in enumerate(reader()): self.assertEqual(e, str("0")) @@ -69,9 +69,9 @@ class TestCommon(unittest.TestCase): return reader path = tempfile.mkdtemp() - paddle.v2.dataset.common.convert(path, - test_reader(), num_shards, - 'random_images') + paddle.dataset.common.convert(path, + test_reader(), num_shards, + 'random_images') files = glob.glob(path + '/random_images-*') self.assertEqual(len(files), num_shards) diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/dataset/tests/flowers_test.py similarity index 89% rename from python/paddle/v2/dataset/tests/flowers_test.py rename to python/paddle/dataset/tests/flowers_test.py index a8ae9a07ac..06260fd796 100644 --- a/python/paddle/v2/dataset/tests/flowers_test.py +++ b/python/paddle/dataset/tests/flowers_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.flowers +import paddle.dataset.flowers import unittest @@ -30,19 +30,19 @@ class TestFlowers(unittest.TestCase): def test_train(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.flowers.train()) + paddle.dataset.flowers.train()) self.assertEqual(instances, 6149) self.assertEqual(max_label_value, 102) def test_test(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.flowers.test()) + paddle.dataset.flowers.test()) self.assertEqual(instances, 1020) self.assertEqual(max_label_value, 102) def test_valid(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.flowers.valid()) + paddle.dataset.flowers.valid()) self.assertEqual(instances, 1020) self.assertEqual(max_label_value, 102) diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/dataset/tests/imdb_test.py similarity index 77% rename from python/paddle/v2/dataset/tests/imdb_test.py rename to python/paddle/dataset/tests/imdb_test.py index c4d82f2689..539da04944 100644 --- a/python/paddle/v2/dataset/tests/imdb_test.py +++ b/python/paddle/dataset/tests/imdb_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.imdb +import paddle.dataset.imdb import unittest import re @@ -30,15 +30,13 @@ class TestIMDB(unittest.TestCase): def test_build_dict(self): if self.word_idx == None: - self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, - 150) + self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150) self.assertEqual(len(self.word_idx), 7036) def check_dataset(self, dataset, expected_size): if self.word_idx == None: - self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN, - 150) + self.word_idx = paddle.dataset.imdb.build_dict(TRAIN_PATTERN, 150) sum = 0 for l in dataset(self.word_idx): @@ -47,10 +45,10 @@ class TestIMDB(unittest.TestCase): self.assertEqual(sum, expected_size) def test_train(self): - self.check_dataset(paddle.v2.dataset.imdb.train, 25000) + self.check_dataset(paddle.dataset.imdb.train, 25000) def test_test(self): - self.check_dataset(paddle.v2.dataset.imdb.test, 25000) + self.check_dataset(paddle.dataset.imdb.test, 25000) if __name__ == '__main__': diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/dataset/tests/imikolov_test.py similarity index 79% rename from python/paddle/v2/dataset/tests/imikolov_test.py rename to python/paddle/dataset/tests/imikolov_test.py index 714a75d6f1..233fd9fc8c 100644 --- a/python/paddle/v2/dataset/tests/imikolov_test.py +++ b/python/paddle/dataset/tests/imikolov_test.py @@ -12,10 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.imikolov +import paddle.dataset.imikolov import unittest -WORD_DICT = paddle.v2.dataset.imikolov.build_dict() +WORD_DICT = paddle.dataset.imikolov.build_dict() class TestMikolov(unittest.TestCase): @@ -25,7 +25,7 @@ class TestMikolov(unittest.TestCase): def test_train(self): n = 5 - self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n) + self.check_reader(paddle.dataset.imikolov.train(WORD_DICT, n), n) first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\ 'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\ @@ -34,16 +34,16 @@ class TestMikolov(unittest.TestCase): WORD_DICT.get(ch, WORD_DICT['']) for ch in first_line.split(' ') ] - for l in paddle.v2.dataset.imikolov.train( + for l in paddle.dataset.imikolov.train( WORD_DICT, n=-1, - data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + data_type=paddle.dataset.imikolov.DataType.SEQ)(): read_line = l[0][1:] break self.assertEqual(first_line, read_line) def test_test(self): n = 5 - self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n) + self.check_reader(paddle.dataset.imikolov.test(WORD_DICT, n), n) first_line = 'consumers may want to move their telephones a little '\ 'closer to the tv set' @@ -51,9 +51,9 @@ class TestMikolov(unittest.TestCase): WORD_DICT.get(ch, WORD_DICT['']) for ch in first_line.split(' ') ] - for l in paddle.v2.dataset.imikolov.test( + for l in paddle.dataset.imikolov.test( WORD_DICT, n=-1, - data_type=paddle.v2.dataset.imikolov.DataType.SEQ)(): + data_type=paddle.dataset.imikolov.DataType.SEQ)(): read_line = l[0][1:] break self.assertEqual(first_line, read_line) diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/dataset/tests/mnist_test.py similarity index 91% rename from python/paddle/v2/dataset/tests/mnist_test.py rename to python/paddle/dataset/tests/mnist_test.py index 1d344cac3e..8ada19d3f2 100644 --- a/python/paddle/v2/dataset/tests/mnist_test.py +++ b/python/paddle/dataset/tests/mnist_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.mnist +import paddle.dataset.mnist import unittest @@ -29,13 +29,13 @@ class TestMNIST(unittest.TestCase): def test_train(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.mnist.train()) + paddle.dataset.mnist.train()) self.assertEqual(instances, 60000) self.assertEqual(max_label_value, 9) def test_test(self): instances, max_label_value = self.check_reader( - paddle.v2.dataset.mnist.test()) + paddle.dataset.mnist.test()) self.assertEqual(instances, 10000) self.assertEqual(max_label_value, 9) diff --git a/python/paddle/v2/dataset/tests/mq2007_test.py b/python/paddle/dataset/tests/mq2007_test.py similarity index 85% rename from python/paddle/v2/dataset/tests/mq2007_test.py rename to python/paddle/dataset/tests/mq2007_test.py index 59847b6c18..fba388724a 100644 --- a/python/paddle/v2/dataset/tests/mq2007_test.py +++ b/python/paddle/dataset/tests/mq2007_test.py @@ -12,19 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.mq2007 +import paddle.dataset.mq2007 import unittest class TestMQ2007(unittest.TestCase): def test_pairwise(self): - for label, query_left, query_right in paddle.v2.dataset.mq2007.test( + for label, query_left, query_right in paddle.dataset.mq2007.test( format="pairwise"): self.assertEqual(query_left.shape(), (46, )) self.assertEqual(query_right.shape(), (46, )) def test_listwise(self): - for label_array, query_array in paddle.v2.dataset.mq2007.test( + for label_array, query_array in paddle.dataset.mq2007.test( format="listwise"): self.assertEqual(len(label_array), len(query_array)) diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/dataset/tests/test_image.py similarity index 97% rename from python/paddle/v2/tests/test_image.py rename to python/paddle/dataset/tests/test_image.py index c78bbdc40a..8bd56607ae 100644 --- a/python/paddle/v2/tests/test_image.py +++ b/python/paddle/dataset/tests/test_image.py @@ -15,7 +15,7 @@ import unittest import numpy as np -import paddle.v2.image as image +import paddle.dataset.image as image class Image(unittest.TestCase): diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/dataset/tests/test_sentiment.py similarity index 97% rename from python/paddle/v2/dataset/tests/test_sentiment.py rename to python/paddle/dataset/tests/test_sentiment.py index 4074052907..543f4b7378 100644 --- a/python/paddle/v2/dataset/tests/test_sentiment.py +++ b/python/paddle/dataset/tests/test_sentiment.py @@ -17,7 +17,7 @@ import unittest import nltk -import paddle.v2.dataset.sentiment as st +import paddle.dataset.sentiment as st from nltk.corpus import movie_reviews diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/dataset/tests/voc2012_test.py similarity index 82% rename from python/paddle/v2/dataset/tests/voc2012_test.py rename to python/paddle/dataset/tests/voc2012_test.py index 31e72ebf5e..0d285461a8 100644 --- a/python/paddle/v2/dataset/tests/voc2012_test.py +++ b/python/paddle/dataset/tests/voc2012_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.voc2012 +import paddle.dataset.voc2012 import unittest @@ -26,15 +26,15 @@ class TestVOC(unittest.TestCase): return sum def test_train(self): - count = self.check_reader(paddle.v2.dataset.voc_seg.train()) + count = self.check_reader(paddle.dataset.voc_seg.train()) self.assertEqual(count, 2913) def test_test(self): - count = self.check_reader(paddle.v2.dataset.voc_seg.test()) + count = self.check_reader(paddle.dataset.voc_seg.test()) self.assertEqual(count, 1464) def test_val(self): - count = self.check_reader(paddle.v2.dataset.voc_seg.val()) + count = self.check_reader(paddle.dataset.voc_seg.val()) self.assertEqual(count, 1449) diff --git a/python/paddle/v2/dataset/tests/wmt16_test.py b/python/paddle/dataset/tests/wmt16_test.py similarity index 89% rename from python/paddle/v2/dataset/tests/wmt16_test.py rename to python/paddle/dataset/tests/wmt16_test.py index cef6c3216e..8b949d8bf5 100644 --- a/python/paddle/v2/dataset/tests/wmt16_test.py +++ b/python/paddle/dataset/tests/wmt16_test.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2.dataset.wmt16 +import paddle.dataset.wmt16 import unittest @@ -34,28 +34,28 @@ class TestWMT16(unittest.TestCase): def test_train(self): for idx, sample in enumerate( - paddle.v2.dataset.wmt16.train( + paddle.dataset.wmt16.train( src_dict_size=100000, trg_dict_size=100000)()): if idx >= 10: break self.checkout_one_sample(sample) def test_test(self): for idx, sample in enumerate( - paddle.v2.dataset.wmt16.test( + paddle.dataset.wmt16.test( src_dict_size=1000, trg_dict_size=1000)()): if idx >= 10: break self.checkout_one_sample(sample) def test_val(self): for idx, sample in enumerate( - paddle.v2.dataset.wmt16.validation( + paddle.dataset.wmt16.validation( src_dict_size=1000, trg_dict_size=1000)()): if idx >= 10: break self.checkout_one_sample(sample) def test_get_dict(self): dict_size = 1000 - word_dict = paddle.v2.dataset.wmt16.get_dict("en", dict_size, True) + word_dict = paddle.dataset.wmt16.get_dict("en", dict_size, True) self.assertEqual(len(word_dict), dict_size) self.assertEqual(word_dict[0], "") self.assertEqual(word_dict[1], "") diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py similarity index 82% rename from python/paddle/v2/dataset/uci_housing.py rename to python/paddle/dataset/uci_housing.py index f10bf7e42a..6a56e9d556 100644 --- a/python/paddle/v2/dataset/uci_housing.py +++ b/python/paddle/dataset/uci_housing.py @@ -21,8 +21,7 @@ parse training set and test set into paddle reader creators. import numpy as np import os -import paddle.v2.dataset.common -from paddle.v2.parameters import Parameters +import paddle.dataset.common __all__ = ['train', 'test'] @@ -85,7 +84,7 @@ def train(): :rtype: callable """ global UCI_TRAIN_DATA - load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5)) def reader(): for d in UCI_TRAIN_DATA: @@ -105,7 +104,7 @@ def test(): :rtype: callable """ global UCI_TEST_DATA - load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)) + load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5)) def reader(): for d in UCI_TEST_DATA: @@ -114,21 +113,13 @@ def test(): return reader -def model(): - tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', - MD5_MODEL) - with open(tar_file, 'r') as f: - parameters = Parameters.from_tar(f) - return parameters - - def fetch(): - paddle.v2.dataset.common.download(URL, 'uci_housing', MD5) + paddle.dataset.common.download(URL, 'uci_housing', MD5) def convert(path): """ Converts dataset to recordio format """ - paddle.v2.dataset.common.convert(path, train(), 1000, "uci_housing_train") - paddle.v2.dataset.common.convert(path, test(), 1000, "uci_houseing_test") + paddle.dataset.common.convert(path, train(), 1000, "uci_housing_train") + paddle.dataset.common.convert(path, test(), 1000, "uci_houseing_test") diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/dataset/voc2012.py similarity index 97% rename from python/paddle/v2/dataset/voc2012.py rename to python/paddle/dataset/voc2012.py index 617e212d67..9c945574db 100644 --- a/python/paddle/v2/dataset/voc2012.py +++ b/python/paddle/dataset/voc2012.py @@ -22,8 +22,8 @@ with segmentation has been increased from 7,062 to 9,993. import tarfile import io import numpy as np -from paddle.v2.dataset.common import download -from paddle.v2.image import * +from paddle.dataset.common import download +from paddle.dataset.image import * from PIL import Image __all__ = ['train', 'test', 'val'] diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/dataset/wmt14.py similarity index 84% rename from python/paddle/v2/dataset/wmt14.py rename to python/paddle/dataset/wmt14.py index 5104e29051..f0908c7378 100644 --- a/python/paddle/v2/dataset/wmt14.py +++ b/python/paddle/dataset/wmt14.py @@ -22,8 +22,7 @@ parse training set and test set into paddle reader creators. import tarfile import gzip -import paddle.v2.dataset.common -from paddle.v2.parameters import Parameters +import paddle.dataset.common __all__ = [ 'train', @@ -123,7 +122,7 @@ def train(dict_size): :rtype: callable """ return reader_creator( - paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size) @@ -139,27 +138,20 @@ def test(dict_size): :rtype: callable """ return reader_creator( - paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size) def gen(dict_size): return reader_creator( - paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), + paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'gen/gen', dict_size) -def model(): - tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) - with gzip.open(tar_file, 'r') as f: - parameters = Parameters.from_tar(f) - return parameters - - def get_dict(dict_size, reverse=True): # if reverse = False, return dict = {'a':'001', 'b':'002', ...} # else reverse = true, return dict = {'001':'a', '002':'b', ...} - tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) src_dict, trg_dict = __read_to_dict(tar_file, dict_size) if reverse: src_dict = {v: k for k, v in src_dict.items()} @@ -168,8 +160,8 @@ def get_dict(dict_size, reverse=True): def fetch(): - paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) - paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) + paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN) + paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL) def convert(path): @@ -177,6 +169,5 @@ def convert(path): Converts dataset to recordio format """ dict_size = 30000 - paddle.v2.dataset.common.convert(path, - train(dict_size), 1000, "wmt14_train") - paddle.v2.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") + paddle.dataset.common.convert(path, train(dict_size), 1000, "wmt14_train") + paddle.dataset.common.convert(path, test(dict_size), 1000, "wmt14_test") diff --git a/python/paddle/v2/dataset/wmt16.py b/python/paddle/dataset/wmt16.py similarity index 94% rename from python/paddle/v2/dataset/wmt16.py rename to python/paddle/dataset/wmt16.py index c8818f715b..ad23338a96 100644 --- a/python/paddle/v2/dataset/wmt16.py +++ b/python/paddle/dataset/wmt16.py @@ -33,7 +33,7 @@ import tarfile import gzip from collections import defaultdict -import paddle.v2.dataset.common +import paddle.dataset.common __all__ = [ "train", @@ -76,7 +76,7 @@ def __build_dict(tar_file, dict_size, save_path, lang): def __load_dict(tar_file, dict_size, lang, reverse=False): - dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + dict_path = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16/%s_%d.dict" % (lang, dict_size)) if not os.path.exists(dict_path) or ( len(open(dict_path, "r").readlines()) != dict_size): @@ -178,8 +178,8 @@ def train(src_dict_size, trg_dict_size, src_lang="en"): src_lang) return reader_creator( - tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, - "wmt16.tar.gz"), + tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), file_name="wmt16/train", src_dict_size=src_dict_size, trg_dict_size=trg_dict_size, @@ -227,8 +227,8 @@ def test(src_dict_size, trg_dict_size, src_lang="en"): src_lang) return reader_creator( - tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, - "wmt16.tar.gz"), + tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), file_name="wmt16/test", src_dict_size=src_dict_size, trg_dict_size=trg_dict_size, @@ -274,8 +274,8 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"): src_lang) return reader_creator( - tar_file=paddle.v2.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, - "wmt16.tar.gz"), + tar_file=paddle.dataset.common.download(DATA_URL, "wmt16", DATA_MD5, + "wmt16.tar.gz"), file_name="wmt16/val", src_dict_size=src_dict_size, trg_dict_size=trg_dict_size, @@ -303,12 +303,12 @@ def get_dict(lang, dict_size, reverse=False): if lang == "en": dict_size = min(dict_size, TOTAL_EN_WORDS) else: dict_size = min(dict_size, TOTAL_DE_WORDS) - dict_path = os.path.join(paddle.v2.dataset.common.DATA_HOME, + dict_path = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16/%s_%d.dict" % (lang, dict_size)) assert os.path.exists(dict_path), "Word dictionary does not exist. " "Please invoke paddle.dataset.wmt16.train/test/validation first " "to build the dictionary." - tar_file = os.path.join(paddle.v2.dataset.common.DATA_HOME, "wmt16.tar.gz") + tar_file = os.path.join(paddle.dataset.common.DATA_HOME, "wmt16.tar.gz") return __load_dict(tar_file, dict_size, lang, reverse) @@ -323,7 +323,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang): """Converts dataset to recordio format. """ - paddle.v2.dataset.common.convert( + paddle.dataset.common.convert( path, train( src_dict_size=src_dict_size, @@ -331,7 +331,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang): src_lang=src_lang), 1000, "wmt16_train") - paddle.v2.dataset.common.convert( + paddle.dataset.common.convert( path, test( src_dict_size=src_dict_size, @@ -339,7 +339,7 @@ def convert(path, src_dict_size, trg_dict_size, src_lang): src_lang=src_lang), 1000, "wmt16_test") - paddle.v2.dataset.common.convert( + paddle.dataset.common.convert( path, validation( src_dict_size=src_dict_size, diff --git a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py index 983f8f4dbe..ce640dece8 100644 --- a/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py +++ b/python/paddle/fluid/tests/book/notest_rnn_encoder_decoer.py @@ -13,7 +13,7 @@ # limitations under the License. import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.framework as framework diff --git a/python/paddle/fluid/tests/book/test_fit_a_line.py b/python/paddle/fluid/tests/book/test_fit_a_line.py index 93ef66851b..6dfc2997ae 100644 --- a/python/paddle/fluid/tests/book/test_fit_a_line.py +++ b/python/paddle/fluid/tests/book/test_fit_a_line.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import contextlib import numpy diff --git a/python/paddle/fluid/tests/book/test_image_classification.py b/python/paddle/fluid/tests/book/test_image_classification.py index b01c1875d6..e8bb082be1 100644 --- a/python/paddle/fluid/tests/book/test_image_classification.py +++ b/python/paddle/fluid/tests/book/test_image_classification.py @@ -14,7 +14,7 @@ from __future__ import print_function -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import contextlib import math diff --git a/python/paddle/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/fluid/tests/book/test_label_semantic_roles.py index f488527e0b..c0a6df831a 100644 --- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py +++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py @@ -15,8 +15,8 @@ import math import numpy as np -import paddle.v2 as paddle -import paddle.v2.dataset.conll05 as conll05 +import paddle +import paddle.dataset.conll05 as conll05 import paddle.fluid as fluid from paddle.fluid.initializer import init_on_cpu import contextlib diff --git a/python/paddle/fluid/tests/book/test_machine_translation.py b/python/paddle/fluid/tests/book/test_machine_translation.py index 3a1a0859ec..830d78df8b 100644 --- a/python/paddle/fluid/tests/book/test_machine_translation.py +++ b/python/paddle/fluid/tests/book/test_machine_translation.py @@ -14,7 +14,7 @@ import contextlib import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import paddle.fluid.framework as framework import paddle.fluid.layers as pd diff --git a/python/paddle/fluid/tests/book/test_recognize_digits.py b/python/paddle/fluid/tests/book/test_recognize_digits.py index e85b97a7f4..e4997b4069 100644 --- a/python/paddle/fluid/tests/book/test_recognize_digits.py +++ b/python/paddle/fluid/tests/book/test_recognize_digits.py @@ -14,7 +14,7 @@ from __future__ import print_function import argparse import paddle.fluid as fluid -import paddle.v2 as paddle +import paddle import sys import numpy import unittest diff --git a/python/paddle/fluid/tests/book/test_recommender_system.py b/python/paddle/fluid/tests/book/test_recommender_system.py index 2ce66d32c9..2172c275b8 100644 --- a/python/paddle/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/fluid/tests/book/test_recommender_system.py @@ -16,7 +16,7 @@ import math import sys import os import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import paddle.fluid.framework as framework import paddle.fluid.layers as layers diff --git a/python/paddle/fluid/tests/book/test_understand_sentiment.py b/python/paddle/fluid/tests/book/test_understand_sentiment.py index d2f3f74046..dedd153778 100644 --- a/python/paddle/fluid/tests/book/test_understand_sentiment.py +++ b/python/paddle/fluid/tests/book/test_understand_sentiment.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest import paddle.fluid as fluid -import paddle.v2 as paddle +import paddle import contextlib import math import numpy as np diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py index 26b97c3e25..8929779de9 100644 --- a/python/paddle/fluid/tests/book/test_word2vec.py +++ b/python/paddle/fluid/tests/book/test_word2vec.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import unittest import os diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py index ad79e96b95..8818cf96fa 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py @@ -13,7 +13,7 @@ # limitations under the License. import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import math import sys diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py index 204669d7e6..dfebb9a06e 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py @@ -16,7 +16,7 @@ from __future__ import print_function import sys -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import math import sys diff --git a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py index a24834a6f0..a1ca6d981f 100644 --- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py +++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py @@ -13,7 +13,7 @@ # limitations under the License. import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import paddle.fluid.core as core import paddle.fluid.framework as framework diff --git a/python/paddle/fluid/tests/demo/fc_gan.py b/python/paddle/fluid/tests/demo/fc_gan.py index 7452ea2a34..8ea1b2b15c 100644 --- a/python/paddle/fluid/tests/demo/fc_gan.py +++ b/python/paddle/fluid/tests/demo/fc_gan.py @@ -19,7 +19,7 @@ import os import matplotlib import numpy -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid matplotlib.use('Agg') diff --git a/python/paddle/fluid/tests/test_cpp_reader.py b/python/paddle/fluid/tests/test_cpp_reader.py index 4b0d039b7e..e54c73b295 100644 --- a/python/paddle/fluid/tests/test_cpp_reader.py +++ b/python/paddle/fluid/tests/test_cpp_reader.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid import numpy as np import sys diff --git a/python/paddle/fluid/tests/test_error_clip.py b/python/paddle/fluid/tests/test_error_clip.py index b2fd5ae29c..89f4c64975 100644 --- a/python/paddle/fluid/tests/test_error_clip.py +++ b/python/paddle/fluid/tests/test_error_clip.py @@ -14,7 +14,7 @@ from __future__ import print_function import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid BATCH_SIZE = 128 diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py index 68b682f68b..d530601f13 100644 --- a/python/paddle/fluid/tests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/test_gradient_clip.py @@ -13,7 +13,7 @@ # limitations under the License. import numpy as np -import paddle.v2 as paddle +import paddle import paddle.fluid as fluid BATCH_SIZE = 128 diff --git a/python/paddle/fluid/tests/test_mnist_if_else_op.py b/python/paddle/fluid/tests/test_mnist_if_else_op.py index 94395f6cfb..d34f52db5f 100644 --- a/python/paddle/fluid/tests/test_mnist_if_else_op.py +++ b/python/paddle/fluid/tests/test_mnist_if_else_op.py @@ -12,12 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import paddle import paddle.fluid.layers as layers from paddle.fluid.framework import Program, program_guard, default_main_program, default_startup_program from paddle.fluid.executor import Executor from paddle.fluid.optimizer import MomentumOptimizer import paddle.fluid.core as core -import paddle.v2 as paddle import unittest import numpy as np diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py index df7ab0d29b..0faed94deb 100644 --- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py @@ -13,7 +13,7 @@ # limitations under the License. import paddle.fluid as fluid -import paddle.v2 as paddle +import paddle import unittest import numpy diff --git a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py index b03a70f1b9..d3f63ee2c4 100644 --- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py +++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py @@ -13,7 +13,7 @@ # limitations under the License. import unittest -import paddle.v2 as paddle +import paddle import paddle.fluid.core as core import paddle.fluid as fluid from paddle.fluid.backward import append_backward diff --git a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py index 8add353303..0b7a290759 100644 --- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py +++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py @@ -15,8 +15,8 @@ import unittest import paddle.fluid as fluid -import paddle.v2 as paddle -import paddle.v2.dataset.mnist as mnist +import paddle +import paddle.dataset.mnist as mnist class TestMultipleReader(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_multiple_reader.py b/python/paddle/fluid/tests/unittests/test_multiple_reader.py index 69f8acf81e..a60a5d6c4a 100644 --- a/python/paddle/fluid/tests/unittests/test_multiple_reader.py +++ b/python/paddle/fluid/tests/unittests/test_multiple_reader.py @@ -15,8 +15,8 @@ import unittest import paddle.fluid as fluid -import paddle.v2 as paddle -import paddle.v2.dataset.mnist as mnist +import paddle +import paddle.dataset.mnist as mnist from shutil import copyfile diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py index bbfd03c638..95d0f9da47 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py @@ -16,9 +16,9 @@ import numpy import unittest import paddle.fluid as fluid -import paddle.v2 as paddle -import paddle.v2.dataset.mnist as mnist -import paddle.v2.dataset.wmt16 as wmt16 +import paddle +import paddle.dataset.mnist as mnist +import paddle.dataset.wmt16 as wmt16 def simple_fc_net(): diff --git a/python/paddle/fluid/tests/unittests/test_recordio_reader.py b/python/paddle/fluid/tests/unittests/test_recordio_reader.py index 24a0074d9b..640264d82f 100644 --- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py +++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py @@ -15,8 +15,8 @@ import unittest import paddle.fluid as fluid -import paddle.v2 as paddle -import paddle.v2.dataset.mnist as mnist +import paddle +import paddle.dataset.mnist as mnist class TestRecordIO(unittest.TestCase): diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/reader/__init__.py similarity index 100% rename from python/paddle/v2/reader/__init__.py rename to python/paddle/reader/__init__.py diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/reader/creator.py similarity index 62% rename from python/paddle/v2/reader/creator.py rename to python/paddle/reader/creator.py index fda5246d74..4c905d959f 100644 --- a/python/paddle/v2/reader/creator.py +++ b/python/paddle/reader/creator.py @@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could be used in user program. """ -__all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader'] +__all__ = ['np_array', 'text_file', 'recordio'] def np_array(x): @@ -66,7 +66,7 @@ def recordio(paths, buf_size=100): """ import recordio as rec - import paddle.v2.reader.decorator as dec + import paddle.reader.decorator as dec import cPickle as pickle def reader(): @@ -83,48 +83,3 @@ def recordio(paths, buf_size=100): f.close() return dec.buffered(reader, buf_size) - - -pass_num = 0 - - -def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64): - """ - Create a data reader that yield a record one by one from - the paths: - :paths: path of recordio files, can be a string or a string list. - :etcd_endpoints: the endpoints for etcd cluster - :returns: data reader of recordio files. - - .. code-block:: python - from paddle.v2.reader.creator import cloud_reader - etcd_endpoints = "http://127.0.0.1:2379" - trainer.train.( - reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints), - ) - """ - import os - import cPickle as pickle - import paddle.v2.master as master - c = master.client(etcd_endpoints, timeout_sec, buf_size) - - if isinstance(paths, basestring): - path = [paths] - else: - path = paths - c.set_dataset(path) - - def reader(): - global pass_num - c.paddle_start_get_records(pass_num) - pass_num += 1 - - while True: - r, e = c.next_record() - if not r: - if e != -2: - print "get record error: ", e - break - yield pickle.loads(r) - - return reader diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/reader/decorator.py similarity index 100% rename from python/paddle/v2/reader/decorator.py rename to python/paddle/reader/decorator.py diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt similarity index 100% rename from python/paddle/v2/reader/tests/CMakeLists.txt rename to python/paddle/reader/tests/CMakeLists.txt diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/reader/tests/__init__.py similarity index 100% rename from python/paddle/v2/reader/tests/__init__.py rename to python/paddle/reader/tests/__init__.py diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/reader/tests/creator_test.py similarity index 92% rename from python/paddle/v2/reader/tests/creator_test.py rename to python/paddle/reader/tests/creator_test.py index 7fe374e663..c4238c12a7 100644 --- a/python/paddle/v2/reader/tests/creator_test.py +++ b/python/paddle/reader/tests/creator_test.py @@ -28,14 +28,14 @@ import os import unittest import numpy as np -import paddle.v2.reader.creator +import paddle.reader.creator class TestNumpyArray(unittest.TestCase): def test_numpy_array(self): l = [[1, 2, 3], [4, 5, 6]] x = np.array(l, np.int32) - reader = paddle.v2.reader.creator.np_array(x) + reader = paddle.reader.creator.np_array(x) for idx, e in enumerate(reader()): self.assertItemsEqual(e, l[idx]) @@ -43,14 +43,14 @@ class TestNumpyArray(unittest.TestCase): class TestTextFile(unittest.TestCase): def test_text_file(self): path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt") - reader = paddle.v2.reader.creator.text_file(path) + reader = paddle.reader.creator.text_file(path) for idx, e in enumerate(reader()): self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1)) class TestRecordIO(unittest.TestCase): def do_test(self, path): - reader = paddle.v2.reader.creator.recordio(path) + reader = paddle.reader.creator.recordio(path) idx = 0 for e in reader(): if idx == 0: diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/reader/tests/decorator_test.py similarity index 81% rename from python/paddle/v2/reader/tests/decorator_test.py rename to python/paddle/reader/tests/decorator_test.py index 6b680e39f3..bee24d3b65 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/reader/tests/decorator_test.py @@ -15,7 +15,7 @@ import time import unittest -import paddle.v2.reader +import paddle.reader def reader_creator_10(dur): @@ -39,7 +39,7 @@ class TestMap(unittest.TestCase): yield "h" yield "i" - r = paddle.v2.reader.map_readers(tokenize, read) + r = paddle.reader.map_readers(tokenize, read) for i, e in enumerate(r()): self.assertEqual(e, i) @@ -47,7 +47,7 @@ class TestMap(unittest.TestCase): class TestBuffered(unittest.TestCase): def test_read(self): for size in range(20): - b = paddle.v2.reader.buffered(reader_creator_10(0), size) + b = paddle.reader.buffered(reader_creator_10(0), size) c = 0 for i in b(): self.assertEqual(i, c) @@ -56,7 +56,7 @@ class TestBuffered(unittest.TestCase): def test_buffering(self): # read have 30ms delay. - b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10) + b = paddle.reader.buffered(reader_creator_10(0.03), 10) last_time = time.time() for idx, i in enumerate(b()): elapsed_time = time.time() - last_time @@ -70,17 +70,17 @@ class TestBuffered(unittest.TestCase): class TestCompose(unittest.TestCase): def test_compse(self): - reader = paddle.v2.reader.compose( + reader = paddle.reader.compose( reader_creator_10(0), reader_creator_10(0)) for idx, e in enumerate(reader()): self.assertEqual(e, (idx, idx)) def test_compose_not_aligned(self): total = 0 - reader = paddle.v2.reader.compose( - paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader = paddle.reader.compose( + paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)), reader_creator_10(0)) - with self.assertRaises(paddle.v2.reader.ComposeNotAligned): + with self.assertRaises(paddle.reader.ComposeNotAligned): for e in reader(): total += 1 # expecting 10, not 20 @@ -88,8 +88,8 @@ class TestCompose(unittest.TestCase): def test_compose_not_aligned_no_check(self): total = 0 - reader = paddle.v2.reader.compose( - paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)), + reader = paddle.reader.compose( + paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)), reader_creator_10(0), check_alignment=False) for e in reader(): @@ -100,7 +100,7 @@ class TestCompose(unittest.TestCase): class TestChain(unittest.TestCase): def test_chain(self): - c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)) + c = paddle.reader.chain(reader_creator_10(0), reader_creator_10(0)) idx = 0 for e in c(): self.assertEqual(e, idx % 10) @@ -113,7 +113,7 @@ class TestShuffle(unittest.TestCase): case = [(0, True), (1, True), (10, False), (100, False)] a = reader_creator_10(0) for size, checkEq in case: - s = paddle.v2.reader.shuffle(a, size) + s = paddle.reader.shuffle(a, size) total = 0 for idx, e in enumerate(s()): if checkEq: @@ -133,9 +133,9 @@ class TestXmap(unittest.TestCase): for order in orders: for tNum in thread_nums: for size in buffered_size: - reader = paddle.v2.reader.xmap_readers(mapper, - reader_creator_10(0), - tNum, size, order) + reader = paddle.reader.xmap_readers(mapper, + reader_creator_10(0), + tNum, size, order) for n in xrange(3): result = [] for i in reader(): @@ -150,7 +150,7 @@ class TestPipeReader(unittest.TestCase): def test_pipe_reader(self): def example_reader(myfiles): for f in myfiles: - pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) + pr = paddle.reader.PipeReader("cat %s" % f, bufsize=128) for l in pr.get_line(): yield l diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/reader/tests/test_data_creator.txt similarity index 100% rename from python/paddle/v2/reader/tests/test_data_creator.txt rename to python/paddle/reader/tests/test_data_creator.txt diff --git a/python/paddle/v2/reader/tests/test_reader_recordio.dat b/python/paddle/reader/tests/test_reader_recordio.dat similarity index 100% rename from python/paddle/v2/reader/tests/test_reader_recordio.dat rename to python/paddle/reader/tests/test_reader_recordio.dat diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/reader/tests/test_recordio_creator.dat similarity index 100% rename from python/paddle/v2/reader/tests/test_recordio_creator.dat rename to python/paddle/reader/tests/test_recordio_creator.dat diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py index df710c33d0..02b0d077ee 100644 --- a/python/paddle/v2/__init__.py +++ b/python/paddle/v2/__init__.py @@ -22,17 +22,13 @@ import data_type import topology import networks import evaluator -from . import dataset -from . import reader from . import plot import attr import op import pooling import inference import networks -import minibatch import plot -import image import paddle.trainer.config_parser as cp __all__ = [ @@ -48,14 +44,11 @@ __all__ = [ 'data_type', 'attr', 'pooling', - 'dataset', - 'reader', 'topology', 'networks', 'infer', 'plot', 'evaluator', - 'image', 'master', ] @@ -153,4 +146,3 @@ def init(**kwargs): infer = inference.infer -batch = minibatch.batch diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py index 52f5b947fd..14b64742fd 100644 --- a/python/paddle/v2/inference.py +++ b/python/paddle/v2/inference.py @@ -15,7 +15,7 @@ import numpy import collections import topology -import minibatch +import paddle import cPickle __all__ = ['infer', 'Inference'] @@ -80,7 +80,7 @@ class Inference(object): for each_sample in input: yield each_sample - reader = minibatch.batch(__reader_impl__, batch_size=batch_size) + reader = paddle.batch(__reader_impl__, batch_size=batch_size) self.__gradient_machine__.start() for data_batch in reader(): diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py index 6a2bb8d337..a188a03eb3 100644 --- a/python/paddle/v2/layer.py +++ b/python/paddle/v2/layer.py @@ -20,7 +20,7 @@ The primary usage shows below. .. code-block:: python - import paddle.v2 as paddle + import paddle img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784)) hidden = paddle.layer.fc(input=img, size=200) diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt index b4333ed530..46e4feb8e1 100644 --- a/python/paddle/v2/tests/CMakeLists.txt +++ b/python/paddle/v2/tests/CMakeLists.txt @@ -1,5 +1,4 @@ py_test(test_op SRCS test_op.py) -py_test(test_image SRCS test_image.py) py_test(test_layer SRCS test_layer.py) py_test(test_topology SRCS test_topology.py) py_test(test_rnn_layer SRCS test_rnn_layer.py) diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py index 264442be18..8320217da2 100644 --- a/python/paddle/v2/tests/test_paramconf_order.py +++ b/python/paddle/v2/tests/test_paramconf_order.py @@ -27,6 +27,7 @@ # limitations under the License. import unittest import math +import paddle.dataset as dataset import paddle.v2 as paddle @@ -40,7 +41,7 @@ def wordemb(inlayer): def train(): - word_dict = paddle.dataset.imikolov.build_dict() + word_dict = dataset.imikolov.build_dict() dict_size = len(word_dict) # Every layer takes integer value of range [0, dict_size) firstword = paddle.layer.data( diff --git a/python/setup.py.in b/python/setup.py.in index 831d173d42..d73a3a6a1c 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -63,6 +63,8 @@ write_version_py(filename='@PADDLE_SOURCE_DIR@/python/paddle/version.py') packages=['paddle', 'paddle.utils', + 'paddle.dataset', + 'paddle.reader', 'paddle.fluid', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', @@ -73,8 +75,6 @@ if '${WITH_FLUID_ONLY}'== 'OFF': 'paddle.trainer', 'paddle.trainer_config_helpers', 'paddle.v2', - 'paddle.v2.dataset', - 'paddle.v2.reader', 'paddle.v2.master', 'paddle.v2.plot', 'py_paddle'] From 3a5bce775e90882c21778334420a9b597c2de583 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sat, 31 Mar 2018 09:20:14 +0800 Subject: [PATCH 298/314] try to complete --- paddle/fluid/operators/detail/grpc_server.cc | 8 +++++-- .../operators/detail/grpc_server_test.cc | 21 ++++++++++++++----- paddle/fluid/operators/detail/grpc_service.h | 2 +- paddle/fluid/operators/listen_and_serv_op.cc | 4 ++++ 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 26bef375cb..407fa5ef5a 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detail/grpc_server.h" +#include using ::grpc::ServerAsyncResponseWriter; @@ -156,6 +157,8 @@ class RequestPrefetch final : public RequestBase { ::grpc::ByteBuffer relay; // TODO(Yancey1989): execute the Block which containers prefetch ops + VLOG(3) << "RequestPrefetch Process in"; + responder_.Finish(relay, ::grpc::Status::OK, this); status_ = FINISH; } @@ -251,6 +254,7 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() { } void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { + VLOG(4) << "TryToRegisterNewPrefetchOne in"; std::unique_lock lock(cq_mutex_); if (is_shut_down_) { return; @@ -287,8 +291,8 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I if (!ok) { - LOG(WARNING) << cq_name << " recv no regular event:argument name" - << base->GetReqName(); + LOG(WARNING) << cq_name << " recv no regular event:argument name[" + << base->GetReqName() << "]"; TryToRegisterNewOne(); delete base; continue; diff --git a/paddle/fluid/operators/detail/grpc_server_test.cc b/paddle/fluid/operators/detail/grpc_server_test.cc index 5773748106..1ad62863a1 100644 --- a/paddle/fluid/operators/detail/grpc_server_test.cc +++ b/paddle/fluid/operators/detail/grpc_server_test.cc @@ -28,6 +28,7 @@ std::unique_ptr rpc_service_; void StartServer(const std::string& endpoint) { rpc_service_.reset(new detail::AsyncGRPCServer(endpoint)); + rpc_service_->RunSyncUpdate(); } TEST(PREFETCH, CPU) { @@ -39,13 +40,23 @@ TEST(PREFETCH, CPU) { platform::CPUPlace place; platform::CPUDeviceContext ctx(place); // create var on local scope - std::string var_name("tmp_0"); - auto var = scope.Var(var_name); - auto tensor = var->GetMutable(); - tensor->Resize({10, 10}); + std::string in_var_name("in"); + std::string out_var_name("out"); + auto* in_var = scope.Var(in_var_name); + auto* in_tensor = in_var->GetMutable(); + in_tensor->Resize({10, 10}); + VLOG(3) << "before mutable_data"; + in_tensor->mutable_data(place); + scope.Var(out_var_name); + + VLOG(3) << "before fetch"; detail::RPCClient client; - client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, var_name, ""); + client.AsyncPrefetchVariable("127.0.0.1:8889", ctx, scope, in_var_name, + out_var_name); + client.Wait(); + + rpc_service_->ShutDown(); server_thread.join(); rpc_service_.reset(nullptr); } diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h index 879e21933b..1ec8cf11c5 100644 --- a/paddle/fluid/operators/detail/grpc_service.h +++ b/paddle/fluid/operators/detail/grpc_service.h @@ -80,7 +80,7 @@ enum class GrpcMethod { }; static const int kGrpcNumMethods = - static_cast(GrpcMethod::kGetVariable) + 1; + static_cast(GrpcMethod::kPrefetchVariable) + 1; inline const char* GrpcMethodName(GrpcMethod id) { switch (id) { diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index d5eae2be79..c9455fd35c 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -112,6 +112,10 @@ class ListenAndServOp : public framework::OperatorBase { framework::Executor executor(dev_place); + rpc_service_->SetExecutor(&executor); + rpc_service_->SetPrefetchBlkdId(0); + rpc_service_->SetProgram(program); + // TODO(typhoonzero): change this to a while_op for every cluster-batch. bool exit_flag = false; // Record received sparse variables, so that From 5aa440fd7a5a6bff32fc628a6907e16cb6feb8a9 Mon Sep 17 00:00:00 2001 From: JiayiFeng Date: Sat, 31 Mar 2018 05:02:19 +0000 Subject: [PATCH 299/314] Add move constructor for Item --- .../operators/reader/create_double_buffer_reader_op.cc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 3f0f449248..f15747e266 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -27,6 +27,15 @@ class DoubleBufferReader : public framework::DecoratedReader { public: struct Item { Item() : ctx_(nullptr) {} + Item(Item&& b) { + payloads_ = std::move(b.payloads_); + ctx_ = std::move(b.ctx_); + } + Item& operator=(Item&& b) { + payloads_ = std::move(b.payloads_); + ctx_ = std::move(b.ctx_); + return *this; + } std::vector payloads_; platform::DeviceContext* ctx_; From c0257f0a5b315bb39f2c3e92c5afe43d631eae69 Mon Sep 17 00:00:00 2001 From: JiayiFeng Date: Sat, 31 Mar 2018 05:17:57 +0000 Subject: [PATCH 300/314] Add comments --- .../operators/reader/create_double_buffer_reader_op.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index f15747e266..3f1d36a3e6 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -20,7 +20,14 @@ namespace paddle { namespace operators { namespace reader { +// 'Double buffer' means we shall maintain two batch of input data at the same +// time. So the kCacheSize shoul be at least 2. static constexpr size_t kCacheSize = 2; +// There will be two bacthes out of the channel during training: +// 1. the one waiting to be sent to the channel +// 2. the one just be received from the channel, which is also being used by +// subsequent operators. +// So the channel size should be kChacheSize - 2 static constexpr size_t kChannelSize = 0; // kCacheSize - 2 class DoubleBufferReader : public framework::DecoratedReader { From 597c845c998a176610ebd83f14a6215008b29f38 Mon Sep 17 00:00:00 2001 From: JiayiFeng Date: Sat, 31 Mar 2018 05:21:59 +0000 Subject: [PATCH 301/314] fix typo --- paddle/fluid/operators/reader/create_double_buffer_reader_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 3f1d36a3e6..342cd2a549 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -20,7 +20,7 @@ namespace paddle { namespace operators { namespace reader { -// 'Double buffer' means we shall maintain two batch of input data at the same +// 'Double buffer' means we shall maintain two batches of input data at the same // time. So the kCacheSize shoul be at least 2. static constexpr size_t kCacheSize = 2; // There will be two bacthes out of the channel during training: From 0ee4565be757534319b611edc17c97e89491968b Mon Sep 17 00:00:00 2001 From: Shan Yi <35982308+shanyi15@users.noreply.github.com> Date: Sat, 31 Mar 2018 16:06:55 +0800 Subject: [PATCH 302/314] translate api standard (#9521) * translate api standard * Update api_doc_std_en.md * fix typo --- doc/fluid/dev/api_doc_std_en.md | 226 ++++++++++++++++++++++++++++++++ 1 file changed, 226 insertions(+) create mode 100644 doc/fluid/dev/api_doc_std_en.md diff --git a/doc/fluid/dev/api_doc_std_en.md b/doc/fluid/dev/api_doc_std_en.md new file mode 100644 index 0000000000..e57072d52f --- /dev/null +++ b/doc/fluid/dev/api_doc_std_en.md @@ -0,0 +1,226 @@ +# API Doc Standard + +- [API Doc Structure](#API Doc Structure) +- [Format and Examples](#Format and Examples) +- [Complete Example](#Complete Example) + + +## API Doc Structure + +API Doc should contain the following parts(please write them in order): + +- Python API Definition + + The definition of API + +- Function Description + + Description of API's function. + The description includes: meaning, purpose and operation on input of API, reference and corresponding link(if any), formula(if necessary) and explanations of key variables in the formula. + +- Args Description + + Description of API parameters. + Introduce parameters one by one according to the order in API definition. + The introduction includes: data type, default value(if any), meaning, etc. + +- Returns + + Introduction of API returned value. + Introduce meaning of returned value, provide correspoding format if necessary. + If returned value is a tuple containing multiple parameters, then introduce parameters one by one in order. + +- Raises(if any) + + Abnormality, error that may occur, and possible reasons. If there are more than one possible abnormity or error, they should be listed in order. + +- Note(if any) + + Matters needing attention. If there are more than one matters, they should be listed in order. + +- Examples + + Examples of how to use API. + + +## Format and Examples + +API documentation must obey reStructuredText format, please refer to [here](http://sphinx-doc-zh.readthedocs.io/en/latest/rest.html). +Format and examples of each part of API documantation are as follows: (take fc for example) + +- Python API Definition + + - Format + + [Python API Definition] + + - Example + + ``` + fc(input, + size, + num_flatten_dims=1, + param_attr=None, + bias_attr=None, + act=None, + name=None, + main_program=None, + startup_program=None) + ``` + +- Function Description + + - Format + + This part contains (please write them in order): + + [Function Description] + + [Formula] + + [Symbols' Descriptions if necessary] + + [References if necessary] + + - Example + + [Function Description] + + ``` + **Fully Connected Layer** + + The fully connected layer can take multiple tensors as its inputs. It + creates a variable called weights for each input tensor, which represents + a fully connected weight matrix from each input unit to each output unit. + The fully connected layer multiplies each input tensor with its coresponding + weight to produce an output Tensor. If multiple input tensors are given, + the results of multiple multiplications will be sumed up. If bias_attr is + not None, a bias variable will be created and added to the output. Finally, + if activation is not None, it will be applied to the output as well. + ``` + + [Formula] + + ``` + This process can be formulated as follows: + + .. math:: + + Out = Act({\sum_{i=0}^{N-1}X_iW_i + b}) + ``` + + [Symbols' Descriptions if necessary] + + ``` + In the above equation: + + * :math:`N`: Number of the input. + * :math:`X_i`: The input tensor. + * :math:`W`: The weights created by this layer. + * :math:`b`: The bias parameter created by this layer (if needed). + * :math:`Act`: The activation function. + * :math:`Out`: The output tensor. + ``` + + [References if necessary] + + Since there is no need for reference of fc, we omit them here. Under other circumstances, please provide explicit reference and link, take layer_norm for example: + + ``` + Refer to `Layer Normalization `_ for more details. + ``` + + +- Args Description + + - Format + + \[Arg's Name\][(Data Type, Default Value)][Description] + + - Example + + part of fc parameters are as follows: + + ``` + Args: + input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of + the input tensor(s) is at least 2. + param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable + parameters/weights of this layer. + name (str, default None): The name of this layer. + ``` + +- Returns + + - Format + + [Name][Shape] + + - Example + + ``` + Returns: + A tensor variable storing the transformation result. + ``` + + when returned value contain more than one tuple, please introduce every parameter in order, take dynamic_lstm for example: + + ``` + Returns: + A tuple containing: + The hidden state of LSTM whose shape is (T X D). + The cell state of LSTM whose shape is (T X D). + ``` + +- Raises + + - Format + + [Exception Type][Condition] + + - Example + + ``` + Raises: + ValueError: If the rank of the input is less than 2. + ``` + +- Note + + - Format + + [Note] + + - Example + + there is no Note in fc, so we omit this part. If there is any note, please write clearly. If there are more than one notes, please list them in order. Take scaled\_dot\_product\_attention for example: + + ``` + Note: + 1. When num_heads > 1, three linear projections are learned respectively + to map input queries, keys and values into queries', keys' and values'. + queries', keys' and values' have the same shapes with queries, keys + and values. + 2. When num_heads == 1, scaled_dot_product_attention has no learnable + parameters. + ``` + +- Examples + + - Format + + \[Python Code Snipper] + + - Example + + ``` + Examples: + .. code-block:: python + + data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32") + fc = fluid.layers.fc(input=data, size=1000, act="tanh") + ``` + +## Complete Example + +Complete Example of fc please see [here](src/fc.py)。 From ffcc7604783633079cf62cefee19a3153bbf0402 Mon Sep 17 00:00:00 2001 From: Abhinav Arora Date: Sat, 31 Mar 2018 10:03:19 -0700 Subject: [PATCH 303/314] Fix deadlock in channel_test (#9544) --- paddle/fluid/framework/CMakeLists.txt | 2 +- paddle/fluid/framework/channel_impl.h | 17 ++++------------- 2 files changed, 5 insertions(+), 14 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index a34e22ff87..c425c71160 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -104,7 +104,7 @@ cc_test(init_test SRCS init_test.cc DEPS init) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) -# cc_test(channel_test SRCS channel_test.cc) +cc_test(channel_test SRCS channel_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h index c47d629289..e056779ea0 100644 --- a/paddle/fluid/framework/channel_impl.h +++ b/paddle/fluid/framework/channel_impl.h @@ -138,8 +138,8 @@ void ChannelImpl::Send(T *item) { // If channel is closed, throw exception if (closed_) { - lock.unlock(); send_return(); + lock.unlock(); PADDLE_THROW("Cannot send on closed channel"); } @@ -152,11 +152,9 @@ void ChannelImpl::Send(T *item) { if (m != nullptr) { *(m->data) = std::move(*item); m->Notify(); - lock.unlock(); send_return(); return; } else { - lock.unlock(); Send(item); send_return(); return; @@ -169,8 +167,6 @@ void ChannelImpl::Send(T *item) { if (buf_.size() < cap_) { // Copy to buffer buf_.push_back(std::move(*item)); - // Release lock and return true - lock.unlock(); send_return(); return; } @@ -181,8 +177,8 @@ void ChannelImpl::Send(T *item) { sendq.push_back(m); m->Wait(lock); if (m->chan_closed) { - lock.unlock(); send_return(); + lock.unlock(); PADDLE_THROW("Cannot send on closed channel"); } send_return(); @@ -195,10 +191,7 @@ bool ChannelImpl::Receive(T *item) { // If channel is closed and buffer is empty or // channel is unbuffered - if (closed_ && buf_.empty()) { - lock.unlock(); - return recv_return(false); - } + if (closed_ && buf_.empty()) return recv_return(false); // If there is a sender, directly receive the value we want // from the sender. In case of a buffered channel, read from @@ -229,7 +222,6 @@ bool ChannelImpl::Receive(T *item) { } else return recv_return(Receive(item)); } - lock.unlock(); return recv_return(true); } @@ -238,8 +230,7 @@ bool ChannelImpl::Receive(T *item) { // Directly read from buffer *item = std::move(buf_.front()); buf_.pop_front(); - // Release lock and return true - lock.unlock(); + // return true return recv_return(true); } From 01667392adb57cdd3ee1f53dbf0516ef8d2bdf63 Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Sat, 31 Mar 2018 10:04:05 -0700 Subject: [PATCH 304/314] Rename test_serde into serde_test (#9504) --- paddle/fluid/operators/detail/CMakeLists.txt | 4 ++-- .../fluid/operators/detail/{test_serde.cc => serde_test.cc} | 0 2 files changed, 2 insertions(+), 2 deletions(-) rename paddle/fluid/operators/detail/{test_serde.cc => serde_test.cc} (100%) diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 2b19f04489..d59411dfb9 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -2,7 +2,7 @@ if(WITH_DISTRIBUTE) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc variable_response.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - cc_test(serde_test SRCS test_serde.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr + set_source_files_properties(serde_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) endif() diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/serde_test.cc similarity index 100% rename from paddle/fluid/operators/detail/test_serde.cc rename to paddle/fluid/operators/detail/serde_test.cc From ef802ce9c0c156679cd584d55ae868f745af1b9a Mon Sep 17 00:00:00 2001 From: Thuan Nguyen Date: Sat, 31 Mar 2018 23:32:00 -0700 Subject: [PATCH 305/314] PaddlePaddle.org static ip was changed, need to change the known hosts (#9547) --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index bf6a41d13c..929c847bd3 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,7 +34,7 @@ addons: - automake - libtool - ccache - ssh_known_hosts: 52.76.173.135 + ssh_known_hosts: 13.229.163.131 before_install: - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python From 453630692e439451b42a2501c2d74f7a011ad14d Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 1 Apr 2018 23:33:07 +0800 Subject: [PATCH 306/314] fix prefetch hang problem, add some more logs --- paddle/fluid/operators/detail/grpc_client.cc | 16 +++++++++------- paddle/fluid/operators/detail/grpc_server.cc | 13 +++++++++++-- paddle/fluid/operators/detail/grpc_service.h | 4 ++-- paddle/fluid/operators/listen_and_serv_op.cc | 12 ++---------- 4 files changed, 24 insertions(+), 21 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index ba9882ce24..f8ec39e8c5 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -12,8 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "grpc_client.h" -#include +#include "paddle/fluid/operators/detail/grpc_client.h" + +#include + #include "paddle/fluid/framework/threadpool.h" namespace paddle { @@ -52,7 +54,7 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); call->StartCall(); - call->Finish(&s->reply_, &s->status_, (void*)s); + call->Finish(&s->reply_, &s->status_, static_cast(s)); }); req_count_++; @@ -109,7 +111,7 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, auto call = s->stub_g_.PrepareUnaryCall( s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); call->StartCall(); - call->Finish(&s->reply_, &s->status_, (void*)s); + call->Finish(&s->reply_, &s->status_, static_cast(s)); }); req_count_++; @@ -153,7 +155,7 @@ bool RPCClient::AsyncPrefetchVariable(const std::string& ep, s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, &cq_); call->StartCall(); - call->Finish(&s->reply_, &s->status_, (void*)s); + call->Finish(&s->reply_, &s->status_, static_cast(s)); }); req_count_++; @@ -169,7 +171,7 @@ void RPCClient::AsyncSendBatchBarrier(const std::string& ep, int64_t time_out) { sendrecv::VariableMessage req; req.set_varname(BATCH_BARRIER_MESSAGE); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, (void*)s); + rpc->Finish(&s->reply_, &s->status_, static_cast(s)); req_count_++; } @@ -181,7 +183,7 @@ void RPCClient::AsyncSendFetchBarrier(const std::string& ep, int64_t time_out) { sendrecv::VariableMessage req; req.set_varname(FETCH_BARRIER_MESSAGE); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); - rpc->Finish(&s->reply_, &s->status_, (void*)s); + rpc->Finish(&s->reply_, &s->status_, static_cast(s)); req_count_++; } diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index b8fba06c7b..71acc568a9 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -13,7 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detail/grpc_server.h" -#include + +#include +#include using ::grpc::ServerAsyncResponseWriter; @@ -224,6 +226,7 @@ void AsyncGRPCServer::ShutdownQueue() { std::unique_lock lock(cq_mutex_); cq_send_->Shutdown(); cq_get_->Shutdown(); + cq_prefetch_->Shutdown(); } // This URL explains why shutdown is complicate: @@ -236,6 +239,7 @@ void AsyncGRPCServer::ShutDown() { void AsyncGRPCServer::TryToRegisterNewSendOne() { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { + VLOG(3) << "shutdown, do not TryToRegisterNewSendOne"; return; } RequestSend* send = new RequestSend(&service_, cq_send_.get(), scope_, @@ -246,6 +250,7 @@ void AsyncGRPCServer::TryToRegisterNewSendOne() { void AsyncGRPCServer::TryToRegisterNewGetOne() { std::unique_lock lock(cq_mutex_); if (is_shut_down_) { + VLOG(3) << "shutdown, do not TryToRegisterNewGetOne"; return; } RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_, dev_ctx_, @@ -257,6 +262,7 @@ void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { VLOG(4) << "TryToRegisterNewPrefetchOne in"; std::unique_lock lock(cq_mutex_); if (is_shut_down_) { + VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne"; return; } RequestPrefetch* prefetch = @@ -274,18 +280,21 @@ void AsyncGRPCServer::HandleRequest(::grpc::ServerCompletionQueue* cq, void* tag = NULL; bool ok = false; + while (true) { + VLOG(3) << "HandleRequest for " << cq_name << " while in"; if (!cq->Next(&tag, &ok)) { LOG(INFO) << cq_name << " CompletionQueue shutdown!"; break; } + VLOG(3) << "HandleRequest for " << cq_name << " while after Next"; PADDLE_ENFORCE(tag); // FIXME(typhoonzero): de-couple the barriers with recv_op if (!is_shut_down_ && cq_name == "cq_get") WaitCond(1); if (!is_shut_down_ && cq_name == "cq_send") WaitCond(0); - RequestBase* base = (RequestBase*)tag; + RequestBase* base = reinterpret_cast(tag); // reference: // https://github.com/tensorflow/tensorflow/issues/5596 // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM diff --git a/paddle/fluid/operators/detail/grpc_service.h b/paddle/fluid/operators/detail/grpc_service.h index 1ec8cf11c5..e6dab2f5a3 100644 --- a/paddle/fluid/operators/detail/grpc_service.h +++ b/paddle/fluid/operators/detail/grpc_service.h @@ -89,7 +89,7 @@ inline const char* GrpcMethodName(GrpcMethod id) { case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; case GrpcMethod::kPrefetchVariable: - return "/sendrecv.SendREcvService/PrefetchVariable"; + return "/sendrecv.SendRecvService/PrefetchVariable"; } // Shouldn't be reached. @@ -117,5 +117,5 @@ class GrpcService final { }; } // namespace detail -} // namespace operator +} // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 66f7058eac..67ee47f9f6 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -13,22 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include #include -#include - -#include #include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/proto_desc.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/detail/grpc_server.h" -#include "paddle/fluid/operators/detail/sendrecvop_utils.h" -#include "paddle/fluid/operators/detail/simple_block_queue.h" -#include "paddle/fluid/string/printf.h" namespace paddle { namespace operators { @@ -177,7 +168,8 @@ class ListenAndServOp : public framework::OperatorBase { } ParallelExecuteBlocks(parallel_blkids, &executor, program, &recv_scope); - VLOG(2) << "run all blocks spent (ms) " << detail::GetTimestamp() - ts; + VLOG(3) << "run all blocks spent " << detail::GetTimestamp() - ts + << "(ms)"; // Reset the received sparse variables, the sum operator would not // sum the input sparse variables which rows is empty at the next From 9af9effc93e39427c758343f6be9892652049863 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 2 Apr 2018 09:26:09 +0800 Subject: [PATCH 307/314] optimize code --- paddle/fluid/operators/detail/grpc_client.cc | 3 +-- paddle/fluid/operators/detail/grpc_server.cc | 1 - paddle/fluid/operators/detail/grpc_server.h | 4 +++- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_client.cc b/paddle/fluid/operators/detail/grpc_client.cc index f8ec39e8c5..d79ba6d291 100644 --- a/paddle/fluid/operators/detail/grpc_client.cc +++ b/paddle/fluid/operators/detail/grpc_client.cc @@ -72,8 +72,7 @@ void ProcGetResponse(const VarHandle& var_h, template void RequestToByteBuffer(const T& proto, ::grpc::ByteBuffer* result) { ::grpc::Slice slice(proto.ByteSizeLong()); - proto.SerializeWithCachedSizesToArray( - const_cast(reinterpret_cast(slice.begin()))); + proto.SerializeWithCachedSizesToArray(const_cast(slice.begin())); ::grpc::ByteBuffer tmp(&slice, 1); result->Swap(&tmp); } diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 71acc568a9..09ca4cc052 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -259,7 +259,6 @@ void AsyncGRPCServer::TryToRegisterNewGetOne() { } void AsyncGRPCServer::TryToRegisterNewPrefetchOne() { - VLOG(4) << "TryToRegisterNewPrefetchOne in"; std::unique_lock lock(cq_mutex_); if (is_shut_down_) { VLOG(3) << "shutdown, do not TryToRegisterNewPrefetchOne"; diff --git a/paddle/fluid/operators/detail/grpc_server.h b/paddle/fluid/operators/detail/grpc_server.h index dd5cf4b377..b0596d3cd1 100644 --- a/paddle/fluid/operators/detail/grpc_server.h +++ b/paddle/fluid/operators/detail/grpc_server.h @@ -15,7 +15,8 @@ limitations under the License. */ #pragma once #include -#include +#include +#include #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -93,6 +94,7 @@ class AsyncGRPCServer final { // received variable from RPC, operators fetch variable from this queue. SimpleBlockQueue var_get_queue_; + // client send variable to this queue. ReceivedQueue var_recv_queue_; // condition of the sub program From 606c57da23511b4474123db519a67ede21de9d67 Mon Sep 17 00:00:00 2001 From: Yancey1989 Date: Mon, 2 Apr 2018 09:33:08 +0800 Subject: [PATCH 308/314] update by comment --- paddle/fluid/operators/detail/grpc_server.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc index 26bef375cb..44c23db0b1 100644 --- a/paddle/fluid/operators/detail/grpc_server.cc +++ b/paddle/fluid/operators/detail/grpc_server.cc @@ -153,10 +153,10 @@ class RequestPrefetch final : public RequestBase { virtual void Process() { // prefetch process... - ::grpc::ByteBuffer relay; + ::grpc::ByteBuffer reply; // TODO(Yancey1989): execute the Block which containers prefetch ops - responder_.Finish(relay, ::grpc::Status::OK, this); + responder_.Finish(reply, ::grpc::Status::OK, this); status_ = FINISH; } From 6cfc0c14971828ee9528502a2787456869210a5c Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 2 Apr 2018 11:15:52 +0800 Subject: [PATCH 309/314] "polish code" (#9318) * "polish code" * "fix ci" * "fix ci" * "done" --- python/paddle/fluid/executor.py | 73 ++++++++------------------------- 1 file changed, 18 insertions(+), 55 deletions(-) diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 2612fb1ae4..54d0a12bcd 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -48,8 +48,7 @@ def as_numpy(tensor): assert isinstance(tensor, core.LoDTensor) lod = tensor.lod() if len(lod) > 0: - raise RuntimeError( - "Some of your featched tensors hold LoD information. \ + raise RuntimeError("Some of your fetched tensors hold LoD information. \ They can not be completely cast to Python ndarray. \ Please set the parameter 'return_numpy' as 'False' to \ return LoDTensor itself directly.") @@ -180,60 +179,24 @@ def get_program_cache_key(feed, fetch_list): class Executor(object): - def __init__(self, places): - if not isinstance(places, list) and not isinstance(places, tuple): - places = [places] - - act_places = [] - for each in places: - p = core.Place() - p.set_place(each) - act_places.append(p) - - # TODO(dzhwinter) : only use the first place - self.executor = core.Executor(act_places[0]) - self.places = places + def __init__(self, place): + self.place = place + p = core.Place() + p.set_place(place) + self.executor = core.Executor(p) self.program_caches = dict() - def aslodtensor(self, data): - def accumulate(data): - if not isinstance(data, list): - return 1 - return sum([accumulate(sub) for sub in data]) - - def parselod(data): - seq_lens = [accumulate(seq) for seq in data] - cur_len = 0 - lod = [cur_len] - for l in seq_lens: - cur_len += l - lod.append(cur_len) - return lod - - assert len(self.places) != 0 - if not isinstance(data, list): - # pure tensor case - tensor = core.LoDTensor() - tensor.set(data, self.places[0]) - return tensor - else: - raise RuntimeError("Current implementation lacks unittests") - # lodtensor case - lod = [] - if not isinstance(data[0], list): - lod.append(parselod(data)) - flattened_data = np.concatenate(data, axis=0).astype("int64") - else: - while isinstance(data[0], list): - lod.append(parselod(seq)) - flattened_data = [item for seq in data for item in seq] - data = flattened_data - flattened_data = np.concatenate(data, axis=0).astype("int64") - flattened_data = flattened_data.reshape([len(flattened_data), 1]) - tensor = core.LoDTensor() - tensor.set(flattened_data, self.places[0]) - tensor.set_lod(lod) - return tensor + def as_lodtensor(self, data): + if isinstance(data, list): + raise RuntimeError("Some of your feed data hold LoD information. \ + They can not be completely cast from a list of Python \ + ndarray to LoDTensor. Please convert data to LoDTensor \ + directly before feeding the data.\ + ") + # single tensor case + tensor = core.LoDTensor() + tensor.set(data, self.place) + return tensor def _get_program_cache(self, program_cache_key): return self.program_caches.get(program_cache_key, None) @@ -293,7 +256,7 @@ class Executor(object): feed_target_name = op.desc.output('Out')[0] cur_feed = feed[feed_target_name] if not isinstance(cur_feed, core.LoDTensor): - cur_feed = self.aslodtensor(cur_feed) + cur_feed = self.as_lodtensor(cur_feed) idx = op.desc.attr('col') core.set_feed_variable(scope, cur_feed, feed_var_name, idx) else: From 04a5c0378517ec08f2eba1339de94bd2e786e516 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 2 Apr 2018 11:18:00 +0800 Subject: [PATCH 310/314] add todo --- paddle/fluid/operators/listen_and_serv_op.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/listen_and_serv_op.cc b/paddle/fluid/operators/listen_and_serv_op.cc index 67ee47f9f6..b19add24e2 100644 --- a/paddle/fluid/operators/listen_and_serv_op.cc +++ b/paddle/fluid/operators/listen_and_serv_op.cc @@ -102,6 +102,7 @@ class ListenAndServOp : public framework::OperatorBase { framework::Executor executor(dev_place); + // TODO(qiao) set proper fields for table lookup and update rpc_service_->SetExecutor(&executor); rpc_service_->SetPrefetchBlkdId(0); rpc_service_->SetProgram(program); From 772cdfe196f6a343ad20f3c2644c078e4e9ef19e Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 2 Apr 2018 12:25:01 +0800 Subject: [PATCH 311/314] fix single pserver error --- python/paddle/fluid/distribute_transpiler.py | 28 +++++++++++++------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/python/paddle/fluid/distribute_transpiler.py b/python/paddle/fluid/distribute_transpiler.py index 24297ffe33..9311fc9904 100644 --- a/python/paddle/fluid/distribute_transpiler.py +++ b/python/paddle/fluid/distribute_transpiler.py @@ -276,20 +276,25 @@ class DistributeTranspiler: suff_idx = v.name.find(".trainer_") if suff_idx >= 0: orig_var_name = v.name[:suff_idx] - pserver_program.global_block().create_var( + else: + orig_var_name = v.name + single_trainer_var = pserver_program.global_block().create_var( name=orig_var_name, persistable=True, type=v.type, dtype=v.dtype, shape=v.shape) - for trainer_id in xrange(self.trainers): - var = pserver_program.global_block().create_var( - name="%s.trainer_%d" % (orig_var_name, trainer_id), - persistable=False, - type=v.type, - dtype=v.dtype, - shape=v.shape) - recv_inputs.append(var) + if self.trainers > 1: + for trainer_id in xrange(self.trainers): + var = pserver_program.global_block().create_var( + name="%s.trainer_%d" % (orig_var_name, trainer_id), + persistable=False, + type=v.type, + dtype=v.dtype, + shape=v.shape) + recv_inputs.append(var) + else: + recv_inputs.append(single_trainer_var) # step3 optimize_block = pserver_program.create_block(0) @@ -511,8 +516,11 @@ class DistributeTranspiler: def _append_split_op(self, program, gradblocks): # Split variables that need to be split and append respective ops + add_suffix = False + if self.trainers > 1: + add_suffix = True var_mapping = self._create_vars_from_blocklist( - program, gradblocks, add_trainer_suffix=True) + program, gradblocks, add_trainer_suffix=add_suffix) for varname, splited_vars in var_mapping.iteritems(): # variable that don't need to split have empty splited_vars if len(splited_vars) <= 1: From 997e9a1fd2a98120a269b7569fccd7f1e595059b Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 2 Apr 2018 13:53:21 +0800 Subject: [PATCH 312/314] fix mac compile --- paddle/fluid/framework/details/var_handle.h | 2 +- paddle/fluid/framework/parallel_executor.cc | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/framework/details/var_handle.h b/paddle/fluid/framework/details/var_handle.h index 893cc15f6c..569dda17c6 100644 --- a/paddle/fluid/framework/details/var_handle.h +++ b/paddle/fluid/framework/details/var_handle.h @@ -22,7 +22,7 @@ namespace paddle { namespace framework { namespace details { -struct OpHandleBase; +class OpHandleBase; // VarHandleBase is the var node in the dependency graph. // A variable can only be generated by a single operator. i.e. diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 91f2db9354..292e4732b4 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,9 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" + #include +#include -#include "ThreadPool.h" +#include "paddle/fluid/framework/threadpool.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" From 9a101cfc08b90832cfa44b9cad1e25db640b7948 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 2 Apr 2018 15:05:14 +0800 Subject: [PATCH 313/314] clean code --- paddle/fluid/framework/parallel_executor.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 292e4732b4..577eea92d2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -17,8 +17,6 @@ limitations under the License. */ #include #include -#include "paddle/fluid/framework/threadpool.h" - #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/nccl_helper.h" #endif From b94f24d44f314279cfe7230db37a22e225957e15 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Mon, 2 Apr 2018 17:33:14 +0800 Subject: [PATCH 314/314] Move StartPrefetcher and EndPrefetcher to private --- .../operators/reader/create_double_buffer_reader_op.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc index 342cd2a549..f9a8058f2a 100644 --- a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -66,6 +66,9 @@ class DoubleBufferReader : public framework::DecoratedReader { void ReadNext(std::vector* out) override; void ReInit() override; + ~DoubleBufferReader() { EndPrefetcher(); } + + private: void StartPrefetcher() { channel_ = framework::MakeChannel(kChannelSize); prefetcher_ = std::thread([this] { PrefetchThreadFunc(); }); @@ -80,9 +83,6 @@ class DoubleBufferReader : public framework::DecoratedReader { channel_ = nullptr; } - ~DoubleBufferReader() { EndPrefetcher(); } - - private: void PrefetchThreadFunc(); std::thread prefetcher_;