diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp index 1b25172ca5..9e9dd20e6f 100644 --- a/paddle/function/ContextProjectionOpTest.cpp +++ b/paddle/function/ContextProjectionOpTest.cpp @@ -28,7 +28,7 @@ void testMatrixProjectionForward(int context_start, std::max(0, (int)(context_start + context_length - 1)); if (pad == 0) is_padding = false; - FunctionCompare test( + CpuGpuFuncCompare test( "ContextProjectionForward", FuncConfig() .set("context_length", context_length) @@ -60,7 +60,7 @@ void testMatrixProjectionBackward(int context_start, std::max(0, (int)(context_start + context_length - 1)); if (pad == 0) is_padding = false; - FunctionCompare test( + CpuGpuFuncCompare test( "ContextProjectionBackward", FuncConfig() .set("context_length", context_length) diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp index 48c815f027..f6c0041101 100644 --- a/paddle/function/CosSimOpTest.cpp +++ b/paddle/function/CosSimOpTest.cpp @@ -22,7 +22,7 @@ void testCosSimForward(size_t height_x, size_t height_y, size_t width, real scale) { - FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale)); + CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale)); // prepare input arguments test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width})); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width})); @@ -36,7 +36,7 @@ void testCosSimBackward(size_t height_x, size_t height_y, size_t width, real scale) { - FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale)); + CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale)); // prepare input arguments test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1})); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1})); diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp index 51f5da81bf..ed17b17da6 100644 --- a/paddle/function/CrossMapNormalOpTest.cpp +++ b/paddle/function/CrossMapNormalOpTest.cpp @@ -28,11 +28,11 @@ TEST(CrossMapNormal, real) { << " size=" << size; // init Test object - FunctionCompare test("CrossMapNormal", - FuncConfig() - .set("size", size) - .set("scale", (real)1.5) - .set("pow", (real)0.5)); + CpuGpuFuncCompare test("CrossMapNormal", + FuncConfig() + .set("size", size) + .set("scale", (real)1.5) + .set("pow", (real)0.5)); // prepare input arguments TensorShape shape{numSamples, channels, imgSizeH, imgSizeW}; test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape)); @@ -57,11 +57,11 @@ TEST(CrossMapNormalGrad, real) { << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW << " size=" << size; - FunctionCompare test("CrossMapNormalGrad", - FuncConfig() - .set("size", size) - .set("scale", (real)1.5) - .set("pow", (real)0.5)); + CpuGpuFuncCompare test("CrossMapNormalGrad", + FuncConfig() + .set("size", size) + .set("scale", (real)1.5) + .set("pow", (real)0.5)); TensorShape shape{numSamples, channels, imgSizeH, imgSizeW}; test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape)); test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape)); diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h index 0cfafdb27f..69ab33052d 100644 --- a/paddle/function/FunctionTest.h +++ b/paddle/function/FunctionTest.h @@ -22,14 +22,62 @@ namespace paddle { typedef std::shared_ptr BufferArgPtr; +namespace test { +template +struct Allocator; + +template <> +struct Allocator { + using type = CpuMemoryHandle; +}; + +template <> +struct Allocator { + using type = GpuMemoryHandle; +}; + +// Copy argument1 to argument2 +template +class CopyArgument { +public: + void operator()(const BufferArg& arg1, BufferArg& arg2) { + CHECK_EQ(arg1.valueType(), arg2.valueType()); + CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements()); + + if (arg1.valueType() == VALUE_TYPE_INT32) { + IVectorPtr vector1 = + IVector::create((int*)arg1.data(), + arg1.shape().getElements(), + DType1 == DEVICE_TYPE_CPU ? false : true); + IVectorPtr vector2 = + IVector::create((int*)arg2.data(), + arg2.shape().getElements(), + DType2 == DEVICE_TYPE_CPU ? false : true); + vector2->copyFrom(*vector1); + } else { + VectorPtr vector1 = + Vector::create((real*)arg1.data(), + arg1.shape().getElements(), + DType1 == DEVICE_TYPE_CPU ? false : true); + VectorPtr vector2 = + Vector::create((real*)arg2.data(), + arg2.shape().getElements(), + DType2 == DEVICE_TYPE_CPU ? false : true); + vector2->copyFrom(*vector1); + } + } +}; +} // namespace test + /** - * \brief A class for comparing CPU and GPU implementations of Function. - * + * \brief A class for comparing two Functions of different implementations. + * For example, can be used to compare the CPU and GPU implementation + * of the function is consistent. * * Use case: * // Initializes a test object, the corresponding cpu and gpu Function * // are constructed according to FunctionName and FuncConfig. - * FunctionCompare test(FunctionName, FuncConfig); + * CpuGpuFuncCompare test(FunctionName, FuncConfig); * // Prepare inputs and outputs arguments. * // Here the input and output can not contain real data, * // only contains the argument type and shape. @@ -45,28 +93,38 @@ typedef std::shared_ptr BufferArgPtr; * // Compares CPU and GPU calculation results for consistency. * test.run(); */ -class FunctionCompare { +template +class Compare2Function { public: - FunctionCompare(const std::string& name, const FuncConfig& config) - : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")), - gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) { - cpuFunc_->init(config); - gpuFunc_->init(config); + typedef typename test::Allocator::type Allocator1; + typedef typename test::Allocator::type Allocator2; + typedef typename Tensor::Vector Vector1; + typedef typename Tensor::Vector Vector2; + typedef typename Tensor::SparseMatrix SparseMatrix1; + typedef typename Tensor::SparseMatrix SparseMatrix2; + + Compare2Function(const std::string& name1, + const std::string& name2, + const FuncConfig& config) + : function1_(FunctionBase::funcRegistrar_.createByType(name1)), + function2_(FunctionBase::funcRegistrar_.createByType(name2)) { + function1_->init(config); + function2_->init(config); } - ~FunctionCompare() {} + ~Compare2Function() {} // input need only contains shape, do not contains data. void addInputs(const BufferArg& input) { size_t size = input.shape().getElements() * sizeOfValuType(input.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + func1Memory_.emplace_back(std::make_shared(size)); + func2Memory_.emplace_back(std::make_shared(size)); - cpuInputs_.emplace_back(std::make_shared( - cpuMemory_.back()->getBuf(), input.valueType(), input.shape())); - gpuInputs_.emplace_back(std::make_shared( - gpuMemory_.back()->getBuf(), input.valueType(), input.shape())); + func1Inputs_.emplace_back(std::make_shared( + func1Memory_.back()->getBuf(), input.valueType(), input.shape())); + func2Inputs_.emplace_back(std::make_shared( + func2Memory_.back()->getBuf(), input.valueType(), input.shape())); } // assume one copy of sequence is shared by different SequenceArgs @@ -75,62 +133,57 @@ public: size_t batchSize = input.shape()[0]; size_t numSeqs = batchSize / 10 + 1; size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32); - cpuMemory_.emplace_back(std::make_shared(sizeId)); - gpuMemory_.emplace_back(std::make_shared(sizeId)); - cpuSeq_ = std::make_shared(cpuMemory_.back()->getBuf(), - TensorShape{numSeqs + 1}); - gpuSeq_ = std::make_shared(gpuMemory_.back()->getBuf(), - TensorShape{numSeqs + 1}); + func1Memory_.emplace_back(std::make_shared(sizeId)); + func2Memory_.emplace_back(std::make_shared(sizeId)); + seq1_ = std::make_shared(func1Memory_.back()->getBuf(), + TensorShape{numSeqs + 1}); + seq2_ = std::make_shared(func2Memory_.back()->getBuf(), + TensorShape{numSeqs + 1}); /// init sequence Id - initArg(*cpuSeq_, batchSize); + initArg(*seq1_, batchSize); - // todo(tianbing), delete it - CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1); - - CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data()); - GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data()); - gpuSeq.copyFrom(cpuSeq); + copyArg_(*seq1_, *seq2_); } void addInputs(const SequenceArg& input) { CHECK_EQ(input.shape().ndims(), 2UL); size_t batchSize = input.shape()[0]; - if (!cpuSeq_ || !gpuSeq_) { // sequence not exist + if (!seq1_ || !seq2_) { // sequence not exist addSequence(SequenceIdArg(TensorShape{batchSize})); } size_t size = input.shape().getElements() * sizeOfValuType(input.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + func1Memory_.emplace_back(std::make_shared(size)); + func2Memory_.emplace_back(std::make_shared(size)); /// SequenceArg - cpuInputs_.emplace_back( - std::make_shared(cpuMemory_.back()->getBuf(), + func1Inputs_.emplace_back( + std::make_shared(func1Memory_.back()->getBuf(), input.valueType(), input.shape(), - *cpuSeq_)); - gpuInputs_.emplace_back( - std::make_shared(gpuMemory_.back()->getBuf(), + *seq1_)); + func2Inputs_.emplace_back( + std::make_shared(func2Memory_.back()->getBuf(), input.valueType(), input.shape(), - *gpuSeq_)); + *seq2_)); } // output need only contains shape, do not contains data. void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) { size_t size = output.shape().getElements() * sizeOfValuType(output.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + func1Memory_.emplace_back(std::make_shared(size)); + func2Memory_.emplace_back(std::make_shared(size)); - cpuOutputs_.emplace_back( - std::make_shared(cpuMemory_.back()->getBuf(), + func1Outputs_.emplace_back( + std::make_shared(func1Memory_.back()->getBuf(), output.valueType(), output.shape(), argType)); - gpuOutputs_.emplace_back( - std::make_shared(gpuMemory_.back()->getBuf(), + func2Outputs_.emplace_back( + std::make_shared(func2Memory_.back()->getBuf(), output.valueType(), output.shape(), argType)); @@ -138,14 +191,14 @@ public: /// add and init output sparse matrix void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) { - cpuSparse_ = std::make_shared( + sparse1_ = std::make_shared( output.shape()[0], output.shape()[1], output.nnz(), static_cast(output.dataType()), static_cast(output.dataFormat())); - gpuSparse_ = std::make_shared( + sparse2_ = std::make_shared( output.shape()[0], output.shape()[1], output.nnz(), @@ -154,52 +207,52 @@ public: /// init sparse matrix hl_stream_t stream(HPPL_STREAM_1); - cpuSparse_->randomizeUniform(); - gpuSparse_->copyFrom(*cpuSparse_, stream); + sparse1_->randomizeUniform(); + sparse2_->copyFrom(*sparse1_, stream); hl_stream_synchronize(stream); - cpuOutputs_.emplace_back( - std::make_shared(*cpuSparse_, argType)); - gpuOutputs_.emplace_back( - std::make_shared(*gpuSparse_, argType)); + func1Outputs_.emplace_back( + std::make_shared(*sparse1_, argType)); + func2Outputs_.emplace_back( + std::make_shared(*sparse2_, argType)); } void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) { CHECK_EQ(output.shape().ndims(), 2UL); size_t batchSize = output.shape()[0]; - if (!cpuSeq_ || !gpuSeq_) { // sequence not exist + if (!seq1_ || !seq2_) { // sequence not exist addSequence(SequenceIdArg(TensorShape{batchSize})); } size_t size = output.shape().getElements() * sizeOfValuType(output.valueType()); - cpuMemory_.emplace_back(std::make_shared(size)); - gpuMemory_.emplace_back(std::make_shared(size)); + func1Memory_.emplace_back(std::make_shared(size)); + func2Memory_.emplace_back(std::make_shared(size)); /// SequenceArg - cpuOutputs_.emplace_back( - std::make_shared(cpuMemory_.back()->getBuf(), + func1Outputs_.emplace_back( + std::make_shared(func1Memory_.back()->getBuf(), output.valueType(), output.shape(), - *cpuSeq_, + *seq1_, argType)); - gpuOutputs_.emplace_back( - std::make_shared(gpuMemory_.back()->getBuf(), + func2Outputs_.emplace_back( + std::make_shared(func2Memory_.back()->getBuf(), output.valueType(), output.shape(), - *gpuSeq_, + *seq2_, argType)); } void addInputs(const SparseMatrixArg& input) { - cpuSparse_ = std::make_shared( + sparse1_ = std::make_shared( input.shape()[0], input.shape()[1], input.nnz(), static_cast(input.dataType()), static_cast(input.dataFormat())); - gpuSparse_ = std::make_shared( + sparse2_ = std::make_shared( input.shape()[0], input.shape()[1], input.nnz(), @@ -208,12 +261,12 @@ public: /// init sparse matrix hl_stream_t stream(HPPL_STREAM_1); - cpuSparse_->randomizeUniform(); - gpuSparse_->copyFrom(*cpuSparse_, stream); + sparse1_->randomizeUniform(); + sparse2_->copyFrom(*sparse1_, stream); hl_stream_synchronize(stream); - cpuInputs_.emplace_back(std::make_shared(*cpuSparse_)); - gpuInputs_.emplace_back(std::make_shared(*gpuSparse_)); + func1Inputs_.emplace_back(std::make_shared(*sparse1_)); + func2Inputs_.emplace_back(std::make_shared(*sparse2_)); } void run() { @@ -236,27 +289,27 @@ public: function->calc(inArgs, outArgs); }; - callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_); - callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_); + callFunction(function1_.get(), func1Inputs_, func1Outputs_); + callFunction(function2_.get(), func2Inputs_, func2Outputs_); // check outputs compareOutputs(); } - std::shared_ptr getCpuFunction() const { return cpuFunc_; } + std::shared_ptr getCpuFunction() const { return function1_; } - std::shared_ptr getGpuFunction() const { return gpuFunc_; } + std::shared_ptr getGpuFunction() const { return function2_; } protected: // only init cpu argument, gpu argument copy from cpu argument. void initArg(BufferArg& arg) { - CpuVector vector(arg.shape().getElements(), (real*)arg.data()); + Vector1 vector(arg.shape().getElements(), (real*)arg.data()); vector.uniform(0.001, 1); } void initArg(SequenceArg& arg) { /// init only matrix - CpuVector vector(arg.shape().getElements(), (real*)arg.data()); + Vector1 vector(arg.shape().getElements(), (real*)arg.data()); vector.uniform(0.001, 1); } @@ -276,73 +329,72 @@ protected: } void initInputs() { - for (size_t i = 0; i < cpuInputs_.size(); i++) { - if (cpuInputs_[i]->isSparseArg()) { + for (size_t i = 0; i < func1Inputs_.size(); i++) { + if (func1Inputs_[i]->isSparseArg()) { continue; /// sparse matrix already init } - if (cpuInputs_[i]->isSequenceArg()) { - initArg(dynamic_cast(*cpuInputs_[i])); + if (func1Inputs_[i]->isSequenceArg()) { + initArg(dynamic_cast(*func1Inputs_[i])); } else { - initArg(*cpuInputs_[i]); + initArg(*func1Inputs_[i]); } - // TODO: Need a BufferCopy used to copy from one BufferArg to another. - CpuVector cpuVector(cpuInputs_[i]->shape().getElements(), - (real*)cpuInputs_[i]->data()); - GpuVector gpuVector(gpuInputs_[i]->shape().getElements(), - (real*)gpuInputs_[i]->data()); - gpuVector.copyFrom(cpuVector); + copyArg_(*func1Inputs_[i], *func2Inputs_[i]); } } void initOutputs() { - for (size_t i = 0; i < cpuOutputs_.size(); i++) { - if (cpuOutputs_[i]->isSparseArg()) { + for (size_t i = 0; i < func1Outputs_.size(); i++) { + if (func1Outputs_[i]->isSparseArg()) { continue; /// sparse matrix already init } - if (cpuOutputs_[i]->isSequenceArg()) { - initArg(dynamic_cast(*cpuOutputs_[i])); + if (func1Outputs_[i]->isSequenceArg()) { + initArg(dynamic_cast(*func1Outputs_[i])); } else { - initArg(*cpuOutputs_[i]); + initArg(*func1Outputs_[i]); } - // TODO: Need a BufferCopy used to copy from one BufferArg to another. - CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(), - (real*)cpuOutputs_[i]->data()); - GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(), - (real*)gpuOutputs_[i]->data()); - - gpuVector.copyFrom(cpuVector); + copyArg_(*func1Outputs_[i], *func2Outputs_[i]); } } void compareOutputs() { - for (size_t i = 0; i < cpuOutputs_.size(); i++) { + for (size_t i = 0; i < func1Outputs_.size(); i++) { // TODO, Need a BufferCheck used to compare the two buffers. - const auto cpu = cpuOutputs_[i]; - const auto gpu = gpuOutputs_[i]; + const auto cpu = func1Outputs_[i]; + const auto gpu = func2Outputs_[i]; CHECK_EQ(cpu->numElements(), gpu->numElements()); - CpuVector cpuVector(cpu->numElements(), (real*)cpu->data()); - GpuVector gpuVector(gpu->numElements(), (real*)gpu->data()); + Vector1 cpuVector(cpu->numElements(), (real*)cpu->data()); + Vector2 gpuVector(gpu->numElements(), (real*)gpu->data()); autotest::TensorCheckErr(cpuVector, gpuVector); } } protected: - std::shared_ptr cpuFunc_; - std::shared_ptr gpuFunc_; - std::vector cpuMemory_; - std::vector gpuMemory_; - std::vector cpuInputs_; - std::vector cpuOutputs_; - std::vector gpuInputs_; - std::vector gpuOutputs_; - std::shared_ptr cpuSparse_; - std::shared_ptr gpuSparse_; - std::shared_ptr cpuSeq_; - std::shared_ptr gpuSeq_; + std::shared_ptr function1_; + std::shared_ptr function2_; + std::vector> func1Memory_; + std::vector> func2Memory_; + std::vector func1Inputs_; + std::vector func1Outputs_; + std::vector func2Inputs_; + std::vector func2Outputs_; + std::shared_ptr sparse1_; + std::shared_ptr sparse2_; + std::shared_ptr seq1_; + std::shared_ptr seq2_; + test::CopyArgument copyArg_; +}; + +class CpuGpuFuncCompare + : public Compare2Function { +public: + CpuGpuFuncCompare(const std::string& name, const FuncConfig& config) + : Compare2Function(name + "-CPU", name + "-GPU", config) {} + + ~CpuGpuFuncCompare() {} }; } // namespace paddle diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp index 8753057ebf..d31eb0c74f 100644 --- a/paddle/function/MulOpTest.cpp +++ b/paddle/function/MulOpTest.cpp @@ -35,7 +35,7 @@ void testFuncDDDMatrix( size_t heightC = dimM; size_t widthC = dimN; // init Test object - FunctionCompare test( + CpuGpuFuncCompare test( "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb)); // prepare input arguments /// matrix A : HA * WA @@ -81,8 +81,8 @@ void testFuncDSparseDMatrix( size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { real scaleT = 1.0; // init Test object - FunctionCompare test("MulOp", - FuncConfig().set("aTrans", false).set("bTrans", false)); + CpuGpuFuncCompare test( + "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false)); // prepare input arguments /// sparse matrix A : M * K test.addInputs(SparseMatrixArg( @@ -126,8 +126,8 @@ void testFuncDDSparseMatrix( size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { real scaleT = 1.0; // init Test object - FunctionCompare test("MulOp", - FuncConfig().set("aTrans", false).set("bTrans", false)); + CpuGpuFuncCompare test( + "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false)); // prepare input arguments /// matrix A : M * K test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK})); @@ -172,8 +172,8 @@ void testFuncSparseDDMatrix( size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) { real scaleT = 1.0; // init Test object - FunctionCompare test("MulOp", - FuncConfig().set("aTrans", false).set("bTrans", false)); + CpuGpuFuncCompare test( + "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false)); // prepare input arguments /// matrix A : M * K test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK})); diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp index f77ac2a8c4..e286f4e5b8 100644 --- a/paddle/function/PadOpTest.cpp +++ b/paddle/function/PadOpTest.cpp @@ -25,7 +25,7 @@ TEST(Pad, real) { VLOG(3) << " numSamples=" << numSamples << " channels=" << channels << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW; for (bool test_grad : {false, true}) { - FunctionCompare compare( + CpuGpuFuncCompare compare( test_grad ? "PadGrad" : "Pad", FuncConfig() .set>("channel", {2, 3})