From fac96456c241a1a34975453d1db0b8418446fe2c Mon Sep 17 00:00:00 2001 From: xzl Date: Fri, 1 Dec 2017 20:41:34 +0800 Subject: [PATCH 01/35] add prelu neon impl --- paddle/math/Matrix.cpp | 23 +++++++++++++++++++- paddle/math/NEONFunctions.cpp | 40 +++++++++++++++++++++++++++++++++++ paddle/math/NEONFunctions.h | 1 + 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp index 88e9180690..be87a4c296 100644 --- a/paddle/math/Matrix.cpp +++ b/paddle/math/Matrix.cpp @@ -28,6 +28,7 @@ limitations under the License. */ #include "hl_top_k.h" #include "paddle/utils/Logging.h" +#include "NEONFunctions.h" #include "paddle/function/GemmFunctor.h" #include "paddle/utils/ThreadLocal.h" @@ -4157,16 +4158,36 @@ void CpuMatrix::print(std::ostream& os) const { void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) { real* input = data.getData(); real* w = W.getData(); + real* output = data_; size_t numElements = data.getWidth(); size_t numSamples = data.getHeight(); size_t paraSize = W.getHeight() * W.getWidth(); CHECK(!(numElements % paraSize)); // this check from ParameterReluLayer::init + size_t partial_sum = numElements / paraSize; + if (paraSize == numElements) { + for (size_t n = 0; n < numSamples * numElements; ++n) { + output[n] = input[n] > 0 ? input[n] : input[n] * w[n % numElements]; + } + return; + } + +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + for (size_t n = 0; n < numSamples; ++n) { + for (size_t i = 0; i < paraSize; i++) { + neon::prelu( + input + i * partial_sum, w[i], output + i * partial_sum, partial_sum); + } + input = input + numElements; + output = output + numElements; + } +#else for (size_t n = 0, k = 0; n < numSamples; ++n) { for (size_t i = 0; i < numElements; ++i, ++k) { - data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum]; + output[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum]; } } +#endif } void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) { diff --git a/paddle/math/NEONFunctions.cpp b/paddle/math/NEONFunctions.cpp index 3bf47901f1..0f83149422 100644 --- a/paddle/math/NEONFunctions.cpp +++ b/paddle/math/NEONFunctions.cpp @@ -49,6 +49,46 @@ void relu(const float* a, float* b, int len) { } } +// b[i] = a[i] > 0.0f ? a[i] : a[i] * w +void prelu(const float* a, float w, float* b, int len) { + int offset = len % 16; + float32x4_t ma0, ma1, ma2, ma3; + + float32x4_t zero = vdupq_n_f32(0.f); + float32x4_t vw = vdupq_n_f32(w); + + for (int k = 0; k < len / 16; k++, a += 16, b += 16) { + ma0 = vld1q_f32(a); + ma1 = vld1q_f32(a + 4); + ma2 = vld1q_f32(a + 8); + ma3 = vld1q_f32(a + 12); + + uint32x4_t flag0 = vcgtq_f32(ma0, zero); + uint32x4_t flag1 = vcgtq_f32(ma1, zero); + uint32x4_t flag2 = vcgtq_f32(ma2, zero); + uint32x4_t flag3 = vcgtq_f32(ma3, zero); + + float32x4_t mul0 = vmulq_f32(ma0, vw); + float32x4_t mul1 = vmulq_f32(ma1, vw); + float32x4_t mul2 = vmulq_f32(ma2, vw); + float32x4_t mul3 = vmulq_f32(ma3, vw); + + ma0 = vbslq_f32(flag0, ma0, mul0); + ma1 = vbslq_f32(flag1, ma1, mul1); + ma2 = vbslq_f32(flag2, ma2, mul2); + ma3 = vbslq_f32(flag3, ma3, mul3); + + vst1q_f32(b, ma0); + vst1q_f32(b + 4, ma1); + vst1q_f32(b + 8, ma2); + vst1q_f32(b + 12, ma3); + } + + for (int i = 0; i < offset; i++) { + b[i] = a[i] > 0.0f ? a[i] : a[i] * w; + } +} + } // namespace neon } // namespace paddle diff --git a/paddle/math/NEONFunctions.h b/paddle/math/NEONFunctions.h index 69085e3335..d67b2f47a8 100644 --- a/paddle/math/NEONFunctions.h +++ b/paddle/math/NEONFunctions.h @@ -18,6 +18,7 @@ namespace paddle { namespace neon { void relu(const float* a, float* b, int len); +void prelu(const float* a, float w, float* b, int len); } // namespace neon } // namespace paddle From ee0d365a3a21ab8881ad88b252a5d117b87bc726 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 5 Dec 2017 11:36:08 +0800 Subject: [PATCH 02/35] add inference benchmark data --- benchmark/IntelOptimizedPaddle.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index 16c2390fd3..c275aeb5cb 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -23,6 +23,8 @@ On each machine, we will test and compare the performance of training on single ## Benchmark Model ### Server + +#### Training Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz Input image size - 3 * 224 * 224, Time: images/second @@ -62,6 +64,34 @@ TBD chart on batch size 128 TBD +#### Inference +Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz +- VGG-19 + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|-------|-------|-------|-------|-------| +| OpenBLAS | 0.36 | 0.48 | 0.56 | 0.50 | 0.43 | +| MKLML | 5.41 | 9.52 | 14.71 | 20.46 | 29.35 | +| MKL-DNN | 65.52 | 89.94 | 83.92 | 94.77 | 95.78 | + +- ResNet-50 + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|-------|--------|--------|--------|--------| +| OpenBLAS | 0.29 | 0.43 | 0.71 | 0.85 | 0.71 | +| MKLML | 6.26 | 11.88 | 21.37 | 39.67 | 59.01 | +| MKL-DNN | 90.27 | 134.03 | 136.03 | 153.66 | 211.22 | + + +- GoogLeNet + +| BatchSize | 1 | 2 | 4 | 8 | 16 | +|-----------|--------|--------|--------|--------|--------| +| OpenBLAS | 12.47 | 12.36 | 12.25 | 12.13 | 12.08 | +| MKLML | 22.50 | 43.90 | 81.22 | 132.92 | 199.69 | +| MKL-DNN | 221.69 | 341.33 | 428.09 | 528.24 | 624.18 | + + ### Laptop TBD ### Desktop From e1247d8015c43ad9dd6254650d8238fc0cefde8f Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Fri, 8 Dec 2017 13:36:04 +0800 Subject: [PATCH 03/35] Fix compile error in android. --- paddle/math/tests/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt index 215bac1271..dcd2a34583 100644 --- a/paddle/math/tests/CMakeLists.txt +++ b/paddle/math/tests/CMakeLists.txt @@ -34,4 +34,4 @@ add_simple_unittest(test_FPException) add_simple_unittest(test_GpuProfiler) add_simple_unittest(test_BaseMatrix) add_simple_unittest(test_Matrix) -cc_test(test_float16 SRCS test_float16.cpp) +add_simple_unittest(test_float16) From 4f1381eac3708ce92b07a01f6cfc9d4131c996af Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 8 Dec 2017 16:20:09 +0800 Subject: [PATCH 04/35] recv_op use serialized program --- paddle/operators/recv_op.cc | 11 +++++++---- paddle/operators/send_recv_op_test.cc | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index c69e416e10..45222f6b76 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -72,8 +72,10 @@ class RecvOp : public framework::OperatorBase { // FIXME(typhoonzero): do not copy framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor); - auto *block = Attr("OptimizeBlock"); - auto *program = block->Program(); + std::string program_str = Attr("OptimizeProgram"); + framework::Program program_desc; + program_desc.ParseFromString(program_str); + framework::ProgramDescBind program(program_desc); framework::Executor executor(dev_ctx); // Run sub graph to get optimized tensor executor.Run(*program, &recv_scope, block->ID(), @@ -108,8 +110,9 @@ This operator will recv tensor from send_op "IP address to listen on.") .SetDefault("127.0.0.1:6164") .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); - AddAttr("OptimizeBlock", "type BlockDescBind*", - "optimize network run in server"); + AddAttr( + "OptimizeProgram", "type string", + "Serialized ProgramDesc string for recv to run."); } }; diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc index ac03eb3752..c35dc8fa50 100644 --- a/paddle/operators/send_recv_op_test.cc +++ b/paddle/operators/send_recv_op_test.cc @@ -85,7 +85,7 @@ void StartServerNet() { paddle::framework::AttributeMap attrs; attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); - attrs.insert({"OptimizeBlock", block}); + attrs.insert({"OptimizeProgram", program.Proto()->SerializeToString()}); recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}}, {{"Out", {"Out"}}}, attrs); paddle::platform::CPUDeviceContext ctx(place); From 986ca03ce24f6d84eb9cfaef64b59fda2298823b Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Fri, 8 Dec 2017 19:45:15 +0800 Subject: [PATCH 05/35] update --- paddle/operators/recv_op.cc | 9 ++++----- paddle/operators/send_recv_op_test.cc | 5 ++++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc index 45222f6b76..eed482c1b4 100644 --- a/paddle/operators/recv_op.cc +++ b/paddle/operators/recv_op.cc @@ -73,12 +73,12 @@ class RecvOp : public framework::OperatorBase { framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor); std::string program_str = Attr("OptimizeProgram"); - framework::Program program_desc; + framework::ProgramDesc program_desc; program_desc.ParseFromString(program_str); framework::ProgramDescBind program(program_desc); framework::Executor executor(dev_ctx); // Run sub graph to get optimized tensor - executor.Run(*program, &recv_scope, block->ID(), + executor.Run(program, &recv_scope, 0, /*global_block*/ false /*create_local_scope*/); auto *out_var = recv_scope.FindVar("Out"); @@ -110,9 +110,8 @@ This operator will recv tensor from send_op "IP address to listen on.") .SetDefault("127.0.0.1:6164") .AddCustomChecker([](const std::string &ip) { return !ip.empty(); }); - AddAttr( - "OptimizeProgram", "type string", - "Serialized ProgramDesc string for recv to run."); + AddAttr("OptimizeProgram", "type string", + "Serialized ProgramDesc string for recv to run."); } }; diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc index c35dc8fa50..3e2e2051af 100644 --- a/paddle/operators/send_recv_op_test.cc +++ b/paddle/operators/send_recv_op_test.cc @@ -85,7 +85,10 @@ void StartServerNet() { paddle::framework::AttributeMap attrs; attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); - attrs.insert({"OptimizeProgram", program.Proto()->SerializeToString()}); + std::string program_proto; + PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto)); + + attrs.insert({"OptimizeProgram", program_proto}); recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}}, {{"Out", {"Out"}}}, attrs); paddle::platform::CPUDeviceContext ctx(place); From 3a222a4dcf4e3bf05c2c24bac76e9551144e4fcb Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sat, 9 Dec 2017 12:39:19 +0800 Subject: [PATCH 06/35] add release note --- RELEASE.cn.md | 36 ++++++++++++++++++++++++++ RELEASE.md | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) diff --git a/RELEASE.cn.md b/RELEASE.cn.md index 5deaf230a8..a6531061af 100644 --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,3 +1,39 @@ +# Release v0.11.0 + +## Fluid Python API + +- PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如: + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- 在v0.11.0版本中,我们提供了一个C++类`Executor`用于运行一个Fluid程序。Executor类似一个解释器。在未来的版本中,我们将提升和优化Executor成为一个调试器,就像GDB。并可能提供一些编译器,这个编译器会读取一个上文所描述的应用然后编译成一个等价的 +源代码,这个源代码可以被nvcc编译成可以使用CUDA的二进制,或者被icc编译成可以充分利用Intel CPU的二进制。 + + +## 新特点 + +* 发布 `Fluid` API。 +* 增加了用于模型预测的C-API。 +* 用Fluid API实现了一个简单的GAN的例子。 +* 增加了关于性能调优的文档。 +* 为`paddle.v2.dataset`下载数据集提供了重试机制. +* C++中使用protobuf-lite替换protobuf减少了二进制的大小。 +* 发布了新特性 [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment). +* 基于Bazel API利用cmake实现了一个的新的构建系统函数库。 +* 当使用编译选项`WITH_MKL=ON`时自动下载和编译Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) 函数库. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - 完成了 11个 MKL-DNN 层: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN。 + - 完成了 3个 MKL-DNN 网络: VGG-19, ResNet-50, GoogleNet + - 基于Intel Skylake 6148 CPU的[性能测试](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) : 相对于MKLML有2~3倍的训练加速。 +* 增加 [softsign activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign) +* 增加 [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod) +* 增加 [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance) +* 增加 [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq) +* 增加 [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score) +* 增加 [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice) +* 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* 增加移动端友好的网页 + # v0.10.0版本 我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。 diff --git a/RELEASE.md b/RELEASE.md index 146f7afa7d..d6aaa341a2 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,3 +1,75 @@ +# Release v0.11.0 + +## Fluid Python API + +- Release 0.11.0 includes a new feature *PaddlePaddle Fluid*. Fluid is + designed to allow users to program like PyTorch and TensorFlow Eager Execution. + In these systems, there is no longer the concept *model* and applications + do not include a symbolic description of a graph of operators nor a sequence + of layers. Instead, applications look exactly like a usual program that + describes a process of training or inference. The difference between + Fluid and PyTorch or Eager Execution is that Fluid doesn't rely on Python's + control-flow, `if-then-else` nor `for`. Instead, Fluid provides its + C++ implementations and their Python binding using the `with` statement. For an example + + https://github.com/PaddlePaddle/Paddle/blob/3df78ed2a98d37f7ae6725894cc7514effd5664b/python/paddle/v2/fluid/tests/test_while_op.py#L36-L44 + +- In 0.11.0, we provides a C++ class `Executor` to run a Fluid program. +Executor works like an interpreter. In future version, we will improve +`Executor` into a debugger like GDB, and we might provide some compilers, +which, for example, takes an application like the above one, and outputs +an equivalent C++ source program, which can be compiled using +[`nvcc`](http://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html) +to generate binaries that use CUDA, or using +[`icc`](https://software.intel.com/en-us/c-compilers) to generate binaries +that make full use of Intel CPUs. + +## New Features + +* Release `Fluid` API. +* Add C-API for model inference +* Use fluid API to create a simple GAN demo. +* Add develop guide about performance tunning. +* Add retry when download `paddle.v2.dataset`. +* Linking protobuf-lite not protobuf in C++. Reduce the binary size. +* Feature [Elastic Deep Learning (EDL)](https://github.com/PaddlePaddle/cloud/tree/develop/doc/autoscale/experiment) released. +* A new style cmake functions for Paddle. It is based on Bazel API. +* Automatically download and compile with Intel® [MKLML](https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz) library as CBLAS when build `WITH_MKL=ON`. +* [Intel® MKL-DNN on PaddlePaddle](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn): + - Complete 11 MKL-DNN layers: Convolution, Fully connectivity, Pooling, ReLU, Tanh, ELU, Softmax, BatchNorm, AddTo, Concat, LRN. + - Complete 3 MKL-DNN networks: VGG-19, ResNet-50, GoogleNet + - [Benchmark](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md) on Intel Skylake 6148 CPU: 2~3x training speedup compared with MKLML. +* Add the [`softsign` activation](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/activation.html#softsign). +* Add the [dot product layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#dot-prod). +* Add the [L2 distance layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#l2-distance). +* Add the [sub-nested sequence layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#sub-nested-seq). +* Add the [kmax sequence score layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#kmax-sequence-score). +* Add the [sequence slice layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#seq-slice). +* Add the [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) +* Add mobile friendly webpages. + +## Improvements + +* Build and install using a single `whl` package. +* [Custom evaluating in V2 API](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标). +* Change `PADDLE_ONLY_CPU` to `PADDLE_WITH_GPU`, since we will support many kinds of devices. +* Remove buggy BarrierStat. +* Clean and remove unused functions in paddle::Parameter. +* Remove ProtoDataProvider. +* Huber loss supports both regression and classification. +* Add the `stride` parameter for sequence pooling layers. +* Enable v2 API use cudnn batch normalization automatically. +* The BN layer's parameter can be shared by a fixed the parameter name. +* Support variable-dimension input feature for 2D convolution operation. +* Refine cmake about CUDA to automatically detect GPU architecture. +* Improved website navigation. + +## Bug Fixes + +* Fix bug in ROI pooling. cc9a761 +* Fix AUC is zero when label is dense vector. #5274 +* Fix bug in WarpCTC layer. + # Release v0.10.0 We are glad to release version 0.10.0. In this version, we are happy to release the new From 5d4f9fb32de4122194e3e343022a894f0aa2ad4e Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sat, 9 Dec 2017 12:57:12 +0800 Subject: [PATCH 07/35] add some content --- RELEASE.cn.md | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/RELEASE.cn.md b/RELEASE.cn.md index a6531061af..df273cf7b7 100644 --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,4 +1,4 @@ -# Release v0.11.0 +# v0.11.0版本 ## Fluid Python API @@ -34,6 +34,29 @@ * 增加 [row convolution layer](http://www.paddlepaddle.org/docs/develop/documentation/zh/api/v2/config/layer.html#row-conv) * 增加移动端友好的网页 +## 改进 + +* 使用一个Python`whl`包即可安装. +* [V2 API可以实现用户定制化评估](https://github.com/PaddlePaddle/models/tree/develop/ltr#训练过程中输出自定义评估指标)。 +* 将 `PADDLE_ONLY_CPU` 改为 `PADDLE_WITH_GPU`, 因为我们会支持多种设备。 +* 删除了有一些bug的BarrierStat。 +* 清理和删除了paddle::Parameter中未使用的函数。 +* 删除了ProtoDataProvider。 +* Huber loss同时支持回归和分类。 +* 为sequence pooling 层增加`stride`参数。 +* v2 API自动使用cudnn batch normalization。 +* 可以使用一个固定的参数名共享BN层的参数。 +* 2D convolution operation支持variable-dimension input特性。 +* 重构cmake中关于CUDA的部分并实现自动检测GPU架构的功能。 +* 优化网页导航。 + +## 错误修复 + +* 修复ROI pooling的Bug. cc9a761 +* 修复当label是dense vector是AUC变成0的问题. #5274 +* 修复WarpCTC 层的Bug. + + # v0.10.0版本 我们非常高兴发布了PaddlePaddle V0.10.0版,并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。 From acab5e656a181efd2f8fe92e8c01e193dc098e78 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 10 Dec 2017 12:41:10 +0800 Subject: [PATCH 08/35] update v0.11.0 release note --- RELEASE.cn.md | 4 ++-- RELEASE.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/RELEASE.cn.md b/RELEASE.cn.md index df273cf7b7..494c59730d 100644 --- a/RELEASE.cn.md +++ b/RELEASE.cn.md @@ -1,6 +1,6 @@ # v0.11.0版本 -## Fluid Python API +## PaddlePaddle Fluid - PaddlePaddle发布版本v0.11.0包含一个新的特性*PaddlePaddle Fluid*. Fluid 是设计用来让用户像Pytorch和Tensorflow Eager Execution一样执行程序。在这些系统中,不再有*模型*这个概念,应用也不再包含一个用于描述Operator图或者一系列层的符号描述,而是像通用程序那样描述训练或者预测的过程。而Fluid与PyTorch或Eager Execution的区别在于Fluid不依赖Python提供的控制流,例如 if-else-then或者for,而是提供了基于C++实现的控制流并暴露了对应的用with语法实现的Python接口。例如: @@ -12,7 +12,7 @@ ## 新特点 -* 发布 `Fluid` API。 +* 发布 `PaddlePaddle Fluid`。 * 增加了用于模型预测的C-API。 * 用Fluid API实现了一个简单的GAN的例子。 * 增加了关于性能调优的文档。 diff --git a/RELEASE.md b/RELEASE.md index d6aaa341a2..7c3c04ef9f 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -1,6 +1,6 @@ # Release v0.11.0 -## Fluid Python API +## PaddlePaddle Fluid - Release 0.11.0 includes a new feature *PaddlePaddle Fluid*. Fluid is designed to allow users to program like PyTorch and TensorFlow Eager Execution. From 2e65df17295750bc73f02cf421272918ee97cb98 Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Sun, 10 Dec 2017 12:43:03 +0800 Subject: [PATCH 09/35] change Fluid description --- RELEASE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/RELEASE.md b/RELEASE.md index 7c3c04ef9f..5a62c95513 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -26,7 +26,7 @@ that make full use of Intel CPUs. ## New Features -* Release `Fluid` API. +* Release `PaddlePaddle Fluid`. * Add C-API for model inference * Use fluid API to create a simple GAN demo. * Add develop guide about performance tunning. From 578ad6d23251c0fc08cbf49aa1d6bf9daae55a88 Mon Sep 17 00:00:00 2001 From: hedaoyuan Date: Mon, 11 Dec 2017 11:21:47 +0800 Subject: [PATCH 10/35] Use PADDLE_WITH_NATIVE_FP16 for float16_t. --- paddle/math/float16.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/math/float16.h b/paddle/math/float16.h index f805cad08b..76ad3a0123 100644 --- a/paddle/math/float16.h +++ b/paddle/math/float16.h @@ -101,7 +101,7 @@ public: half tmp = __float2half(val); x = *reinterpret_cast(&tmp); -#elif defined(PADDLE_NEON) +#elif defined(PADDLE_WITH_NATIVE_FP16) float32x4_t tmp = vld1q_dup_f32(&val); float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0); x = *reinterpret_cast(&res); @@ -252,7 +252,7 @@ public: half tmp = *reinterpret_cast(this); return __half2float(tmp); -#elif defined(PADDLE_NEON) +#elif defined(PADDLE_WITH_NATIVE_FP16) float16x4_t res = vld1_dup_f16(reinterpret_cast(this)); return vgetq_lane_f32(vcvt_f32_f16(res), 0); From 3ba75c7c8225cfeb001b2d85b6dc062d08e9b630 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 11 Dec 2017 14:28:44 +0800 Subject: [PATCH 11/35] update inference benchmark data --- benchmark/IntelOptimizedPaddle.md | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index c275aeb5cb..9c884044e6 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -70,26 +70,26 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz | BatchSize | 1 | 2 | 4 | 8 | 16 | |-----------|-------|-------|-------|-------|-------| -| OpenBLAS | 0.36 | 0.48 | 0.56 | 0.50 | 0.43 | -| MKLML | 5.41 | 9.52 | 14.71 | 20.46 | 29.35 | -| MKL-DNN | 65.52 | 89.94 | 83.92 | 94.77 | 95.78 | +| OpenBLAS | 1.07 | 1.08 | 1.06 | 0.88 | 0.65 | +| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 | +| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 | - ResNet-50 | BatchSize | 1 | 2 | 4 | 8 | 16 | |-----------|-------|--------|--------|--------|--------| -| OpenBLAS | 0.29 | 0.43 | 0.71 | 0.85 | 0.71 | -| MKLML | 6.26 | 11.88 | 21.37 | 39.67 | 59.01 | -| MKL-DNN | 90.27 | 134.03 | 136.03 | 153.66 | 211.22 | +| OpenBLAS | 3.35 | 3.19 | 3.09 | 2.55 | 1.96 | +| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 | +| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 | -- GoogLeNet +- GoogLeNet | BatchSize | 1 | 2 | 4 | 8 | 16 | |-----------|--------|--------|--------|--------|--------| -| OpenBLAS | 12.47 | 12.36 | 12.25 | 12.13 | 12.08 | -| MKLML | 22.50 | 43.90 | 81.22 | 132.92 | 199.69 | -| MKL-DNN | 221.69 | 341.33 | 428.09 | 528.24 | 624.18 | +| OpenBLAS | 12.04 | 11.31 | 10.00 | 9.07 | 4.34 | +| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 | +| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 | ### Laptop From 95924686096556d959e67b294e146153ac3b0dfb Mon Sep 17 00:00:00 2001 From: Yiqun Liu Date: Mon, 11 Dec 2017 16:12:36 +0800 Subject: [PATCH 12/35] Fix gcc4.9 (#6442) * Fix compiling error of gcc4.9. * Refine the check of cxx compiler flags in api/CMakeLists.txt. --- paddle/api/CMakeLists.txt | 12 +++- paddle/framework/backward.cc | 18 +++--- paddle/framework/backward_test.cc | 70 ++++++++++++++---------- paddle/framework/op_desc.cc | 4 +- paddle/framework/operator_test.cc | 4 +- paddle/framework/prune_test.cc | 44 ++++++++++----- paddle/operators/conditional_block_op.cc | 6 +- paddle/operators/net_op.h | 5 +- paddle/operators/net_op_test.cc | 16 +++--- paddle/operators/recurrent_op.cc | 5 +- paddle/operators/while_op.cc | 5 +- 11 files changed, 118 insertions(+), 71 deletions(-) diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt index d6b8464100..cf84568ecd 100644 --- a/paddle/api/CMakeLists.txt +++ b/paddle/api/CMakeLists.txt @@ -25,8 +25,18 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py) SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON) +SET(SWIG_NEED_FLAGS + -ftls-model=global-dynamic + -Wno-parentheses-equality + -Wno-self-assign + -Wno-maybe-uninitialized + -Wno-missing-field-initializers) + FOREACH(flag ${SWIG_NEED_FLAGS}) + safe_set_cxxflag(SWIG_CXX_FLAGS ${flag}) +ENDFOREACH() + SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR}) -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic") +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SWIG_CXX_FLAGS}") SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS paddle_parameter diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc index 7294ba1a9c..a17036c652 100644 --- a/paddle/framework/backward.cc +++ b/paddle/framework/backward.cc @@ -190,8 +190,9 @@ static std::unique_ptr BackwardRecursive( // collect all the offset for each alias, // insert a sum operator to add all aliases to output insert_position.push_back( - {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, - {{"Out", {name}}}, {})}); + {dup_op.back(), + OpRegistry::CreateOp("sum", {{"X", dup_outputs}}, {{"Out", {name}}}, + AttributeMap{})}); } // make sure the inserted `sum` ops follow the BFS order. @@ -216,7 +217,8 @@ static std::unique_ptr BackwardRecursive( // If part of input gradient of that operator is not calculated, fill // zero variables to that input gradient. net->AppendOp(OpRegistry::CreateOp("fill_zeros_like", {{"X", {prefix}}}, - {{"Y", {grad_input}}}, {})); + {{"Y", {grad_input}}}, + AttributeMap{})); } return false; }); @@ -392,8 +394,9 @@ std::vector> MakeOpGrad( 0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1); std::string new_name = prefix + kZeroVarSuffix; desc->Rename(in_name, new_name); - std::unique_ptr fill_zeros_op(new OpDescBind( - "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {})); + std::unique_ptr fill_zeros_op( + new OpDescBind("fill_zeros_like", {{"X", {prefix}}}, + {{"Y", {new_name}}}, AttributeMap{})); pending_fill_zeros_ops.push_back(std::move(fill_zeros_op)); } } @@ -483,8 +486,9 @@ std::vector> MakeBlockBackward( sum_op_inputs.emplace_back(new_name); next_g_name = sum_op_inputs.back(); } - std::unique_ptr sum_op(new OpDescBind( - "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); + std::unique_ptr sum_op( + new OpDescBind("sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, + AttributeMap{})); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); } } diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc index 2b858f5ea0..9fe49881d5 100644 --- a/paddle/framework/backward_test.cc +++ b/paddle/framework/backward_test.cc @@ -106,15 +106,15 @@ class FcOp : public operators::NetOp { FcOp(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs) : NetOp(type, inputs, outputs, attrs) { - AppendOp(OpRegistry::CreateOp("mul", - {{"X", {Input("X")}}, {"Y", {Input("W")}}}, - {{"Out", {Output("mul_result")}}}, {})); + AppendOp(OpRegistry::CreateOp( + "mul", {{"X", {Input("X")}}, {"Y", {Input("W")}}}, + {{"Out", {Output("mul_result")}}}, AttributeMap{})); auto input_b = Inputs("b"); std::string before_act = "mul_result"; if (input_b.size() != 0) { AppendOp(OpRegistry::CreateOp( "rowwise_add", {{"X", {Output("mul_result")}}, {"b", {input_b[0]}}}, - {{"Out", {Output("add_result")}}}, {})); + {{"Out", {Output("add_result")}}}, AttributeMap{})); before_act = "add_result"; } else { auto out_varname = Output("add_result"); @@ -124,7 +124,7 @@ class FcOp : public operators::NetOp { } AppendOp(OpRegistry::CreateOp("sigmoid", {{"X", {Output(before_act)}}}, - {{"Out", {Output("Out")}}}, {})); + {{"Out", {Output("Out")}}}, AttributeMap{})); CompleteAddOp(false); } }; @@ -278,8 +278,9 @@ REGISTER_OPERATOR(scale, f::NoneOp); REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel); TEST(Backward, simple_op_not_need_grad) { - auto fwd = f::OpRegistry::CreateOp( - "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {}); + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); ASSERT_NE(fwd, nullptr); auto gop = f::Backward(*fwd, {"x"}); ASSERT_EQ(gop->Output(f::GradVarName("X")), f::kEmptyVarName); @@ -296,9 +297,10 @@ TEST(Backward, net_fc_backward_normal) { {{"mul_result", {"mul_res"}}, {"add_result", {"add_re"}}, {"Out", {"out"}}}, - {}); + f::AttributeMap{}); ASSERT_NE(fwd, nullptr); - std::shared_ptr gop = f::Backward(*fwd, {}); + std::shared_ptr gop = + f::Backward(*fwd, std::unordered_set{}); ASSERT_TRUE(gop->IsNetOp()); auto net = static_cast(gop.get()); @@ -322,9 +324,10 @@ TEST(Backward, net_fc_backward_not_have_b) { {{"mul_result", {"mul_res"}}, {"add_result", {"add_res"}}, {"Out", {"tmp"}}}, - {}); + f::AttributeMap{}); ASSERT_NE(fwd, nullptr); - std::shared_ptr gop = f::Backward(*fwd, {}); + std::shared_ptr gop = + f::Backward(*fwd, std::unordered_set{}); ASSERT_TRUE(gop->IsNetOp()); auto net = static_cast(gop.get()); @@ -346,13 +349,13 @@ TEST(Backward, net_input_of_network_not_need_grad) { {{"mul_result", {"mul_tmp_0"}}, {"add_result", {"add_tmp_0"}}, {"Out", {"hidden0"}}}, - {})); + f::AttributeMap{})); net.AppendOp(f::OpRegistry::CreateOp( "fc", {{"X", {"hidden0"}}, {"W", {"W2"}}, {"b", {"b2"}}}, {{"mul_result", {"mul_tmp_1"}}, {"add_result", {"add_tmp_1"}}, {"Out", {"hidden1"}}}, - {})); + f::AttributeMap{})); net.CompleteAddOp(); auto bwd = Backward(net, {"x"}); // x@GRAD is not need. ASSERT_TRUE(bwd->IsNetOp()); @@ -381,12 +384,13 @@ TEST(Backward, net_input_of_network_not_need_grad) { TEST(Backward, net_shared_weight) { ops::NetOp net; net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"x"}}, {"Y", {"w"}}}, - {{"Out", {"out"}}}, {})); + {{"Out", {"out"}}}, f::AttributeMap{})); net.AppendOp(f::OpRegistry::CreateOp("mul", {{"X", {"out"}}, {"Y", {"w"}}}, - {{"Out", {"FinalOut"}}}, {})); + {{"Out", {"FinalOut"}}}, + f::AttributeMap{})); net.CompleteAddOp(); - auto bwd = f::Backward(net, {}); + auto bwd = f::Backward(net, std::unordered_set{}); ASSERT_TRUE(bwd->IsNetOp()); auto bwd_net = static_cast(bwd.get()); ASSERT_EQ(3UL, bwd_net->ops_.size()); @@ -394,8 +398,9 @@ TEST(Backward, net_shared_weight) { } TEST(Backward, op_all_input_are_not_need) { - auto fwd = f::OpRegistry::CreateOp( - "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {}); + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); auto backward = f::Backward(*fwd, {"x", "b"}); ASSERT_TRUE(backward->IsNetOp()); auto net = static_cast(backward.get()); @@ -403,8 +408,9 @@ TEST(Backward, op_all_input_are_not_need) { } TEST(Backward, op_all_output_are_not_need) { - auto fwd = f::OpRegistry::CreateOp( - "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {}); + auto fwd = + f::OpRegistry::CreateOp("rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, + {{"Out", {"out"}}}, f::AttributeMap{}); auto backward = f::Backward(*fwd, {"out"}); ASSERT_TRUE(backward->IsNetOp()); auto net = static_cast(backward.get()); @@ -412,8 +418,9 @@ TEST(Backward, op_all_output_are_not_need) { } TEST(Backward, op_part_of_output_are_not_need) { - auto fwd = f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}}, - {{"y", {"Y"}}, {"z", {"Z"}}}, {}); + auto fwd = + f::OpRegistry::CreateOp("many_output_op", {{"x", {"X"}}}, + {{"y", {"Y"}}, {"z", {"Z"}}}, f::AttributeMap{}); auto backward = f::Backward(*fwd, {"Z"}); ASSERT_TRUE(backward->IsNetOp()); auto net = static_cast(backward.get()); @@ -437,7 +444,7 @@ TEST(Backward, op_part_of_output_are_not_need) { TEST(Backward, op_part_of_input_are_not_need) { auto fwd = f::OpRegistry::CreateOp("mul", {{"X", {"a"}}, {"Y", {"b"}}}, - {{"Out", {"out"}}}, {}); + {{"Out", {"out"}}}, f::AttributeMap{}); auto backward = f::Backward(*fwd, {"a"}); auto &grad_mul = *backward; ASSERT_EQ(grad_mul.Type(), "mul_grad"); @@ -458,19 +465,19 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) { {{"mul_result", {"mul_out1"}}, {"add_result", {"add_out1"}}, {"Out", {"out1"}}}, - {})); + f::AttributeMap{})); net.AppendOp(f::OpRegistry::CreateOp( "fc", {{"X", {"out1"}}, {"W", {"w2"}}, {"b", {"b2"}}}, {{"mul_result", {"mul_out2"}}, {"add_result", {"tmp_out2"}}, {"Out", {"out2"}}}, - {})); + f::AttributeMap{})); net.AppendOp(f::OpRegistry::CreateOp( "fc", {{"X", {"out2"}}, {"W", {"w3"}}, {"b", {"b3"}}}, {{"mul_result", {"mul_out3"}}, {"add_result", {"tmp_out3"}}, {"Out", {"out3"}}}, - {})); + f::AttributeMap{})); net.CompleteAddOp(); auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"}); @@ -509,7 +516,8 @@ TEST(Backward, simple_single_op) { auto target = f::VarDescBind("out"); target.SetShape({1}); - auto var_to_grad = AppendBackward(program, target, {}); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 3UL); f::OpDescBind *fill_op = block->AllOps()[1]; @@ -546,7 +554,7 @@ TEST(Backward, default_attribute) { auto target = f::VarDescBind("out"); target.SetShape({1}); - AppendBackward(program, target, {}); + AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 3UL); EXPECT_EQ(boost::get(op->GetAttr("x_num_col_dims")), 1); @@ -585,7 +593,8 @@ TEST(Backward, simple_mult_op) { auto target = f::VarDescBind("out3"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); - auto var_to_grad = AppendBackward(program, target, {}); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 6UL + 1); f::OpDescBind *fill_op = block->AllOps()[forward_len]; @@ -817,7 +826,8 @@ TEST(Backward, shared_var) { auto target = f::VarDescBind("out3"); target.SetShape({1}); size_t forward_len = block->AllOps().size(); - auto var_to_grad = AppendBackward(program, target, {}); + auto var_to_grad = + AppendBackward(program, target, std::unordered_set{}); ASSERT_EQ(block->AllOps().size(), 8UL); f::OpDescBind *fill_op = block->AllOps()[forward_len]; diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index cde3f1ac2e..7ba1e3e4e3 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -316,8 +316,8 @@ static void InitInferShapeFuncs() { for (auto &kern_pair : OperatorWithKernel::AllOpKernels()) { auto op_type = kern_pair.first; auto &op_info = info_map.at(op_type); - auto op = - static_cast(op_info.Creator()("", {}, {}, {})); + auto op = static_cast(op_info.Creator()( + "", VariableNameMap{}, VariableNameMap{}, AttributeMap{})); if (op_info.infer_shape_) { // infer_shape has been registered. continue; } diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index 1e19f82b34..59ddbc7791 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -261,7 +261,9 @@ class OperatorClone : public paddle::framework::OperatorBase { }; TEST(Operator, Clone) { - OperatorClone a("ABC", {}, {}, {}); + OperatorClone a("ABC", paddle::framework::VariableNameMap{}, + paddle::framework::VariableNameMap{}, + paddle::framework::AttributeMap{}); auto b = a.Clone(); ASSERT_EQ(a.Type(), b->Type()); } diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc index 5988874809..f21df37a29 100644 --- a/paddle/framework/prune_test.cc +++ b/paddle/framework/prune_test.cc @@ -54,7 +54,8 @@ TEST(Prune, one_operator) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block); + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); f::ProgramDesc *pdesc = program.Proto(); f::ProgramDesc pruned; @@ -71,10 +72,14 @@ TEST(Prune, forward) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block); - AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, {}, block); - AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, {}, block); - AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, {}, block); + AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, f::AttributeMap{}, + block); f::ProgramDesc *pdesc = program.Proto(); @@ -90,11 +95,14 @@ TEST(Prune, multi_input_op) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, {}, block); - AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, {}, block); - AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, {}, block); - AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, {}, + AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, f::AttributeMap{}, + block); + AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, + f::AttributeMap{}, block); f::ProgramDesc *pdesc = program.Proto(); pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true); @@ -108,9 +116,12 @@ TEST(Prune, multi_output_op) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block); - AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block); - AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block); + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); f::ProgramDesc *pdesc = program.Proto(); pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true); @@ -124,9 +135,12 @@ TEST(Prune, multi_target) { f::ProgramDescBind program; f::BlockDescBind *block = program.MutableBlock(0); - AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block); - AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block); - AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block); + AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, + f::AttributeMap{}, block); + AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, f::AttributeMap{}, + block); + AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, f::AttributeMap{}, + block); f::ProgramDesc *pdesc = program.Proto(); pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true); diff --git a/paddle/operators/conditional_block_op.cc b/paddle/operators/conditional_block_op.cc index d5b124682d..03c58a7eab 100644 --- a/paddle/operators/conditional_block_op.cc +++ b/paddle/operators/conditional_block_op.cc @@ -142,9 +142,9 @@ class ConditionalBlockGradOp : public ConditionalOp { continue; } auto new_in_grad_name = cur_scope.Rename(in_grad_name); - auto assign = - framework::OpRegistry::CreateOp("assign", {{"X", {new_in_grad_name}}}, - {{"Out", {out_grad_name}}}, {}); + auto assign = framework::OpRegistry::CreateOp( + "assign", {{"X", {new_in_grad_name}}}, {{"Out", {out_grad_name}}}, + framework::AttributeMap{}); assign->Run(cur_scope, dev_ctx); cur_scope.Rename(new_in_grad_name, in_grad_name); } diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h index ebeb262d96..8935751f15 100644 --- a/paddle/operators/net_op.h +++ b/paddle/operators/net_op.h @@ -38,7 +38,10 @@ namespace operators { class NetOp : public framework::OperatorBase { public: static const char kAll[]; - NetOp() : framework::OperatorBase("plain_net", {}, {}, {}) {} + NetOp() + : framework::OperatorBase("plain_net", framework::VariableNameMap{}, + framework::VariableNameMap{}, + framework::AttributeMap{}) {} NetOp(const std::string& type, const framework::VariableNameMap& inputs, const framework::VariableNameMap& outputs, diff --git a/paddle/operators/net_op_test.cc b/paddle/operators/net_op_test.cc index 63bebd5b44..22fba9568d 100644 --- a/paddle/operators/net_op_test.cc +++ b/paddle/operators/net_op_test.cc @@ -38,10 +38,10 @@ TEST(OpKernel, all) { net->AppendOp(std::unique_ptr( new TestOp("test", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, - {{"Out", {"y"}}}, {}))); + {{"Out", {"y"}}}, framework::AttributeMap{}))); net->AppendOp(std::unique_ptr( new TestOp("test", {{"X", {"y"}}, {"W", {"w2"}}, {"b", {"b2"}}}, - {{"Out", {"z"}}}, {}))); + {{"Out", {"z"}}}, framework::AttributeMap{}))); net->CompleteAddOp(); AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, @@ -58,7 +58,7 @@ TEST(NetOp, insert_op) { NetOp net; auto op1 = std::unique_ptr( new framework::NOP("empty", {{"X", {"x"}}, {"W", {"w1"}}, {"b", {"b1"}}}, - {{"Out", {"y"}}}, {})); + {{"Out", {"y"}}}, framework::AttributeMap{})); net.AppendOp(*op1); net.InsertOp(0, *op1); ASSERT_EQ(2UL, net.ops_.size()); @@ -68,10 +68,12 @@ TEST(NetOp, insert_op) { TEST(NetOp, Clone) { NetOp net; - net.AppendOp( - std::unique_ptr(new framework::NOP{"empty", {}, {}, {}})); - net.AppendOp(std::unique_ptr( - new framework::NOP{"empty2", {}, {}, {}})); + net.AppendOp(std::unique_ptr(new framework::NOP{ + "empty", framework::VariableNameMap{}, framework::VariableNameMap{}, + framework::AttributeMap{}})); + net.AppendOp(std::unique_ptr(new framework::NOP{ + "empty2", framework::VariableNameMap{}, framework::VariableNameMap{}, + framework::AttributeMap{}})); net.CompleteAddOp(true); auto new_net_op = net.Clone(); ASSERT_NE(new_net_op, nullptr); diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc index 8b60b9c912..29f9163643 100644 --- a/paddle/operators/recurrent_op.cc +++ b/paddle/operators/recurrent_op.cc @@ -408,7 +408,8 @@ class RecurrentGradOp : public RecurrentBase { attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( - "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs); + "fill_constant", framework::VariableNameMap{}, + {{"Out", {pg_names[param_id]}}}, attrs); zero_op->Run(scope, dev_ctx); } @@ -417,7 +418,7 @@ class RecurrentGradOp : public RecurrentBase { auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, - {{"Out", {pg_names[param_id]}}}, {}); + {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); sum_op->Run(cur_scope, dev_ctx); cur_scope.Rename(new_inside_name, inside_grad_name); diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc index 9b3f21cf94..b8e44bcc5a 100644 --- a/paddle/operators/while_op.cc +++ b/paddle/operators/while_op.cc @@ -187,7 +187,8 @@ class WhileGradOp : public framework::OperatorBase { attrs["value"] = 0.0f; auto zero_op = framework::OpRegistry::CreateOp( - "fill_constant", {}, {{"Out", {pg_names[param_id]}}}, attrs); + "fill_constant", framework::VariableNameMap{}, + {{"Out", {pg_names[param_id]}}}, attrs); zero_op->Run(scope, dev_ctx); } } @@ -195,7 +196,7 @@ class WhileGradOp : public framework::OperatorBase { auto new_inside_name = cur_scope.Rename(inside_grad_name); auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, - {{"Out", {pg_names[param_id]}}}, {}); + {{"Out", {pg_names[param_id]}}}, framework::AttributeMap{}); sum_op->Run(cur_scope, dev_ctx); cur_scope.Rename(new_inside_name, inside_grad_name); } From 8d428bd9b89ec59dfcf16eb9e319a9106415b766 Mon Sep 17 00:00:00 2001 From: ranqiu Date: Mon, 4 Dec 2017 19:28:12 +0800 Subject: [PATCH 13/35] Update annotations of layers.py --- .../paddle/trainer_config_helpers/layers.py | 121 ++++++++++-------- 1 file changed, 66 insertions(+), 55 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 4bd94861af..48858f4c34 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -1516,34 +1516,33 @@ def lstmemory(input, NOTE: This is a low level user interface. You can use network.simple_lstm to config a simple plain lstm layer. - Please refer to **Generating Sequences With Recurrent Neural Networks** for - more details about LSTM. - - Link_ goes as below. - - .. _Link: http://arxiv.org/abs/1308.0850 + Reference: + `Generating Sequences With Recurrent Neural Networks + `_ - :param name: The lstmemory layer name. + :param name: The name of this layer. It is optional. :type name: basestring - :param size: DEPRECATED. size of the lstm cell + :param size: DEPRECATED. The dimension of the lstm cell. :type size: int :param input: The input of this layer. :type input: LayerOutput - :param reverse: is sequence process reversed or not. + :param reverse: Whether the input sequence is processed in a reverse order. :type reverse: bool :param act: Activation type. TanhActivation is the default activation. :type act: BaseActivation - :param gate_act: gate activation type, SigmoidActivation by default. + :param gate_act: Activation type of this layer's gates. SigmoidActivation is the + default activation. :type gate_act: BaseActivation - :param state_act: state activation type, TanhActivation by default. + :param state_act: Activation type of the state. TanhActivation is the default activation. :type state_act: BaseActivation :param bias_attr: The bias attribute. If the parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param param_attr: Parameter Attribute. - :type param_attr: ParameterAttribute | None | False - :param layer_attr: Extra Layer attribute + :param param_attr: The parameter attribute. See ParameterAttribute for details. + :type param_attr: ParameterAttribute + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute | None :return: LayerOutput object. :rtype: LayerOutput @@ -1632,14 +1631,14 @@ def grumemory(input, h_t = (1 - z_t) h_{t-1} + z_t {\\tilde{h_t}} NOTE: In PaddlePaddle's implementation, the multiplication operations - :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not computed in - gate_recurrent layer. Consequently, an additional mixed_layer with + :math:`W_{r}x_{t}`, :math:`W_{z}x_{t}` and :math:`W x_t` are not performed + in gate_recurrent layer. Consequently, an additional mixed_layer with full_matrix_projection or a fc_layer must be included before grumemory is called. - More details can be found by referring to `Empirical Evaluation of Gated - Recurrent Neural Networks on Sequence Modeling. - `_ + Reference: + `Empirical Evaluation of Gated Recurrent Neural Networks on Sequence Modeling + `_ The simple usage is: @@ -1647,28 +1646,29 @@ def grumemory(input, gru = grumemory(input) - :param name: The gru layer name. - :type name: None | basestring + :param name: The name of this layer. It is optional. + :type name: basestring :param input: The input of this layer. :type input: LayerOutput. - :param size: DEPRECATED. size of the gru cell + :param size: DEPRECATED. The dimension of the gru cell. :type size: int - :param reverse: Whether sequence process is reversed or not. + :param reverse: Whether the input sequence is processed in a reverse order. :type reverse: bool :param act: Activation type, TanhActivation is the default. This activation affects the :math:`{\\tilde{h_t}}`. :type act: BaseActivation - :param gate_act: gate activation type, SigmoidActivation by default. - This activation affects the :math:`z_t` and :math:`r_t`. It is the - :math:`\\sigma` in the above formula. + :param gate_act: Activation type of this layer's two gates. SigmoidActivation is + the default activation. This activation affects the :math:`z_t` + and :math:`r_t`. It is the :math:`\\sigma` in the above formula. :type gate_act: BaseActivation :param bias_attr: The bias attribute. If the parameter is set to False or an object whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param param_attr: Parameter Attribute. - :type param_attr: ParameterAttribute | None | False - :param layer_attr: Extra Layer attribute + :param param_attr: The parameter attribute. See ParameterAttribute for details. + :type param_attr: ParameterAttribute + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute | None :return: LayerOutput object. :rtype: LayerOutput @@ -1712,10 +1712,10 @@ def last_seq(input, """ Get Last Timestamp Activation of a sequence. - If stride > 0, this layer slides a window whose size is determined by stride, - and return the last value of the window as the output. Thus, a long sequence - will be shorten. Note that for sequence with sub-sequence, the default value - of stride is -1. + If stride > 0, this layer will slide a window whose size is determined by stride, + and return the last value of the sequence in the window as the output. Thus, a + long sequence will be shortened. Note that for sequence with sub-sequence, the + default value of stride is -1. The simple usage is: @@ -1724,14 +1724,16 @@ def last_seq(input, seq = last_seq(input=layer) :param agg_level: Aggregated level + :type agg_level: AggregateLevel :param name: The name of this layer. It is optional. :type name: basestring :param input: The input of this layer. :type input: LayerOutput :param stride: The step size between successive pooling regions. - :type stride: Int - :param layer_attr: extra layer attributes. - :type layer_attr: ExtraLayerAttribute. + :type stride: int + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. + :type layer_attr: ExtraLayerAttribute :return: LayerOutput object. :rtype: LayerOutput """ @@ -1768,10 +1770,10 @@ def first_seq(input, """ Get First Timestamp Activation of a sequence. - If stride > 0, this layer slides a window whose size is determined by stride, - and return the first value of the window as the output. Thus, a long sequence - will be shorten. Note that for sequence with sub-sequence, the default value - of stride is -1. + If stride > 0, this layer will slide a window whose size is determined by stride, + and return the first value of the sequence in the window as the output. Thus, a + long sequence will be shortened. Note that for sequence with sub-sequence, the + default value of stride is -1. The simple usage is: @@ -1780,13 +1782,15 @@ def first_seq(input, seq = first_seq(input=layer) :param agg_level: aggregation level + :type agg_level: AggregateLevel :param name: The name of this layer. It is optional. :type name: basestring :param input: The input of this layer. :type input: LayerOutput :param stride: The step size between successive pooling regions. - :type stride: Int - :param layer_attr: extra layer attributes. + :type stride: int + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute. :return: LayerOutput object. :rtype: LayerOutput @@ -1844,8 +1848,8 @@ def expand_layer(input, expand_level=ExpandLevel.FROM_NO_SEQUENCE, layer_attr=None): """ - A layer for "Expand Dense data or (sequence data where the length of each - sequence is one) to sequence data." + A layer for expanding dense data or (sequence data where the length of each + sequence is one) to sequence data. The example usage is: @@ -1857,7 +1861,9 @@ def expand_layer(input, :param input: The input of this layer. :type input: LayerOutput - :param expand_as: Expand as this layer's sequence info. + :param expand_as: Expand the input according to this layer's sequence infomation. And + after the operation, the input expanded will have the same number of + elememts as this layer. :type expand_as: LayerOutput :param name: The name of this layer. It is optional. :type name: basestring @@ -1865,9 +1871,10 @@ def expand_layer(input, whose type is not ParameterAttribute, no bias is defined. If the parameter is set to True, the bias is initialized to zero. :type bias_attr: ParameterAttribute | None | bool | Any - :param expand_level: whether input layer is timestep(default) or sequence. + :param expand_level: Whether the input layer is a sequence or the element of a sequence. :type expand_level: ExpandLevel - :param layer_attr: extra layer attributes. + :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for + details. :type layer_attr: ExtraLayerAttribute. :return: LayerOutput object. :rtype: LayerOutput @@ -3294,7 +3301,7 @@ def row_l2_norm_layer(input, name=None, layer_attr=None): A layer for L2-normalization in each row. .. math:: - out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}} + out[i] = \\frac{in[i]} {\\sqrt{\\sum_{k=1}^N in[k]^{2}}} where the size of :math:`in` is (batchSize x dataDim) , and the size of :math:`out` is a (batchSize x dataDim) . @@ -6161,9 +6168,11 @@ def huber_regression_cost(input, Given a prediction f(x), a label y and :math:`\delta`, the loss function is defined as: - .. math: - loss = 0.5*\left ( y-f(x) \right )^2, \left | y-f(x) \right |\leq \delta - loss = \delta \left | y-f(x) \right |-0.5\delta ^2, otherwise + .. math:: + + loss = 0.5*(y-f(x))^{2}, | y-f(x) | < \delta + + loss = \delta | y-f(x) | - 0.5 \delta ^2, otherwise The example usage is: @@ -6210,12 +6219,14 @@ def huber_classification_cost(input, """ For classification purposes, a variant of the Huber loss called modified Huber is sometimes used. Given a prediction f(x) (a real-valued classifier score) and - a true binary class label :math:`y\in \left \{-1, 1 \right \}`, the modified Huber + a true binary class label :math:`y\in \{-1, 1 \}`, the modified Huber loss is defined as: .. math: - loss = \max \left ( 0, 1-yf(x) \right )^2, yf(x)\geq 1 - loss = -4yf(x), \text{otherwise} + + loss = \max ( 0, 1-yf(x) )^2, yf(x) \geq -1 + + loss = -4yf(x), otherwise The example usage is: @@ -6959,7 +6970,7 @@ def clip_layer(input, min, max, name=None): .. math:: - out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right) + out[i] = \min (\max (in[i],p_{1} ),p_{2} ) .. code-block:: python From ddf20e589fad724f077b0613ebf3872d2311647a Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 11 Dec 2017 18:31:14 +0800 Subject: [PATCH 14/35] typo WITH_TEST to WITH_TESTING --- doc/howto/dev/contribute_to_paddle_cn.md | 2 +- paddle/scripts/docker/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md index 6993901452..3eb477eb65 100644 --- a/doc/howto/dev/contribute_to_paddle_cn.md +++ b/doc/howto/dev/contribute_to_paddle_cn.md @@ -87,7 +87,7 @@ no changes added to commit (use "git add" and/or "git commit -a") 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: ```bash -➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev +➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:dev ``` 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`): diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md index f3a6f1dba7..1e1fcc50dc 100644 --- a/paddle/scripts/docker/README.md +++ b/paddle/scripts/docker/README.md @@ -192,7 +192,7 @@ For developers who are interested in the C++ source code, please use -e "WOBOQ=O - The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host: ```bash -docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev +docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" -e "WOBOQ=ON" paddlepaddle/paddle:latest-dev ``` - You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run: From 9f44af9d7c9dd2f1dc775f59d059a87eb9e64fd6 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Mon, 11 Dec 2017 18:37:18 +0800 Subject: [PATCH 15/35] Fix #6460 (#6461) --- python/paddle/v2/fluid/layers.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index b4426bad14..fd8a2ed18c 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -762,7 +762,7 @@ def sequence_conv(input, helper = LayerHelper('sequence_conv', **locals()) dtype = helper.input_dtype() filter_shape = [filter_size * input.shape[1], num_filters] - filter = helper.create_parameter( + filter_param = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype) pre_bias = helper.create_tmp_variable(dtype) @@ -770,7 +770,7 @@ def sequence_conv(input, type='sequence_conv', inputs={ 'X': [input], - 'Filter': [filter], + 'Filter': [filter_param], }, outputs={"Out": pre_bias}, attrs={ @@ -785,7 +785,7 @@ def sequence_conv(input, def conv2d(input, num_filters, filter_size, - stride=[1, 1], + stride=None, padding=None, groups=None, param_attr=None, @@ -802,6 +802,8 @@ def conv2d(input, conv-2d output, if mentioned in the input parameters. """ + if stride is None: + stride = [1, 1] helper = LayerHelper('conv2d', **locals()) dtype = helper.input_dtype() @@ -827,7 +829,7 @@ def conv2d(input, std = (2.0 / (filter_size[0]**2 * num_channels))**0.5 return Normal(0.0, std, 0) - filter = helper.create_parameter( + filter_param = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype, @@ -839,7 +841,7 @@ def conv2d(input, type='conv2d_cudnn', inputs={ 'Input': input, - 'Filter': filter, + 'Filter': filter_param, }, outputs={"Output": pre_bias}, attrs={'strides': stride, @@ -875,8 +877,8 @@ def sequence_pool(input, pool_type, **kwargs): def pool2d(input, pool_size, pool_type, - pool_stride=[1, 1], - pool_padding=[0, 0], + pool_stride=None, + pool_padding=None, global_pooling=False, main_program=None, startup_program=None): @@ -884,6 +886,10 @@ def pool2d(input, This function adds the operator for pooling in 2 dimensions, using the pooling configurations mentioned in input parameters. """ + if pool_padding is None: + pool_padding = [0, 0] + if pool_stride is None: + pool_stride = [1, 1] if pool_type not in ["max", "avg"]: raise ValueError( "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", From 69b44f2f198509e29b8ab50edab9ab34f56fd1af Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 12 Dec 2017 15:31:44 +0800 Subject: [PATCH 16/35] unify MKL macro definition --- cmake/cblas.cmake | 2 +- cmake/external/mkldnn.cmake | 2 +- paddle/gserver/activations/ActivationFunction.cpp | 4 ++-- paddle/gserver/gradientmachines/NeuralNetwork.cpp | 4 ++-- paddle/math/Allocator.h | 2 +- paddle/math/MathFunctions.cpp | 2 +- paddle/math/MathFunctions.h | 2 +- paddle/memory/detail/system_allocator.cc | 2 +- paddle/operators/math/math_function.cc | 2 +- paddle/operators/math/math_function.h | 2 +- paddle/parameter/FirstOrderOptimizer.h | 2 +- paddle/parameter/ParameterUpdateFunctions.cpp | 2 +- paddle/utils/Flags.cpp | 2 +- 13 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake index b21fc43904..13294c0548 100644 --- a/cmake/cblas.cmake +++ b/cmake/cblas.cmake @@ -17,7 +17,7 @@ if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB) set(CBLAS_INC_DIR ${MKLML_INC_DIR}) set(CBLAS_LIBRARIES ${MKLML_LIB}) - add_definitions(-DPADDLE_USE_MKLML) + add_definitions(-DPADDLE_WITH_MKLML) add_definitions(-DLAPACK_FOUND) message(STATUS "Found cblas and lapack in MKLML " diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index fc52d339d7..5d24caebdc 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -67,5 +67,5 @@ ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") -add_definitions(-DPADDLE_USE_MKLDNN) +add_definitions(-DPADDLE_WITH_MKLDNN) LIST(APPEND external_project_dependencies mkldnn) diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp index f5a41b66bf..57c890e488 100644 --- a/paddle/gserver/activations/ActivationFunction.cpp +++ b/paddle/gserver/activations/ActivationFunction.cpp @@ -24,7 +24,7 @@ limitations under the License. */ #include "paddle/utils/ClassRegistrar.h" #include "paddle/utils/Logging.h" -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN #include "MKLDNNActivation.h" #endif @@ -490,7 +490,7 @@ Error __must_check backward(Argument& act) { END_DEFINE_ACTIVATION(log) ActivationFunction* ActivationFunction::create(const std::string& type) { -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN if (!type.empty() && type.compare(0, 7, "mkldnn_") == 0) { return MKLDNNActivation::create(type); } diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp index be112b4123..68bf37d59d 100644 --- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp +++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp @@ -20,7 +20,7 @@ limitations under the License. */ #include "paddle/utils/Logging.h" #include "paddle/utils/Stat.h" -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN #include "paddle/gserver/layers/MKLDNNLayer.h" #endif @@ -307,7 +307,7 @@ void NeuralNetwork::backward(const UpdateCallback& callback) { } void NeuralNetwork::finish() { -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN FOR_EACH_R(layer, layers_) { MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast(*layer); if (dnnLayer) { diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h index 94ef561f06..17563bf5e1 100644 --- a/paddle/math/Allocator.h +++ b/paddle/math/Allocator.h @@ -48,7 +48,7 @@ public: */ virtual void* alloc(size_t size) { void* ptr; -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp // memory alignment CHECK_EQ(posix_memalign(&ptr, 4096ul, size), 0); diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp index ba86eacbb5..28ab54b450 100644 --- a/paddle/math/MathFunctions.cpp +++ b/paddle/math/MathFunctions.cpp @@ -206,7 +206,7 @@ double dotProduct(const int n, const double* x, const double* y) { } #endif -#if defined(PADDLE_USE_MKLML) +#if defined(PADDLE_WITH_MKLML) template <> void vExp(const int n, const float* a, float* r) { diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h index f6e77029bd..29fe36e3a4 100644 --- a/paddle/math/MathFunctions.h +++ b/paddle/math/MathFunctions.h @@ -15,7 +15,7 @@ limitations under the License. */ #ifndef MATHFUNCTIONS_H_ #define MATHFUNCTIONS_H_ -#ifdef PADDLE_USE_MKLML +#ifdef PADDLE_WITH_MKLML #include #include #include diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc index b543b767e8..6a815a1b57 100644 --- a/paddle/memory/detail/system_allocator.cc +++ b/paddle/memory/detail/system_allocator.cc @@ -43,7 +43,7 @@ void* CPUAllocator::Alloc(size_t& index, size_t size) { void* p; -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp // memory alignment PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0); diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index 2e333a8cde..e099a6a439 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -132,7 +132,7 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } -#ifdef PADDLE_USE_MKLML +#ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> void batched_gemm( diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index 5a42854f22..f2b025b78b 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#ifdef PADDLE_USE_MKLML +#ifdef PADDLE_WITH_MKLML #include #include #include diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h index f157188a4f..5b0c52a30d 100644 --- a/paddle/parameter/FirstOrderOptimizer.h +++ b/paddle/parameter/FirstOrderOptimizer.h @@ -38,7 +38,7 @@ public: real torch_learningRate = optConfig_.learning_method() == "torch_momentum" ? 1.0 - paraConfig.momentum() : 1.0; -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN sgdUpdate(learningRate_ * paraConfig.learning_rate() * (firstTime_ ? 1.0 : torch_learningRate), paraConfig.momentum(), diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp index 1898598e49..d60cb36383 100644 --- a/paddle/parameter/ParameterUpdateFunctions.cpp +++ b/paddle/parameter/ParameterUpdateFunctions.cpp @@ -30,7 +30,7 @@ void sgdUpdateCpu(real learningRate, const real* grad, real* momentumVec) { decayRate *= learningRate; -#ifdef PADDLE_USE_MKLML +#ifdef PADDLE_WITH_MKLML #pragma omp parallel for #endif for (size_t i = 0; i < size; ++i) { diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp index 8f100f02e9..9a7dc0e356 100644 --- a/paddle/utils/Flags.cpp +++ b/paddle/utils/Flags.cpp @@ -20,7 +20,7 @@ DEFINE_bool(use_gpu, false, "Only support CPU training"); DEFINE_bool(use_gpu, true, "Whether to use GPU for training"); #endif -#ifdef PADDLE_USE_MKLDNN +#ifdef PADDLE_WITH_MKLDNN // TODO(TJ): change to true when MKLDNN layers support multi-inputs DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training"); #else From 0ca6274451d3693f363f2b8b5d6b29ce722febaf Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 11 Dec 2017 23:15:35 +0800 Subject: [PATCH 17/35] "add global regularization" (#6443) * "add global regularization" * Polish `append_regularization_ops` --- python/paddle/v2/fluid/optimizer.py | 38 +++++++++++---------------- python/paddle/v2/fluid/regularizer.py | 15 ++++++++--- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py index 719e3b2563..bbdfab2df9 100644 --- a/python/paddle/v2/fluid/optimizer.py +++ b/python/paddle/v2/fluid/optimizer.py @@ -18,8 +18,9 @@ class Optimizer(object): but need to use one of it's implementation. """ - def __init__(self, global_step=None): + def __init__(self, global_step=None, regularization=None): self._global_step = global_step + self.regularization = regularization # Dictionary of accumulators. Some optimizer subclasses need to # allocate and manage extra variables associated with the parameters # to train. These variables are called accumulators. @@ -199,7 +200,8 @@ class Optimizer(object): """ params_grads = append_backward_ops(loss, parameter_list, no_grad_set) # Add regularization if any - params_grads = append_regularization_ops(params_grads) + params_grads = append_regularization_ops(params_grads, + self.regularization) optimize_ops = self.create_optimization_pass(params_grads, loss, startup_program) return optimize_ops @@ -209,9 +211,9 @@ class SGDOptimizer(Optimizer): """ Simple SGD optimizer without any state. """ - def __init__(self, learning_rate, global_step=None): + def __init__(self, learning_rate, **kwargs): assert learning_rate is not None - super(SGDOptimizer, self).__init__(global_step) + super(SGDOptimizer, self).__init__(**kwargs) self.type = "sgd" self._learning_rate = learning_rate @@ -236,14 +238,10 @@ class MomentumOptimizer(Optimizer): """ _velocity_acc_str = "velocity" - def __init__(self, - learning_rate, - momentum, - use_nesterov=False, - global_step=None): + def __init__(self, learning_rate, momentum, use_nesterov=False, **kwargs): assert learning_rate is not None assert momentum is not None - super(MomentumOptimizer, self).__init__(global_step) + super(MomentumOptimizer, self).__init__(**kwargs) self.type = "momentum" self._learning_rate = learning_rate self._momentum = momentum @@ -284,10 +282,10 @@ class AdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, learning_rate, epsilon=1.0e-6, global_step=None): + def __init__(self, learning_rate, epsilon=1.0e-6, **kwargs): assert learning_rate is not None assert epsilon is not None - super(AdagradOptimizer, self).__init__(global_step) + super(AdagradOptimizer, self).__init__(**kwargs) self.type = "adagrad" self._learning_rate = learning_rate self._epsilon = epsilon @@ -331,12 +329,12 @@ class AdamOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - global_step=None): + **kwargs): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamOptimizer, self).__init__(global_step) + super(AdamOptimizer, self).__init__(**kwargs) self.type = "adam" self._learning_rate = learning_rate self._beta1 = beta1 @@ -436,12 +434,12 @@ class AdamaxOptimizer(Optimizer): beta1=0.9, beta2=0.999, epsilon=1e-8, - global_step=None): + **kwargs): assert learning_rate is not None assert beta1 is not None assert beta2 is not None assert epsilon is not None - super(AdamaxOptimizer, self).__init__() + super(AdamaxOptimizer, self).__init__(**kwargs) self.type = "adamax" self._learning_rate = learning_rate self._beta1 = beta1 @@ -514,16 +512,12 @@ class DecayedAdagradOptimizer(Optimizer): """ _moment_acc_str = "moment" - def __init__(self, - learning_rate, - decay=0.95, - epsilon=1.0e-6, - global_step=None): + def __init__(self, learning_rate, decay=0.95, epsilon=1.0e-6, **kwargs): assert learning_rate is not None assert decay is not None assert epsilon is not None - super(DecayedAdagradOptimizer, self).__init__(global_step) + super(DecayedAdagradOptimizer, self).__init__(**kwargs) self.type = "decayed_adagrad" self._learning_rate = learning_rate self._decay = decay diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py index bb1ac8911e..d1955b0047 100644 --- a/python/paddle/v2/fluid/regularizer.py +++ b/python/paddle/v2/fluid/regularizer.py @@ -3,7 +3,7 @@ import framework __all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay'] -def append_regularization_ops(parameters_and_grads): +def append_regularization_ops(parameters_and_grads, regularization=None): """Create and add backward regularization Operators Creates and adds backward regularization operators in the BlockDesc. @@ -14,6 +14,8 @@ def append_regularization_ops(parameters_and_grads): Args: parameters_and_grads: A list of (parameters, gradients) pairs that need to be regularized. + regularization: A global regularizer. If the parameter is not + set. It will be applied with regularizer. Returns: list of (parameters, gradients) pair with the regularized gradient @@ -23,14 +25,19 @@ def append_regularization_ops(parameters_and_grads): """ params_and_grads = [] for param, grad in parameters_and_grads: + regularization_term = None + if param.regularizer is not None: + # Add variable for regularization term in grad block + regularization_term = param.regularizer(param, grad.block) + elif regularization is not None: + regularization_term = regularization(param, grad.block) + # If no gradient or no regularization specified, # then we don't need to do anything - if grad is None or param.regularizer is None: + if grad is None or regularization_term is None: params_and_grads.append((param, grad)) continue - # Add variable for regularization term in grad block - regularization_term = param.regularizer(param, grad.block) assert grad.shape == regularization_term.shape grad.block.append_op( From 4ff6bc175a18d03f1159e78aef0c305703adbe37 Mon Sep 17 00:00:00 2001 From: Siddharth Goyal Date: Mon, 11 Dec 2017 11:33:54 -0800 Subject: [PATCH 18/35] Add row conv operator (#6013) * Fix documentation * Address review comments --- paddle/operators/row_conv_op.cc | 257 +++++++++++ paddle/operators/row_conv_op.cu | 408 ++++++++++++++++++ paddle/operators/row_conv_op.h | 33 ++ .../paddle/v2/fluid/tests/test_row_conv_op.py | 95 ++++ 4 files changed, 793 insertions(+) create mode 100644 paddle/operators/row_conv_op.cc create mode 100644 paddle/operators/row_conv_op.cu create mode 100644 paddle/operators/row_conv_op.h create mode 100644 python/paddle/v2/fluid/tests/test_row_conv_op.py diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc new file mode 100644 index 0000000000..ea0bb99f8d --- /dev/null +++ b/paddle/operators/row_conv_op.cc @@ -0,0 +1,257 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/row_conv_op.h" +#include "paddle/framework/eigen.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using framework::Tensor; + +template +using EigenMatrix = framework::EigenMatrix; + +class RowConvOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of RowConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) of RowConvOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of RowConvOp should not be null."); + + auto x_dims = ctx->GetInputDim("X"); + auto filter_dims = ctx->GetInputDim("Filter"); + PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2."); + PADDLE_ENFORCE_EQ(filter_dims.size(), 2, "Input(Y)'s rank should be 2."); + PADDLE_ENFORCE_EQ( + x_dims[1], filter_dims[1], + "The 2nd dimension of Input(X) and Input(Filter) should be same."); + ctx->SetOutputDim("Out", x_dims); + ctx->ShareLoD("X", "Out"); + } +}; + +class RowConvGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Filter"), + "Input(Filter) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Gradient of output(Out) should not be null."); + + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(x_grad_name, x_dims); + } + + auto filter_grad_name = framework::GradVarName("Filter"); + if (ctx->HasOutput(filter_grad_name)) { + auto filter_dims = ctx->GetInputDim("Filter"); + ctx->SetOutputDim(filter_grad_name, filter_dims); + } + } +}; + +class RowConvOpMaker : public framework::OpProtoAndCheckerMaker { + public: + RowConvOpMaker(framework::OpProto *proto, + framework::OpAttrChecker *op_checker) + : framework::OpProtoAndCheckerMaker(proto, op_checker) { + AddInput("X", + "(LoDTensor), the input(X) is a LodTensor, which supports " + "variable time-length input sequences. The underlying tensor " + "in this LoDTensor is a matrix with shape (T x N), where T " + "is the total time steps in this mini-batch and N is the input " + "data dimension."); + AddInput("Filter", + "(Tensor), the input(Filter) is a learnable parameter. It " + "is a 2-D tensor with shape (future_context x N), where, " + "future_context is the future context length and N is the data " + "dimension."); + AddOutput("Out", + "(LoDTensor), the output(Out) is a LodTensor, which supports " + "variable time-length input sequences. The underlying tensor " + "in this LodTensor is a matrix with shape T x N, i.e., the " + "same shape as X."); + AddComment(R"DOC( +Row-convolution Operator. + +The row convolution is called lookahead convolution. This operator was +introduced in the following paper for DeepSpeech2: +http://www.cs.cmu.edu/~dyogatam/papers/wang+etal.iclrworkshop2016.pdf + +The main motivation is that a bidirectional RNN, useful in DeepSpeech +like speech models, learns representation for a sequence by performing a +forward and a backward pass through the entire sequence. However, unlike +unidirectional RNNs, bidirectional RNNs are challenging to deploy in an online +and low-latency setting. The lookahead convolution incorporates information +from future subsequences in a computationally efficient manner to improve +unidirectional recurrent neural networks. The row convolution operator is +different from the 1D sequence convolution, and is computed as follows: + +Given an input sequence $in$ of length $t$ and input dimension $d$, +and a filter ($W$) of size $context \times d$, +the output sequence is convolved as: + +$$ +out_{i, :} = \sum_{j=i}^{i + context} in_{j,:} \dot W_{i-j, :} +$$ + +)DOC"); + } +}; + +template +class RowConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *filter = context.Input("Filter"); + auto *out = context.Output("Out"); + + out->mutable_data(context.GetPlace()); + + auto batch_indices = x->lod()[0]; + auto input_dim = x->dims()[1]; // 'in' is of size T x N + size_t num_sequence = batch_indices.size() - 1; + + auto future_context = filter->dims()[0]; + auto weights = EigenMatrix::From(*filter); + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + Tensor cur_input_sequence = + x->Slice(start, end); // Current input sequence + Tensor cur_output_sequence = + out->Slice(start, end); // Current output sequence + auto cip_seq = EigenMatrix::From(cur_input_sequence); + auto cot_seq = EigenMatrix::From(cur_output_sequence); + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + for (int d = 0; d < input_dim; d++) { + if (w == 0) { + cot_seq(k, d) = weights(w, d) * cip_seq(k + w, d); + } else { + cot_seq(k, d) += weights(w, d) * cip_seq(k + w, d); + } + } + } + } + } + } +}; + +template +class RowConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *filter = context.Input("Filter"); + auto *d_out = context.Input(framework::GradVarName("Out")); + auto *dx = context.Output(framework::GradVarName("X")); + auto *d_filter = context.Output(framework::GradVarName("Filter")); + + auto input_dim = x->dims()[1]; // 'x' is of size T x N + auto batch_indices = x->lod()[0]; + size_t num_sequence = batch_indices.size() - 1; + auto future_context = filter->dims()[0]; + + if (d_filter) { + d_filter->mutable_data(context.GetPlace()); + auto dweights = + EigenMatrix::From(*d_filter); // Gradient of weight matrix + dweights.setZero(); + + for (size_t i = 0; i < num_sequence; i++) { // For different sequences + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + + Tensor cur_input = x->Slice(start, end); // Current input sequence + Tensor cur_doutput = + d_out->Slice(start, end); // Current output grad sequence + + auto cur_ip = EigenMatrix::From(cur_input); + auto cur_dout = EigenMatrix::From(cur_doutput); + int current_timesteps = end - start; + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + // For dweights (Updating the gradient of weight matrix) + for (int d = 0; d < input_dim; d++) { + dweights(w, d) += cur_ip(k + w, d) * cur_dout(k, d); + } + } + } + } + } + + if (dx) { + dx->mutable_data(context.GetPlace()); + auto weights = EigenMatrix::From(*filter); + for (size_t i = 0; i < num_sequence; i++) { // For different sequences + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + + Tensor cur_doutput = + d_out->Slice(start, end); // Current output grad sequence + Tensor cur_dinput = + dx->Slice(start, end); // Current input grad sequence + + auto cur_dout = EigenMatrix::From(cur_doutput); + auto cur_dip = EigenMatrix::From(cur_dinput); + cur_dip.setZero(); + int current_timesteps = end - start; + + for (int k = 0; k < current_timesteps; + k++) { // For different time steps in the same sequence + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + // For dinput (Updating the gradient wrt input) + for (int d = 0; d < input_dim; d++) { + cur_dip(k + w, d) += weights(w, d) * cur_dout(k, d); + } + } + } + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad, + ops::RowConvGradOp); +REGISTER_OP_CPU_KERNEL(row_conv, + ops::RowConvKernel); +REGISTER_OP_CPU_KERNEL( + row_conv_grad, ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu new file mode 100644 index 0000000000..e0d7ebda7e --- /dev/null +++ b/paddle/operators/row_conv_op.cu @@ -0,0 +1,408 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/operators/math/math_function.h" +#include "paddle/operators/row_conv_op.h" +#include "paddle/platform/cuda_helper.h" + +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; +using framework::Tensor; + +namespace { + +inline int DivUp(int x, int y) { return (x + y - 1) / y; } + +// Forward prop (shared memory version, for small future_context) +template +__global__ void RowConvForwardSharedMemory(const T *in, const T *wt, + int num_sequence, int input_dim, + int future_context, + const size_t *batch_indices, + T *out) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int d = blockIdx.x * blx + thx; // index along input dim + + extern __shared__ T mem[]; + T *sw = mem; + + if (thy < future_context) { + sw[thy * blx + thx] = + (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); + } + __syncthreads(); + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + sum += (d < input_dim) + ? sw[w * blx + thx] * in[(start + k + w) * input_dim + d] + : static_cast(0); + } + if (d < input_dim) { + out[(start + k) * input_dim + d] = sum; + } + } + } +} + +// Forward prop (naive version) +template +__global__ void RowConvForward(const T *in, const T *wt, int num_sequence, + int input_dim, int future_context, + const size_t *batch_indices, T *out) { + int d = blockIdx.x * blockDim.x + threadIdx.x; // index along input_dim + int bly = blockDim.y; + int thy = threadIdx.y; + + if (d >= input_dim) return; + + for (size_t i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k + w) < current_timesteps); + w++) { + sum += (wt[w * input_dim + d] * in[(start + k + w) * input_dim + d]); + } + out[(start + k) * input_dim + d] = sum; + } + } +} + +// Compute input gradient (shared memory version, for small future_context) +template +__global__ void RowConvGradInputSharedMemory(const T *dout, const T *wt, + int num_sequence, int input_dim, + int future_context, + const size_t *batch_indices, + T *din) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int d = blockIdx.x * blx + thx; // index along input dim + + extern __shared__ T mem[]; + T *sw = mem; + if (thy < future_context) { + sw[thy * blx + thx] = + (d < input_dim) ? wt[thy * input_dim + d] : static_cast(0); + } + __syncthreads(); + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { + sum += (d < input_dim) + ? (sw[w * blx + thx] * dout[(k + start - w) * input_dim + d]) + : static_cast(0); + } + if (d < input_dim) { + din[(k + start) * input_dim + d] = sum; + } + } + } +} + +// Compute input gradient (Naive version) +template +__global__ void RowConvGradInput(const T *dout, const T *wt, int num_sequence, + int input_dim, int future_context, + const size_t *batch_indices, T *din) { + int d = blockIdx.x * blockDim.x + threadIdx.x; // index along input_dim + int bly = blockDim.y; + int thy = threadIdx.y; + + if (d >= input_dim) return; + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + for (int k = thy; k < current_timesteps; k += bly) { + T sum = 0; + for (int w = 0; (w < future_context) && ((k - w) >= 0); w++) { + sum += (wt[w * input_dim + d] * dout[(k + start - w) * input_dim + d]); + } + din[(k + start) * input_dim + d] = sum; + } + } +} + +// Compute W gradient (small future_context version) +template +__global__ void RowConvGradFilterImproved(const T *in, const T *dout, + int num_sequence, int input_dim, + int future_context, int block_x, + int block_y, + const size_t *batch_indices, + T *dfilter) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int gx = blockIdx.x * blx; + int d = gx + thx; // index along input dim + + extern __shared__ T mem[]; + + int xdim_sh_in = block_y; + int xdim_sh_dout = block_y; + // int xdim_sh_dfilter = future_context; + int ydim_sh_in = block_x; + int ydim_sh_dout = block_x + future_context - 1; + int ydim_sh_dfilter = block_y; + + T *sh_in = mem; + T *sh_dout = &mem[xdim_sh_in * ydim_sh_in]; + T *sh_dfilter = &mem[xdim_sh_in * ydim_sh_in + xdim_sh_dout * ydim_sh_dout]; + + if (thy < future_context) { + sh_dfilter[thy * ydim_sh_dfilter + thx] = static_cast(0); + } + __syncthreads(); + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + int scaled_cur_steps = + ((current_timesteps + block_x - 1) / block_x) * block_x; + + for (int k = thy; k < scaled_cur_steps; k += block_x) { + int pos = start + k; + sh_in[thx * ydim_sh_in + thy] = + (d < input_dim && pos < end) ? in[pos * input_dim + d] : T(0); + sh_dout[thx * ydim_sh_dout + thy + future_context - 1] = + (d < input_dim && pos < end) ? dout[pos * input_dim + d] : T(0); + __syncthreads(); + + if (thy < future_context - 1) { + int pos_offset = pos - future_context + 1; + sh_dout[thx * ydim_sh_dout + thy] = + (d < input_dim && pos_offset >= start) + ? dout[pos_offset * input_dim + d] + : T(0); + } + __syncthreads(); + + for (int w = 0; w < future_context; w++) { + T val = sh_in[thy * ydim_sh_in + thx] * + sh_dout[thy * ydim_sh_dout + thx + future_context - 1 - w]; + __syncthreads(); + + for (int offset = 16; offset > 0; + offset = offset / 2) { // blockDim.x is 32. + val += __shfl_down(val, offset); + } + __syncthreads(); + + if (thx == 0) { + sh_dfilter[w * ydim_sh_dfilter + thy] += val; + } + __syncthreads(); + } + } + } + for (int w = thy; (w < future_context) && (d < input_dim); w += bly) { + dfilter[w * input_dim + d] += sh_dfilter[w * ydim_sh_dfilter + thx]; + } +} + +// Compute weight(filter) gradient +template +__global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, + int input_dim, int future_context, + int block_x, int block_y, + const size_t *batch_indices, T *dfilter) { + int blx = blockDim.x; + int bly = blockDim.y; + int thx = threadIdx.x; + int thy = threadIdx.y; + int gx = blockIdx.x * blx; + int d = gx + thx; // index along input dim + extern __shared__ T mem[]; + T *sh_in = mem; + T *sh_dout = &mem[block_x * block_y]; + + for (int i = 0; i < num_sequence; i++) { + int start = static_cast(batch_indices[i]); + int end = static_cast(batch_indices[i + 1]); + int current_timesteps = end - start; + int scaled_cur_steps = + ((current_timesteps + block_x - 1) / block_x) * block_x; + + for (int k = thy; k < scaled_cur_steps; k += block_x) { + int pos = start + k; + sh_in[thx * block_y + thy] = + (d < input_dim && pos < end) ? in[pos * input_dim + d] : 0.0; + __syncthreads(); + + for (int w = 0; w < future_context; w++) { + sh_dout[thx * block_y + thy] = + (d < input_dim && (k - w) >= 0 && (k - w) < current_timesteps) + ? dout[(pos - w) * input_dim + d] + : 0.0; + __syncthreads(); + + T val = sh_in[thy * block_y + thx] * sh_dout[thy * block_y + thx]; + __syncthreads(); + + for (int offset = 16; offset > 0; + offset = offset / 2) { // blockDim.x is 32. + val += __shfl_down(val, offset); + } + __syncthreads(); + + if (thx == 0 && (gx + thy) < input_dim) { + dfilter[w * input_dim + gx + thy] += val; + } + } + } + } +} + +} // namespace + +template +class RowConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Filter = context.Input("Filter"); + auto *Out = context.Output("Out"); + + const T *in = X->data(); + const T *weight = Filter->data(); + T *out = Out->mutable_data(context.GetPlace()); + + auto batch_indices = X->lod()[0]; + int input_dim = X->dims()[1]; + int num_sequence = batch_indices.size() - 1; + int future_context = Filter->dims()[0]; + size_t *idx = batch_indices.data(); + auto stream = context.cuda_device_context().stream(); + + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int mem_per_block = (future_context * block_dim.x) * sizeof(T); + RowConvForwardSharedMemory< + T><<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + RowConvForward<<>>( + in, weight, num_sequence, input_dim, future_context, idx, out); + } + } +}; + +template +class RowConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *X = context.Input("X"); + auto *Filter = context.Input("Filter"); + auto *dOut = context.Input(framework::GradVarName("Out")); + const T *in = X->data(); + const T *weights = Filter->data(); + const T *dout = dOut->data(); + + Tensor *dX = context.Output(framework::GradVarName("X")); + Tensor *dFilter = context.Output(framework::GradVarName("Filter")); + + auto batch_indices = X->lod()[0]; + int input_dim = X->dims()[1]; + int num_sequence = batch_indices.size() - 1; + int future_context = Filter->dims()[0]; + size_t *idx = batch_indices.data(); + + auto &device_ctx = context.cuda_device_context(); + math::SetConstant zero; + + if (dFilter) { + T *dfilter = dFilter->mutable_data(context.GetPlace()); + zero(device_ctx, dFilter, static_cast(0.0)); + + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int block_x = block_dim.x; + int block_y = block_dim.y; + int mem_per_block = + (block_y * block_x + block_y * (block_x + future_context - 1) + + future_context * block_y) * + sizeof(T); + RowConvGradFilterImproved< + T><<>>( + in, dout, num_sequence, input_dim, future_context, block_x, block_y, + idx, dfilter); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int block_x = block_dim.x; + int block_y = block_dim.y; + int mem_per_block = + (block_x * block_y * 2) * sizeof(T); // For 2 arrays of size 32x32 + RowConvGradFilter< + T><<>>( + in, dout, num_sequence, input_dim, future_context, block_x, block_y, + idx, dfilter); + } + } + + if (dX) { + T *din = dX->mutable_data(context.GetPlace()); + if (future_context <= 32) { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + int mem_per_block = (future_context * block_dim.x) * sizeof(T); + RowConvGradInputSharedMemory< + T><<>>( + dout, weights, num_sequence, input_dim, future_context, idx, din); + } else { + dim3 block_dim = dim3(32, 32); + dim3 grid_dim = dim3(DivUp(input_dim, block_dim.x), 1); + RowConvGradInput<<>>( + dout, weights, num_sequence, input_dim, future_context, idx, din); + } + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_GPU_KERNEL(row_conv, + ops::RowConvKernel); +REGISTER_OP_GPU_KERNEL( + row_conv_grad, ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h new file mode 100644 index 0000000000..525e83908d --- /dev/null +++ b/paddle/operators/row_conv_op.h @@ -0,0 +1,33 @@ +/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "paddle/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class RowConvKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; + +template +class RowConvGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override; +}; +} // namespace operators +} // namespace paddle diff --git a/python/paddle/v2/fluid/tests/test_row_conv_op.py b/python/paddle/v2/fluid/tests/test_row_conv_op.py new file mode 100644 index 0000000000..1ed86e23ac --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_row_conv_op.py @@ -0,0 +1,95 @@ +import unittest +import numpy as np +from op_test import OpTest + + +def row_conv_forward(x, lod, wt): + out = np.zeros_like(x) + seq_info = lod[0] + num_sequences = len(seq_info) - 1 + context_length = wt.shape[0] + + for i in range(num_sequences): # loop over number of sequences + start = seq_info[i] + end = seq_info[i + 1] + curinput = x[start:end, :] + curoutput = out[start:end, :] + + cur_timesteps = end - start + for j in range(cur_timesteps): # loop over different timesteps + for k in range(context_length): + + if j + k >= cur_timesteps: + continue + curoutput[j, :] += curinput[j + k, :] * wt[k, :] + + return out + + +class TestRowConvOp1(OpTest): + def setUp(self): + + self.op_type = "row_conv" + lod = [[0, 2, 5, 7]] + T = lod[0][-1] + D = 16 + context_length = 2 + + x = np.random.random((T, D)).astype("float32") + wt = np.random.random((context_length, D)).astype("float32") + self.inputs = {'X': (x, lod), 'Filter': wt} + + out = row_conv_forward(x, lod, wt) + self.outputs = {'Out': (out, lod)} + + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.05) + + def test_check_grad_ignore_x(self): + self.check_grad( + ['Filter'], 'Out', max_relative_error=0.05, no_grad_set=set('X')) + + def test_check_grad_ignore_wt(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Filter')) + + +class TestRowConvOp2(OpTest): + def setUp(self): + + self.op_type = "row_conv" + lod = [[0, 20, 50, 100]] + T = lod[0][-1] + D = 35 + context_length = 35 + + x = np.random.random((T, D)).astype("float32") + wt = np.random.random((context_length, D)).astype("float32") + self.inputs = {'X': (x, lod), 'Filter': wt} + + out = row_conv_forward(x, lod, wt) + self.outputs = {'Out': (out, lod)} + + def test_check_output(self): + self.check_output() + + #max_relative_error is increased from 0.05 to 0.06 as for higher + #dimensional input, the dX on CPU for some values has max_rel_error + #slightly more than 0.05 + def test_check_grad_normal(self): + self.check_grad(['X', 'Filter'], 'Out', max_relative_error=0.06) + + def test_check_grad_ignore_x(self): + self.check_grad( + ['Filter'], 'Out', max_relative_error=0.06, no_grad_set=set('X')) + + def test_check_grad_ignore_wt(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.06, no_grad_set=set('Filter')) + + +if __name__ == '__main__': + unittest.main() From 35420cdf63dd1369972c26f70cac2d4d75b1492a Mon Sep 17 00:00:00 2001 From: kavyasrinet Date: Mon, 11 Dec 2017 15:33:28 -0800 Subject: [PATCH 19/35] Updating the Latex equation for Adagrad (#6009) * Updating the Latex equation for Adagrad * Fixing Latex euqations for adadelta, adam and adamax --- paddle/operators/adadelta_op.cc | 12 ++++++------ paddle/operators/adagrad_op.cc | 4 ++-- paddle/operators/adam_op.cc | 12 +++++++----- paddle/operators/adamax_op.cc | 8 ++++---- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc index 16a7794d5b..29434a0ee2 100644 --- a/paddle/operators/adadelta_op.cc +++ b/paddle/operators/adadelta_op.cc @@ -92,12 +92,12 @@ for gradient descent. Adadelta updates are as follows: -$$avgSquaredGradOut = \rho * avgSquaredGrad + (1 - \rho) * grad * grad \break -paramUpdate = - $\sqrt{((avgSquaredUpdate + \epsilon) / - (avgSquaredGrad_out + \epsilon))}$ * grad \break -avgSquaredUpdateOut = \rho * avgSquaredUpdate + (1 - \rho) * - {(paramUpdate)}^2 \break -paramOut = param + paramUpdate$$ +$$ +avg\_squared\_grad\_out = \rho * avg\_squared\_grad + (1 - \rho) * grad * grad \\ +param\_update = - \sqrt{\frac{avg\_squared\_update + \epsilon}{avg\_squared\_grad\_out + \epsilon}} * grad \\ +avg\_squared\_update\_out = \rho * avg\_squared\_update + (1 - \rho) * {param\_update}^2 \\ +param\_out = param + param\_update +$$ )DOC"); } diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index d6686e3ef3..d19602244b 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -80,8 +80,8 @@ Adaptive Gradient Algorithm (Adagrad). The update is done as follows: -$$momentOut = moment + grad * grad \break -paramOut = param - learningRate * grad / ($\sqrt{momentOut}$ + \epsilon) \break +$$moment\_out = moment + grad * grad \\ +param\_out = param - \frac{learning\_rate * grad}{\sqrt{moment\_out} + \epsilon} $$ The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf) diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc index 03faa2a7c5..a268d05484 100644 --- a/paddle/operators/adam_op.cc +++ b/paddle/operators/adam_op.cc @@ -112,11 +112,13 @@ adaptive estimates of lower-order moments. Adam updates: -$$moment_1_{out} = \beta_1 * moment_1 + (1 - \beta_1) * grad \break -moment_2_{out} = \beta_2 * moment_2 + (1 - \beta_2) * grad * grad \break -learningRate = learningRate * - $\sqrt{(1 - \beta_2_{pow})}$ / (1 - \beta_1_{pow}) \break -paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$ +$$ +moment\_1\_out = \beta_1 * moment\_1 + (1 - \beta_1) * grad \\ +moment\_2_\out = \beta_2 * moment\_2 + (1 - \beta_2) * grad * grad \\ +learning\_rate = learning\_rate * + \frac{\sqrt{1 - \beta_{2\_pow}}}{1 - \beta_{1\_pow}} \\ +param\_out = param - learning\_rate * \frac{moment\_1}{\sqrt{moment\_2} + \epsilon} +$$ )DOC"); } diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index 867ddd9790..9e7576c961 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -108,10 +108,10 @@ Adam algorithm based on the infinity norm. Adamax updates: $$ - momentOut = \beta_{1} * moment + (1 - \beta_{1}) * grad \\ - infNormOut = max(\beta_{2} * infNorm + \epsilon, |grad|) \\ - learningRate = \frac{learningRate}{1 - \beta_{1}^{Beta1Pow}} \\ - paramOut = param - learningRate * \frac{momentOut}{infNormOut} +moment\_out = \beta_1 * moment + (1 - \beta_1) * grad \\ +inf\_norm\_out = max(\beta_2 * inf\_norm + \epsilon, |grad|) \\ +learning\_rate = \frac{learning\_rate}{1 - \beta_{1\_pow}} \\ +param\_out = param - learning\_rate * \frac{moment\_out}{inf\_norm\_out} $$ The original paper does not have an epsilon attribute. From f4f17e539b70a6cdb4245267ae4027cd5202b0fa Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 12 Dec 2017 16:35:13 +0800 Subject: [PATCH 20/35] skip mkl setting in v1 with Mac --- paddle/scripts/submit_local.sh.in | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in index d71cb84df3..43d2d1b410 100755 --- a/paddle/scripts/submit_local.sh.in +++ b/paddle/scripts/submit_local.sh.in @@ -140,7 +140,11 @@ else: sys.exit(0) EOF -cpu_config +if [ "`uname -s`" == "Linux" ]; then + # only support on linux yet, with mac can use v2 + cpu_config +fi + # echo $KMP_AFFINITY $OMP_DYNAMIC case "$1" in From c175eeb387ee90d8ba8a549a2e29a1680c9f1f35 Mon Sep 17 00:00:00 2001 From: wangmeng28 Date: Tue, 12 Dec 2017 11:08:23 +0800 Subject: [PATCH 21/35] Fix wrong index in dataset downloading exception --- python/paddle/v2/dataset/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py index e31e501ce9..191d9ecfb1 100644 --- a/python/paddle/v2/dataset/common.py +++ b/python/paddle/v2/dataset/common.py @@ -71,7 +71,7 @@ def download(url, module_name, md5sum): if retry < retry_limit: retry += 1 else: - raise RuntimeError("Cannot download {0} within retry limit {2}". + raise RuntimeError("Cannot download {0} within retry limit {1}". format(url, retry_limit)) print "Cache file %s not found, downloading %s" % (filename, url) r = requests.get(url, stream=True) From f3acdd3af94cb3018134b13893cffebdf09c827c Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 12 Dec 2017 11:17:33 +0800 Subject: [PATCH 22/35] fix warning in row_conv_op.cu --- paddle/operators/row_conv_op.cu | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu index e0d7ebda7e..79b7086b24 100644 --- a/paddle/operators/row_conv_op.cu +++ b/paddle/operators/row_conv_op.cu @@ -243,7 +243,6 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, int block_x, int block_y, const size_t *batch_indices, T *dfilter) { int blx = blockDim.x; - int bly = blockDim.y; int thx = threadIdx.x; int thy = threadIdx.y; int gx = blockIdx.x * blx; From 59c74fb14adcd23fc244275ef9e74b8430cc43fc Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Tue, 12 Dec 2017 11:35:38 +0800 Subject: [PATCH 23/35] change paddle:dev to paddle:latest-dev --- doc/howto/dev/contribute_to_paddle_cn.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md index 3eb477eb65..3e0bf7b397 100644 --- a/doc/howto/dev/contribute_to_paddle_cn.md +++ b/doc/howto/dev/contribute_to_paddle_cn.md @@ -76,18 +76,18 @@ no changes added to commit (use "git add" and/or "git commit -a") ## 构建和测试 -编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:dev`来代替。 +编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家,我们的标准开发流程是把这些工具都装进一个Docker image,称为*开发镜像*,通常名字是 `paddle:latest-dev` 或者 `paddle:[version tag]-dev` 如 `paddle:0.11.0-dev`。然后所有用 `cmake && make` 的地方(比如IDE配置里)都用 `docker run paddle:latest-dev`来代替。 如要build这个开发镜像,在源码目录树的根目录中运行: ```bash -➜ docker build -t paddle:dev . +➜ docker build -t paddle:latest-dev . ``` 随后可以用这个开发镜像开始build PaddlePaddle的源码。比如如果要build一个不依赖GPU,但是支持AVX指令集,并且包括unit tests的PaddlePaddle,可以: ```bash -➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:dev +➜ docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=ON" paddle:latest-dev ``` 这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`,并且输出一个 `./build/paddle.deb`文件之外,还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*(`paddle:prod`): @@ -99,7 +99,7 @@ no changes added to commit (use "git add" and/or "git commit -a") 如果要运行所有的单元测试,可以用如下命令: ```bash -➜ docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest" +➜ docker run -it -v $(pwd):/paddle paddle:latest-dev bash -c "cd /paddle/build && ctest" ``` 关于构建和测试的更多信息,请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。 From 936f0546e3aaa772514bba721167e897769ec2a9 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 12 Dec 2017 10:24:50 +0800 Subject: [PATCH 24/35] fix img_pool maxout doc --- .../paddle/trainer_config_helpers/layers.py | 48 ++++++++++++------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index d0b14cf63c..3a82e858f7 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -2729,15 +2729,17 @@ def img_pool_layer(input, .. math:: - w = 1 + \frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} \\\\ - h = 1 + \frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} + + h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} - ceil_mode=False: .. math:: - w = 1 + \frac{floor(input\_width + 2 * padding - pool\_size)}{stride} \\\\ - h = 1 + \frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride} + + h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} The example usage is: @@ -2870,17 +2872,21 @@ def img_pool3d_layer(input, .. math:: - w = 1 + \frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} \\\\ - h = 1 + \frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} \\\\ - d = 1 + \frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} + w & = 1 + \\frac{ceil(input\_width + 2 * padding - pool\_size)}{stride} + + h & = 1 + \\frac{ceil(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + + d & = 1 + \\frac{ceil(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} - ceil_mode=False: .. math:: - w = 1 + \frac{floor(input\_width + 2 * padding - pool\_size)}{stride} \\\\ - h = 1 + \frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} \\\\ - d = 1 + \frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} \\\\ + w & = 1 + \\frac{floor(input\_width + 2 * padding - pool\_size)}{stride} + + h & = 1 + \\frac{floor(input\_height + 2 * padding\_y - pool\_size\_y)}{stride\_y} + + d & = 1 + \\frac{floor(input\_depth + 2 * padding\_z - pool\_size\_z)}{stride\_z} The example usage is: @@ -5437,13 +5443,21 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): .. math:: - out = \max_k (in[n, k, o_c , s]) \\\\ - out_{i * s + j} = \max_k in_{ k * o_{c} * s + i * s + j} \\\\ - s = \frac{input.size}{ num\_channels} \\\\ - o_{c} =\frac{num\_channels}{groups} \\\\ - 0 \le i < o_{c} \\\\ - 0 \le j < s \\\\ - 0 \le k < groups \\\\ + + out & = \max_k (in[n, k, o_c , s]) + + out_{i * s + j} & = \max_k in_{ k * o_{c} * s + i * s + j} + + s & = \\frac{input.size}{ num\_channels} + + o_{c} & = \\frac{num\_channels}{groups} + + 0 \le i & < o_{c} + + 0 \le j & < s + + 0 \le k & < groups + The simple usage is: From 7d8802938149df6b0dcf8934c0eb7fe0e4e5145c Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Tue, 12 Dec 2017 12:55:58 +0800 Subject: [PATCH 25/35] equation align --- python/paddle/trainer_config_helpers/layers.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py index 3a82e858f7..7e118b24a4 100644 --- a/python/paddle/trainer_config_helpers/layers.py +++ b/python/paddle/trainer_config_helpers/layers.py @@ -5444,19 +5444,19 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None): .. math:: - out & = \max_k (in[n, k, o_c , s]) + & out = \max_k (in[n, k, o_c , s]) - out_{i * s + j} & = \max_k in_{ k * o_{c} * s + i * s + j} + & out_{i * s + j} = \max_k in_{ k * o_{c} * s + i * s + j} - s & = \\frac{input.size}{ num\_channels} + & s = \\frac{input.size}{ num\_channels} - o_{c} & = \\frac{num\_channels}{groups} + & o_{c} = \\frac{num\_channels}{groups} - 0 \le i & < o_{c} + & 0 \le i < o_{c} - 0 \le j & < s + & 0 \le j < s - 0 \le k & < groups + & 0 \le k < groups The simple usage is: From 61ec0b951656fb402e61c4dc519e80ba3fbc61d0 Mon Sep 17 00:00:00 2001 From: QI JUN Date: Tue, 12 Dec 2017 14:00:28 +0800 Subject: [PATCH 26/35] Refine device context (#6433) There are mainly following fixes: - take `DeviceContext` as the template parameter of math functors and OpKernel instead of `Place` - remove `eigen_device` interface in base class `DeviceContext` - remove `GetEigenDevice` interface in `ExecutionContext` and base class `DeviceContext` - remove unused `platform::EigenDeviceConverter` - rename `REGISTER_OP_GPU_KERNEL` to `REGISTER_OP_CUDA_KERNEL` - rename `USE_GPU_ONLY_OP` to `USE_CUDA_ONLY_OP` --- paddle/framework/op_registry.h | 12 +- paddle/framework/operator.cc | 16 +- paddle/framework/operator.h | 26 +-- paddle/framework/operator_test.cc | 2 +- paddle/operators/CMakeLists.txt | 2 +- paddle/operators/accuracy_op.cc | 2 +- paddle/operators/accuracy_op.cu | 5 +- paddle/operators/accuracy_op.h | 2 +- paddle/operators/activation_op.cc | 21 +- paddle/operators/activation_op.cu | 23 +- paddle/operators/activation_op.h | 14 +- paddle/operators/adadelta_op.cc | 4 +- paddle/operators/adadelta_op.cu | 6 +- paddle/operators/adadelta_op.h | 4 +- paddle/operators/adagrad_op.cc | 18 +- paddle/operators/adagrad_op.cu | 20 +- paddle/operators/adagrad_op.h | 17 +- paddle/operators/adam_op.cc | 6 +- paddle/operators/adam_op.cu | 6 +- paddle/operators/adam_op.h | 10 +- paddle/operators/adamax_op.cc | 6 +- paddle/operators/adamax_op.cu | 6 +- paddle/operators/adamax_op.h | 10 +- paddle/operators/auc_op.h | 2 +- paddle/operators/batch_norm_op.cc | 14 +- paddle/operators/batch_norm_op.cu.cc | 32 +-- paddle/operators/batch_norm_op.h | 4 +- .../operators/bilinear_tensor_product_op.cc | 11 +- .../operators/bilinear_tensor_product_op.cu | 16 +- paddle/operators/bilinear_tensor_product_op.h | 41 ++-- paddle/operators/cast_op.cc | 2 +- paddle/operators/cast_op.cu | 6 +- paddle/operators/cast_op.h | 13 +- paddle/operators/chunk_eval_op.h | 2 +- paddle/operators/clip_by_norm_op.cc | 3 +- paddle/operators/clip_by_norm_op.cu | 5 +- paddle/operators/clip_by_norm_op.h | 5 +- paddle/operators/clip_op.cc | 8 +- paddle/operators/clip_op.cu | 8 +- paddle/operators/clip_op.h | 16 +- paddle/operators/compare_op.cu | 10 +- paddle/operators/compare_op.h | 31 ++- paddle/operators/concat_op.cu.cc | 9 +- paddle/operators/concat_op.h | 4 +- paddle/operators/conv_cudnn_op.cc | 22 +- paddle/operators/conv_cudnn_op.cu.cc | 32 +-- paddle/operators/conv_op.cc | 22 +- paddle/operators/conv_op.cu.cc | 26 ++- paddle/operators/conv_op.h | 55 +++-- paddle/operators/conv_shift_op.cu | 22 +- paddle/operators/conv_shift_op.h | 4 +- paddle/operators/conv_transpose_cudnn_op.cc | 18 +- .../operators/conv_transpose_cudnn_op.cu.cc | 32 +-- paddle/operators/conv_transpose_op.cc | 18 +- paddle/operators/conv_transpose_op.cu.cc | 28 ++- paddle/operators/conv_transpose_op.h | 58 ++--- paddle/operators/cos_sim_op.cc | 7 +- paddle/operators/cos_sim_op.cu | 9 +- paddle/operators/cos_sim_op.h | 10 +- paddle/operators/crf_decoding_op.cc | 5 +- paddle/operators/crf_decoding_op.h | 6 +- paddle/operators/crop_op.cc | 4 +- paddle/operators/crop_op.cu | 6 +- paddle/operators/crop_op.h | 19 +- paddle/operators/cross_entropy_op.cu | 23 +- paddle/operators/cross_entropy_op.h | 12 +- paddle/operators/decayed_adagrad_op.cc | 2 +- paddle/operators/decayed_adagrad_op.cu | 4 +- paddle/operators/decayed_adagrad_op.h | 4 +- paddle/operators/dropout_op.cc | 6 +- paddle/operators/dropout_op.cu | 12 +- paddle/operators/dropout_op.h | 10 +- paddle/operators/elementwise_add_op.cc | 16 +- paddle/operators/elementwise_add_op.cu | 21 +- paddle/operators/elementwise_add_op.h | 10 +- paddle/operators/elementwise_div_op.cc | 16 +- paddle/operators/elementwise_div_op.cu | 21 +- paddle/operators/elementwise_div_op.h | 8 +- paddle/operators/elementwise_mul_op.cc | 16 +- paddle/operators/elementwise_mul_op.cu | 21 +- paddle/operators/elementwise_mul_op.h | 8 +- paddle/operators/elementwise_op_function.h | 94 ++++---- paddle/operators/elementwise_sub_op.cc | 16 +- paddle/operators/elementwise_sub_op.cu | 21 +- paddle/operators/elementwise_sub_op.h | 8 +- paddle/operators/expand_op.cc | 7 +- paddle/operators/expand_op.cu | 9 +- paddle/operators/expand_op.h | 10 +- .../fill_constant_batch_size_like_op.cc | 11 +- .../fill_constant_batch_size_like_op.cu.cc | 13 +- .../fill_constant_batch_size_like_op.h | 7 +- paddle/operators/fill_zeros_like_op.cc | 11 +- paddle/operators/fill_zeros_like_op.cu.cc | 13 +- paddle/operators/fill_zeros_like_op.h | 7 +- paddle/operators/ftrl_op.cc | 4 +- paddle/operators/ftrl_op.cu | 4 +- paddle/operators/ftrl_op.h | 4 +- paddle/operators/gather.cu.h | 2 +- paddle/operators/gather_op.cu | 7 +- paddle/operators/gather_op.h | 3 +- paddle/operators/gaussian_random_op.cu | 4 +- paddle/operators/gru_op.cc | 11 +- paddle/operators/gru_op.cu.cc | 11 +- paddle/operators/gru_op.h | 48 ++-- paddle/operators/gru_unit_op.cc | 11 +- paddle/operators/gru_unit_op.cu | 13 +- paddle/operators/gru_unit_op.h | 64 +++--- paddle/operators/hinge_loss_op.cc | 7 +- paddle/operators/hinge_loss_op.cu | 9 +- paddle/operators/hinge_loss_op.h | 10 +- paddle/operators/huber_loss_op.cc | 7 +- paddle/operators/huber_loss_op.cu | 9 +- paddle/operators/huber_loss_op.h | 10 +- paddle/operators/l1_norm_op.cc | 7 +- paddle/operators/l1_norm_op.cu | 9 +- paddle/operators/l1_norm_op.h | 10 +- paddle/operators/linear_chain_crf_op.cc | 9 +- paddle/operators/linear_chain_crf_op.cu | 13 +- paddle/operators/linear_chain_crf_op.h | 24 +- paddle/operators/lod_reset_op.cu | 13 +- paddle/operators/lod_reset_op.h | 4 +- paddle/operators/log_loss_op.cc | 7 +- paddle/operators/log_loss_op.cu | 9 +- paddle/operators/log_loss_op.h | 8 +- paddle/operators/logical_op.cu | 8 +- paddle/operators/logical_op.h | 21 +- paddle/operators/lookup_table_op.cu | 20 +- paddle/operators/lrn_op.cc | 21 +- paddle/operators/lrn_op.cu | 30 +-- paddle/operators/lrn_op.h | 10 +- paddle/operators/lstm_op.cc | 11 +- paddle/operators/lstm_op.cu.cc | 11 +- paddle/operators/lstm_op.h | 94 ++++---- paddle/operators/lstm_unit_op.cu | 8 +- paddle/operators/lstm_unit_op.h | 4 +- paddle/operators/margin_rank_loss_op.cc | 4 +- paddle/operators/margin_rank_loss_op.cu | 8 +- paddle/operators/margin_rank_loss_op.h | 8 +- paddle/operators/math/context_project.cc | 4 +- paddle/operators/math/context_project.cu | 4 +- paddle/operators/math/context_project.h | 20 +- paddle/operators/math/cross_entropy.cc | 10 +- paddle/operators/math/cross_entropy.cu | 14 +- paddle/operators/math/cross_entropy.h | 6 +- paddle/operators/math/gru_compute.cc | 28 +-- paddle/operators/math/gru_compute.cu | 34 ++- paddle/operators/math/gru_compute.h | 13 +- paddle/operators/math/im2col.cc | 32 +-- paddle/operators/math/im2col.cu | 48 ++-- paddle/operators/math/im2col.h | 11 +- paddle/operators/math/im2col_test.cc | 27 +-- paddle/operators/math/lstm_compute.cc | 16 +- paddle/operators/math/lstm_compute.cu | 16 +- paddle/operators/math/lstm_compute.h | 13 +- paddle/operators/math/math_function.cc | 144 ++++++------ paddle/operators/math/math_function.cu | 207 ++++++++---------- paddle/operators/math/math_function.h | 81 ++++--- paddle/operators/math/math_function_impl.h | 39 ++-- paddle/operators/math/math_function_test.cc | 9 +- paddle/operators/math/math_function_test.cu | 10 +- paddle/operators/math/matmul.h | 19 +- paddle/operators/math/maxouting.cc | 16 +- paddle/operators/math/maxouting.cu | 33 ++- paddle/operators/math/maxouting.h | 14 +- paddle/operators/math/pooling.cc | 128 ++++++----- paddle/operators/math/pooling.cu | 182 +++++++-------- paddle/operators/math/pooling.h | 68 +++--- .../operators/math/selected_rows_functor.cc | 45 ++-- .../operators/math/selected_rows_functor.cu | 77 +++---- paddle/operators/math/selected_rows_functor.h | 16 +- .../math/selected_rows_functor_test.cc | 12 +- .../math/selected_rows_functor_test.cu | 12 +- paddle/operators/math/sequence2batch.cc | 16 +- paddle/operators/math/sequence2batch.cu | 19 +- paddle/operators/math/sequence2batch.h | 22 +- paddle/operators/math/sequence_pooling.cc | 18 +- paddle/operators/math/sequence_pooling.cu | 24 +- paddle/operators/math/sequence_pooling.h | 8 +- paddle/operators/math/softmax.cc | 8 +- paddle/operators/math/softmax.cu | 8 +- paddle/operators/math/softmax.h | 13 +- paddle/operators/math/softmax_impl.h | 32 ++- paddle/operators/math/unpooling.cc | 16 +- paddle/operators/math/unpooling.cu | 36 ++- paddle/operators/math/unpooling.h | 10 +- paddle/operators/math/vol2col.cc | 16 +- paddle/operators/math/vol2col.cu | 24 +- paddle/operators/math/vol2col.h | 10 +- paddle/operators/math/vol2col_test.cc | 24 +- paddle/operators/matmul_op.cc | 7 +- paddle/operators/matmul_op.cu.cc | 9 +- paddle/operators/matmul_op.h | 45 ++-- paddle/operators/maxout_op.cc | 7 +- paddle/operators/maxout_op.cu.cc | 13 +- paddle/operators/maxout_op.h | 18 +- paddle/operators/mean_op.cc | 11 +- paddle/operators/mean_op.cu | 11 +- paddle/operators/mean_op.h | 10 +- paddle/operators/minus_op.cc | 4 +- paddle/operators/minus_op.cu | 5 +- paddle/operators/minus_op.h | 5 +- paddle/operators/modified_huber_loss_op.cc | 2 +- paddle/operators/modified_huber_loss_op.cu | 8 +- paddle/operators/modified_huber_loss_op.h | 5 +- paddle/operators/momentum_op.cu | 4 +- paddle/operators/mul_op.cc | 7 +- paddle/operators/mul_op.cu.cc | 7 +- paddle/operators/mul_op.h | 18 +- paddle/operators/multiplex_op.cc | 5 +- paddle/operators/multiplex_op.cu | 16 +- paddle/operators/multiplex_op.h | 11 +- paddle/operators/nccl_op.cu.cc | 6 +- paddle/operators/nccl_op_test.cu.cc | 6 +- paddle/operators/nce_op.cc | 4 +- paddle/operators/nce_op.h | 8 +- paddle/operators/pad_op.cc | 7 +- paddle/operators/pad_op.cu | 7 +- paddle/operators/pad_op.h | 38 ++-- paddle/operators/pool_cudnn_op.cc | 26 ++- paddle/operators/pool_cudnn_op.cu.cc | 18 +- paddle/operators/pool_op.cc | 24 +- paddle/operators/pool_op.cu.cc | 26 ++- paddle/operators/pool_op.h | 59 ++--- paddle/operators/pool_with_index_op.cc | 22 +- paddle/operators/pool_with_index_op.cu.cc | 32 ++- paddle/operators/pool_with_index_op.h | 26 ++- paddle/operators/positive_negative_pair_op.h | 2 +- paddle/operators/precision_recall_op.h | 2 +- paddle/operators/prelu_op.cc | 9 +- paddle/operators/prelu_op.cu | 11 +- paddle/operators/prelu_op.h | 16 +- paddle/operators/proximal_adagrad_op.cc | 2 +- paddle/operators/proximal_adagrad_op.cu | 4 +- paddle/operators/proximal_adagrad_op.h | 10 +- paddle/operators/proximal_gd_op.cc | 3 +- paddle/operators/proximal_gd_op.cu | 5 +- paddle/operators/proximal_gd_op.h | 4 +- paddle/operators/rank_loss_op.cc | 7 +- paddle/operators/rank_loss_op.cu | 12 +- paddle/operators/rank_loss_op.h | 8 +- paddle/operators/reduce_op.cc | 15 +- paddle/operators/reduce_op.cu | 15 +- paddle/operators/reduce_op.h | 44 ++-- paddle/operators/reshape_op.cu | 4 +- paddle/operators/reshape_op.h | 4 +- paddle/operators/rmsprop_op.cc | 4 +- paddle/operators/rmsprop_op.cu | 4 +- paddle/operators/rmsprop_op.h | 4 +- paddle/operators/roi_pool_op.cc | 9 +- paddle/operators/roi_pool_op.cu | 15 +- paddle/operators/roi_pool_op.h | 9 +- paddle/operators/row_conv_op.cc | 13 +- paddle/operators/row_conv_op.cu | 17 +- paddle/operators/row_conv_op.h | 4 +- paddle/operators/scale_op.cc | 10 +- paddle/operators/scale_op.cu | 12 +- paddle/operators/scale_op.h | 5 +- paddle/operators/scatter_op.cu | 4 +- paddle/operators/seq_expand_op.cc | 7 +- paddle/operators/seq_expand_op.cu | 9 +- paddle/operators/seq_expand_op.h | 14 +- paddle/operators/sequence_concat_op.cc | 4 +- paddle/operators/sequence_concat_op.cu.cc | 10 +- paddle/operators/sequence_concat_op.h | 4 +- paddle/operators/sequence_conv_op.cc | 9 +- paddle/operators/sequence_conv_op.cu.cc | 13 +- paddle/operators/sequence_conv_op.h | 65 +++--- paddle/operators/sequence_pool_op.cc | 5 +- paddle/operators/sequence_pool_op.cu | 9 +- paddle/operators/sequence_pool_op.h | 26 ++- paddle/operators/sequence_slice_op.cc | 4 +- paddle/operators/sequence_slice_op.cu | 8 +- paddle/operators/sequence_slice_op.h | 9 +- paddle/operators/sequence_softmax_op.cc | 4 +- paddle/operators/sequence_softmax_op.cu.cc | 8 +- paddle/operators/sequence_softmax_op.h | 12 +- paddle/operators/sgd_op.cc | 13 +- paddle/operators/sgd_op.cu | 22 +- paddle/operators/sgd_op.h | 14 +- .../sigmoid_cross_entropy_with_logits_op.cc | 4 +- .../sigmoid_cross_entropy_with_logits_op.cu | 12 +- .../sigmoid_cross_entropy_with_logits_op.h | 9 +- paddle/operators/sign_op.cc | 4 +- paddle/operators/sign_op.cu | 5 +- paddle/operators/sign_op.h | 5 +- paddle/operators/smooth_l1_loss_op.cc | 5 +- paddle/operators/smooth_l1_loss_op.cu | 9 +- paddle/operators/smooth_l1_loss_op.h | 30 +-- paddle/operators/softmax_op.cc | 7 +- paddle/operators/softmax_op.cu.cc | 9 +- paddle/operators/softmax_op.h | 10 +- .../softmax_with_cross_entropy_op.cu | 40 ++-- .../operators/softmax_with_cross_entropy_op.h | 18 +- paddle/operators/split_op.cu.cc | 4 +- paddle/operators/split_op.h | 2 +- paddle/operators/squared_l2_distance_op.cc | 8 +- paddle/operators/squared_l2_distance_op.cu | 10 +- paddle/operators/squared_l2_distance_op.h | 10 +- paddle/operators/squared_l2_norm_op.cc | 4 +- paddle/operators/squared_l2_norm_op.cu | 8 +- paddle/operators/squared_l2_norm_op.h | 14 +- paddle/operators/sum_op.cc | 9 +- paddle/operators/sum_op.cu | 9 +- paddle/operators/sum_op.h | 23 +- paddle/operators/top_k_op.cu | 2 +- paddle/operators/top_k_op.h | 2 +- paddle/operators/transpose_op.cc | 6 +- paddle/operators/transpose_op.cu.cc | 9 +- paddle/operators/transpose_op.h | 29 +-- paddle/operators/uniform_random_op.cc | 2 +- paddle/operators/uniform_random_op.cu | 6 +- paddle/operators/unpool_op.cc | 11 +- paddle/operators/unpool_op.cu.cc | 13 +- paddle/operators/unpool_op.h | 22 +- paddle/platform/device_context.cc | 12 - paddle/platform/device_context.h | 17 -- paddle/platform/device_context_test.cc | 5 +- paddle/platform/transform.h | 24 +- paddle/platform/transform_test.cu | 8 +- 319 files changed, 2624 insertions(+), 2546 deletions(-) diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index daade439e5..b29238432b 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -181,8 +181,8 @@ class OpKernelRegistrar : public Registrar { return 0; \ } -#define REGISTER_OP_GPU_KERNEL(op_type, ...) \ - REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__) +#define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ + REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__) #define REGISTER_OP_CPU_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) @@ -217,7 +217,7 @@ class OpKernelRegistrar : public Registrar { #else #define USE_OP_KERNEL(op_type) \ USE_OP_DEVICE_KERNEL(op_type, CPU); \ - USE_OP_DEVICE_KERNEL(op_type, GPU) + USE_OP_DEVICE_KERNEL(op_type, CUDA) #endif #define USE_NO_KERNEL_OP(op_type) USE_OP_ITSELF(op_type); @@ -226,9 +226,9 @@ class OpKernelRegistrar : public Registrar { USE_OP_ITSELF(op_type); \ USE_OP_DEVICE_KERNEL(op_type, CPU); -#define USE_GPU_ONLY_OP(op_type) \ - USE_OP_ITSELF(op_type); \ - USE_OP_DEVICE_KERNEL(op_type, GPU) +#define USE_CUDA_ONLY_OP(op_type) \ + USE_OP_ITSELF(op_type); \ + USE_OP_DEVICE_KERNEL(op_type, CUDA) #define USE_OP(op_type) \ USE_OP_ITSELF(op_type); \ diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index f1444eeee9..e83d754783 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -22,20 +22,6 @@ limitations under the License. */ namespace paddle { namespace framework { -template <> -Eigen::DefaultDevice& ExecutionContext::GetEigenDevice< - platform::CPUPlace, Eigen::DefaultDevice>() const { - return *device_context_.GetEigenDevice(); -} - -#ifdef PADDLE_WITH_CUDA -template <> -Eigen::GpuDevice& -ExecutionContext::GetEigenDevice() const { - return *device_context_.GetEigenDevice(); -} -#endif - std::string OperatorBase::Input(const std::string& name) const { auto& ins = Inputs(name); PADDLE_ENFORCE_LE(ins.size(), 1UL, @@ -429,7 +415,7 @@ void OperatorWithKernel::Run(const Scope& scope, } OpKernelType OperatorWithKernel::GetKernelType( const ExecutionContext& ctx) const { - return OpKernelType(IndicateDataType(ctx), ctx.device_context()); + return OpKernelType(IndicateDataType(ctx), ctx.GetPlace()); } DataType OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index 60861d9293..e60dbfc313 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -276,17 +276,25 @@ class ExecutionContext { out_tensor->set_lod(in_tensor.lod()); } - template ::EigenDeviceType> - DeviceType& GetEigenDevice() const; - platform::Place GetPlace() const { return device_context_.GetPlace(); } + template + const DeviceContextType& device_context() const { + return *reinterpret_cast(&device_context_); + } + const platform::DeviceContext& device_context() const { return device_context_; } +#ifdef PADDLE_WITH_CUDA + const inline platform::CUDADeviceContext& cuda_device_context() const { + PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); + return *reinterpret_cast( + &device_context_); + } +#endif + //! Get actual name vector for this input. const std::vector& Inputs(const std::string& name) const { return op_.Inputs(name); @@ -297,14 +305,6 @@ class ExecutionContext { return op_.Outputs(name); } -#ifdef PADDLE_WITH_CUDA - const inline platform::CUDADeviceContext& cuda_device_context() const { - PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace())); - return *reinterpret_cast( - &device_context_); - } -#endif - private: const OperatorBase& op_; const Scope& scope_; diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index 59ddbc7791..b678178454 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -115,7 +115,7 @@ class OpWithKernelTest : public OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override {} OpKernelType GetKernelType(const ExecutionContext& ctx) const override { - return OpKernelType(DataType::FP32, ctx.device_context()); + return OpKernelType(DataType::FP32, ctx.GetPlace()); } }; diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt index 38b89b9eb1..5aaaf99332 100644 --- a/paddle/operators/CMakeLists.txt +++ b/paddle/operators/CMakeLists.txt @@ -138,7 +138,7 @@ function(op_library TARGET) if ("${TARGET}" STREQUAL "nccl_op") set(pybind_flag 1) # It's enough to just adding one operator to pybind - file(APPEND ${pybind_file} "USE_GPU_ONLY_OP(ncclAllReduce);\n") + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n") endif() # reduce_op contains several operators diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc index 2785a8c6fb..76da21c472 100644 --- a/paddle/operators/accuracy_op.cc +++ b/paddle/operators/accuracy_op.cc @@ -57,7 +57,7 @@ class AccuracyOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Out")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index d2dcab4e54..539a935302 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -104,5 +104,6 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { // FIXME(typhoonzero): types of T is for inference data. // label data is always int64 -REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel, - paddle::operators::AccuracyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(accuracy, + paddle::operators::AccuracyOpCUDAKernel, + paddle::operators::AccuracyOpCUDAKernel); diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h index d060e6eddd..04104a695f 100644 --- a/paddle/operators/accuracy_op.h +++ b/paddle/operators/accuracy_op.h @@ -21,7 +21,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class AccuracyKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc index 7f3118f176..63490f0ec9 100644 --- a/paddle/operators/activation_op.cc +++ b/paddle/operators/activation_op.cc @@ -611,16 +611,17 @@ REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker, REGISTER_OP(swish, ops::ActivationOp, ops::SwishOpMaker, swish_grad, ops::ActivationOpGrad); -#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_CPU_KERNEL( \ - act_type##_grad, ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CPU_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL); diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu index 97737857ab..856d3fc35d 100644 --- a/paddle/operators/activation_op.cu +++ b/paddle/operators/activation_op.cu @@ -17,16 +17,17 @@ namespace ops = paddle::operators; -#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor) \ - REGISTER_OP_GPU_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>); \ - REGISTER_OP_GPU_KERNEL( \ - act_type##_grad, ops::ActivationGradKernel>, \ - ops::ActivationGradKernel>, \ + ops::ActivationKernel>); \ + REGISTER_OP_CUDA_KERNEL( \ + act_type##_grad, \ + ops::ActivationGradKernel>, \ + ops::ActivationGradKernel>); -FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL); +FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL); diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h index ac0e0a3b01..75eefca8b8 100644 --- a/paddle/operators/activation_op.h +++ b/paddle/operators/activation_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class ActivationKernel : public framework::OpKernel { public: @@ -32,18 +32,19 @@ class ActivationKernel auto x = framework::EigenVector::Flatten(*X); auto y = framework::EigenVector::Flatten(*Y); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); Functor functor; auto attrs = functor.GetAttrs(); for (auto& attr : attrs) { *attr.second = context.Attr(attr.first); } - functor(place, x, y); + functor(*place, x, y); } }; -template +template class ActivationGradKernel : public framework::OpKernel { public: @@ -59,13 +60,14 @@ class ActivationGradKernel auto x = framework::EigenVector::Flatten(*X); auto y = framework::EigenVector::Flatten(*Y); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); Functor functor; auto attrs = functor.GetAttrs(); for (auto& attr : attrs) { *attr.second = context.Attr(attr.first); } - functor(place, x, y, dy, dx); + functor(*place, x, y, dy, dx); } }; diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc index 29434a0ee2..507811e7b5 100644 --- a/paddle/operators/adadelta_op.cc +++ b/paddle/operators/adadelta_op.cc @@ -109,5 +109,5 @@ $$ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); REGISTER_OP_CPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu index 9fb6185207..eee2d0a2f5 100644 --- a/paddle/operators/adadelta_op.cu +++ b/paddle/operators/adadelta_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adadelta_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - adadelta, ops::AdadeltaOpKernel, - ops::AdadeltaOpKernel); +REGISTER_OP_CUDA_KERNEL( + adadelta, ops::AdadeltaOpKernel, + ops::AdadeltaOpKernel); diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h index a8c5f0c8aa..819d0845db 100644 --- a/paddle/operators/adadelta_op.h +++ b/paddle/operators/adadelta_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdadeltaOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,7 +51,7 @@ class AdadeltaOpKernel : public framework::OpKernel { framework::EigenVector::Flatten(*avg_squared_grad_out_tensor); auto avg_squared_update_out = framework::EigenVector::Flatten(*avg_squared_update_out_tensor); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); avg_squared_grad_out.device(place) = rho * avg_squared_grad + (1 - rho) * grad.square(); diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc index d19602244b..5d00716316 100644 --- a/paddle/operators/adagrad_op.cc +++ b/paddle/operators/adagrad_op.cc @@ -100,8 +100,8 @@ size_t FindPos(const std::vector& rows, int64_t value) { } // namespace template -struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseAdagradFunctor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { @@ -120,7 +120,7 @@ struct SparseAdagradFunctor { {static_cast(merge_rows.size()), grad_width}), context.GetPlace()); - math::SetConstant constant_functor; + math::SetConstant constant_functor; constant_functor(context, grad_merge->mutable_value(), 0.0); auto* grad_merge_data = grad_merge->mutable_value()->data(); @@ -144,9 +144,9 @@ struct SparseAdagradFunctor { auto gs = framework::EigenVector::Flatten(*(grad_square->mutable_value())); auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.GetEigenDevice()) = gm * gm; + gs.device(*context.eigen_device()) = gm * gm; - math::SelectedRowsAddToTensor functor; + math::SelectedRowsAddToTensor functor; functor(context, *grad_square, moment); // 3. update parameter @@ -164,13 +164,13 @@ struct SparseAdagradFunctor { } }; -template struct SparseAdagradFunctor; -template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker); REGISTER_OP_CPU_KERNEL( - adagrad, ops::AdagradOpKernel, - ops::AdagradOpKernel); + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu index 1c870214b2..585b2d9289 100644 --- a/paddle/operators/adagrad_op.cu +++ b/paddle/operators/adagrad_op.cu @@ -72,8 +72,8 @@ __global__ void SparseAdagradFunctorKernel(const T* grad, const int64_t* rows, } // namespace template -struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseAdagradFunctor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param) { @@ -92,7 +92,7 @@ struct SparseAdagradFunctor { {static_cast(merge_rows.size()), grad_width}), context.GetPlace()); - math::SetConstant constant_functor; + math::SetConstant constant_functor; constant_functor(context, grad_merge->mutable_value(), 0.0); auto* grad_merge_data = grad_merge->mutable_value()->data(); @@ -119,9 +119,9 @@ struct SparseAdagradFunctor { auto gs = framework::EigenVector::Flatten(*(grad_square->mutable_value())); auto gm = framework::EigenVector::Flatten(grad_merge->value()); - gs.device(*context.GetEigenDevice()) = gm * gm; + gs.device(*context.eigen_device()) = gm * gm; - math::SelectedRowsAddToTensor functor; + math::SelectedRowsAddToTensor functor; functor(context, *grad_square, moment); // 3. update parameter @@ -139,13 +139,13 @@ struct SparseAdagradFunctor { } }; -template struct SparseAdagradFunctor; -template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; +template struct SparseAdagradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - adagrad, ops::AdagradOpKernel, - ops::AdagradOpKernel); +REGISTER_OP_CUDA_KERNEL( + adagrad, ops::AdagradOpKernel, + ops::AdagradOpKernel); diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h index 4d4a6434c7..0d77dbcbac 100644 --- a/paddle/operators/adagrad_op.h +++ b/paddle/operators/adagrad_op.h @@ -19,15 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template struct SparseAdagradFunctor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& grad, const framework::Tensor& learning_rate, T epsilon, framework::Tensor* moment, framework::Tensor* param); }; -template +template class AdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,11 +52,11 @@ class AdagradOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment_out.device(place) = moment + grad * grad; + moment_out.device(*place) = moment + grad * grad; Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); } else if (grad_var->IsType()) { auto* param_tensor = ctx.Input("Param"); @@ -65,8 +65,9 @@ class AdagradOpKernel : public framework::OpKernel { auto* moment_tensor = ctx.Input("Moment"); PADDLE_ENFORCE_EQ(moment_tensor, moment_out_tensor); - SparseAdagradFunctor functor; - functor(ctx.device_context(), *ctx.Input("Grad"), + SparseAdagradFunctor functor; + functor(ctx.template device_context(), + *ctx.Input("Grad"), *ctx.Input("LearningRate"), epsilon, moment_out_tensor, param_out_tensor); } else { diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc index a268d05484..cf6ef6dd53 100644 --- a/paddle/operators/adam_op.cc +++ b/paddle/operators/adam_op.cc @@ -128,6 +128,6 @@ $$ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker); -REGISTER_OP_CPU_KERNEL(adam, - ops::AdamOpKernel, - ops::AdamOpKernel); +REGISTER_OP_CPU_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu index 6e34f7818c..c135b37378 100644 --- a/paddle/operators/adam_op.cu +++ b/paddle/operators/adam_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adam_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(adam, - ops::AdamOpKernel, - ops::AdamOpKernel); +REGISTER_OP_CUDA_KERNEL( + adam, ops::AdamOpKernel, + ops::AdamOpKernel); diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 7f7fa1da1c..45157842a6 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdamOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,17 +52,17 @@ class AdamOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment1_out = framework::EigenVector::Flatten(*moment1_out_tensor); auto moment2_out = framework::EigenVector::Flatten(*moment2_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad; - moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square(); + moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad; + moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square(); // All of these are tensors of 1 element auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow); // Eigen does not support automatic broadcast // Get dimensions of moment vector to broadcast lr_t Eigen::DSizes m_dsize(moment1_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr_t.broadcast(m_dsize) * (moment1_out / (moment2_out.sqrt() + epsilon)); diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc index 9e7576c961..49ce497bb7 100644 --- a/paddle/operators/adamax_op.cc +++ b/paddle/operators/adamax_op.cc @@ -127,6 +127,6 @@ division by 0 error. namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker); -REGISTER_OP_CPU_KERNEL(adamax, - ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +REGISTER_OP_CPU_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/operators/adamax_op.cu b/paddle/operators/adamax_op.cu index 057ef39025..2d143905c4 100644 --- a/paddle/operators/adamax_op.cu +++ b/paddle/operators/adamax_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/adamax_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(adamax, - ops::AdamaxOpKernel, - ops::AdamaxOpKernel); +REGISTER_OP_CUDA_KERNEL( + adamax, ops::AdamaxOpKernel, + ops::AdamaxOpKernel); diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h index bf36ed7860..172c179c5f 100644 --- a/paddle/operators/adamax_op.h +++ b/paddle/operators/adamax_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class AdamaxOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,14 +51,14 @@ class AdamaxOpKernel : public framework::OpKernel { auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); auto inf_norm_out = framework::EigenVector::Flatten(*inf_norm_out_tensor); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); - moment_out.device(place) = beta1 * moment + (1 - beta1) * grad; - inf_norm_out.device(place) = + moment_out.device(*place) = beta1 * moment + (1 - beta1) * grad; + inf_norm_out.device(*place) = grad.abs().cwiseMax((beta2 * inf_norm) + epsilon); auto lr_t = lr / (1 - beta1_pow); Eigen::DSizes m_dsize(moment_out_tensor->numel()); - param_out.device(place) = + param_out.device(*place) = param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out); } }; diff --git a/paddle/operators/auc_op.h b/paddle/operators/auc_op.h index e5ac57b038..b80509e2a9 100644 --- a/paddle/operators/auc_op.h +++ b/paddle/operators/auc_op.h @@ -25,7 +25,7 @@ template using EigenVector = framework::EigenVector; -template +template class AucKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc index ac97bd83ab..94a972b7ab 100644 --- a/paddle/operators/batch_norm_op.cc +++ b/paddle/operators/batch_norm_op.cc @@ -135,7 +135,8 @@ The required data format for this layer is one of the following: }; template -class BatchNormKernel : public framework::OpKernel { +class BatchNormKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { const float epsilon = ctx.Attr("epsilon"); @@ -318,12 +319,12 @@ class BatchNormGradOp : public framework::OperatorWithKernel { PADDLE_THROW("can't find Y@GRAD"); } return framework::OpKernelType(framework::ToDataType(t->type()), - ctx.device_context()); + ctx.GetPlace()); } }; template -class BatchNormGradKernel +class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -436,8 +437,9 @@ class BatchNormGradKernel namespace ops = paddle::operators; REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker, batch_norm_grad, ops::BatchNormGradOp); -REGISTER_OP_CPU_KERNEL(batch_norm, - ops::BatchNormKernel); +REGISTER_OP_CPU_KERNEL( + batch_norm, + ops::BatchNormKernel); REGISTER_OP_CPU_KERNEL( batch_norm_grad, - ops::BatchNormGradKernel); + ops::BatchNormGradKernel); diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc index 7b2f318700..c7adc3d80e 100644 --- a/paddle/operators/batch_norm_op.cu.cc +++ b/paddle/operators/batch_norm_op.cu.cc @@ -47,7 +47,8 @@ void ExtractNCWHD(const framework::DDim &dims, } template -class BatchNormKernel : public framework::OpKernel { +class BatchNormKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), @@ -121,11 +122,12 @@ class BatchNormKernel : public framework::OpKernel { saved_mean->mutable_data(ctx.GetPlace()); saved_variance->mutable_data(ctx.GetPlace()); - math::SetConstant functor; - functor(ctx.device_context(), saved_mean, 0); - functor(ctx.device_context(), saved_variance, 0); + auto &dev_ctx = ctx.template device_context(); + math::SetConstant functor; + functor(dev_ctx, saved_mean, 0); + functor(dev_ctx, saved_variance, 0); - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto handle = dev_ctx.cudnn_handle(); // Now, depending on whether we are running test or not, we have two paths. if (is_test) { @@ -171,7 +173,7 @@ class BatchNormKernel : public framework::OpKernel { }; template -class BatchNormGradKernel +class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -244,11 +246,12 @@ class BatchNormGradKernel const void *saved_mean_data = saved_mean->template data(); const void *saved_var_data = saved_var->template data(); + auto &dev_ctx = ctx.template device_context(); CUDNN_ENFORCE(platform::dynload::cudnnBatchNormalizationBackward( - ctx.cuda_device_context().cudnn_handle(), mode_, - CudnnDataType::kOne(), CudnnDataType::kZero(), - CudnnDataType::kOne(), CudnnDataType::kZero(), data_desc_, - x->template data(), data_desc_, d_y->template data(), data_desc_, + dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), + CudnnDataType::kZero(), CudnnDataType::kOne(), + CudnnDataType::kZero(), data_desc_, x->template data(), + data_desc_, d_y->template data(), data_desc_, d_x->template mutable_data(ctx.GetPlace()), bn_param_desc_, scale->template data(), d_scale->template mutable_data(ctx.GetPlace()), @@ -266,8 +269,9 @@ class BatchNormGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(batch_norm, - ops::BatchNormKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + batch_norm, + ops::BatchNormKernel); +REGISTER_OP_CUDA_KERNEL( batch_norm_grad, - ops::BatchNormGradKernel); + ops::BatchNormGradKernel); diff --git a/paddle/operators/batch_norm_op.h b/paddle/operators/batch_norm_op.h index 4e80134a1a..8d99b68647 100644 --- a/paddle/operators/batch_norm_op.h +++ b/paddle/operators/batch_norm_op.h @@ -34,13 +34,13 @@ inline TensorFormat StringToTensorFormat(const std::string& str) { } } -template +template class BatchNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override; }; -template +template class BatchNormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override; diff --git a/paddle/operators/bilinear_tensor_product_op.cc b/paddle/operators/bilinear_tensor_product_op.cc index c88b2c9beb..217fd52366 100644 --- a/paddle/operators/bilinear_tensor_product_op.cc +++ b/paddle/operators/bilinear_tensor_product_op.cc @@ -159,9 +159,12 @@ REGISTER_OP(bilinear_tensor_product, ops::BilinearTensorProductOp, ops::BilinearTensorProductOpGrad); REGISTER_OP_CPU_KERNEL( bilinear_tensor_product, - ops::BilinearTensorProductKernel, - ops::BilinearTensorProductKernel); + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); REGISTER_OP_CPU_KERNEL( bilinear_tensor_product_grad, - ops::BilinearTensorProductGradKernel, - ops::BilinearTensorProductGradKernel); + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/operators/bilinear_tensor_product_op.cu b/paddle/operators/bilinear_tensor_product_op.cu index 858d2668d0..0f48010716 100644 --- a/paddle/operators/bilinear_tensor_product_op.cu +++ b/paddle/operators/bilinear_tensor_product_op.cu @@ -16,11 +16,15 @@ limitations under the License. */ #include "paddle/operators/bilinear_tensor_product_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( bilinear_tensor_product, - ops::BilinearTensorProductKernel, - ops::BilinearTensorProductKernel); -REGISTER_OP_GPU_KERNEL( + ops::BilinearTensorProductKernel, + ops::BilinearTensorProductKernel); +REGISTER_OP_CUDA_KERNEL( bilinear_tensor_product_grad, - ops::BilinearTensorProductGradKernel, - ops::BilinearTensorProductGradKernel); + ops::BilinearTensorProductGradKernel, + ops::BilinearTensorProductGradKernel); diff --git a/paddle/operators/bilinear_tensor_product_op.h b/paddle/operators/bilinear_tensor_product_op.h index 1113a4c6f3..ba9a2c5ce3 100644 --- a/paddle/operators/bilinear_tensor_product_op.h +++ b/paddle/operators/bilinear_tensor_product_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class BilinearTensorProductKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -46,7 +46,8 @@ class BilinearTensorProductKernel : public framework::OpKernel { int out_dim = weight_dims[0]; auto x_dim = weight_dims[1]; auto y_dim = weight_dims[2]; - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); // Create the intermediate variable to caculate the result of // Input(X) multiplied by Input(Weight_i), the formula is: @@ -60,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel { auto output_col_vec = output_mat.chip(i, 1); Tensor weight_mat = weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim})); - math::gemm(ctx.device_context(), CblasNoTrans, CblasNoTrans, - batch_size, y_dim, x_dim, 1, x->data(), - weight_mat.data(), 0, left_mul.data()); + math::gemm(dev_ctx, CblasNoTrans, CblasNoTrans, + batch_size, y_dim, x_dim, 1, x->data(), + weight_mat.data(), 0, left_mul.data()); output_col_vec.device(place) = (left_mul_mat * y_mat).sum(Eigen::DSizes(1)); } @@ -74,7 +75,7 @@ class BilinearTensorProductKernel : public framework::OpKernel { } }; -template +template class BilinearTensorProductGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -96,8 +97,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { auto x_mat = EigenMatrix::From(*x); auto y_mat = EigenMatrix::From(*y); auto d_out_mat = EigenMatrix::From(*d_out); - auto place = ctx.GetEigenDevice(); - + auto& place = *ctx.template device_context().eigen_device(); + auto& dev_ctx = ctx.template device_context(); // Create the intermediate variable to caculate the Output(Y@Grad). Tensor x_scale; x_scale.mutable_data(framework::make_ddim({batch_size, x_dim}), @@ -110,18 +111,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { ctx.GetPlace()); auto y_scale_mat = EigenMatrix::From(y_scale); - math::SetConstant set_zero; + math::SetConstant set_zero; // Set Output(X@Grad) be zero. if (d_x) { d_x->mutable_data(ctx.GetPlace()); - set_zero(ctx.device_context(), d_x, static_cast(0)); + set_zero(dev_ctx, d_x, static_cast(0)); } // Set Output(Y@Grad) be zero. if (d_y) { d_y->mutable_data(ctx.GetPlace()); - set_zero(ctx.device_context(), d_y, static_cast(0)); + set_zero(dev_ctx, d_y, static_cast(0)); } // Caculate the Output(X@Grad) and Output(Y@Grad). @@ -137,18 +138,18 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_x) * y_mat; - math::gemm(ctx.device_context(), CblasNoTrans, CblasTrans, - batch_size, x_dim, y_dim, 1, y_scale.data(), - weight_i.data(), 1, d_x->data()); + math::gemm( + dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1, + y_scale.data(), weight_i.data(), 1, d_x->data()); } if (d_y) { x_scale_mat.device(place) = output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_y) * x_mat; - math::gemm(ctx.device_context(), CblasNoTrans, CblasNoTrans, - batch_size, y_dim, x_dim, 1, x_scale.data(), - weight_i.data(), 1, d_y->data()); + math::gemm( + dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, + x_scale.data(), weight_i.data(), 1, d_y->data()); } } } @@ -165,9 +166,9 @@ class BilinearTensorProductGradKernel : public framework::OpKernel { output_vec.reshape(Eigen::DSizes(batch_size, 1)) .broadcast(bcast_for_weight) * x_mat; - math::gemm(ctx.device_context(), CblasTrans, CblasNoTrans, - x_dim, y_dim, batch_size, 1, x_scale.data(), - y->data(), 0, d_weight_i.data()); + math::gemm(dev_ctx, CblasTrans, CblasNoTrans, x_dim, + y_dim, batch_size, 1, x_scale.data(), + y->data(), 0, d_weight_i.data()); } } diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc index 3082a53ccf..42bff69a1e 100644 --- a/paddle/operators/cast_op.cc +++ b/paddle/operators/cast_op.cc @@ -68,7 +68,7 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker { } // namespace paddle namespace ops = paddle::operators; -using CPU = paddle::platform::CPUPlace; +using CPU = paddle::platform::CPUDeviceContext; REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, ops::CastOpProtoMaker); REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu index fb75ddbabf..4681deaa62 100644 --- a/paddle/operators/cast_op.cu +++ b/paddle/operators/cast_op.cu @@ -16,7 +16,7 @@ template using CastOpKernel = - paddle::operators::CastOpKernel; + paddle::operators::CastOpKernel; -REGISTER_OP_GPU_KERNEL(cast, CastOpKernel, CastOpKernel, - CastOpKernel, CastOpKernel); +REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, + CastOpKernel, CastOpKernel); diff --git a/paddle/operators/cast_op.h b/paddle/operators/cast_op.h index 850dc8e349..a6773f13a8 100644 --- a/paddle/operators/cast_op.h +++ b/paddle/operators/cast_op.h @@ -27,13 +27,13 @@ struct CastOpTransformFunctor { HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } }; -template +template struct CastOpFunctor { const framework::Tensor* in_; framework::Tensor* out_; - const platform::DeviceContext& ctx_; + const DeviceContext& ctx_; CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, - const platform::DeviceContext& ctx) + const DeviceContext& ctx) : in_(in), out_(out), ctx_(ctx) {} template @@ -42,13 +42,13 @@ struct CastOpFunctor { auto numel = in_->numel(); auto* in_end = in_begin + numel; auto* out_begin = out_->mutable_data(ctx_.GetPlace()); - platform::Transform trans; + platform::Transform trans; trans(ctx_, in_begin, in_end, out_begin, CastOpTransformFunctor()); } }; -template +template class CastOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,7 +56,8 @@ class CastOpKernel : public framework::OpKernel { auto* out = context.Output("Out"); framework::VisitDataType( static_cast(context.Attr("out_dtype")), - CastOpFunctor(in, out, context.device_context())); + CastOpFunctor( + in, out, context.template device_context())); } }; diff --git a/paddle/operators/chunk_eval_op.h b/paddle/operators/chunk_eval_op.h index dd88f2553b..9cd758a825 100644 --- a/paddle/operators/chunk_eval_op.h +++ b/paddle/operators/chunk_eval_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class ChunkEvalKernel : public framework::OpKernel { public: struct Segment { diff --git a/paddle/operators/clip_by_norm_op.cc b/paddle/operators/clip_by_norm_op.cc index f73d55bbe3..0b7975a63f 100644 --- a/paddle/operators/clip_by_norm_op.cc +++ b/paddle/operators/clip_by_norm_op.cc @@ -71,4 +71,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(clip_by_norm, ops::ClipByNormOp, ops::ClipByNormOpMaker); REGISTER_OP_CPU_KERNEL( - clip_by_norm, ops::ClipByNormKernel); + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/operators/clip_by_norm_op.cu b/paddle/operators/clip_by_norm_op.cu index 2593a24ebb..acd7543823 100644 --- a/paddle/operators/clip_by_norm_op.cu +++ b/paddle/operators/clip_by_norm_op.cu @@ -15,5 +15,6 @@ #include "paddle/operators/clip_by_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - clip_by_norm, ops::ClipByNormKernel); +REGISTER_OP_CUDA_KERNEL( + clip_by_norm, + ops::ClipByNormKernel); diff --git a/paddle/operators/clip_by_norm_op.h b/paddle/operators/clip_by_norm_op.h index b26476cae9..d8db1566b0 100644 --- a/paddle/operators/clip_by_norm_op.h +++ b/paddle/operators/clip_by_norm_op.h @@ -26,7 +26,7 @@ template using EigenVector = framework::EigenVector; -template +template class ClipByNormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,7 +38,8 @@ class ClipByNormKernel : public framework::OpKernel { auto x = EigenVector::Flatten(*input); auto out = EigenVector::Flatten(*output); auto x_norm = x.square().sum().sqrt(); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto temp = (x_norm <= max_norm).template cast().eval(); auto scaling = temp + (static_cast(1) - temp) * max_norm / x_norm; diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc index 4ddf24dea3..6092212de4 100644 --- a/paddle/operators/clip_op.cc +++ b/paddle/operators/clip_op.cc @@ -83,7 +83,7 @@ class ClipOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(clip, ops::ClipOp, ops::ClipOpMaker, clip_grad, ops::ClipOpGrad); -REGISTER_OP_CPU_KERNEL(clip, - ops::ClipKernel); -REGISTER_OP_CPU_KERNEL(clip_grad, - ops::ClipGradKernel); +REGISTER_OP_CPU_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CPU_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/operators/clip_op.cu b/paddle/operators/clip_op.cu index ca9701298f..bb7dcc671a 100644 --- a/paddle/operators/clip_op.cu +++ b/paddle/operators/clip_op.cu @@ -15,7 +15,7 @@ #include "paddle/operators/clip_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(clip, - ops::ClipKernel); -REGISTER_OP_GPU_KERNEL(clip_grad, - ops::ClipGradKernel); +REGISTER_OP_CUDA_KERNEL( + clip, ops::ClipKernel); +REGISTER_OP_CUDA_KERNEL( + clip_grad, ops::ClipGradKernel); diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h index ac702e9935..0c40797410 100644 --- a/paddle/operators/clip_op.h +++ b/paddle/operators/clip_op.h @@ -55,7 +55,7 @@ class ClipGradFunctor { T max_; }; -template +template class ClipKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -66,13 +66,13 @@ class ClipKernel : public framework::OpKernel { T* out_data = out->mutable_data(context.GetPlace()); const T* x_data = x->data(); int64_t numel = x->numel(); - Transform trans; - trans(context.device_context(), x_data, x_data + numel, out_data, - ClipFunctor(min, max)); + Transform trans; + trans(context.template device_context(), x_data, + x_data + numel, out_data, ClipFunctor(min, max)); } }; -template +template class ClipGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -86,9 +86,9 @@ class ClipGradKernel : public framework::OpKernel { auto* d_x_data = d_x->mutable_data(context.GetPlace()); const T* d_out_data = d_out->data(); const T* x_data = x->data(); - Transform trans; - trans(context.device_context(), d_out_data, d_out_data + numel, x_data, - d_x_data, ClipGradFunctor(min, max)); + Transform trans; + trans(context.template device_context(), d_out_data, + d_out_data + numel, x_data, d_x_data, ClipGradFunctor(min, max)); } } }; diff --git a/paddle/operators/compare_op.cu b/paddle/operators/compare_op.cu index 6ac8c124b9..596a878bcf 100644 --- a/paddle/operators/compare_op.cu +++ b/paddle/operators/compare_op.cu @@ -14,10 +14,10 @@ #include "paddle/operators/compare_op.h" -REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor); -REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor); -REGISTER_LOGICAL_KERNEL(greater_than, GPU, +REGISTER_LOGICAL_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); +REGISTER_LOGICAL_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); +REGISTER_LOGICAL_KERNEL(greater_than, CUDA, paddle::operators::GreaterThanFunctor); -REGISTER_LOGICAL_KERNEL(greater_equal, GPU, +REGISTER_LOGICAL_KERNEL(greater_equal, CUDA, paddle::operators::GreaterEqualFunctor); -REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor); +REGISTER_LOGICAL_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h index afdf3ab3e0..a56536e155 100644 --- a/paddle/operators/compare_op.h +++ b/paddle/operators/compare_op.h @@ -59,7 +59,7 @@ struct EqualFunctor { } }; -template +template class CompareOpKernel : public framework::OpKernel { public: @@ -69,24 +69,23 @@ class CompareOpKernel auto* y = context.Input("Y"); auto* out = context.Output("Out"); Functor binary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), - y->data(), out->mutable_data(context.GetPlace()), - binary_func); + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); } }; } // namespace operators } // namespace paddle -#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>, \ - ::paddle::operators::CompareOpKernel<::paddle::platform::dev##Place, \ - functor>); +#define REGISTER_LOGICAL_KERNEL(op_type, dev, functor) \ + REGISTER_OP_##dev##_KERNEL( \ + op_type, ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/operators/concat_op.cu.cc b/paddle/operators/concat_op.cu.cc index ede832ddcd..7b46452d3d 100644 --- a/paddle/operators/concat_op.cu.cc +++ b/paddle/operators/concat_op.cu.cc @@ -14,7 +14,8 @@ limitations under the License. */ #include "paddle/operators/concat_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(concat, - ops::ConcatKernel); -REGISTER_OP_GPU_KERNEL( - concat_grad, ops::ConcatGradKernel); +REGISTER_OP_CUDA_KERNEL( + concat, ops::ConcatKernel); +REGISTER_OP_CUDA_KERNEL( + concat_grad, + ops::ConcatGradKernel); diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h index c113f19fb5..de4011585a 100644 --- a/paddle/operators/concat_op.h +++ b/paddle/operators/concat_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class ConcatKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -43,7 +43,7 @@ class ConcatKernel : public framework::OpKernel { } }; -template +template class ConcatGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc index 0dd8c13b2a..008bf01885 100644 --- a/paddle/operators/conv_cudnn_op.cc +++ b/paddle/operators/conv_cudnn_op.cc @@ -57,18 +57,20 @@ REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker, REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker, conv3d_cudnn_grad, ops::ConvOpGrad); -REGISTER_OP_CPU_KERNEL(conv2d_cudnn, - ops::GemmConvKernel, - ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_cudnn, + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( conv2d_cudnn_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL(conv3d_cudnn, - ops::GemmConvKernel, - ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_cudnn, + ops::GemmConvKernel, + ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( conv3d_cudnn_grad, - ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index bc265dcc4f..3da0a9001a 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -118,7 +118,8 @@ class CudnnConvOpKernel : public framework::OpKernel { } // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionFwdAlgo_t algo; - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, @@ -238,7 +239,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel { workspace_size_limit = user_workspace_size * 1024 * 1024; } - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); if (input_grad) { PADDLE_ENFORCE( platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( @@ -313,16 +315,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(conv2d_cudnn, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); -REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad, - paddle::operators::CudnnConvGradOpKernel, - paddle::operators::CudnnConvGradOpKernel); - -REGISTER_OP_GPU_KERNEL(conv3d_cudnn, - paddle::operators::CudnnConvOpKernel, - paddle::operators::CudnnConvOpKernel); -REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad, - paddle::operators::CudnnConvGradOpKernel, - paddle::operators::CudnnConvGradOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_cudnn, + paddle::operators::CudnnConvOpKernel, + paddle::operators::CudnnConvOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel, + paddle::operators::CudnnConvGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(conv3d_cudnn, + paddle::operators::CudnnConvOpKernel, + paddle::operators::CudnnConvOpKernel); +REGISTER_OP_CUDA_KERNEL(conv3d_cudnn_grad, + paddle::operators::CudnnConvGradOpKernel, + paddle::operators::CudnnConvGradOpKernel); diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc index 462e6d9cbc..7ef805fd44 100644 --- a/paddle/operators/conv_op.cc +++ b/paddle/operators/conv_op.cc @@ -235,16 +235,18 @@ namespace ops = paddle::operators; REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, ops::ConvOpGrad); -REGISTER_OP_CPU_KERNEL(conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv2d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_CPU_KERNEL(conv3d, - ops::GemmConvKernel, - ops::GemmConvKernel); REGISTER_OP_CPU_KERNEL( - conv3d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CPU_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc index 546451234a..38615a8bef 100644 --- a/paddle/operators/conv_op.cu.cc +++ b/paddle/operators/conv_op.cu.cc @@ -16,16 +16,18 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv2d, - ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_GPU_KERNEL( - conv2d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv2d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); -REGISTER_OP_GPU_KERNEL(conv3d, - ops::GemmConvKernel, - ops::GemmConvKernel); -REGISTER_OP_GPU_KERNEL( - conv3d_grad, ops::GemmConvGradKernel, - ops::GemmConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d, ops::GemmConvKernel, + ops::GemmConvKernel); +REGISTER_OP_CUDA_KERNEL( + conv3d_grad, + ops::GemmConvGradKernel, + ops::GemmConvGradKernel); diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h index 09bff0a68d..749258183b 100644 --- a/paddle/operators/conv_op.h +++ b/paddle/operators/conv_op.h @@ -72,7 +72,7 @@ class ConvOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override; }; -template +template class GemmConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -141,9 +141,10 @@ class GemmConvKernel : public framework::OpKernel { int in_step = static_cast(input->dims()[1]) / groups; int out_step = static_cast(output->dims()[1]) / groups; - math::Vol2ColFunctor vol2col; - math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + auto& dev_ctx = context.template device_context(); for (int i = 0; i < batch_size; i++) { Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape); Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape); @@ -157,27 +158,26 @@ class GemmConvKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); } else if (data_dim == 2U) { // im2col - im2col(context.device_context(), in_slice, dilations, strides, + im2col(dev_ctx, in_slice, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &col); } else if (data_dim == 3U) { // vol2col - vol2col(context.device_context(), in_slice, dilations, strides, - paddings, &col); + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); } // gemm Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - math::matmul(context.device_context(), filter_slice, false, - col_matrix, false, T(1.0), &out_slice, T(0.0)); + math::matmul(dev_ctx, filter_slice, false, col_matrix, + false, T(1.0), &out_slice, T(0.0)); } } } }; -template +template class GemmConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -256,14 +256,15 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); } - math::SetConstant set_zero; + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); if (input_grad) { input_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); - math::Col2VolFunctor col2vol; - math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = @@ -282,18 +283,17 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.ShareDataWith(in_grad_slice); col_matrix.Resize(col_matrix_shape); } - math::matmul(context.device_context(), filter_slice, true, - out_grad_slice, false, T(1.0), &col_matrix, - T(0.0)); + math::matmul(dev_ctx, filter_slice, true, + out_grad_slice, false, T(1.0), + &col_matrix, T(0.0)); if (is_expand && data_dim == 2U) { - col2im(context.device_context(), col, dilations, strides, + col2im(dev_ctx, col, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &in_grad_slice); } else if (is_expand && data_dim == 3U) { - col2vol(context.device_context(), col, dilations, strides, paddings, - &in_grad_slice); + col2vol(dev_ctx, col, dilations, strides, paddings, &in_grad_slice); } } } @@ -303,9 +303,9 @@ class GemmConvGradKernel : public framework::OpKernel { filter_grad->mutable_data(context.GetPlace()); Tensor filter_grad_ = *filter_grad; filter_grad_.Resize(filter_matrix_shape); - set_zero(context.device_context(), filter_grad, static_cast(0)); - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; + set_zero(dev_ctx, filter_grad, static_cast(0)); + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; for (int i = 0; i < batch_size; i++) { Tensor out_grad_batch = output_grad->Slice(i, i + 1).Resize(output_matrix_shape); @@ -321,21 +321,20 @@ class GemmConvGradKernel : public framework::OpKernel { col_matrix.ShareDataWith(col); col_matrix.Resize(col_matrix_shape); } else if (data_dim == 2U) { - im2col(context.device_context(), in_slice, dilations, strides, + im2col(dev_ctx, in_slice, dilations, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, &col); } else if (data_dim == 3U) { - vol2col(context.device_context(), in_slice, dilations, strides, - paddings, &col); + vol2col(dev_ctx, in_slice, dilations, strides, paddings, &col); } // gemm Tensor filter_grad_slice = filter_grad_.Slice(g * out_step, (g + 1) * out_step); - math::matmul(context.device_context(), out_grad_slice, - false, col_matrix, true, T(1.0), - &filter_grad_slice, T(1.0)); + math::matmul(dev_ctx, out_grad_slice, false, + col_matrix, true, T(1.0), + &filter_grad_slice, T(1.0)); } } } diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu index 95e13c38a8..f7ca82ce26 100644 --- a/paddle/operators/conv_shift_op.cu +++ b/paddle/operators/conv_shift_op.cu @@ -111,7 +111,8 @@ __global__ void ConvShiftDy(const T *x, const T *dout, int x_width, int y_width, } // namespace template -class ConvShiftKernel : public framework::OpKernel { +class ConvShiftKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { const Tensor *X = context.Input("X"); @@ -132,7 +133,8 @@ class ConvShiftKernel : public framework::OpKernel { dim3 grid_dim(num_x_blocks, batch_size); - auto stream = context.cuda_device_context().stream(); + auto stream = + context.template device_context().stream(); ConvShiftForward<<>>( x_data, y_data, x_width, y_width, y_half_width, batch_size, out_data); @@ -140,7 +142,7 @@ class ConvShiftKernel : public framework::OpKernel { }; template -class ConvShiftGradKernel +class ConvShiftGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -159,8 +161,9 @@ class ConvShiftGradKernel int y_width = Y->dims()[1]; int y_half_width = (y_width - 1) / 2; - auto &device_ctx = context.cuda_device_context(); - math::SetConstant zero; + auto &device_ctx = + context.template device_context(); + math::SetConstant zero; const int x_per_block = 256; int num_x_blocks = DivUp(x_width, x_per_block); @@ -186,8 +189,9 @@ class ConvShiftGradKernel } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv_shift, - ops::ConvShiftKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + conv_shift, + ops::ConvShiftKernel); +REGISTER_OP_CUDA_KERNEL( conv_shift_grad, - ops::ConvShiftGradKernel); + ops::ConvShiftGradKernel); diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h index 5a160b0f16..1a70b38a0d 100644 --- a/paddle/operators/conv_shift_op.h +++ b/paddle/operators/conv_shift_op.h @@ -18,13 +18,13 @@ namespace paddle { namespace operators { -template +template class ConvShiftKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; }; -template +template class ConvShiftGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; diff --git a/paddle/operators/conv_transpose_cudnn_op.cc b/paddle/operators/conv_transpose_cudnn_op.cc index 0192178ce3..4cb6a2ccff 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cc @@ -61,12 +61,13 @@ REGISTER_OP(conv2d_transpose_cudnn, ops::ConvTransposeOp, REGISTER_OP_CPU_KERNEL( conv2d_transpose_cudnn, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv2d_transpose_cudnn_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp, ops::CudnnConv3DTransposeOpMaker, conv3d_transpose_cudnn_grad, @@ -74,9 +75,10 @@ REGISTER_OP(conv3d_transpose_cudnn, ops::ConvTransposeOp, REGISTER_OP_CPU_KERNEL( conv3d_transpose_cudnn, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv3d_transpose_cudnn_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc index 494904fe52..f0297f6c40 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc @@ -83,7 +83,8 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { } // ------------------- cudnn conv algorithm --------------------- cudnnConvolutionBwdDataAlgo_t algo; - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); // Get the algorithm PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm( handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, @@ -165,7 +166,8 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { workspace_size_limit = user_workspace_size * 1024 * 1024; } - auto handle = ctx.cuda_device_context().cudnn_handle(); + auto& dev_ctx = ctx.template device_context(); + auto handle = dev_ctx.cudnn_handle(); if (input_grad) { // choose backward algorithm for data PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( @@ -234,16 +236,16 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn, - ops::CudnnConvTransposeOpKernel, - ops::CudnnConvTransposeOpKernel); -REGISTER_OP_GPU_KERNEL(conv2d_transpose_cudnn_grad, - ops::CudnnConvTransposeGradOpKernel, - ops::CudnnConvTransposeGradOpKernel); - -REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn, - ops::CudnnConvTransposeOpKernel, - ops::CudnnConvTransposeOpKernel); -REGISTER_OP_GPU_KERNEL(conv3d_transpose_cudnn_grad, - ops::CudnnConvTransposeGradOpKernel, - ops::CudnnConvTransposeGradOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn, + ops::CudnnConvTransposeOpKernel, + ops::CudnnConvTransposeOpKernel); +REGISTER_OP_CUDA_KERNEL(conv2d_transpose_cudnn_grad, + ops::CudnnConvTransposeGradOpKernel, + ops::CudnnConvTransposeGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn, + ops::CudnnConvTransposeOpKernel, + ops::CudnnConvTransposeOpKernel); +REGISTER_OP_CUDA_KERNEL(conv3d_transpose_cudnn_grad, + ops::CudnnConvTransposeGradOpKernel, + ops::CudnnConvTransposeGradOpKernel); diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc index 678b192dea..ca063e94bb 100644 --- a/paddle/operators/conv_transpose_op.cc +++ b/paddle/operators/conv_transpose_op.cc @@ -197,21 +197,23 @@ REGISTER_OP(conv2d_transpose, ops::ConvTransposeOp, ops::Conv2DTransposeOpMaker, REGISTER_OP_CPU_KERNEL( conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); REGISTER_OP(conv3d_transpose, ops::ConvTransposeOp, ops::Conv3DTransposeOpMaker, conv3d_transpose_grad, ops::ConvTransposeOpGrad); REGISTER_OP_CPU_KERNEL( conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); REGISTER_OP_CPU_KERNEL( conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_op.cu.cc b/paddle/operators/conv_transpose_op.cu.cc index 4165eb0c7b..b91ebd7922 100644 --- a/paddle/operators/conv_transpose_op.cu.cc +++ b/paddle/operators/conv_transpose_op.cu.cc @@ -16,20 +16,24 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( conv2d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_GPU_KERNEL( + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( conv2d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( conv3d_transpose, - ops::GemmConvTransposeKernel, - ops::GemmConvTransposeKernel); -REGISTER_OP_GPU_KERNEL( + ops::GemmConvTransposeKernel, + ops::GemmConvTransposeKernel); +REGISTER_OP_CUDA_KERNEL( conv3d_transpose_grad, - ops::GemmConvTransposeGradKernel, - ops::GemmConvTransposeGradKernel); + ops::GemmConvTransposeGradKernel, + ops::GemmConvTransposeGradKernel); diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h index 1cacb770e6..80600b5361 100644 --- a/paddle/operators/conv_transpose_op.h +++ b/paddle/operators/conv_transpose_op.h @@ -52,7 +52,7 @@ class ConvTransposeOpGrad : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override; }; -template +template class GemmConvTransposeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -109,11 +109,12 @@ class GemmConvTransposeKernel : public framework::OpKernel { filter.Resize(filter_matrix_shape); output->mutable_data(context.GetPlace()); - math::SetConstant set_zero; - set_zero(context.device_context(), output, static_cast(0)); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, output, static_cast(0)); - math::Col2ImFunctor col2im; - math::Col2VolFunctor col2vol; + math::Col2ImFunctor col2im; + math::Col2VolFunctor col2vol; std::vector dilations({1, 1, 1}); // convolution transpose: gemm + col2im or col2vol (similar to conv-backward @@ -127,29 +128,27 @@ class GemmConvTransposeKernel : public framework::OpKernel { // col_matrix = filter * input_batch // of shape (c * k_h * k_w, h * w) or (c * k_d * k_h * k_w, d * h * w) - math::matmul(context.device_context(), filter, true, - input_batch, false, static_cast(1.0), - &col_matrix, static_cast(0.0)); + math::matmul(dev_ctx, filter, true, input_batch, false, + static_cast(1.0), &col_matrix, + static_cast(0.0)); if (data_dim == 2U) { // col2im: col_matrix -> dy // from (c * k_h * k_w, h * w) to (c, o_h, o_w) - col2im(context.device_context(), col, - std::vector{dilations[0], dilations[1]}, strides, - std::vector{paddings[0], paddings[1], paddings[0], - paddings[1]}, + col2im(dev_ctx, col, std::vector{dilations[0], dilations[1]}, + strides, std::vector{paddings[0], paddings[1], paddings[0], + paddings[1]}, &output_batch); } else if (data_dim == 3U) { // col2vol: col_matrix -> dy // from (c * k_d * k_h * k_w, d * h * w) to (c, o_d, o_h, o_w) - col2vol(context.device_context(), col, dilations, strides, paddings, - &output_batch); + col2vol(dev_ctx, col, dilations, strides, paddings, &output_batch); } } } }; -template +template class GemmConvTransposeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -206,6 +205,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // convolution transpose grad on input: // im2col + gemm (similar to conv-forward) // input need to compute gradient + auto& dev_ctx = context.template device_context(); if (input_grad || filter_grad) { Tensor col; col.mutable_data(col_shape, context.GetPlace()); @@ -217,19 +217,19 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { col_matrix.Resize(col_matrix_shape); Tensor filter_grad_; - math::SetConstant set_zero; + math::SetConstant set_zero; - math::Im2ColFunctor im2col; - math::Vol2ColFunctor vol2col; + math::Im2ColFunctor im2col; + math::Vol2ColFunctor vol2col; std::vector dilations({1, 1, 1}); if (input_grad) { input_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), input_grad, static_cast(0)); + set_zero(dev_ctx, input_grad, static_cast(0)); } if (filter_grad) { // filter size (m, c, k_h, k_w) filter_grad->mutable_data(context.GetPlace()); - set_zero(context.device_context(), filter_grad, static_cast(0)); + set_zero(dev_ctx, filter_grad, static_cast(0)); filter_grad_ = *filter_grad; filter_grad_.Resize(filter_matrix_shape); } @@ -242,7 +242,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { if (data_dim == 2U) { // im2col: dy -> col matrix // from (c, o_h, o_w) to (c * k_h * k_w, h * w) - im2col(context.device_context(), output_grad_batch, + im2col(dev_ctx, output_grad_batch, std::vector{dilations[0], dilations[1]}, strides, std::vector{paddings[0], paddings[1], paddings[0], paddings[1]}, @@ -250,8 +250,8 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { } else if (data_dim == 3U) { // vol2col: dy -> col_matrix // from (c, o_d, o_h, o_w) to (c * k_d * k_h * k_w, d * h * w) - vol2col(context.device_context(), output_grad_batch, dilations, - strides, paddings, &col); + vol2col(dev_ctx, output_grad_batch, dilations, strides, paddings, + &col); } if (input_grad) { @@ -263,9 +263,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // or // (m, c * k_d * k_h * k_w) * (c * k_d * k_h * k_w, d * h * w) -> (m, // d, h, w) - math::matmul(context.device_context(), filter, false, - col_matrix, false, static_cast(1.0), - &input_grad_batch, static_cast(0.0)); + math::matmul( + dev_ctx, filter, false, col_matrix, false, static_cast(1.0), + &input_grad_batch, static_cast(0.0)); } if (filter_grad) { // input batch @@ -275,9 +275,9 @@ class GemmConvTransposeGradKernel : public framework::OpKernel { // or // (m, d * h * w) * (d * h * w, c * k_d * k_h * k_w) -> (m, c * k_d * // k_h * k_w) - math::matmul(context.device_context(), in_batch, false, - col_matrix, true, static_cast(1.0), - &filter_grad_, static_cast(1.0)); + math::matmul(dev_ctx, in_batch, false, col_matrix, + true, static_cast(1.0), + &filter_grad_, static_cast(1.0)); } } } diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc index 312264ccd4..440c427cba 100644 --- a/paddle/operators/cos_sim_op.cc +++ b/paddle/operators/cos_sim_op.cc @@ -155,7 +155,8 @@ class CosSimOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(cos_sim, ops::CosSimOp, ops::CosSimOpMaker, cos_sim_grad, ops::CosSimOpGrad); -REGISTER_OP_CPU_KERNEL(cos_sim, - ops::CosSimKernel); REGISTER_OP_CPU_KERNEL( - cos_sim_grad, ops::CosSimGradKernel); + cos_sim, ops::CosSimKernel); +REGISTER_OP_CPU_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/operators/cos_sim_op.cu b/paddle/operators/cos_sim_op.cu index 0cb8fd26de..1cb01f5945 100644 --- a/paddle/operators/cos_sim_op.cu +++ b/paddle/operators/cos_sim_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/cos_sim_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(cos_sim, - ops::CosSimKernel); -REGISTER_OP_GPU_KERNEL( - cos_sim_grad, ops::CosSimGradKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim, ops::CosSimKernel); +REGISTER_OP_CUDA_KERNEL( + cos_sim_grad, + ops::CosSimGradKernel); diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h index 62a4e484ec..fecb5a79b2 100644 --- a/paddle/operators/cos_sim_op.h +++ b/paddle/operators/cos_sim_op.h @@ -27,7 +27,7 @@ template using EigenVector = framework::EigenVector; -template +template class CosSimKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -51,7 +51,8 @@ class CosSimKernel : public framework::OpKernel { auto y_norm = EigenVector::Flatten(*out_y_norm); // compute - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto row_along = Eigen::array({{1}}); x_norm.device(place) = x.square().sum(row_along).sqrt(); y_norm.device(place) = y.square().sum(row_along).sqrt(); @@ -66,7 +67,7 @@ class CosSimKernel : public framework::OpKernel { } }; -template +template class CosSimGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -96,7 +97,8 @@ class CosSimGradKernel : public framework::OpKernel { auto z_bcast = z.broadcast(bcast_cols); auto dz_bcast = dz.broadcast(bcast_cols); auto x_snorm_bcast = x_norm.square().eval().broadcast(bcast_cols); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); if (rows_x == rows_y) { auto y_snorm_bcast = y_norm.square().eval().broadcast(bcast_cols); auto norm_prod_bcast = (x_norm * y_norm).eval().broadcast(bcast_cols); diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc index 291b23ed1b..1ce189fa6e 100644 --- a/paddle/operators/crf_decoding_op.cc +++ b/paddle/operators/crf_decoding_op.cc @@ -135,5 +135,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(crf_decoding, ops::CRFDecodingOp, ops::CRFDecodingOpMaker); REGISTER_OP_CPU_KERNEL( - crf_decoding, ops::CRFDecodingOpKernel, - ops::CRFDecodingOpKernel); + crf_decoding, + ops::CRFDecodingOpKernel, + ops::CRFDecodingOpKernel); diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h index 57b5e21b3a..f6827b7b11 100644 --- a/paddle/operators/crf_decoding_op.h +++ b/paddle/operators/crf_decoding_op.h @@ -24,7 +24,7 @@ using framework::LoDTensor; using framework::LoD; using framework::Tensor; -template +template class CRFDecodingOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -44,8 +44,8 @@ class CRFDecodingOpKernel : public framework::OpKernel { const size_t seq_num = lod[level].size() - 1; int64_t* path = decoded_path->mutable_data(platform::CPUPlace()); - math::SetConstant()(ctx.device_context(), - decoded_path, 0); + math::SetConstant()( + ctx.template device_context(), decoded_path, 0); for (size_t i = 0; i < seq_num; ++i) { int start_pos = static_cast(lod[level][i]); int end_pos = static_cast(lod[level][i + 1]); diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc index 6752eb8c1c..7c2a0ac7a7 100644 --- a/paddle/operators/crop_op.cc +++ b/paddle/operators/crop_op.cc @@ -133,5 +133,5 @@ class CropOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad); REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); -REGISTER_OP_CPU_KERNEL(crop_grad, - ops::CropGradKernel); +REGISTER_OP_CPU_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu index f8ee18a1d6..90fd83ca10 100644 --- a/paddle/operators/crop_op.cu +++ b/paddle/operators/crop_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/crop_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(crop, ops::CropKernel); -REGISTER_OP_GPU_KERNEL(crop_grad, - ops::CropGradKernel); +REGISTER_OP_CUDA_KERNEL(crop, ops::CropKernel); +REGISTER_OP_CUDA_KERNEL( + crop_grad, ops::CropGradKernel); diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h index 2e72583d68..d531a19c78 100644 --- a/paddle/operators/crop_op.h +++ b/paddle/operators/crop_op.h @@ -49,7 +49,7 @@ class CropKernel : public framework::OpKernel { } }; -template +template void CropGradFunction(const framework::ExecutionContext& context) { auto* d_x = context.Output(framework::GradVarName("X")); if (d_x != nullptr) { @@ -63,12 +63,13 @@ void CropGradFunction(const framework::ExecutionContext& context) { } auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - d_x_tensor.device(context.GetEigenDevice()) = + d_x_tensor.device( + *context.template device_context().eigen_device()) = d_out_tensor.pad(paddings, 0); } } -template +template class CropGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -76,22 +77,22 @@ class CropGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out"))->dims().size(); switch (rank) { case 1: - CropGradFunction(context); + CropGradFunction(context); break; case 2: - CropGradFunction(context); + CropGradFunction(context); break; case 3: - CropGradFunction(context); + CropGradFunction(context); break; case 4: - CropGradFunction(context); + CropGradFunction(context); break; case 5: - CropGradFunction(context); + CropGradFunction(context); break; case 6: - CropGradFunction(context); + CropGradFunction(context); break; default: PADDLE_THROW( diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu index 6212e39dfd..0546964588 100644 --- a/paddle/operators/cross_entropy_op.cu +++ b/paddle/operators/cross_entropy_op.cu @@ -53,8 +53,9 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel { Tensor* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - math::CrossEntropyFunctor()( - ctx.device_context(), y, x, label, ctx.Attr("soft_label")); + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, label, + ctx.Attr("soft_label")); } }; @@ -80,15 +81,17 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { int block = 512; int grid = (batch_size * class_num + block - 1) / block; - auto stream = ctx.cuda_device_context().stream(); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); if (ctx.Attr("soft_label")) { auto* label_data = label->data(); SoftCrossEntropyGradientKernel<<>>( dx_data, dy_data, x_data, label_data, batch_size, class_num); } else { - math::SetConstant functor; - functor(ctx.device_context(), dx, 0); + math::SetConstant functor; + functor(dev_ctx, dx, 0); auto* label_data = label->data(); grid = (batch_size + block - 1) / block; CrossEntropyGradientKernel<<>>( @@ -101,8 +104,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, - ops::CrossEntropyOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(cross_entropy_grad, - ops::CrossEntropyGradientOpCUDAKernel, - ops::CrossEntropyGradientOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(cross_entropy, ops::CrossEntropyOpCUDAKernel, + ops::CrossEntropyOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(cross_entropy_grad, + ops::CrossEntropyGradientOpCUDAKernel, + ops::CrossEntropyGradientOpCUDAKernel); diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h index 37db0a930a..5623d2ded1 100644 --- a/paddle/operators/cross_entropy_op.h +++ b/paddle/operators/cross_entropy_op.h @@ -37,8 +37,9 @@ class CrossEntropyOpKernel : public framework::OpKernel { Tensor* y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - math::CrossEntropyFunctor()( - ctx.device_context(), y, x, labels, ctx.Attr("soft_label")); + math::CrossEntropyFunctor()( + ctx.template device_context(), y, x, labels, + ctx.Attr("soft_label")); } }; @@ -61,7 +62,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { auto lbl_mat = EigenMatrix::From(*label); auto dx_mat = EigenMatrix::From(*dx); - dx_mat.device(ctx.GetEigenDevice()) = + dx_mat.device(*ctx.template device_context() + .eigen_device()) = -(lbl_mat * dy_mat.broadcast(Eigen::DSizes(1, class_num)) / x_mat); } else { @@ -70,8 +72,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel { const T* x_data = x->data(); const int64_t* label_data = label->data(); - math::SetConstant functor; - functor(ctx.device_context(), dx, 0); + math::SetConstant functor; + functor(ctx.template device_context(), dx, 0); for (int64_t i = 0; i < batch_size; ++i) { PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num); diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc index 640b4e7744..fd29c7270b 100644 --- a/paddle/operators/decayed_adagrad_op.cc +++ b/paddle/operators/decayed_adagrad_op.cc @@ -99,4 +99,4 @@ REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp, ops::DecayedAdagradOpMaker); REGISTER_OP_CPU_KERNEL( decayed_adagrad, - ops::DecayedAdagradOpKernel); + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu index 6fce77fe4e..282b90f275 100644 --- a/paddle/operators/decayed_adagrad_op.cu +++ b/paddle/operators/decayed_adagrad_op.cu @@ -16,6 +16,6 @@ #include "paddle/operators/decayed_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( decayed_adagrad, - ops::DecayedAdagradOpKernel); + ops::DecayedAdagradOpKernel); diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h index 0fe0fc5acd..fec9705cfc 100644 --- a/paddle/operators/decayed_adagrad_op.h +++ b/paddle/operators/decayed_adagrad_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class DecayedAdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -43,7 +43,7 @@ class DecayedAdagradOpKernel : public framework::OpKernel { auto param_out = framework::EigenVector::Flatten(*param_out_tensor); auto moment_out = framework::EigenVector::Flatten(*moment_out_tensor); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); moment_out.device(place) = decay * moment + (1 - decay) * grad * grad; Eigen::DSizes m_dsize(moment_out_tensor->numel()); diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc index 932c0bf8fb..acd526ae80 100644 --- a/paddle/operators/dropout_op.cc +++ b/paddle/operators/dropout_op.cc @@ -100,6 +100,8 @@ namespace ops = paddle::operators; REGISTER_OP(dropout, ops::DropoutOp, ops::DropoutOpMaker, dropout_grad, ops::DropoutOpGrad); REGISTER_OP_CPU_KERNEL( - dropout, ops::CPUDropoutKernel); + dropout, + ops::CPUDropoutKernel); REGISTER_OP_CPU_KERNEL( - dropout_grad, ops::DropoutGradKernel); + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu index db3578b9bf..10c670751d 100644 --- a/paddle/operators/dropout_op.cu +++ b/paddle/operators/dropout_op.cu @@ -58,7 +58,7 @@ class GPUDropoutKernel : public framework::OpKernel { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - auto place = context.GetEigenDevice(); + auto& place = *context.template device_context().eigen_device(); if (!context.Attr("is_test")) { auto* mask = context.Output("Mask"); auto* mask_data = mask->mutable_data(context.GetPlace()); @@ -80,7 +80,9 @@ class GPUDropoutKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - dropout, ops::GPUDropoutKernel); -REGISTER_OP_GPU_KERNEL( - dropout_grad, ops::DropoutGradKernel); +REGISTER_OP_CUDA_KERNEL( + dropout, + ops::GPUDropoutKernel); +REGISTER_OP_CUDA_KERNEL( + dropout_grad, + ops::DropoutGradKernel); diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h index d9a130fdc0..84ad39f0bb 100644 --- a/paddle/operators/dropout_op.h +++ b/paddle/operators/dropout_op.h @@ -25,7 +25,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class CPUDropoutKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -55,13 +55,14 @@ class CPUDropoutKernel : public framework::OpKernel { } else { auto X = EigenMatrix::Reshape(*x, 1); auto Y = EigenMatrix::Reshape(*y, 1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Y.device(place) = X * dropout_prob; } } }; -template +template class DropoutGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -77,7 +78,8 @@ class DropoutGradKernel : public framework::OpKernel { auto dX = EigenMatrix::Reshape(*grad_x, 1); auto dY = EigenMatrix::Reshape(*grad_y, 1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); dX.device(place) = dY * M; } }; diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc index 432b9ba6f7..a62eeeeb95 100644 --- a/paddle/operators/elementwise_add_op.cc +++ b/paddle/operators/elementwise_add_op.cc @@ -34,13 +34,13 @@ REGISTER_OP(elementwise_add, ops::ElementwiseOp, ops::ElementwiseAddOpMaker, elementwise_add_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_add, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); REGISTER_OP_CPU_KERNEL( elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/operators/elementwise_add_op.cu b/paddle/operators/elementwise_add_op.cu index 7591428ac7..78642bb424 100644 --- a/paddle/operators/elementwise_add_op.cu +++ b/paddle/operators/elementwise_add_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_add, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel, - ops::ElementwiseAddKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel, + ops::ElementwiseAddKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_add_grad, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel, - ops::ElementwiseAddGradKernel); + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel, + ops::ElementwiseAddGradKernel); diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h index 921dc5f6a6..069bdaf0ab 100644 --- a/paddle/operators/elementwise_add_op.h +++ b/paddle/operators/elementwise_add_op.h @@ -24,7 +24,7 @@ struct AddFunctor { inline HOSTDEVICE T operator()(T a, T b) const { return a + b; } }; -template +template class ElementwiseAddKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -34,8 +34,8 @@ class ElementwiseAddKernel : public framework::OpKernel { auto* y = ctx.Input("Y"); auto* z = ctx.Output("Out"); z->mutable_data(ctx.GetPlace()); - TransformFunctor, T, Place> functor( - x, y, z, ctx.device_context(), AddFunctor()); + TransformFunctor, T, DeviceContext> functor( + x, y, z, ctx.template device_context(), AddFunctor()); auto x_dims = x->dims(); auto y_dims = y->dims(); @@ -137,11 +137,11 @@ struct ElementwiseAddBroadCast2GradFunctor { } }; -template +template class ElementwiseAddGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseAddOneGradFunctor, ElementwiseAddBroadCastGradFunctor, ElementwiseAddBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc index 7a325199bd..1c3e9e70ee 100644 --- a/paddle/operators/elementwise_div_op.cc +++ b/paddle/operators/elementwise_div_op.cc @@ -35,13 +35,13 @@ REGISTER_OP(elementwise_div, ops::ElementwiseOp, ops::ElementwiseDivOpMaker, elementwise_div_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); REGISTER_OP_CPU_KERNEL( elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel); + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/operators/elementwise_div_op.cu b/paddle/operators/elementwise_div_op.cu index de4d0c3344..502c528936 100644 --- a/paddle/operators/elementwise_div_op.cu +++ b/paddle/operators/elementwise_div_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_div, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel, - ops::ElementwiseDivKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel, + ops::ElementwiseDivKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_div_grad, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel, - ops::ElementwiseDivGradKernel); + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel, + ops::ElementwiseDivGradKernel); diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h index 8946ff3d25..d91313db42 100644 --- a/paddle/operators/elementwise_div_op.h +++ b/paddle/operators/elementwise_div_op.h @@ -19,11 +19,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseDivKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -102,11 +102,11 @@ struct ElementwiseDivBroadCast2GradFunctor { } }; -template +template class ElementwiseDivGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseDivGradFunctor, ElementwiseDivBroadCastGradFunctor, ElementwiseDivBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc index 8851267a52..aadb95cbe3 100644 --- a/paddle/operators/elementwise_mul_op.cc +++ b/paddle/operators/elementwise_mul_op.cc @@ -36,13 +36,13 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker, elementwise_mul_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_mul, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); REGISTER_OP_CPU_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu index b0dfdee1cc..089451b3e1 100644 --- a/paddle/operators/elementwise_mul_op.cu +++ b/paddle/operators/elementwise_mul_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_mul, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel, - ops::ElementwiseMulKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel, + ops::ElementwiseMulKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_mul_grad, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel, - ops::ElementwiseMulGradKernel); + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel, + ops::ElementwiseMulGradKernel); diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h index 4469b07eaa..16fa5ec4b3 100644 --- a/paddle/operators/elementwise_mul_op.h +++ b/paddle/operators/elementwise_mul_op.h @@ -18,11 +18,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -101,11 +101,11 @@ struct ElementwiseMulBroadCast2GradFunctor { } }; -template +template class ElementwiseMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseMulGradFunctor, ElementwiseMulBroadCastGradFunctor, ElementwiseMulBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h index ca3542e783..7ebfc7df8c 100644 --- a/paddle/operators/elementwise_op_function.h +++ b/paddle/operators/elementwise_op_function.h @@ -59,17 +59,17 @@ inline void get_mid_dims(const framework::DDim& x_dims, } } -template +template class RowwiseTransformIterator; -template +template class MidWiseTransformIterator; template -class RowwiseTransformIterator { +class RowwiseTransformIterator { public: RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {} - RowwiseTransformIterator& operator++() { + RowwiseTransformIterator& operator++() { ++i_; if (UNLIKELY(i_ == n_)) { i_ = 0; @@ -77,13 +77,13 @@ class RowwiseTransformIterator { return *this; } - bool operator==( - const RowwiseTransformIterator& rhs) const { + bool operator==(const RowwiseTransformIterator& + rhs) const { return (ptr_ + i_) == &(*rhs); } - bool operator!=( - const RowwiseTransformIterator& rhs) const { + bool operator!=(const RowwiseTransformIterator& + rhs) const { return (ptr_ + i_) != &(*rhs); } @@ -96,12 +96,12 @@ class RowwiseTransformIterator { }; template -class MidWiseTransformIterator { +class MidWiseTransformIterator { public: MidWiseTransformIterator(const T* ptr, int n, int post) : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {} - MidWiseTransformIterator& operator++() { + MidWiseTransformIterator& operator++() { ++j_; i_ = j_ / post_; if (UNLIKELY(i_ == n_)) { @@ -111,13 +111,13 @@ class MidWiseTransformIterator { return *this; } - bool operator==( - const MidWiseTransformIterator& rhs) const { + bool operator==(const MidWiseTransformIterator& + rhs) const { return (ptr_ + i_) == &(*rhs); } - bool operator!=( - const MidWiseTransformIterator& rhs) const { + bool operator!=(const MidWiseTransformIterator& + rhs) const { return (ptr_ + i_) != &(*rhs); } @@ -133,12 +133,12 @@ class MidWiseTransformIterator { #ifdef __NVCC__ template -class RowwiseTransformIterator +class RowwiseTransformIterator : public thrust::iterator_adaptor< - RowwiseTransformIterator, const T*> { + RowwiseTransformIterator, const T*> { public: typedef thrust::iterator_adaptor< - RowwiseTransformIterator, const T*> + RowwiseTransformIterator, const T*> super_t; HOSTDEVICE RowwiseTransformIterator(const T* x, int n) : super_t(x), begin_(x), n_(n){}; @@ -153,12 +153,12 @@ class RowwiseTransformIterator }; template -class MidWiseTransformIterator +class MidWiseTransformIterator : public thrust::iterator_adaptor< - MidWiseTransformIterator, const T*> { + MidWiseTransformIterator, const T*> { public: typedef thrust::iterator_adaptor< - MidWiseTransformIterator, const T*> + MidWiseTransformIterator, const T*> super_t; HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post) : super_t(x), begin_(x), n_(n), post_(post){}; @@ -174,12 +174,11 @@ class MidWiseTransformIterator }; #endif -template +template class TransformFunctor { public: TransformFunctor(const framework::Tensor* x, const framework::Tensor* y, - framework::Tensor* z, const platform::DeviceContext& ctx, - Functor func) + framework::Tensor* z, const DeviceContext& ctx, Functor func) : x_(x->data()), y_(y->data()), z_(z->mutable_data(ctx.GetPlace())), @@ -188,20 +187,20 @@ class TransformFunctor { func_(func) {} inline void Run() const { - platform::Transform trans; + platform::Transform trans; trans(ctx_, x_, x_ + nx_, y_, z_, func_); } inline void RunRowWise(int n, int pre) const { - platform::Transform trans; - trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), z_, - func_); + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator(y_, n), + z_, func_); } inline void RunMidWise(int n, int pre, int post) const { - platform::Transform trans; - trans(ctx_, x_, x_ + nx_, MidWiseTransformIterator(y_, n, post), - z_, func_); + platform::Transform trans; + trans(ctx_, x_, x_ + nx_, + MidWiseTransformIterator(y_, n, post), z_, func_); } private: @@ -209,22 +208,24 @@ class TransformFunctor { const T* y_; T* z_; int64_t nx_; - const platform::DeviceContext& ctx_; + const DeviceContext& ctx_; Functor func_; }; #define EIGEN_FUNCTOR(name, eigen_op) \ struct Eigen##name##Functor { \ - template \ + template \ inline void Run(const framework::Tensor* x, const framework::Tensor* y, \ framework::Tensor* z, \ const framework::ExecutionContext& ctx) { \ auto x_e = framework::EigenVector::Flatten(*x); \ auto y_e = framework::EigenVector::Flatten(*y); \ auto z_e = framework::EigenVector::Flatten(*z); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_e); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_e); \ } \ - template \ + template \ inline void RunBroadCast(const framework::Tensor* x, \ const framework::Tensor* y, framework::Tensor* z, \ const framework::ExecutionContext& ctx, int pre, \ @@ -235,9 +236,11 @@ class TransformFunctor { auto y_bcast = y_e.reshape(Eigen::DSizes(1, n)) \ .broadcast(Eigen::DSizes(pre, 1)) \ .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_bcast); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ } \ - template \ + template \ inline void RunBroadCast2(const framework::Tensor* x, \ const framework::Tensor* y, \ framework::Tensor* z, \ @@ -249,11 +252,13 @@ class TransformFunctor { auto y_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) \ .broadcast(Eigen::DSizes(pre, 1, post)) \ .reshape(Eigen::DSizes(x_e.size())); \ - z_e.device(ctx.GetEigenDevice()) = eigen_op(x_e, y_bcast); \ + z_e.device( \ + *ctx.template device_context().eigen_device()) = \ + eigen_op(x_e, y_bcast); \ } \ } -template +template void ElementwiseCompute(const framework::ExecutionContext& ctx) { using Tensor = framework::Tensor; @@ -269,7 +274,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { if (x_dims == y_dims) { functor f; - f.template Run(x, y, z, ctx); + f.template Run(x, y, z, ctx); return; } @@ -282,11 +287,11 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) { get_mid_dims(x_dims, y_dims, axis, pre, n, post); if (post == 1) { functor f; - f.template RunBroadCast(x, y, z, ctx, pre, n); + f.template RunBroadCast(x, y, z, ctx, pre, n); return; } else { functor f; - f.template RunBroadCast2(x, y, z, ctx, pre, n, post); + f.template RunBroadCast2(x, y, z, ctx, pre, n, post); return; } } @@ -303,8 +308,9 @@ EIGEN_FUNCTOR(Mul, EIGEN_MUL); #define EIGEN_DIV(x, y) ((x) / (y)) EIGEN_FUNCTOR(Div, EIGEN_DIV); -template +template void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { using Tensor = framework::Tensor; @@ -313,7 +319,7 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) { auto* out = ctx.Input("Out"); auto* dout = ctx.Input(framework::GradVarName("Out")); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); auto x_dims = x->dims(); auto y_dims = y->dims(); diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc index 95d7979e39..3e4d19361e 100644 --- a/paddle/operators/elementwise_sub_op.cc +++ b/paddle/operators/elementwise_sub_op.cc @@ -34,13 +34,13 @@ REGISTER_OP(elementwise_sub, ops::ElementwiseOp, ops::ElementwiseSubOpMaker, elementwise_sub_grad, ops::ElementwiseOpGrad); REGISTER_OP_CPU_KERNEL( elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); REGISTER_OP_CPU_KERNEL( elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel); + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/operators/elementwise_sub_op.cu b/paddle/operators/elementwise_sub_op.cu index ec23bec35f..0b2f0f7d4d 100644 --- a/paddle/operators/elementwise_sub_op.cu +++ b/paddle/operators/elementwise_sub_op.cu @@ -17,15 +17,16 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( elementwise_sub, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel, - ops::ElementwiseSubKernel); -REGISTER_OP_GPU_KERNEL( + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel, + ops::ElementwiseSubKernel); +REGISTER_OP_CUDA_KERNEL( elementwise_sub_grad, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel, - ops::ElementwiseSubGradKernel); + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel, + ops::ElementwiseSubGradKernel); diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h index 3f40c1c5bc..731a30c5e3 100644 --- a/paddle/operators/elementwise_sub_op.h +++ b/paddle/operators/elementwise_sub_op.h @@ -18,11 +18,11 @@ namespace paddle { namespace operators { -template +template class ElementwiseSubKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseCompute(ctx); + ElementwiseCompute(ctx); } }; @@ -101,11 +101,11 @@ struct ElementwiseSubBroadCast2GradFunctor { } }; -template +template class ElementwiseSubGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - ElementwiseGradCompute, + ElementwiseGradCompute, ElementwiseSubOneGradFunctor, ElementwiseSubBroadCastGradFunctor, ElementwiseSubBroadCast2GradFunctor>(ctx); diff --git a/paddle/operators/expand_op.cc b/paddle/operators/expand_op.cc index 282775fcda..8b3cddbb94 100644 --- a/paddle/operators/expand_op.cc +++ b/paddle/operators/expand_op.cc @@ -130,7 +130,8 @@ class ExpandGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(expand, ops::ExpandOp, ops::ExpandOpMaker, expand_grad, ops::ExpandGradOp); -REGISTER_OP_CPU_KERNEL(expand, - ops::ExpandKernel); REGISTER_OP_CPU_KERNEL( - expand_grad, ops::ExpandGradKernel); + expand, ops::ExpandKernel); +REGISTER_OP_CPU_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/operators/expand_op.cu b/paddle/operators/expand_op.cu index 6744562b6c..99ee584d08 100644 --- a/paddle/operators/expand_op.cu +++ b/paddle/operators/expand_op.cu @@ -17,7 +17,8 @@ #include "paddle/operators/expand_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(expand, - ops::ExpandKernel); -REGISTER_OP_GPU_KERNEL( - expand_grad, ops::ExpandGradKernel); +REGISTER_OP_CUDA_KERNEL( + expand, ops::ExpandKernel); +REGISTER_OP_CUDA_KERNEL( + expand_grad, + ops::ExpandGradKernel); diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h index 4d7996ad1e..14ef8b0912 100644 --- a/paddle/operators/expand_op.h +++ b/paddle/operators/expand_op.h @@ -56,7 +56,7 @@ template using EigenTensor = framework::EigenTensor; -template +template class ExpandKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -83,12 +83,13 @@ class ExpandKernel : public framework::OpKernel { auto x = EigenTensor::From(*in0); out0->mutable_data(context.GetPlace()); auto y = EigenTensor::From(*out0); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); y.device(place) = x.broadcast(bcast_dims); } }; -template +template class ExpandGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -164,7 +165,8 @@ class ExpandGradKernel : public framework::OpKernel { reduce_dims[i] = reduce_dims_vec[i]; } auto out_grad = EigenVector::Flatten(*in0); - x_grad.device(context.GetEigenDevice()) = + x_grad.device( + *context.template device_context().eigen_device()) = out_grad.reshape(reshape_dims).sum(reduce_dims).reshape(x.dimensions()); } }; diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc index 892922cd3a..7fb74e2b95 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cc @@ -100,8 +100,11 @@ REGISTER_OPERATOR(fill_constant_batch_size_like, ops::FillConstantBatchSizeLikeOpMaker); REGISTER_OP_CPU_KERNEL( fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/operators/fill_constant_batch_size_like_op.cu.cc index 9e7a1eeab8..2e0e15f36b 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cu.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cu.cc @@ -16,10 +16,13 @@ #include "paddle/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( fill_constant_batch_size_like, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, - ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel, + ops::FillConstantBatchSizeLikeOpKernel); diff --git a/paddle/operators/fill_constant_batch_size_like_op.h b/paddle/operators/fill_constant_batch_size_like_op.h index 339d97a30a..66da9d0307 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.h +++ b/paddle/operators/fill_constant_batch_size_like_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -27,8 +27,9 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); auto value = ctx.Attr("value"); - math::SetConstant setter; - setter(ctx.device_context(), out, static_cast(value)); + math::SetConstant setter; + setter(ctx.template device_context(), out, + static_cast(value)); } }; diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc index 95fb5932b8..720c11f5f1 100644 --- a/paddle/operators/fill_zeros_like_op.cc +++ b/paddle/operators/fill_zeros_like_op.cc @@ -54,8 +54,9 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker); REGISTER_OP_CPU_KERNEL( - fill_zeros_like, ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel); + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.cu.cc b/paddle/operators/fill_zeros_like_op.cu.cc index 1501a17441..9f412306bb 100644 --- a/paddle/operators/fill_zeros_like_op.cu.cc +++ b/paddle/operators/fill_zeros_like_op.cu.cc @@ -16,9 +16,10 @@ #include "paddle/framework/op_registry.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - fill_zeros_like, ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel, - ops::FillZerosLikeKernel); +REGISTER_OP_CUDA_KERNEL( + fill_zeros_like, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel, + ops::FillZerosLikeKernel); diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h index 7e7d78eea2..a6e2941f52 100644 --- a/paddle/operators/fill_zeros_like_op.h +++ b/paddle/operators/fill_zeros_like_op.h @@ -19,15 +19,16 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class FillZerosLikeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* out = context.Output("Y"); out->mutable_data(context.GetPlace()); - math::SetConstant setter; - setter(context.device_context(), out, static_cast(0)); + math::SetConstant setter; + setter(context.template device_context(), out, + static_cast(0)); } }; diff --git a/paddle/operators/ftrl_op.cc b/paddle/operators/ftrl_op.cc index cb7ae69196..b14913ff21 100644 --- a/paddle/operators/ftrl_op.cc +++ b/paddle/operators/ftrl_op.cc @@ -135,5 +135,5 @@ The paper that proposed Follow The Regularized Leader (FTRL): namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(ftrl, ops::FTRLOp, ops::FTRLOpMaker); -REGISTER_OP_CPU_KERNEL(ftrl, - ops::FTRLOpKernel); +REGISTER_OP_CPU_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/operators/ftrl_op.cu b/paddle/operators/ftrl_op.cu index 97b36dade6..abbbe7adbe 100644 --- a/paddle/operators/ftrl_op.cu +++ b/paddle/operators/ftrl_op.cu @@ -15,5 +15,5 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/ftrl_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(ftrl, - ops::FTRLOpKernel); +REGISTER_OP_CUDA_KERNEL( + ftrl, ops::FTRLOpKernel); diff --git a/paddle/operators/ftrl_op.h b/paddle/operators/ftrl_op.h index b040162f8d..4eea04cd8d 100644 --- a/paddle/operators/ftrl_op.h +++ b/paddle/operators/ftrl_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class FTRLOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -53,7 +53,7 @@ class FTRLOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto s_acc_out = EigenVector::Flatten(*sq_accum_out); auto l_acc_out = EigenVector::Flatten(*lin_accum_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h index 8d04ecd284..c806aa5f05 100644 --- a/paddle/operators/gather.cu.h +++ b/paddle/operators/gather.cu.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { using framework::Tensor; -using platform::Place; +using platform::DeviceContext; #define CUDA_1D_KERNEL_LOOP(i, n) \ for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu index 92219d6a43..b37f0576e2 100644 --- a/paddle/operators/gather_op.cu +++ b/paddle/operators/gather_op.cu @@ -49,7 +49,8 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto place = ctx.GetEigenDevice(); + auto &place = *ctx.template device_context() + .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); GPUScatterAssign(ctx.device_context(), *dO, *Index, dX); @@ -60,5 +61,5 @@ class GatherGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel); diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h index 8276ed0d3d..1a1ba0c41a 100644 --- a/paddle/operators/gather_op.h +++ b/paddle/operators/gather_op.h @@ -53,7 +53,8 @@ class GatherGradientOpKernel : public framework::OpKernel { dX->mutable_data(ctx.GetPlace()); auto dxt = framework::EigenVector::Flatten(*dX); - auto place = ctx.GetEigenDevice(); + auto &place = *ctx.template device_context() + .eigen_device(); dxt.device(place) = dxt.constant(static_cast(0)); ScatterAssign(ctx.device_context(), *dO, *Index, dX); diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu index 315560bf1b..ffce6f7138 100644 --- a/paddle/operators/gaussian_random_op.cu +++ b/paddle/operators/gaussian_random_op.cu @@ -60,5 +60,5 @@ class GPUGaussianRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(gaussian_random, - paddle::operators::GPUGaussianRandomKernel); +REGISTER_OP_CUDA_KERNEL(gaussian_random, + paddle::operators::GPUGaussianRandomKernel); diff --git a/paddle/operators/gru_op.cc b/paddle/operators/gru_op.cc index 5aa03f8916..311e7edcf1 100644 --- a/paddle/operators/gru_op.cc +++ b/paddle/operators/gru_op.cc @@ -213,8 +213,9 @@ class GRUGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(gru, ops::GRUOp, ops::GRUOpMaker, gru_grad, ops::GRUGradOp); -REGISTER_OP_CPU_KERNEL(gru, ops::GRUKernel, - ops::GRUKernel); -REGISTER_OP_CPU_KERNEL(gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CPU_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CPU_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/operators/gru_op.cu.cc b/paddle/operators/gru_op.cu.cc index 0ceff94ec3..458630ca61 100644 --- a/paddle/operators/gru_op.cu.cc +++ b/paddle/operators/gru_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/gru_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gru, ops::GRUKernel, - ops::GRUKernel); -REGISTER_OP_GPU_KERNEL(gru_grad, - ops::GRUGradKernel, - ops::GRUGradKernel); +REGISTER_OP_CUDA_KERNEL( + gru, ops::GRUKernel, + ops::GRUKernel); +REGISTER_OP_CUDA_KERNEL( + gru_grad, ops::GRUGradKernel, + ops::GRUGradKernel); diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h index 564489d3a9..6d02dff578 100644 --- a/paddle/operators/gru_op.h +++ b/paddle/operators/gru_op.h @@ -27,16 +27,16 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -template -inline void ReorderInitState(const platform::DeviceContext& ctx, +template +inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, const size_t* index, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index, *dst, indexed_src); } -template +template class GRUKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { @@ -60,12 +60,12 @@ class GRUKernel : public framework::OpKernel { auto hidden_dims = hidden->dims(); bool is_reverse = context.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; - auto& dev_ctx = context.device_context(); + math::LoDTensor2BatchFunctor to_batch; + auto& dev_ctx = context.template device_context(); to_batch(dev_ctx, *input, *batch_gate, true, is_reverse); if (bias) { - math::RowwiseAdd add_bias; + math::RowwiseAdd add_bias; add_bias(dev_ctx, *batch_gate, *bias, batch_gate); } @@ -80,8 +80,9 @@ class GRUKernel : public framework::OpKernel { // Since the batch computing for GRU reorders the input sequences // according to their length. The initialized cell state also needs // to reorder. - ReorderInitState(context.device_context(), *h0, order, - &ordered_h0, true); + ReorderInitState( + context.template device_context(), *h0, order, + &ordered_h0, true); gru_value.prev_out_value = ordered_h0.data(); } else { gru_value.prev_out_value = nullptr; @@ -99,14 +100,14 @@ class GRUKernel : public framework::OpKernel { gru_value.output_value = hidden_t.data(); gru_value.gate_value = gate_t.data(); gru_value.reset_output_value = reset_hidden_prev_t.data(); - math::GRUUnitFunctor::compute( + math::GRUUnitFunctor::compute( dev_ctx, gru_value, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); gru_value.prev_out_value = gru_value.output_value; } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_hidden->set_lod(batch_gate->lod()); to_seq(dev_ctx, *batch_hidden, *hidden); } @@ -116,7 +117,7 @@ class GRUKernel : public framework::OpKernel { } }; -template +template class GRUGradKernel : public framework::OpKernel { public: void BatchCompute(const framework::ExecutionContext& context) const { @@ -141,14 +142,14 @@ class GRUGradKernel : public framework::OpKernel { auto hidden_dims = hidden->dims(); int frame_size = hidden_dims[1]; - math::LoDTensor2BatchFunctor to_batch; + math::LoDTensor2BatchFunctor to_batch; LoDTensor batch_hidden_grad, batch_gate_grad, batch_reset_hidden_prev_grad; batch_hidden_grad.mutable_data(hidden_dims, context.GetPlace()); batch_gate_grad.mutable_data(gate_dims, context.GetPlace()); batch_reset_hidden_prev_grad.mutable_data(hidden_dims, context.GetPlace()); - math::SetConstant zero; - auto& dev_ctx = context.device_context(); + math::SetConstant zero; + auto& dev_ctx = context.template device_context(); zero(dev_ctx, &batch_hidden_grad, static_cast(0.0)); zero(dev_ctx, &batch_gate_grad, static_cast(0.0)); zero(dev_ctx, &batch_reset_hidden_prev_grad, static_cast(0.0)); @@ -156,12 +157,13 @@ class GRUGradKernel : public framework::OpKernel { Tensor ordered_h0, ordered_h0_grad; const size_t* order = batch_gate->lod()[2].data(); if (h0) { - ReorderInitState(context.device_context(), *h0, order, - &ordered_h0, true); + ReorderInitState(dev_ctx, *h0, order, &ordered_h0, + true); } if (h0_grad) { ordered_h0_grad.mutable_data(h0_grad->dims(), context.GetPlace()); - zero(context.device_context(), &ordered_h0_grad, static_cast(0.0)); + zero(context.template device_context(), &ordered_h0_grad, + static_cast(0.0)); } bool is_reverse = context.Attr("is_reverse"); @@ -216,25 +218,25 @@ class GRUGradKernel : public framework::OpKernel { gru_grad.prev_out_grad = hidden_prev_grad_t.data(); } - math::GRUUnitGradFunctor::compute( + math::GRUUnitGradFunctor::compute( dev_ctx, gru_value, gru_grad, frame_size, cur_batch_size, math::ActiveType(context.Attr("activation")), math::ActiveType(context.Attr("gate_activation"))); } if (input_grad) { input_grad->mutable_data(context.GetPlace()); - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_gate_grad.set_lod(batch_gate->lod()); to_seq(dev_ctx, batch_gate_grad, *input_grad); } if (bias_grad) { bias_grad->mutable_data(context.GetPlace()); - math::ColwiseSum col_sum; + math::ColwiseSum col_sum; col_sum(dev_ctx, batch_gate_grad, bias_grad); } if (h0 && h0_grad) { - ReorderInitState(context.device_context(), ordered_h0_grad, - order, h0_grad, false); + ReorderInitState(dev_ctx, ordered_h0_grad, order, + h0_grad, false); } } diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc index 877c969103..705de87be5 100644 --- a/paddle/operators/gru_unit_op.cc +++ b/paddle/operators/gru_unit_op.cc @@ -201,9 +201,10 @@ class GRUUnitGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad, ops::GRUUnitGradOp); -REGISTER_OP_CPU_KERNEL(gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); REGISTER_OP_CPU_KERNEL( - gru_unit_grad, ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CPU_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu index 821c8c6421..7c752db494 100644 --- a/paddle/operators/gru_unit_op.cu +++ b/paddle/operators/gru_unit_op.cu @@ -16,9 +16,10 @@ #include "paddle/operators/gru_unit_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(gru_unit, - ops::GRUUnitKernel, - ops::GRUUnitKernel); -REGISTER_OP_GPU_KERNEL( - gru_unit_grad, ops::GRUUnitGradKernel, - ops::GRUUnitGradKernel); +REGISTER_OP_CUDA_KERNEL( + gru_unit, ops::GRUUnitKernel, + ops::GRUUnitKernel); +REGISTER_OP_CUDA_KERNEL( + gru_unit_grad, + ops::GRUUnitGradKernel, + ops::GRUUnitGradKernel); diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h index 3398c0934e..8fe60c750d 100644 --- a/paddle/operators/gru_unit_op.h +++ b/paddle/operators/gru_unit_op.h @@ -34,7 +34,7 @@ using EigenVector = framework::EigenVector; enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 }; -template +template class GRUUnitKernel : public framework::OpKernel { public: template @@ -71,7 +71,8 @@ class GRUUnitKernel : public framework::OpKernel { auto g = EigenMatrix::From(*gate); auto r_h_p = EigenMatrix::From(*reset_hidden_prev); auto h = EigenMatrix::From(*hidden); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); // calculate unactivated gate outputs if (bias) { @@ -86,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel { const T* weight_data = weight->data(); T* gate_data = gate->data(); T* reset_hidden_prev_data = reset_hidden_prev->data(); - math::gemm(context.device_context(), false, false, batch_size, - 2 * frame_size, frame_size, 1, hidden_prev_data, - frame_size, weight_data, frame_size * 2, 1, gate_data, - frame_size * 3); + math::gemm( + context.template device_context(), false, false, + batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size, + weight_data, frame_size * 2, 1, gate_data, frame_size * 3); // calculate activited gate Eigen::array extents({{batch_size, frame_size}}); @@ -102,11 +103,11 @@ class GRUUnitKernel : public framework::OpKernel { g.slice(r_offsets, extents), g.slice(r_offsets, extents)); auto r = g.slice(r_offsets, extents); // reset gate r_h_p.device(place) = r * h_p; // reset previous hidden state - math::gemm(context.device_context(), false, false, batch_size, - frame_size, frame_size, 1, reset_hidden_prev_data, - frame_size, weight_data + frame_size * frame_size * 2, - frame_size, 1, gate_data + frame_size * 2, - frame_size * 3); + math::gemm( + context.template device_context(), false, false, + batch_size, frame_size, frame_size, 1, reset_hidden_prev_data, + frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1, + gate_data + frame_size * 2, frame_size * 3); Eigen::array c_offsets({{0, frame_size * 2}}); ActCompute(context.Attr("activation"), place, @@ -118,7 +119,7 @@ class GRUUnitKernel : public framework::OpKernel { } }; -template +template class GRUUnitGradKernel : public framework::OpKernel { public: template @@ -166,7 +167,8 @@ class GRUUnitGradKernel : public framework::OpKernel { auto d_h = EigenMatrix::From(*hidden_grad); auto d_g = EigenMatrix::From(gate_grad); auto d_r_h_p = EigenMatrix::From(reset_hidden_prev_grad); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); int batch_size = input->dims()[0]; int frame_size = hidden_prev->dims()[1]; @@ -186,11 +188,11 @@ class GRUUnitGradKernel : public framework::OpKernel { ActGradCompute(context.Attr("activation"), place, c, c, d_g.slice(c_offsets, extents), d_h * u); // backward for reset_hidden_prev - math::gemm(context.device_context(), false, true, batch_size, - frame_size, frame_size, 1, - gate_grad_data + frame_size * 2, frame_size * 3, - weight_data + frame_size * frame_size * 2, frame_size, - 0, reset_hidden_prev_grad_data, frame_size); + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2, + frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size, + 0, reset_hidden_prev_grad_data, frame_size); // backward for unactivated reset gate ActGradCompute(context.Attr("gate_activation"), place, r, r, d_g.slice(r_offsets, extents), d_r_h_p * h_p); @@ -198,17 +200,18 @@ class GRUUnitGradKernel : public framework::OpKernel { if (weight_grad) { T* weight_grad_data = weight_grad->mutable_data(context.GetPlace()); // backward for state_weight - math::gemm( - context.device_context(), true, false, frame_size, frame_size, - batch_size, 1, reset_hidden_prev_data, frame_size, - gate_grad_data + frame_size * 2, frame_size * 3, 0, + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size, batch_size, 1, reset_hidden_prev_data, + frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0, weight_grad_data + frame_size * frame_size * 2, frame_size); // backward for update_gate_weight and reset_gate_weight - math::gemm(context.device_context(), true, false, frame_size, - frame_size * 2, batch_size, 1, hidden_prev_data, - frame_size, gate_grad_data, frame_size * 3, 0, - weight_grad_data, frame_size * 2); + math::gemm( + context.template device_context(), true, false, + frame_size, frame_size * 2, batch_size, 1, hidden_prev_data, + frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data, + frame_size * 2); } // backward for hidden_prev if (hidden_prev_grad) { @@ -216,10 +219,11 @@ class GRUUnitGradKernel : public framework::OpKernel { hidden_prev_grad->mutable_data(context.GetPlace()); auto d_h_p = EigenMatrix::From(*hidden_prev_grad); d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u); - math::gemm(context.device_context(), false, true, batch_size, - frame_size, frame_size * 2, 1, gate_grad_data, - frame_size * 3, weight_data, frame_size * 2, 1, - hidden_prev_grad_data, frame_size); + math::gemm( + context.template device_context(), false, true, + batch_size, frame_size, frame_size * 2, 1, gate_grad_data, + frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data, + frame_size); } // backward for input if (input_grad) { diff --git a/paddle/operators/hinge_loss_op.cc b/paddle/operators/hinge_loss_op.cc index 1e13897bb6..373b4d99b4 100644 --- a/paddle/operators/hinge_loss_op.cc +++ b/paddle/operators/hinge_loss_op.cc @@ -106,8 +106,9 @@ class HingeLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker, hinge_loss_grad, ops::HingeLossGradOp); -REGISTER_OP_CPU_KERNEL(hinge_loss, - ops::HingeLossKernel); +REGISTER_OP_CPU_KERNEL( + hinge_loss, + ops::HingeLossKernel); REGISTER_OP_CPU_KERNEL( hinge_loss_grad, - ops::HingeLossGradKernel); + ops::HingeLossGradKernel); diff --git a/paddle/operators/hinge_loss_op.cu b/paddle/operators/hinge_loss_op.cu index ec20b08e30..31a5bde292 100644 --- a/paddle/operators/hinge_loss_op.cu +++ b/paddle/operators/hinge_loss_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/hinge_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(hinge_loss, - ops::HingeLossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + hinge_loss, + ops::HingeLossKernel); +REGISTER_OP_CUDA_KERNEL( hinge_loss_grad, - ops::HingeLossGradKernel); + ops::HingeLossGradKernel); diff --git a/paddle/operators/hinge_loss_op.h b/paddle/operators/hinge_loss_op.h index c0be496f9c..91369cfb8a 100644 --- a/paddle/operators/hinge_loss_op.h +++ b/paddle/operators/hinge_loss_op.h @@ -19,14 +19,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class HingeLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { auto* pred = context.Input("Logits"); auto* label = context.Input("Labels"); auto* loss = context.Output("Loss"); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = framework::EigenVector::Flatten(*pred); auto y = framework::EigenVector::Flatten(*label); @@ -38,7 +39,7 @@ class HingeLossKernel : public framework::OpKernel { } }; -template +template class HingeLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -48,7 +49,8 @@ class HingeLossGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Loss")); auto* dpred = context.Output(framework::GradVarName("Logits")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = framework::EigenVector::Flatten(*pred); auto y = framework::EigenVector::Flatten(*label); diff --git a/paddle/operators/huber_loss_op.cc b/paddle/operators/huber_loss_op.cc index 938803d5b3..11828d083a 100644 --- a/paddle/operators/huber_loss_op.cc +++ b/paddle/operators/huber_loss_op.cc @@ -124,8 +124,9 @@ class HuberLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(huber_loss, ops::HuberLossOp, ops::HuberLossOpMaker, huber_loss_grad, ops::HuberLossGradOp); -REGISTER_OP_CPU_KERNEL(huber_loss, - ops::HuberLossKernel); +REGISTER_OP_CPU_KERNEL( + huber_loss, + ops::HuberLossKernel); REGISTER_OP_CPU_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel); diff --git a/paddle/operators/huber_loss_op.cu b/paddle/operators/huber_loss_op.cu index 317321dc6c..d49a4d9d42 100644 --- a/paddle/operators/huber_loss_op.cu +++ b/paddle/operators/huber_loss_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/huber_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(huber_loss, - ops::HuberLossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + huber_loss, + ops::HuberLossKernel); +REGISTER_OP_CUDA_KERNEL( huber_loss_grad, - ops::HuberLossGradKernel); + ops::HuberLossGradKernel); diff --git a/paddle/operators/huber_loss_op.h b/paddle/operators/huber_loss_op.h index 4e7bc55432..4dd20e8b08 100644 --- a/paddle/operators/huber_loss_op.h +++ b/paddle/operators/huber_loss_op.h @@ -41,7 +41,7 @@ struct HuberLossForward { T delta; }; -template +template class HuberLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -50,7 +50,8 @@ class HuberLossKernel : public framework::OpKernel { auto* out0 = context.Output("Residual"); auto* out1 = context.Output("Out"); auto delta = static_cast(context.Attr("delta")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = EigenVector::Flatten(*in0); auto y = EigenVector::Flatten(*in1); @@ -85,7 +86,7 @@ struct HuberLossBackward { T delta; }; -template +template class HuberLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -94,7 +95,8 @@ class HuberLossGradKernel : public framework::OpKernel { auto* out0 = context.Output(framework::GradVarName("X")); auto* out1 = context.Output(framework::GradVarName("Y")); auto delta = static_cast(context.op().Attr("delta")); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto residual = EigenVector::Flatten(*in0); auto out_grad = EigenVector::Flatten(*in1); diff --git a/paddle/operators/l1_norm_op.cc b/paddle/operators/l1_norm_op.cc index 02ebf02296..c0b51202c6 100644 --- a/paddle/operators/l1_norm_op.cc +++ b/paddle/operators/l1_norm_op.cc @@ -69,7 +69,8 @@ $$Out = \sum{|X|}$$ namespace ops = paddle::operators; REGISTER_OP(l1_norm, ops::L1NormOp, ops::L1NormOpMaker, l1_norm_grad, ops::L1NormGradOp); -REGISTER_OP_CPU_KERNEL(l1_norm, - ops::L1NormKernel); REGISTER_OP_CPU_KERNEL( - l1_norm_grad, ops::L1NormGradKernel); + l1_norm, ops::L1NormKernel); +REGISTER_OP_CPU_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/operators/l1_norm_op.cu b/paddle/operators/l1_norm_op.cu index 1c206e04cc..fd725f86f6 100644 --- a/paddle/operators/l1_norm_op.cu +++ b/paddle/operators/l1_norm_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/l1_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(l1_norm, - ops::L1NormKernel); -REGISTER_OP_GPU_KERNEL( - l1_norm_grad, ops::L1NormGradKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm, ops::L1NormKernel); +REGISTER_OP_CUDA_KERNEL( + l1_norm_grad, + ops::L1NormGradKernel); diff --git a/paddle/operators/l1_norm_op.h b/paddle/operators/l1_norm_op.h index 3c60dc3dc7..ae3878f2b7 100644 --- a/paddle/operators/l1_norm_op.h +++ b/paddle/operators/l1_norm_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = sum(abs(X)) -template +template class L1NormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -30,14 +30,15 @@ class L1NormKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto out = framework::EigenScalar::From(*Out); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); out.device(place) = x.abs().sum(); } }; // dX = dout * sign(X) -template +template class L1NormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -52,7 +53,8 @@ class L1NormGradKernel : public framework::OpKernel { auto x_eigen = framework::EigenVector::Flatten(*x); auto d_out_eigen = framework::EigenVector::Flatten(*d_out); auto dx_eigen = framework::EigenVector::Flatten(*dx); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); Eigen::DSizes x_dsize(x->numel()); dx_eigen.device(place) = d_out_eigen.broadcast(x_dsize) * x_eigen.sign(); diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc index 8e079a14e0..896e3657d4 100644 --- a/paddle/operators/linear_chain_crf_op.cc +++ b/paddle/operators/linear_chain_crf_op.cc @@ -261,9 +261,10 @@ REGISTER_OP(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFOpMaker, linear_chain_crf_grad, ops::LinearChainCRFGradOp); REGISTER_OP_CPU_KERNEL( linear_chain_crf, - ops::LinearChainCRFOpKernel, - ops::LinearChainCRFOpKernel); + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); REGISTER_OP_CPU_KERNEL( linear_chain_crf_grad, - ops::LinearChainCRFGradOpKernel, - ops::LinearChainCRFGradOpKernel); + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/operators/linear_chain_crf_op.cu b/paddle/operators/linear_chain_crf_op.cu index 6fc8995f4c..3b105ec341 100644 --- a/paddle/operators/linear_chain_crf_op.cu +++ b/paddle/operators/linear_chain_crf_op.cu @@ -16,11 +16,12 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( linear_chain_crf, - ops::LinearChainCRFOpKernel, - ops::LinearChainCRFOpKernel); -REGISTER_OP_GPU_KERNEL( + ops::LinearChainCRFOpKernel, + ops::LinearChainCRFOpKernel); +REGISTER_OP_CUDA_KERNEL( linear_chain_crf_grad, - ops::LinearChainCRFGradOpKernel, - ops::LinearChainCRFGradOpKernel); + ops::LinearChainCRFGradOpKernel, + ops::LinearChainCRFGradOpKernel); diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index 014bbfa758..694584e79c 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -50,7 +50,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class LinearChainCRFOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -137,7 +137,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel { framework::make_ddim({static_cast(batch_size), 1}), platform::CPUPlace()); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context() + .eigen_device(); auto x = EigenMatrix::From(*emission_weights); auto x_row_max = EigenMatrix::From(emission_row_max); x_row_max.device(place) = @@ -287,7 +288,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel { } }; -template +template class LinearChainCRFGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -359,8 +360,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { emission_grad->mutable_data(platform::CPUPlace()); if (transition_grad) { transition_grad->mutable_data(platform::CPUPlace()); - math::SetConstant()(ctx.device_context(), - transition_grad, 0.); + math::set_constant(ctx.device_context(), transition_grad, 0.); } // Now, all the inputs and outputs should be on the CPU memory. @@ -384,10 +384,10 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { Tensor one_seq_beta = beta.Slice(start_pos, end_pos); Tensor one_seq_emission_grad = emission_grad->Slice(start_pos, end_pos); - BackwardOneSequence(ctx.device_context(), ll_grad[i], - one_seq_emission_exps, *transition_exps, - one_seq_alpha, one_seq_label, &one_seq_beta, - transition_grad, &one_seq_emission_grad); + BackwardOneSequence( + ctx.template device_context(), ll_grad[i], + one_seq_emission_exps, *transition_exps, one_seq_alpha, one_seq_label, + &one_seq_beta, transition_grad, &one_seq_emission_grad); } if (platform::is_gpu_place(ctx.GetPlace())) { @@ -441,8 +441,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { copyTensor(ctx, transition_grad_src, transition_grad_dst); } - void BackwardOneSequence(const platform::DeviceContext& ctx, const T ll_grad, - const Tensor& emission_exps, + void BackwardOneSequence(const platform::CPUDeviceContext& ctx, + const T ll_grad, const Tensor& emission_exps, const Tensor& transition_exps, const Tensor& alpha, const Tensor& label, Tensor* beta, Tensor* transition_grad, @@ -481,7 +481,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { auto alpha_mat = EigenMatrix::From(alpha); auto beta_mat = EigenMatrix::From(*beta); - auto* place = ctx.GetEigenDevice(); + auto* place = ctx.eigen_device(); auto prob = alpha_mat * beta_mat; auto row_sum = prob.sum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(seq_length, 1)) diff --git a/paddle/operators/lod_reset_op.cu b/paddle/operators/lod_reset_op.cu index 5244a17c3a..f7c2358980 100644 --- a/paddle/operators/lod_reset_op.cu +++ b/paddle/operators/lod_reset_op.cu @@ -16,9 +16,10 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lod_reset, - ops::LoDResetKernel, - ops::LoDResetKernel); -REGISTER_OP_GPU_KERNEL( - lod_reset_grad, ops::LoDResetGradKernel, - ops::LoDResetGradKernel); +REGISTER_OP_CUDA_KERNEL( + lod_reset, ops::LoDResetKernel, + ops::LoDResetKernel); +REGISTER_OP_CUDA_KERNEL( + lod_reset_grad, + ops::LoDResetGradKernel, + ops::LoDResetGradKernel); diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h index cbcbf80adc..b86f8b1313 100644 --- a/paddle/operators/lod_reset_op.h +++ b/paddle/operators/lod_reset_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class LoDResetKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -65,7 +65,7 @@ class LoDResetKernel : public framework::OpKernel { } }; -template +template class LoDResetGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc index 257e5c8a49..4524229a33 100644 --- a/paddle/operators/log_loss_op.cc +++ b/paddle/operators/log_loss_op.cc @@ -109,7 +109,8 @@ class LogLossGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker, log_loss_grad, ops::LogLossGradOp); -REGISTER_OP_CPU_KERNEL(log_loss, - ops::LogLossKernel); REGISTER_OP_CPU_KERNEL( - log_loss_grad, ops::LogLossGradKernel); + log_loss, ops::LogLossKernel); +REGISTER_OP_CPU_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu index 6c189ef341..e87ac7d12a 100644 --- a/paddle/operators/log_loss_op.cu +++ b/paddle/operators/log_loss_op.cu @@ -16,7 +16,8 @@ #include "paddle/operators/log_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(log_loss, - ops::LogLossKernel); -REGISTER_OP_GPU_KERNEL( - log_loss_grad, ops::LogLossGradKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss, ops::LogLossKernel); +REGISTER_OP_CUDA_KERNEL( + log_loss_grad, + ops::LogLossGradKernel); diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h index 73404fce91..743eddb740 100644 --- a/paddle/operators/log_loss_op.h +++ b/paddle/operators/log_loss_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class LogLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -38,7 +38,7 @@ class LogLossKernel : public framework::OpKernel { auto label = EigenVector::Flatten(*ctx.Input("Labels")); auto loss = EigenVector::Flatten(*loss_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); loss.device(place) = (-(label * (prediction + epsilon).log()) - ((static_cast(1) - label) * @@ -46,7 +46,7 @@ class LogLossKernel : public framework::OpKernel { } }; -template +template class LogLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -59,7 +59,7 @@ class LogLossGradKernel : public framework::OpKernel { auto* dpred = ctx.Output(framework::GradVarName("Predicted")); auto dl = EigenVector::Flatten(*dloss); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); if (dpred) { dpred->mutable_data(ctx.GetPlace()); diff --git a/paddle/operators/logical_op.cu b/paddle/operators/logical_op.cu index d41239b2ca..7fef60e0c9 100644 --- a/paddle/operators/logical_op.cu +++ b/paddle/operators/logical_op.cu @@ -14,11 +14,11 @@ #include "paddle/operators/logical_op.h" -REGISTER_BINARY_LOGICAL_KERNEL(logical_and, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CUDA, paddle::operators::LogicalAndFunctor); -REGISTER_BINARY_LOGICAL_KERNEL(logical_or, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CUDA, paddle::operators::LogicalOrFunctor); -REGISTER_UNARY_LOGICAL_KERNEL(logical_not, GPU, +REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CUDA, paddle::operators::LogicalNotFunctor); -REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, GPU, +REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CUDA, paddle::operators::LogicalXorFunctor); diff --git a/paddle/operators/logical_op.h b/paddle/operators/logical_op.h index 6e78a7d6ed..629388cac8 100644 --- a/paddle/operators/logical_op.h +++ b/paddle/operators/logical_op.h @@ -47,7 +47,7 @@ struct LogicalXorFunctor { } }; -template +template class BinaryLogicalOpKernel : public framework::OpKernel { public: @@ -57,14 +57,14 @@ class BinaryLogicalOpKernel auto* y = context.Input("Y"); auto* out = context.Output("Out"); Functor binary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), - y->data(), out->mutable_data(context.GetPlace()), - binary_func); + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), y->data(), + out->mutable_data(context.GetPlace()), binary_func); } }; -template +template class UnaryLogicalOpKernel : public framework::OpKernel { public: @@ -73,8 +73,9 @@ class UnaryLogicalOpKernel auto* x = context.Input("X"); auto* out = context.Output("Out"); Functor unary_func; - platform::Transform trans; - trans(context.device_context(), x->data(), x->data() + x->numel(), + platform::Transform trans; + trans(context.template device_context(), x->data(), + x->data() + x->numel(), out->mutable_data(context.GetPlace()), unary_func); } }; @@ -85,9 +86,9 @@ class UnaryLogicalOpKernel #define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor) \ REGISTER_OP_##dev##_KERNEL( \ op_type, ::paddle::operators::BinaryLogicalOpKernel< \ - ::paddle::platform::dev##Place, functor>); + ::paddle::platform::dev##DeviceContext, functor>); #define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor) \ REGISTER_OP_##dev##_KERNEL( \ op_type, ::paddle::operators::UnaryLogicalOpKernel< \ - ::paddle::platform::dev##Place, functor>); + ::paddle::platform::dev##DeviceContext, functor>); diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index 84b044184a..9431030a53 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -85,6 +85,8 @@ template class LookupTableGradCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { + auto& dev_ctx = + context.template device_context(); bool is_sparse = context.Attr("is_sparse"); if (is_sparse) { auto* ids = context.Input("Ids"); @@ -95,7 +97,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { auto* ids_data = ids->data(); auto ids_dim = ids->dims(); - auto stream = context.cuda_device_context().stream(); + auto stream = dev_ctx.stream(); // copy GPU memory to CPU pinned memory framework::Vector new_rows; new_rows.resize(ids_dim[0]); @@ -129,14 +131,11 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { T* d_table = d_table_t->mutable_data(context.GetPlace()); auto t = framework::EigenVector::Flatten(*d_table_t); - t.device(context.GetEigenDevice()) = - t.constant(static_cast(0)); + t.device(*dev_ctx.eigen_device()) = t.constant(static_cast(0)); dim3 threads(128, 8); dim3 grids(8, 1); - LookupTableGrad< - T, 128, 8, - 8><<>>( + LookupTableGrad<<>>( d_table, d_output, ids, N, K, D); } } @@ -146,7 +145,8 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lookup_table, ops::LookupTableCUDAKernel, - ops::LookupTableCUDAKernel); -REGISTER_OP_GPU_KERNEL(lookup_table_grad, ops::LookupTableGradCUDAKernel, - ops::LookupTableGradCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table, ops::LookupTableCUDAKernel, + ops::LookupTableCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lookup_table_grad, + ops::LookupTableGradCUDAKernel, + ops::LookupTableGradCUDAKernel); diff --git a/paddle/operators/lrn_op.cc b/paddle/operators/lrn_op.cc index e20340e77b..b5b7bc940a 100644 --- a/paddle/operators/lrn_op.cc +++ b/paddle/operators/lrn_op.cc @@ -20,7 +20,7 @@ namespace operators { using framework::Tensor; template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, framework::Tensor* mid, int N, int C, int H, int W, int n, @@ -55,11 +55,11 @@ struct LRNFunctor { out_e = x_v * e_mid.reshape(Eigen::DSizes(e_mid.size())).pow(-beta); } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, const framework::Tensor& mid, framework::Tensor* x_g, @@ -113,8 +113,8 @@ struct LRNGradFunctor { } } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; class LRNOp : public framework::OperatorWithKernel { public: @@ -204,7 +204,7 @@ Input(i, x, y), Output(i, x, y) represents an element in an image. C is the number of feature maps of one image. n is a hyper-parameter configured when operator is initialized. The sum in the denominator is the sum of the same positions in the neighboring maps. - + )DOC"); } }; @@ -230,6 +230,7 @@ class LRNOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(lrn, ops::LRNOp, ops::LRNOpMaker, lrn_grad, ops::LRNOpGrad); -REGISTER_OP_CPU_KERNEL(lrn, ops::LRNKernel); -REGISTER_OP_CPU_KERNEL(lrn_grad, - ops::LRNGradKernel); +REGISTER_OP_CPU_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CPU_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/operators/lrn_op.cu b/paddle/operators/lrn_op.cu index e9a8671233..c6857c2b6d 100644 --- a/paddle/operators/lrn_op.cu +++ b/paddle/operators/lrn_op.cu @@ -69,19 +69,18 @@ void CrossMapNormal(const framework::ExecutionContext& ctx, const T* inputs, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - KeCMRNormFillScale< - T><<>>( + auto& dev_ctx = ctx.template device_context(); + KeCMRNormFillScale<<>>( img_size, inputs, mid, C, H, W, n, k, alpha); int input_size = N * H * W * C; grid_size = (input_size + block_size - 1) / block_size; - KeCMRNormOutput< - T><<>>( + KeCMRNormOutput<<>>( input_size, inputs, mid, -beta, outputs); } template -struct LRNFunctor { +struct LRNFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& input, framework::Tensor* out, framework::Tensor* mid, int N, int C, int H, int W, int n, @@ -92,8 +91,8 @@ struct LRNFunctor { } }; -template struct LRNFunctor; -template struct LRNFunctor; +template struct LRNFunctor; +template struct LRNFunctor; template __global__ void KeCMRNormDiff(int img_size, const T* x, const T* out, @@ -148,14 +147,14 @@ void CrossMapNormalGrad(const framework::ExecutionContext& ctx, const T* x, const int block_size = 1024; int grid_size = (img_size + block_size - 1) / block_size; - KeCMRNormDiff< - T><<>>( + auto& dev_ctx = ctx.template device_context(); + KeCMRNormDiff<<>>( img_size, x, out, mid, x_g, out_g, C, H, W, n, -beta, 2.0f * alpha * beta); } template -struct LRNGradFunctor { +struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, const framework::Tensor& mid, framework::Tensor* x_g, @@ -167,12 +166,13 @@ struct LRNGradFunctor { } }; -template struct LRNGradFunctor; -template struct LRNGradFunctor; +template struct LRNGradFunctor; +template struct LRNGradFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lrn, ops::LRNKernel); -REGISTER_OP_GPU_KERNEL(lrn_grad, - ops::LRNGradKernel); +REGISTER_OP_CUDA_KERNEL( + lrn, ops::LRNKernel); +REGISTER_OP_CUDA_KERNEL( + lrn_grad, ops::LRNGradKernel); diff --git a/paddle/operators/lrn_op.h b/paddle/operators/lrn_op.h index aa7539db4a..44063d3e03 100644 --- a/paddle/operators/lrn_op.h +++ b/paddle/operators/lrn_op.h @@ -29,7 +29,7 @@ struct LRNFunctor { T k, T alpha, T beta); }; -template +template class LRNKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; @@ -65,12 +65,12 @@ class LRNKernel : public framework::OpKernel { PADDLE_ENFORCE(beta >= 0.0, "beta should >= 0.0"); PADDLE_ENFORCE(k >= 0.0, "k should >= 0.0"); - LRNFunctor f; + LRNFunctor f; f(ctx, x, out, mid, N, C, H, W, n, k, alpha, beta); } }; -template +template struct LRNGradFunctor { void operator()(const framework::ExecutionContext& ctx, const framework::Tensor& x, const framework::Tensor& out, @@ -98,7 +98,7 @@ struct LRNGradFunctor { * The upper and lower is the same as forward. The logic of the sum * is also the same as forward. */ -template +template class LRNGradKernel : public framework::OpKernel { public: using Tensor = framework::Tensor; @@ -121,7 +121,7 @@ class LRNGradKernel : public framework::OpKernel { T alpha = ctx.Attr("alpha"); T beta = ctx.Attr("beta"); - LRNGradFunctor f; + LRNGradFunctor f; f(ctx, x, out, mid, x_g, out_g, N, C, H, W, n, alpha, beta); } }; diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc index fa8e5f2da8..2db7da30db 100644 --- a/paddle/operators/lstm_op.cc +++ b/paddle/operators/lstm_op.cc @@ -273,8 +273,9 @@ class LSTMGradOp : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp); -REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_CPU_KERNEL(lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CPU_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CPU_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/operators/lstm_op.cu.cc b/paddle/operators/lstm_op.cu.cc index 610cbb03e8..48519bed6f 100644 --- a/paddle/operators/lstm_op.cu.cc +++ b/paddle/operators/lstm_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/lstm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel, - ops::LSTMKernel); -REGISTER_OP_GPU_KERNEL(lstm_grad, - ops::LSTMGradKernel, - ops::LSTMGradKernel); +REGISTER_OP_CUDA_KERNEL( + lstm, ops::LSTMKernel, + ops::LSTMKernel); +REGISTER_OP_CUDA_KERNEL( + lstm_grad, ops::LSTMGradKernel, + ops::LSTMGradKernel); diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h index a78f548aaf..14abd4bf0a 100644 --- a/paddle/operators/lstm_op.h +++ b/paddle/operators/lstm_op.h @@ -24,16 +24,16 @@ namespace operators { using LoDTensor = framework::LoDTensor; using Tensor = framework::Tensor; -template -inline void ReorderInitState(const platform::DeviceContext& ctx, +template +inline void ReorderInitState(const DeviceContext& ctx, const framework::Tensor& src, const size_t* index, framework::Tensor* dst, bool indexed_src) { - math::CopyMatrixRowsFunctor row_shuffle; + math::CopyMatrixRowsFunctor row_shuffle; dst->mutable_data(src.dims(), ctx.GetPlace()); row_shuffle(ctx, src, index, *dst, indexed_src); } -template +template class LSTMKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,8 +52,8 @@ class LSTMKernel : public framework::OpKernel { cell_out->mutable_data(ctx.GetPlace()); bool is_reverse = ctx.Attr("is_reverse"); - math::LoDTensor2BatchFunctor to_batch; - auto& device_ctx = ctx.device_context(); + math::LoDTensor2BatchFunctor to_batch; + auto& device_ctx = ctx.template device_context(); to_batch(device_ctx, *input, *batch_gate, true, is_reverse); auto in_dims = input->dims(); @@ -64,7 +64,7 @@ class LSTMKernel : public framework::OpKernel { Tensor b = *bias; b.Resize({bias->numel(), 1}); Tensor gate_bias = b.Slice(0, 4 * frame_size); - math::RowwiseAdd add_bias; + math::RowwiseAdd add_bias; add_bias(device_ctx, *batch_gate, gate_bias, batch_gate); } @@ -88,8 +88,8 @@ class LSTMKernel : public framework::OpKernel { // Since the batch computing for LSTM reorders the input sequence // according to their length. The initialized cell state also needs // to reorder. - ReorderInitState(device_ctx, *cell_t0, order, &ordered_c0, - true); + ReorderInitState(device_ctx, *cell_t0, order, + &ordered_c0, true); lstm_value.prev_state_value = ordered_c0.data(); } @@ -121,9 +121,9 @@ class LSTMKernel : public framework::OpKernel { int pre_h_start = static_cast(batch_starts[n - 1]); int pre_h_end = pre_h_start + cur_batch_size; auto pre_hidden_t = batch_hidden.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, pre_hidden_t, false, *weight, false, - static_cast(1.0), &gate_t, - static_cast(1.0)); + math::matmul(device_ctx, pre_hidden_t, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); } else if (hidden_t0) { // If n == 0 and there is no initialized hidden state, that is to say // the H0 is zeros, the calculation W_h * H0 will be skiped. @@ -133,24 +133,24 @@ class LSTMKernel : public framework::OpKernel { // according to their length. The initialized hidden state also needs // to reorder. Tensor ordered_h0; - ReorderInitState(device_ctx, *hidden_t0, order, &ordered_h0, - true); - math::matmul(device_ctx, ordered_h0, false, *weight, false, - static_cast(1.0), &gate_t, - static_cast(1.0)); + ReorderInitState(device_ctx, *hidden_t0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, false, *weight, + false, static_cast(1.0), &gate_t, + static_cast(1.0)); } lstm_value.gate_value = gate_t.data(); lstm_value.output_value = out_t.data(); lstm_value.state_value = cell_t.data(); lstm_value.state_active_value = cell_pre_act_t.data(); - math::LstmUnitFunctor::compute(device_ctx, lstm_value, - frame_size, cur_batch_size, - gate_act, cell_act, cand_act); + math::LstmUnitFunctor::compute( + device_ctx, lstm_value, frame_size, cur_batch_size, gate_act, + cell_act, cand_act); lstm_value.prev_state_value = lstm_value.state_value; } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; batch_hidden.set_lod(batch_gate->lod()); // restore the output hidden in LoDTensor from the batch hidden to_seq(device_ctx, batch_hidden, *hidden_out); @@ -161,7 +161,7 @@ class LSTMKernel : public framework::OpKernel { } }; -template +template class LSTMGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -187,8 +187,8 @@ class LSTMGradKernel : public framework::OpKernel { auto* h0_g = ctx.Output(framework::GradVarName("H0")); auto* c0_g = ctx.Output(framework::GradVarName("C0")); - auto& device_ctx = ctx.device_context(); - math::SetConstant zero; + auto& device_ctx = ctx.template device_context(); + math::SetConstant zero; if (weight_g) { weight_g->mutable_data(ctx.GetPlace()); zero(device_ctx, weight_g, static_cast(0.0)); @@ -200,7 +200,8 @@ class LSTMGradKernel : public framework::OpKernel { Tensor ordered_h0, ordered_c0, ordered_h0_g, ordered_c0_g; const size_t* order = batch_gate->lod()[2].data(); if (c0) { - ReorderInitState(device_ctx, *c0, order, &ordered_c0, true); + ReorderInitState(device_ctx, *c0, order, &ordered_c0, + true); } if (c0 && c0_g) { ordered_c0_g.mutable_data(c0_g->dims(), ctx.GetPlace()); @@ -240,10 +241,10 @@ class LSTMGradKernel : public framework::OpKernel { lstm_grad.check_og_grad = nullptr; } - math::LoDTensor2BatchFunctor to_batch; + math::LoDTensor2BatchFunctor to_batch; auto ToBatch = [&batch_gate, &to_batch]( - const platform::DeviceContext& ctx, const framework::LoDTensor& src, + const DeviceContext& ctx, const framework::LoDTensor& src, const framework::DDim& dims, framework::LoDTensor& dst) { dst.mutable_data(dims, ctx.GetPlace()); dst.set_lod(batch_gate->lod()); @@ -299,7 +300,7 @@ class LSTMGradKernel : public framework::OpKernel { } int cur_batch_size = bend - bstart; - math::LstmUnitGradFunctor::compute( + math::LstmUnitGradFunctor::compute( device_ctx, lstm_value, lstm_grad, frame_size, cur_batch_size, gate_act, cell_act, cand_act); @@ -307,33 +308,34 @@ class LSTMGradKernel : public framework::OpKernel { int pre_h_start = static_cast(batch_starts[n - 1]); int pre_h_end = pre_h_start + cur_batch_size; auto pre_hidden_g = batch_hidden_g.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, gate_g, false, *weight, true, - static_cast(1.0), &pre_hidden_g, - static_cast(1.0)); + math::matmul(device_ctx, gate_g, false, *weight, true, + static_cast(1.0), &pre_hidden_g, + static_cast(1.0)); if (weight_g) { /* backward weight */ auto pre_hidden = batch_hidden.Slice(pre_h_start, pre_h_end); - math::matmul(device_ctx, pre_hidden, true, gate_g, false, - static_cast(1.0), weight_g, - static_cast(1.0)); + math::matmul(device_ctx, pre_hidden, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); } } else { if (h0 && weight_g) { - ReorderInitState(device_ctx, *h0, order, &ordered_h0, true); - math::matmul(device_ctx, ordered_h0, true, gate_g, false, - static_cast(1.0), weight_g, - static_cast(1.0)); + ReorderInitState(device_ctx, *h0, order, + &ordered_h0, true); + math::matmul(device_ctx, ordered_h0, true, gate_g, + false, static_cast(1.0), weight_g, + static_cast(1.0)); } if (h0 && h0_g) { ordered_h0_g.mutable_data(h0_g->dims(), ctx.GetPlace()); - math::matmul(device_ctx, gate_g, false, *weight, true, - static_cast(1.0), &ordered_h0_g, - static_cast(0.0)); + math::matmul(device_ctx, gate_g, false, *weight, + true, static_cast(1.0), + &ordered_h0_g, static_cast(0.0)); } } } - math::Batch2LoDTensorFunctor to_seq; + math::Batch2LoDTensorFunctor to_seq; if (in_g) { /* backward data */ in_g->mutable_data(ctx.GetPlace()); @@ -344,15 +346,17 @@ class LSTMGradKernel : public framework::OpKernel { Tensor b_g = *bias_g; b_g.Resize({bias_g->numel(), 1}); Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size); - math::ColwiseSum col_sum; + math::ColwiseSum col_sum; col_sum(device_ctx, batch_gate_g, &gate_bias_g); } if (h0 && h0_g) { - ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, false); + ReorderInitState(device_ctx, ordered_h0_g, order, h0_g, + false); } if (c0 && c0_g) { - ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, false); + ReorderInitState(device_ctx, ordered_c0_g, order, c0_g, + false); } } }; diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu index e192283aa0..291f2c295e 100644 --- a/paddle/operators/lstm_unit_op.cu +++ b/paddle/operators/lstm_unit_op.cu @@ -173,7 +173,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, - ops::LstmUnitOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, - ops::LstmUnitGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel, + ops::LstmUnitOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel, + ops::LstmUnitGradOpCUDAKernel); diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h index 38cb298f92..61705675d9 100644 --- a/paddle/operators/lstm_unit_op.h +++ b/paddle/operators/lstm_unit_op.h @@ -35,7 +35,7 @@ inline T tanh(T x) { return 2. * sigmoid(2. * x) - 1.; } -template +template class LstmUnitKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -78,7 +78,7 @@ class LstmUnitKernel : public framework::OpKernel { } }; -template +template class LstmUnitGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc index d7e8a0ea76..42e8961c0e 100644 --- a/paddle/operators/margin_rank_loss_op.cc +++ b/paddle/operators/margin_rank_loss_op.cc @@ -117,7 +117,7 @@ REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp, ops::MarginRankLossGradOp); REGISTER_OP_CPU_KERNEL( margin_rank_loss, - ops::MarginRankLossKernel); + ops::MarginRankLossKernel); REGISTER_OP_CPU_KERNEL( margin_rank_loss_grad, - ops::MarginRankLossGradKernel); + ops::MarginRankLossGradKernel); diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu index 3a639f25d4..1c2afccc5b 100644 --- a/paddle/operators/margin_rank_loss_op.cu +++ b/paddle/operators/margin_rank_loss_op.cu @@ -16,9 +16,9 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( margin_rank_loss, - ops::MarginRankLossKernel); -REGISTER_OP_GPU_KERNEL( + ops::MarginRankLossKernel); +REGISTER_OP_CUDA_KERNEL( margin_rank_loss_grad, - ops::MarginRankLossGradKernel); + ops::MarginRankLossGradKernel); diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h index 8d0830147e..9c1f96cac1 100644 --- a/paddle/operators/margin_rank_loss_op.h +++ b/paddle/operators/margin_rank_loss_op.h @@ -34,7 +34,7 @@ struct Heaviside { } }; -template +template class MarginRankLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -56,13 +56,13 @@ class MarginRankLossKernel : public framework::OpKernel { auto x1 = framework::EigenVector::Flatten(*x1_t); auto x2 = framework::EigenVector::Flatten(*x2_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU()); act.device(dev) = out.unaryExpr(Heaviside()); } }; -template +template class MarginRankLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -78,7 +78,7 @@ class MarginRankLossGradKernel : public framework::OpKernel { auto d_out = framework::EigenVector::Flatten(*d_out_t); auto act = framework::EigenVector::Flatten(*act_t); auto label = framework::EigenVector::Flatten(*label_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); // compute d_x1 if (d_x1_t) { diff --git a/paddle/operators/math/context_project.cc b/paddle/operators/math/context_project.cc index f82ea5d7be..980dd90df8 100644 --- a/paddle/operators/math/context_project.cc +++ b/paddle/operators/math/context_project.cc @@ -18,8 +18,8 @@ namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/context_project.cu b/paddle/operators/math/context_project.cu index 04eeed543c..934e3df645 100644 --- a/paddle/operators/math/context_project.cu +++ b/paddle/operators/math/context_project.cu @@ -20,8 +20,8 @@ namespace paddle { namespace operators { namespace math { -template class ContextProjectFunctor; -template class ContextProjectFunctor; +template class ContextProjectFunctor; +template class ContextProjectFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h index d853507188..4036614086 100644 --- a/paddle/operators/math/context_project.h +++ b/paddle/operators/math/context_project.h @@ -81,17 +81,17 @@ using LoDTensor = framework::LoDTensor; * */ -template +template class ContextProjectFunctor { public: - void operator()(const platform::DeviceContext& context, const LoDTensor& in, + void operator()(const DeviceContext& context, const LoDTensor& in, const Tensor& padding_data, bool padding_trainable, const int context_start, const int context_length, const int context_stride, const int up_pad, const int down_pad, Tensor* col) { auto lod_level_0 = in.lod()[0]; - math::Im2ColFunctor im2col_ocf; + math::Im2ColFunctor im2col_ocf; std::vector dilation({1, 1}); std::vector padding({up_pad, 0, down_pad, 0}); @@ -188,17 +188,17 @@ class ContextProjectFunctor { } }; -template +template class ContextProjectGradFunctor { public: - void operator()(const platform::DeviceContext& context, const LoDTensor& in, + void operator()(const DeviceContext& context, const LoDTensor& in, bool padding_trainable, const int context_start, const int context_length, const int context_stride, const int up_pad, const int down_pad, bool pad_grad, bool input_grad, Tensor* padding_data, Tensor* col) { auto lod_level_0 = in.lod()[0]; - math::Col2ImFunctor col2im_ocf; + math::Col2ImFunctor col2im_ocf; std::vector dilation({1, 1}); std::vector padding({up_pad, 0, down_pad, 0}); @@ -258,8 +258,8 @@ class ContextProjectGradFunctor { Tensor out_t_sub = out_t.Slice(k * context_length, k * context_length + padding_size); Tensor w_sub = padding_data->Slice(k, k + padding_size); - axpy(context, w_sub.numel(), static_cast(1), - out_t_sub.data(), w_sub.data()); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); } } if (down_pad > 0) { @@ -290,8 +290,8 @@ class ContextProjectGradFunctor { (down_pad_begin_row + t) * context_length); Tensor w_sub = padding_data->Slice( up_pad + padding_idx, up_pad + padding_idx + padding_size); - axpy(context, w_sub.numel(), static_cast(1), - out_t_sub.data(), w_sub.data()); + axpy(context, w_sub.numel(), static_cast(1), + out_t_sub.data(), w_sub.data()); } } out_t.Resize({sequence_height, context_length * sequence_width}); diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc index cf238a58e0..6011a196d4 100644 --- a/paddle/operators/math/cross_entropy.cc +++ b/paddle/operators/math/cross_entropy.cc @@ -24,9 +24,9 @@ template ; template -class CrossEntropyFunctor { +class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& ctx, framework::Tensor* out, + void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out, const framework::Tensor* prob, const framework::Tensor* labels, const bool softLabel) { const int batch_size = prob->dims()[0]; @@ -35,7 +35,7 @@ class CrossEntropyFunctor { auto lbl = EigenMatrix::From(*labels); auto loss = EigenMatrix::From(*out); - loss.device(*ctx.GetEigenDevice()) = + loss.device(*ctx.eigen_device()) = -((lbl * in.log().unaryExpr(math::TolerableValue())) .sum(Eigen::DSizes(1)) .reshape(Eigen::DSizes(batch_size, 1))); @@ -53,8 +53,8 @@ class CrossEntropyFunctor { } }; -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu index 651c08f740..2132d49c93 100644 --- a/paddle/operators/math/cross_entropy.cu +++ b/paddle/operators/math/cross_entropy.cu @@ -95,10 +95,10 @@ __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label, using Tensor = framework::Tensor; template -class CrossEntropyFunctor { +class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& ctx, framework::Tensor* out, - const framework::Tensor* prob, + void operator()(const platform::CUDADeviceContext& ctx, + framework::Tensor* out, const framework::Tensor* prob, const framework::Tensor* labels, bool softLabel) { const T* prob_data = prob->data(); T* loss_data = out->mutable_data(ctx.GetPlace()); @@ -118,16 +118,14 @@ class CrossEntropyFunctor { const int64_t* label_data = labels->data(); int block = 512; int grid = (batch_size + block - 1) / block; - CrossEntropyKernel<<< - grid, block, 0, - reinterpret_cast(ctx).stream()>>>( + CrossEntropyKernel<<>>( loss_data, prob_data, label_data, batch_size, class_num); } } }; -template class CrossEntropyFunctor; -template class CrossEntropyFunctor; +template class CrossEntropyFunctor; +template class CrossEntropyFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h index 70ed9ddd55..677adb5ada 100644 --- a/paddle/operators/math/cross_entropy.h +++ b/paddle/operators/math/cross_entropy.h @@ -33,11 +33,11 @@ struct TolerableValue { } }; -template +template class CrossEntropyFunctor { public: - void operator()(const platform::DeviceContext& context, - framework::Tensor* out, const framework::Tensor* prob, + void operator()(const DeviceContext& context, framework::Tensor* out, + const framework::Tensor* prob, const framework::Tensor* labels, const bool softLabel); }; } // namespace math diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc index ae4e47b014..d570c68cd4 100644 --- a/paddle/operators/math/gru_compute.cc +++ b/paddle/operators/math/gru_compute.cc @@ -19,14 +19,14 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitFunctor { + static void compute(const platform::CPUDeviceContext &context, hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { #ifndef __NVCC__ if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, frame_size * 3); @@ -36,7 +36,7 @@ struct GRUUnitFunctor { frame_size, batch_size, active_gate); if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight, frame_size, 1, value.gate_value + frame_size * 2, frame_size * 3); @@ -49,8 +49,8 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const platform::CPUDeviceContext &context, hl_gru_value value, hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, @@ -60,13 +60,13 @@ struct GRUUnitGradFunctor { grad, frame_size, batch_size, active_node); if (value.prev_out_value && grad.prev_out_grad) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size, 1, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, frame_size, 0, grad.reset_output_grad, frame_size); if (grad.state_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size, batch_size, 1, value.reset_output_value, frame_size, grad.gate_grad + frame_size * 2, frame_size * 3, 1, @@ -78,13 +78,13 @@ struct GRUUnitGradFunctor { grad, frame_size, batch_size, active_gate); if (grad.prev_out_grad && value.prev_out_value) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, grad.prev_out_grad, frame_size); if (grad.gate_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size * 2, batch_size, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2); @@ -94,10 +94,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu index 0252bdbdb6..dd518cd1e4 100644 --- a/paddle/operators/math/gru_compute.cu +++ b/paddle/operators/math/gru_compute.cu @@ -19,13 +19,12 @@ namespace operators { namespace math { template -struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitFunctor { + static void compute(const platform::CUDADeviceContext &context, hl_gru_value value, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); dim3 threads; dim3 grid; if (batch_size == 1) { @@ -39,7 +38,7 @@ struct GRUUnitFunctor { } if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size * 2, frame_size, 1, value.prev_out_value, frame_size, value.gate_weight, frame_size * 2, 1, value.gate_value, frame_size * 3); @@ -62,7 +61,7 @@ struct GRUUnitFunctor { } if (value.prev_out_value) { - math::gemm( + math::gemm( context, false, false, batch_size, frame_size, frame_size, 1, value.reset_output_value, frame_size, value.state_weight, frame_size, 1, value.gate_value + frame_size * 2, frame_size * 3); @@ -87,14 +86,13 @@ struct GRUUnitFunctor { }; template -struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, +struct GRUUnitGradFunctor { + static void compute(const platform::CUDADeviceContext &context, hl_gru_value value, hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate) { - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); dim3 threads; dim3 grid; if (batch_size == 1) { @@ -124,13 +122,13 @@ struct GRUUnitGradFunctor { } if (value.prev_out_value && grad.prev_out_grad) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size, 1, grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight, frame_size, 0, grad.reset_output_grad, frame_size); if (grad.state_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size, batch_size, 1, value.reset_output_value, frame_size, grad.gate_grad + frame_size * 2, frame_size * 3, 1, @@ -155,13 +153,13 @@ struct GRUUnitGradFunctor { } if (grad.prev_out_grad && value.prev_out_value) { - math::gemm( + math::gemm( context, false, true, batch_size, frame_size, frame_size * 2, 1, grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1, grad.prev_out_grad, frame_size); if (grad.gate_weight_grad) { - math::gemm( + math::gemm( context, true, false, frame_size, frame_size * 2, batch_size, 1, value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2); @@ -170,10 +168,10 @@ struct GRUUnitGradFunctor { } }; -template struct GRUUnitFunctor; -template struct GRUUnitFunctor; -template struct GRUUnitGradFunctor; -template struct GRUUnitGradFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitFunctor; +template struct GRUUnitGradFunctor; +template struct GRUUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h index 58ea59f68e..ca1343cb2c 100644 --- a/paddle/operators/math/gru_compute.h +++ b/paddle/operators/math/gru_compute.h @@ -40,19 +40,18 @@ struct hl_gru_grad { T *prev_out_grad; }; -template +template struct GRUUnitFunctor { - static void compute(const platform::DeviceContext &context, - hl_gru_value value, int frame_size, int batch_size, + static void compute(const DeviceContext &context, hl_gru_value value, + int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate); }; -template +template struct GRUUnitGradFunctor { - static void compute(const platform::DeviceContext &context, - hl_gru_value value, hl_gru_grad grad, - int frame_size, int batch_size, + static void compute(const DeviceContext &context, hl_gru_value value, + hl_gru_grad grad, int frame_size, int batch_size, activation_mode_t active_node, activation_mode_t active_gate); }; diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc index c10c44c520..707ebf0596 100644 --- a/paddle/operators/math/im2col.cc +++ b/paddle/operators/math/im2col.cc @@ -25,9 +25,9 @@ namespace math { */ template class Im2ColFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -90,9 +90,9 @@ class Im2ColFunctor class Col2ImFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -149,13 +149,13 @@ class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Im2ColFunctor; + platform::CPUDeviceContext, double>; template class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Col2ImFunctor; + platform::CPUDeviceContext, double>; /* * im = [input_channels, input_height, input_width] @@ -164,9 +164,9 @@ template class Col2ImFunctor class Im2ColFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -235,9 +235,9 @@ class Im2ColFunctor class Col2ImFunctor { + platform::CPUDeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -300,13 +300,13 @@ class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Im2ColFunctor; + platform::CPUDeviceContext, double>; template class Col2ImFunctor; + platform::CPUDeviceContext, float>; template class Col2ImFunctor; + platform::CPUDeviceContext, double>; } // namespace math } // namespace operators diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu index bf78942439..a88e837b03 100644 --- a/paddle/operators/math/im2col.cu +++ b/paddle/operators/math/im2col.cu @@ -58,9 +58,9 @@ __global__ void im2col(const T* data_im, int num_outs, int im_height, */ template class Im2ColFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -96,9 +96,7 @@ class Im2ColFunctor<<(context) - .stream()>>>( + im2col<<>>( im.data(), num_outputs, im_height, im_width, dilation[0], dilation[1], filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, col->data()); @@ -160,9 +158,9 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width, */ template class Col2ImFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -203,9 +201,7 @@ class Col2ImFunctor<<(context) - .stream()>>>( + col2im<<>>( num_kernels, col.data(), im_height, im_width, dilation[0], dilation[1], filter_height, filter_width, stride[0], stride[1], padding[0], padding[2], col_height, col_width, im->data()); @@ -213,13 +209,13 @@ class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Im2ColFunctor; + platform::CUDADeviceContext, double>; template class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Col2ImFunctor; + platform::CUDADeviceContext, double>; template __global__ void im2colOCF(const T* im_data, int im_channels, int im_height, @@ -260,9 +256,9 @@ __global__ void im2colOCF(const T* im_data, int im_channels, int im_height, */ template class Im2ColFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& im, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col) { @@ -310,9 +306,7 @@ class Im2ColFunctor<<(context) - .stream()>>>( + im2colOCF<<>>( im.data(), im_channels, im_height, im_width, filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, col->data()); @@ -358,9 +352,9 @@ __global__ void col2imOCF(const T* col_data, int im_channels, int im_height, */ template class Col2ImFunctor { + platform::CUDADeviceContext, T> { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, @@ -409,9 +403,7 @@ class Col2ImFunctor<<(context) - .stream()>>>( + col2imOCF<<>>( col.data(), im_channels, im_height, im_width, filter_height, filter_width, stride[0], stride[1], padding[0], padding[1], col_height, col_width, im->data()); @@ -419,13 +411,13 @@ class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Im2ColFunctor; + platform::CUDADeviceContext, double>; template class Col2ImFunctor; + platform::CUDADeviceContext, float>; template class Col2ImFunctor; + platform::CUDADeviceContext, double>; } // namespace math } // namespace operators diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h index 24fd9a06e9..38f2c9fe0a 100644 --- a/paddle/operators/math/im2col.h +++ b/paddle/operators/math/im2col.h @@ -79,20 +79,19 @@ enum class ColFormat { kCFO = 0, kOCF = 1 }; * \note The caller needs to ensure that imShape.inputChannels is equal to * colShape.inputChannels. */ -template +template class Im2ColFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& im, const std::vector& dilation, + void operator()(const DeviceContext& context, const framework::Tensor& im, + const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* col); }; -template +template class Col2ImFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& col, + void operator()(const DeviceContext& context, const framework::Tensor& col, const std::vector& dilation, const std::vector& stride, const std::vector& padding, framework::Tensor* im); diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index ae197a97ed..256f3bc9bd 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -template +template void testIm2col() { paddle::framework::Tensor input_tmp; paddle::framework::Tensor input; @@ -59,18 +59,7 @@ void testIm2col() { memcpy(input_ptr, arr, 6 * sizeof(float)); auto* place = new Place(); - paddle::platform::DeviceContext* context; - if (paddle::platform::is_cpu_place(*place)) { - context = - new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); - } else { -#ifdef PADDLE_WITH_CUDA - context = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); -#else - PADDLE_THROW("no GPU support"); -#endif // PADDLE_WITH_CUDA - } + DeviceContext* context = new DeviceContext(*place); if (paddle::platform::is_cpu_place(*place)) { input = input_tmp; } else { @@ -83,10 +72,10 @@ void testIm2col() { // Im2Col paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kCFO, Place, float> + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> im2col; paddle::operators::math::Im2ColFunctor< - paddle::operators::math::ColFormat::kOCF, Place, float> + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> im2col_ocf; im2col(*context, input, dilation, stride, padding, &output_cfo); @@ -119,10 +108,10 @@ void testIm2col() { // Col2Im: kCFO paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kCFO, Place, float> + paddle::operators::math::ColFormat::kCFO, DeviceContext, float> col2im; paddle::operators::math::Col2ImFunctor< - paddle::operators::math::ColFormat::kOCF, Place, float> + paddle::operators::math::ColFormat::kOCF, DeviceContext, float> col2im_ocf; float col2im_data[] = {0, 2, 2, 3, 8, 5}; @@ -168,8 +157,8 @@ void testIm2col() { } TEST(math, im2col) { - testIm2col(); + testIm2col(); #ifdef PADDLE_WITH_CUDA - testIm2col(); + testIm2col(); #endif } diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc index ad3a59bcdb..2c2e8bb82e 100644 --- a/paddle/operators/math/lstm_compute.cc +++ b/paddle/operators/math/lstm_compute.cc @@ -21,8 +21,8 @@ namespace operators { namespace math { template -struct LstmUnitFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitFunctor { + static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, const std::string& cand_act) { @@ -42,8 +42,8 @@ struct LstmUnitFunctor { }; template -struct LstmUnitGradFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitGradFunctor { + static void compute(const platform::CPUDeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, @@ -72,10 +72,10 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu index b2122f2a5c..92b1f4228b 100644 --- a/paddle/operators/math/lstm_compute.cu +++ b/paddle/operators/math/lstm_compute.cu @@ -21,8 +21,8 @@ namespace operators { namespace math { template -struct LstmUnitFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitFunctor { + static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, const std::string& cand_act) { @@ -33,8 +33,8 @@ struct LstmUnitFunctor { }; template -struct LstmUnitGradFunctor { - static void compute(const platform::DeviceContext& context, +struct LstmUnitGradFunctor { + static void compute(const platform::CUDADeviceContext& context, LstmMetaValue value, LstmMetaGrad grad, int frame_size, int batch_size, const std::string& gate_act, const std::string& cell_act, @@ -45,10 +45,10 @@ struct LstmUnitGradFunctor { } }; -template class LstmUnitFunctor; -template class LstmUnitFunctor; -template class LstmUnitGradFunctor; -template class LstmUnitGradFunctor; +template class LstmUnitFunctor; +template class LstmUnitFunctor; +template class LstmUnitGradFunctor; +template class LstmUnitGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h index 9652399d4c..5f74e27358 100644 --- a/paddle/operators/math/lstm_compute.h +++ b/paddle/operators/math/lstm_compute.h @@ -67,21 +67,20 @@ inline activation_mode_t ActiveType(const std::string &type) { } } -template +template class LstmUnitFunctor { public: - static void compute(const platform::DeviceContext &context, - LstmMetaValue value, int frame_size, int batch_size, + static void compute(const DeviceContext &context, LstmMetaValue value, + int frame_size, int batch_size, const std::string &gate_act, const std::string &cell_act, const std::string &cand_act); }; -template +template class LstmUnitGradFunctor { public: - static void compute(const platform::DeviceContext &context, - LstmMetaValue value, LstmMetaGrad grad, - int frame_size, int batch_size, + static void compute(const DeviceContext &context, LstmMetaValue value, + LstmMetaGrad grad, int frame_size, int batch_size, const std::string &gate_act, const std::string &cell_act, const std::string &cand_act); }; diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index e099a6a439..2b35e4532a 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -21,13 +21,11 @@ namespace operators { namespace math { template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const float alpha, const float* A, - const float* B, const float beta, - float* C) { +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -36,13 +34,11 @@ void gemm(const platform::DeviceContext& context, } template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const double alpha, const double* A, - const double* B, const double beta, - double* C) { +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { int lda = (transA == CblasNoTrans) ? K : M; int ldb = (transB == CblasNoTrans) ? N : K; int ldc = N; @@ -51,35 +47,32 @@ void gemm(const platform::DeviceContext& context, } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const float alpha, const float* A, - const int lda, const float* B, - const int ldb, const float beta, float* C, - const int ldc) { +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const double alpha, const double* A, - const int lda, const double* B, - const int ldb, const double beta, - double* C, const int ldc) { +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans, transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -99,15 +92,16 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -127,7 +121,7 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } @@ -135,8 +129,8 @@ void matmul( #ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -157,8 +151,8 @@ void batched_gemm( } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -183,8 +177,8 @@ void batched_gemm( // functions of Intel MKL are not available. In the future, this computation // should be parallelized. template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -192,14 +186,14 @@ void batched_gemm( const float* Ak = &A[k * strideA]; const float* Bk = &B[k * strideB]; float* Ck = &C[k * M * N]; - gemm(context, transA, transB, M, N, K, alpha, Ak, - Bk, beta, Ck); + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); } } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -207,55 +201,53 @@ void batched_gemm( const double* Ak = &A[k * strideA]; const double* Bk = &B[k * strideB]; double* Ck = &C[k * M * N]; - gemm(context, transA, transB, M, N, K, alpha, - Ak, Bk, beta, Ck); + gemm(context, transA, transB, M, N, K, + alpha, Ak, Bk, beta, Ck); } } #endif template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const float alpha, - const float* A, const float* B, - const float beta, float* C) { +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; cblas_sgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const double alpha, - const double* A, const double* B, - const double beta, double* C) { +void gemv( + const platform::CPUDeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; cblas_dgemv(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const float alpha, - const float* x, float* y) { +void axpy( + const platform::CPUDeviceContext& context, const int n, const float alpha, + const float* x, float* y) { cblas_saxpy(n, alpha, x, 1, y, 1); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const double alpha, - const double* x, double* y) { +void axpy( + const platform::CPUDeviceContext& context, const int n, const double alpha, + const double* x, double* y) { cblas_daxpy(n, alpha, x, 1, y, 1); } -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; DEFINE_CPU_TRANS(1); DEFINE_CPU_TRANS(2); @@ -310,10 +302,10 @@ void set_constant(const platform::DeviceContext& context, #endif } -template struct RowwiseAdd; -template struct RowwiseAdd; -template struct ColwiseSum; -template struct ColwiseSum; +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +template struct ColwiseSum; } // namespace math } // namespace operators diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 3018e50a4f..1b560a7e2d 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -22,13 +22,11 @@ namespace operators { namespace math { template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const float alpha, const float* A, - const float* B, const float beta, - float* C) { +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float alpha, const float* A, const float* B, const float beta, + float* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -39,19 +37,16 @@ void gemm(const platform::DeviceContext& context, (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasSgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); } template <> -void gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, - const CBLAS_TRANSPOSE transB, const int M, - const int N, const int K, - const double alpha, const double* A, - const double* B, const double beta, - double* C) { +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const double alpha, const double* A, const double* B, const double beta, + double* C) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. int lda = (transA == CblasNoTrans) ? K : M; @@ -61,51 +56,45 @@ void gemm(const platform::DeviceContext& context, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasDgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, N)); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const float alpha, const float* A, - const int lda, const float* B, - const int ldb, const float beta, float* C, - const int ldc) { +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, const float alpha, + const float* A, const int lda, const float* B, const int ldb, + const float beta, float* C, const int ldc) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasSgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); } template <> -void gemm(const platform::DeviceContext& context, - const bool transA, const bool transB, - const int M, const int N, const int K, - const double alpha, const double* A, - const int lda, const double* B, - const int ldb, const double beta, - double* C, const int ldc) { +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const double alpha, const double* A, const int lda, const double* B, + const int ldb, const double beta, double* C, const int ldc) { // Note that cublas follows fortran order, so the order is different from // the cblas convention. cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; PADDLE_ENFORCE(platform::dynload::cublasDgemm( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, ldc)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, + lda, &beta, C, ldc)); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, float alpha, +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float alpha, framework::Tensor* matrix_out, float beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -125,15 +114,16 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void matmul( - const platform::DeviceContext& context, const framework::Tensor& matrix_a, - bool trans_a, const framework::Tensor& matrix_b, bool trans_b, double alpha, +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, double alpha, framework::Tensor* matrix_out, double beta) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); @@ -153,14 +143,14 @@ void matmul( CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; - gemm( + gemm( context, transA, transB, M, N, K, alpha, matrix_a.data(), matrix_b.data(), beta, matrix_out->data()); } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C, const int batchCount, const int strideA, const int strideB) { @@ -176,15 +166,13 @@ void batched_gemm( const int strideC = M * N; PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, - &beta, C, ldc, strideC, batchCount)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); } template <> -void batched_gemm( - const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const double alpha, const double* A, const double* B, const double beta, double* C, const int batchCount, const int strideA, const int strideB) { @@ -200,68 +188,58 @@ void batched_gemm( const int strideC = M * N; PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched( - reinterpret_cast(context) - .cublas_handle(), - cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA, - &beta, C, ldc, strideC, batchCount)); + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, + strideB, A, lda, strideA, &beta, C, ldc, strideC, batchCount)); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const float alpha, - const float* A, const float* B, - const float beta, float* C) { +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const float alpha, const float* A, const float* B, + const float beta, float* C) { cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; - PADDLE_ENFORCE(platform::dynload::cublasSgemv( - reinterpret_cast(context) - .cublas_handle(), - cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); + PADDLE_ENFORCE(platform::dynload::cublasSgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); } template <> -void gemv(const platform::DeviceContext& context, - const bool trans_a, const int M, - const int N, const double alpha, - const double* A, const double* B, - const double beta, double* C) { +void gemv( + const platform::CUDADeviceContext& context, const bool trans_a, const int M, + const int N, const double alpha, const double* A, const double* B, + const double beta, double* C) { cublasOperation_t cuTransA = (trans_a == false) ? CUBLAS_OP_T : CUBLAS_OP_N; - PADDLE_ENFORCE(platform::dynload::cublasDgemv( - reinterpret_cast(context) - .cublas_handle(), - cuTransA, N, M, &alpha, A, N, B, 1, &beta, C, 1)); + PADDLE_ENFORCE(platform::dynload::cublasDgemv(context.cublas_handle(), + cuTransA, N, M, &alpha, A, N, B, + 1, &beta, C, 1)); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const float alpha, - const float* x, float* y) { - PADDLE_ENFORCE(platform::dynload::cublasSaxpy( - reinterpret_cast(context) - .cublas_handle(), - n, &alpha, x, 1, y, 1)); +void axpy( + const platform::CUDADeviceContext& context, const int n, const float alpha, + const float* x, float* y) { + PADDLE_ENFORCE(platform::dynload::cublasSaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); } template <> -void axpy(const platform::DeviceContext& context, - const int n, const double alpha, - const double* x, double* y) { - PADDLE_ENFORCE(platform::dynload::cublasDaxpy( - reinterpret_cast(context) - .cublas_handle(), - n, &alpha, x, 1, y, 1)); +void axpy( + const platform::CUDADeviceContext& context, const int n, const double alpha, + const double* x, double* y) { + PADDLE_ENFORCE(platform::dynload::cublasDaxpy(context.cublas_handle(), n, + &alpha, x, 1, y, 1)); } -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; -template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; +template struct SetConstant; -#define DEFINE_GPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; +#define DEFINE_GPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; DEFINE_GPU_TRANS(1); DEFINE_GPU_TRANS(2); @@ -277,8 +255,9 @@ struct TensorSetConstantGPU { template void operator()() const { - SetConstant functor; - functor(context_, tensor_, static_cast(value_)); + SetConstant functor; + functor(reinterpret_cast(context_), + tensor_, static_cast(value_)); } const platform::DeviceContext& context_; @@ -294,27 +273,27 @@ void set_constant_with_place( TensorSetConstantGPU(context, tensor, value)); } -template struct RowwiseAdd; -template struct RowwiseAdd; -template struct ColwiseSum; -// template struct ColwiseSum; -// The ColwiseSum failed in debug mode, +template struct RowwiseAdd; +template struct RowwiseAdd; +template struct ColwiseSum; +// template struct ColwiseSum; +// The ColwiseSum failed in debug mode, // and only failed for this case. So reimplemented it. template <> -void ColwiseSum::operator()( - const platform::DeviceContext& context, const framework::Tensor& input, +void ColwiseSum::operator()( + const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* vector) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector->numel(), size); framework::Tensor one; one.mutable_data({in_dims[0]}, context.GetPlace()); - SetConstant set; + SetConstant set; set(context, &one, static_cast(1.0)); - gemv(context, true, static_cast(in_dims[0]), - static_cast(in_dims[1]), 1.0, - input.data(), one.data(), - 0.0, vector->data()); + gemv( + context, true, static_cast(in_dims[0]), static_cast(in_dims[1]), + 1.0, input.data(), one.data(), 0.0, + vector->data()); } } // namespace math diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h index f2b025b78b..8cc03c2ba0 100644 --- a/paddle/operators/math/math_function.h +++ b/paddle/operators/math/math_function.h @@ -62,53 +62,51 @@ namespace math { // Then matrixA: M * K, matrixB: K * N, matrixC : M * N // For more detailed info, please refer to // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html -template -void gemm(const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA, +template +void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, const T alpha, const T* A, const T* B, const T beta, T* C); // gemm wrapper with stride args for matrix uncontinuous in memory -template -void gemm(const platform::DeviceContext& context, const bool transA, - const bool transB, const int M, const int N, const int K, - const T alpha, const T* A, const int lda, const T* B, const int ldb, - const T beta, T* C, const int ldc); +template +void gemm(const DeviceContext& context, const bool transA, const bool transB, + const int M, const int N, const int K, const T alpha, const T* A, + const int lda, const T* B, const int ldb, const T beta, T* C, + const int ldc); // matrix multiply with continuous memory -template -void matmul(const platform::DeviceContext& context, - const framework::Tensor& matrix_a, bool trans_a, - const framework::Tensor& matrix_b, bool trans_b, T alpha, - framework::Tensor* matrix_out, T beta); +template +void matmul(const DeviceContext& context, const framework::Tensor& matrix_a, + bool trans_a, const framework::Tensor& matrix_b, bool trans_b, + T alpha, framework::Tensor* matrix_out, T beta); // Batched gemm -template -void batched_gemm(const platform::DeviceContext& context, - const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, - const int M, const int N, const int K, const T alpha, - const T* A, const T* B, const T beta, T* C, - const int batchCount, const int strideA, const int strideB); - -template -void gemv(const platform::DeviceContext& context, const bool trans_a, - const int M, const int N, const T alpha, const T* A, const T* B, - const T beta, T* C); - -template -void axpy(const platform::DeviceContext& context, const int n, const T alpha, - const T* x, T* y); - -template +template +void batched_gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, + const int K, const T alpha, const T* A, const T* B, + const T beta, T* C, const int batchCount, const int strideA, + const int strideB); + +template +void gemv(const DeviceContext& context, const bool trans_a, const int M, + const int N, const T alpha, const T* A, const T* B, const T beta, + T* C); + +template +void axpy(const DeviceContext& context, const int n, const T alpha, const T* x, + T* y); + +template struct Transpose { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& in, framework::Tensor* out, - const std::vector& axis); + void operator()(const DeviceContext& context, const framework::Tensor& in, + framework::Tensor* out, const std::vector& axis); }; -template +template struct SetConstant { - void operator()(const platform::DeviceContext& context, - framework::Tensor* tensor, T num); + void operator()(const DeviceContext& context, framework::Tensor* tensor, + T num); }; template @@ -118,17 +116,16 @@ void set_constant_with_place(const platform::DeviceContext& context, void set_constant(const platform::DeviceContext& context, framework::Tensor* tensor, float value); -template +template struct RowwiseAdd { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, const framework::Tensor& vec, - framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + const framework::Tensor& vec, framework::Tensor* output); }; -template +template struct ColwiseSum { - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* vec); + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* vec); }; } // namespace math diff --git a/paddle/operators/math/math_function_impl.h b/paddle/operators/math/math_function_impl.h index 4dc17a4e52..3e6d833865 100644 --- a/paddle/operators/math/math_function_impl.h +++ b/paddle/operators/math/math_function_impl.h @@ -20,16 +20,17 @@ namespace paddle { namespace operators { namespace math { -template -void SetConstant::operator()(const platform::DeviceContext& context, - framework::Tensor* tensor, T num) { +template +void SetConstant::operator()(const DeviceContext& context, + framework::Tensor* tensor, + T num) { auto t = framework::EigenVector::Flatten(*tensor); - t.device(*context.GetEigenDevice()) = t.constant(static_cast(num)); + t.device(*context.eigen_device()) = t.constant(static_cast(num)); } -template -void Transpose::operator()( - const platform::DeviceContext& context, const framework::Tensor& in, +template +void Transpose::operator()( + const DeviceContext& context, const framework::Tensor& in, framework::Tensor* out, const std::vector& axis) { Eigen::array permute; for (int i = 0; i < Rank; i++) { @@ -40,15 +41,15 @@ void Transpose::operator()( auto eigen_in = framework::EigenTensor::From(in); auto eigen_out = framework::EigenTensor::From(*out); - auto* dev = context.GetEigenDevice(); + auto* dev = context.eigen_device(); eigen_out.device(*dev) = eigen_in.shuffle(permute); } -template -void RowwiseAdd::operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - const framework::Tensor& vector, - framework::Tensor* output) { +template +void RowwiseAdd::operator()(const DeviceContext& context, + const framework::Tensor& input, + const framework::Tensor& vector, + framework::Tensor* output) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector.numel(), size); @@ -59,14 +60,14 @@ void RowwiseAdd::operator()(const platform::DeviceContext& context, auto out = framework::EigenMatrix::From(*output); Eigen::array shape({{1, static_cast(size)}}); Eigen::array bcast({{static_cast(in_dims[0]), 1}}); - out.device(*context.GetEigenDevice()) = + out.device(*context.eigen_device()) = in + vec.reshape(shape).broadcast(bcast); } -template -void ColwiseSum::operator()(const platform::DeviceContext& context, - const framework::Tensor& input, - framework::Tensor* vector) { +template +void ColwiseSum::operator()(const DeviceContext& context, + const framework::Tensor& input, + framework::Tensor* vector) { auto in_dims = input.dims(); auto size = input.numel() / in_dims[0]; PADDLE_ENFORCE_EQ(vector->numel(), size); @@ -74,7 +75,7 @@ void ColwiseSum::operator()(const platform::DeviceContext& context, auto vec = framework::EigenMatrix::From(*vector); auto in = framework::EigenMatrix::From(input); Eigen::array shape({{1, static_cast(size)}}); - vec.reshape(shape).device(*context.GetEigenDevice()) = + vec.reshape(shape).device(*context.eigen_device()) = in.sum(Eigen::array({{0}})).reshape(shape); } diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc index 983c9fdcff..7c6f098ca9 100644 --- a/paddle/operators/math/math_function_test.cc +++ b/paddle/operators/math/math_function_test.cc @@ -21,7 +21,7 @@ TEST(math_function, gemm_notrans_cblas) { memcpy(input3_ptr, arr3, 8 * sizeof(float)); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, false, m, n, k, 1, input1_ptr, 3, input2_ptr + 1, 4, 1, input3_ptr + 1, 4); @@ -55,7 +55,7 @@ TEST(math_function, gemm_trans_clbas) { memcpy(input3_ptr, arr3, 8 * sizeof(float)); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, true, m, n, k, 1, input1_ptr, 3, input2_ptr + 3, 3, 1, input3_ptr + 1, 4); @@ -74,7 +74,8 @@ TEST(math_function, zero) { auto* cpu_place = new paddle::platform::CPUPlace(); float* t = tensor.mutable_data({2, 2}, *cpu_place); paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::SetConstant + paddle::operators::math::SetConstant functor; functor(context, &tensor, 0); EXPECT_EQ(t[0], 0); @@ -110,7 +111,7 @@ void GemvTest(int m, int n, bool trans) { } paddle::platform::CPUDeviceContext context(*cpu_place); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., data_a, data_b, 0., data_c); diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu index d5d6f0c73b..32e96d9487 100644 --- a/paddle/operators/math/math_function_test.cu +++ b/paddle/operators/math/math_function_test.cu @@ -21,7 +21,7 @@ TEST(math_function, notrans_mul_trans) { out_gpu.mutable_data({2, 2}, *gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); @@ -55,7 +55,7 @@ TEST(math_function, trans_mul_notrans) { out_gpu.mutable_data({3, 3}, *gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); @@ -106,7 +106,7 @@ TEST(math_function, gemm_notrans_cublas) { float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); @@ -161,7 +161,7 @@ TEST(math_function, gemm_trans_cublas) { float* b = input2_gpu.data(); float* c = input3_gpu.mutable_data(*gpu_place); - paddle::operators::math::gemm( + paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); @@ -208,7 +208,7 @@ void GemvTest(int m, int n, bool trans) { paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a); paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h index 6ba9a0ba9a..7048e11e6f 100644 --- a/paddle/operators/math/matmul.h +++ b/paddle/operators/math/matmul.h @@ -26,13 +26,12 @@ namespace math { // // Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported // yet. -template +template class MatMulFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& a, bool trans_a, - const framework::Tensor& b, bool trans_b, T alpha, - framework::Tensor* out, T beta) { + void operator()(const DeviceContext& context, const framework::Tensor& a, + bool trans_a, const framework::Tensor& b, bool trans_b, + T alpha, framework::Tensor* out, T beta) { auto dim_a = a.dims(); auto dim_b = b.dims(); @@ -108,13 +107,13 @@ class MatMulFunctor { if (!batchCount) { // regular matrix multiplication - gemm(context, transA, transB, M, N, kA, alpha, a.data(), - b.data(), beta, out->data()); + gemm(context, transA, transB, M, N, kA, alpha, + a.data(), b.data(), beta, out->data()); } else { // batched matrix multiplication - batched_gemm(context, transA, transB, M, N, kA, alpha, - a.data(), b.data(), beta, out->data(), - batchCount, strideA, strideB); + batched_gemm( + context, transA, transB, M, N, kA, alpha, a.data(), b.data(), + beta, out->data(), batchCount, strideA, strideB); } } }; diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc index c9003962d3..fea86675f7 100644 --- a/paddle/operators/math/maxouting.cc +++ b/paddle/operators/math/maxouting.cc @@ -20,9 +20,9 @@ namespace math { // All tensors are in NCHW format, and the groups must be greater than 1 template -class MaxOutFunctor { +class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, framework::Tensor* output, int groups) { const int batch_size = input.dims()[0]; @@ -54,9 +54,9 @@ class MaxOutFunctor { }; template -class MaxOutGradFunctor { +class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups) { @@ -91,10 +91,10 @@ class MaxOutGradFunctor { } }; -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; -template class MaxOutFunctor; -template class MaxOutFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu index c3fabcae08..6056ad251c 100644 --- a/paddle/operators/math/maxouting.cu +++ b/paddle/operators/math/maxouting.cu @@ -78,9 +78,9 @@ __global__ void KernelMaxoutGrad(const int nthreads, const T* input_data, * All tensors are in NCHW format. */ template -class MaxOutFunctor { +class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* output, int groups) { const int batch_size = input.dims()[0]; @@ -98,20 +98,18 @@ class MaxOutFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxOut< - T><<(context) - .stream()>>>(nthreads, input_data, input_channels, - input_height, input_width, groups, output_data); + KernelMaxOut<<>>( + nthreads, input_data, input_channels, input_height, input_width, groups, + output_data); } }; /* * All tensors are in NCHW format. */ template -class MaxOutGradFunctor { +class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups) { @@ -132,20 +130,17 @@ class MaxOutGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxoutGrad< - T><<(context) - .stream()>>>(nthreads, input_data, output_data, - output_grad_data, input_grad_data, input_channels, - input_height, input_width, groups); + KernelMaxoutGrad<<>>( + nthreads, input_data, output_data, output_grad_data, input_grad_data, + input_channels, input_height, input_width, groups); } }; -template class MaxOutGradFunctor; -template class MaxOutGradFunctor; +template class MaxOutGradFunctor; +template class MaxOutGradFunctor; -template class MaxOutFunctor; -template class MaxOutFunctor; +template class MaxOutFunctor; +template class MaxOutFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h index 2d9069b0b3..68f4743db0 100644 --- a/paddle/operators/math/maxouting.h +++ b/paddle/operators/math/maxouting.h @@ -23,20 +23,18 @@ namespace math { #define FLT_MAX __FLT_MAX__ -template - +template class MaxOutFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* output, - int groups); + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* output, int groups); }; -template +template class MaxOutGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, framework::Tensor* input_grad, + void operator()(const DeviceContext& context, const framework::Tensor& input, + framework::Tensor* input_grad, const framework::Tensor& output, const framework::Tensor& output_grad, int groups); }; diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc index 135984586a..150de6fd59 100644 --- a/paddle/operators/math/pooling.cc +++ b/paddle/operators/math/pooling.cc @@ -24,9 +24,9 @@ namespace math { * height and width, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -84,9 +84,9 @@ class Pool2dFunctor { * and width, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -152,9 +152,9 @@ class Pool2dGradFunctor { * height and width, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -213,25 +213,29 @@ class MaxPool2dGradFunctor { } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool2dFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool2dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; /* * All tensors are in NCDHW format. @@ -239,9 +243,9 @@ template class Pool2dGradFunctor< * depth, height and width, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -314,9 +318,9 @@ class Pool3dFunctor { * depth, height and width, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -398,9 +402,9 @@ class Pool3dGradFunctor { * depth, height and width, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -473,25 +477,29 @@ class MaxPool3dGradFunctor { } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool3dFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool3dGradFunctor< - platform::CPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; /* * All tensors are in NCHW format. @@ -499,9 +507,9 @@ template class Pool3dGradFunctor< * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -564,9 +572,9 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -602,10 +610,14 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; /* * All tensors are in NCDHW format. @@ -613,9 +625,9 @@ template class MaxPool2dWithIndexGradFunctor; * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -692,9 +704,9 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -735,10 +747,14 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu index ca3560f264..0243cf8316 100644 --- a/paddle/operators/math/pooling.cu +++ b/paddle/operators/math/pooling.cu @@ -155,9 +155,9 @@ __global__ void KernelMaxPool2DGrad( * height and width, respectively. */ template -class Pool2dFunctor { +class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -183,11 +183,7 @@ class Pool2dFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool2D< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool2D<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, pool_process, output_data); @@ -200,9 +196,9 @@ class Pool2dFunctor { * height and width, respectively. */ template -class Pool2dGradFunctor { +class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -231,11 +227,7 @@ class Pool2dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool2DGrad< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool2DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -249,9 +241,9 @@ class Pool2dGradFunctor { * height and width, respectively. */ template -class MaxPool2dGradFunctor { +class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -281,10 +273,7 @@ class MaxPool2dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2DGrad< - T><<(context) - .stream()>>>( + KernelMaxPool2DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -292,25 +281,29 @@ class MaxPool2dGradFunctor { } }; -template class MaxPool2dGradFunctor; -template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; +template class MaxPool2dGradFunctor; -template class Pool2dFunctor, float>; -template class Pool2dFunctor, float>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool2dFunctor, + float>; +template class Pool2dGradFunctor, + float>; +template class Pool2dFunctor, double>; -template class Pool2dFunctor, double>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool2dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool2dGradFunctor, + double>; +template class Pool2dGradFunctor, + double>; template __global__ void KernelPool3D(const int nthreads, const T* input_data, @@ -478,9 +471,9 @@ __global__ void KernelMaxPool3DGrad( * depth, height and width, respectively. */ template -class Pool3dFunctor { +class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_process, framework::Tensor* output) { @@ -512,11 +505,7 @@ class Pool3dFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool3D< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool3D<<>>( nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, @@ -531,9 +520,9 @@ class Pool3dFunctor { * depth, height and width, respectively. */ template -class Pool3dGradFunctor { +class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -569,11 +558,7 @@ class Pool3dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelPool3DGrad< - PoolProcess, - T><<(context) - .stream()>>>( + KernelPool3DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, @@ -588,9 +573,9 @@ class Pool3dGradFunctor { * depth, height and width, respectively. */ template -class MaxPool3dGradFunctor { +class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, @@ -626,10 +611,7 @@ class MaxPool3dGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DGrad< - T><<(context) - .stream()>>>( + KernelMaxPool3DGrad<<>>( nthreads, input_data, output_data, output_grad_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, @@ -638,25 +620,29 @@ class MaxPool3dGradFunctor { } }; -template class MaxPool3dGradFunctor; -template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; +template class MaxPool3dGradFunctor; -template class Pool3dFunctor, float>; -template class Pool3dFunctor, float>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, float>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, float>; -template class Pool3dFunctor, + float>; +template class Pool3dGradFunctor, + float>; +template class Pool3dFunctor, double>; -template class Pool3dFunctor, double>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::MaxPoolGrad, double>; -template class Pool3dGradFunctor< - platform::GPUPlace, paddle::operators::math::AvgPoolGrad, double>; +template class Pool3dGradFunctor, + double>; +template class Pool3dGradFunctor, + double>; template __global__ void KernelMaxPool2dWithIdx( @@ -747,9 +733,9 @@ __global__ void KernelMaxPool2DWithIdxGrad( * height and width, respectively. */ template -class MaxPool2dWithIndexFunctor { +class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -776,10 +762,7 @@ class MaxPool2dWithIndexFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2dWithIdx< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool2dWithIdx<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, output_data, mask_data); @@ -792,9 +775,9 @@ class MaxPool2dWithIndexFunctor { * height and width, respectively. */ template -class MaxPool2dWithIndexGradFunctor { +class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -821,10 +804,7 @@ class MaxPool2dWithIndexGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool2DWithIdxGrad< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool2DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, @@ -832,10 +812,14 @@ class MaxPool2dWithIndexGradFunctor { } }; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; -template class MaxPool2dWithIndexFunctor; -template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; +template class MaxPool2dWithIndexFunctor; +template class MaxPool2dWithIndexGradFunctor; template __global__ void KernelMaxPool3DWithIdx( @@ -950,9 +934,9 @@ __global__ void KernelMaxPool3DWithIdxGrad( * depth, height and width, respectively. */ template -class MaxPool3dWithIndexFunctor { +class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* output, framework::Tensor* mask) { @@ -985,10 +969,7 @@ class MaxPool3dWithIndexFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DWithIdx< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool3DWithIdx<<>>( nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, @@ -1002,9 +983,9 @@ class MaxPool3dWithIndexFunctor { * depth, height and width, respectively. */ template -class MaxPool3dWithIndexGradFunctor { +class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -1037,10 +1018,7 @@ class MaxPool3dWithIndexGradFunctor { dim3 threads(1024, 1); dim3 grid(blocks, 1); - KernelMaxPool3DWithIdxGrad< - T1, T2><<(context) - .stream()>>>( + KernelMaxPool3DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, @@ -1049,10 +1027,14 @@ class MaxPool3dWithIndexGradFunctor { } }; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; -template class MaxPool3dWithIndexFunctor; -template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; +template class MaxPool3dWithIndexFunctor; +template class MaxPool3dWithIndexGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h index 19fbd8b4bb..2759f06cb6 100644 --- a/paddle/operators/math/pooling.h +++ b/paddle/operators/math/pooling.h @@ -84,62 +84,58 @@ class AvgPoolGrad { * This is different from average pooling. So we rewrite the max_pool_grad: * MaxPool2dGradFunctor, MaxPool3dGradFunctor. */ -template +template class Pool2dFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); }; -template +template class Pool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_compute, framework::Tensor* input_grad); }; -template +template class MaxPool2dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* input_grad); }; -template +template class Pool3dFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - PoolProcess pool_compute, framework::Tensor* output); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, PoolProcess pool_compute, + framework::Tensor* output); }; -template +template class Pool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, PoolProcess pool_compute, framework::Tensor* input_grad); }; -template +template class MaxPool3dGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& output, const framework::Tensor& output_grad, std::vector& ksize, std::vector& strides, std::vector& paddings, @@ -153,38 +149,38 @@ class MaxPool3dGradFunctor { * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in * NCDHW format. */ -template +template class MaxPool2dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); }; -template +template class MaxPool2dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, framework::Tensor* input_grad); }; -template +template class MaxPool3dWithIndexFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, std::vector& ksize, - std::vector& strides, std::vector& paddings, - framework::Tensor* output, framework::Tensor* mask); + void operator()(const DeviceContext& context, const framework::Tensor& input, + std::vector& ksize, std::vector& strides, + std::vector& paddings, framework::Tensor* output, + framework::Tensor* mask); }; -template +template class MaxPool3dWithIndexGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& output_grad, const framework::Tensor& mask, std::vector& ksize, std::vector& strides, std::vector& paddings, diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc index 514f2adef2..ab758d1e7f 100644 --- a/paddle/operators/math/selected_rows_functor.cc +++ b/paddle/operators/math/selected_rows_functor.cc @@ -19,8 +19,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAdd { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output) { @@ -67,12 +67,12 @@ struct SelectedRowsAdd { } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; template -struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTensor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); @@ -88,7 +88,7 @@ struct SelectedRowsAddTensor { PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height); PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height); - SetConstant functor; + SetConstant functor; functor(context, output, 0.0); auto* in1_data = in1_value.data(); @@ -103,17 +103,16 @@ struct SelectedRowsAddTensor { auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); - out_eigen.device(*context.GetEigenDevice()) = - out_eigen + in2_eigen; + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; } }; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2) { @@ -143,14 +142,14 @@ struct SelectedRowsAddTo { } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; template -struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddToTensor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); @@ -175,10 +174,10 @@ struct SelectedRowsAddToTensor { } }; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index c1dd323ba2..c44577e00a 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -20,8 +20,8 @@ namespace paddle { namespace operators { namespace math { template -struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAdd { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output) { @@ -64,16 +64,15 @@ struct SelectedRowsAdd { reinterpret_cast(context).stream()); auto* in2_data = in2_value.data(); - memory::Copy( - boost::get(out_place), out_data + in1_value.numel(), - boost::get(in2_place), in2_data, - in2_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(out_place), + out_data + in1_value.numel(), + boost::get(in2_place), in2_data, + in2_value.numel() * sizeof(T), context.stream()); } }; -template struct SelectedRowsAdd; -template struct SelectedRowsAdd; +template struct SelectedRowsAdd; +template struct SelectedRowsAdd; namespace { template @@ -96,8 +95,8 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows, } // namespace template -struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTensor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output) { auto in1_height = input1.height(); @@ -117,30 +116,28 @@ struct SelectedRowsAddTensor { auto* in2_data = input2.data(); auto* out_data = output->data(); - SetConstant functor; + SetConstant functor; functor(context, output, 0.0); const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in1_rows.size()); - SelectedRowsAddTensorKernel<<< - grid, threads, 0, - reinterpret_cast(context) - .stream()>>>(in1_data, in1_rows.data(), out_data, in1_row_numel); + SelectedRowsAddTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.data(), out_data, in1_row_numel); auto out_eigen = framework::EigenVector::Flatten(*output); auto in2_eigen = framework::EigenVector::Flatten(input2); - out_eigen.device(*context.GetEigenDevice()) = - out_eigen + in2_eigen; + out_eigen.device(*context.eigen_device()) = out_eigen + in2_eigen; } }; -template struct SelectedRowsAddTensor; -template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; +template struct SelectedRowsAddTensor; template -struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddTo { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2) { @@ -163,18 +160,17 @@ struct SelectedRowsAddTo { auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); - memory::Copy( - boost::get(in2_place), in2_data + input2_offset, - boost::get(in1_place), in1_data, - in1_value.numel() * sizeof(T), - reinterpret_cast(context).stream()); + memory::Copy(boost::get(in2_place), + in2_data + input2_offset, + boost::get(in1_place), in1_data, + in1_value.numel() * sizeof(T), context.stream()); } }; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; -template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; +template struct SelectedRowsAddTo; namespace { template @@ -197,8 +193,8 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows, } // namespace template -struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, +struct SelectedRowsAddToTensor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2) { auto in1_height = input1.height(); @@ -216,17 +212,16 @@ struct SelectedRowsAddToTensor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in1_rows.size()); - SelectedRowsAddToTensorKernel<<< - grid, threads, 0, - reinterpret_cast(context) - .stream()>>>(in1_data, in1_rows.data(), in2_data, in1_row_numel); + SelectedRowsAddToTensorKernel< + T, block_size><<>>( + in1_data, in1_rows.data(), in2_data, in1_row_numel); } }; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; -template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h index d6dc6c03c9..1149075abf 100644 --- a/paddle/operators/math/selected_rows_functor.h +++ b/paddle/operators/math/selected_rows_functor.h @@ -21,33 +21,33 @@ namespace math { // SelectedRows + SelectedRows will simplely concat value and rows. // The real computation happens in dealing with LoDTensor. -template +template struct SelectedRowsAdd { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const framework::SelectedRows& input2, framework::SelectedRows* output); }; -template +template struct SelectedRowsAddTensor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const framework::Tensor& input2, framework::Tensor* output); }; // input2 = input1 + input2 -template +template struct SelectedRowsAddTo { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, const int64_t input2_offset, framework::SelectedRows* input2); }; // input2 = input1 + input2 -template +template struct SelectedRowsAddToTensor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input1, framework::Tensor* input2); }; diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc index a3649b6875..8c74cab0a1 100644 --- a/paddle/operators/math/selected_rows_functor_test.cc +++ b/paddle/operators/math/selected_rows_functor_test.cc @@ -23,7 +23,7 @@ TEST(selected_rows_functor, cpu_add) { CPUPlace cpu_place; CPUDeviceContext ctx(cpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -47,7 +47,7 @@ TEST(selected_rows_functor, cpu_add) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), cpu_place); - SelectedRowsAdd add_functor; + SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -85,7 +85,7 @@ TEST(selected_rows_functor, cpu_add) { std::unique_ptr tensor2{new Tensor()}; tensor2->mutable_data(make_ddim({height, row_numel}), cpu_place); - SelectedRowsAddTensor add_tensor_functor; + SelectedRowsAddTensor add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); auto* tensor2_data = tensor2->data(); @@ -112,7 +112,7 @@ TEST(selected_rows_functor, cpu_add_to) { CPUPlace cpu_place; CPUDeviceContext ctx(cpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -137,7 +137,7 @@ TEST(selected_rows_functor, cpu_add_to) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), cpu_place); - SelectedRowsAddTo add_to_functor; + SelectedRowsAddTo add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -173,7 +173,7 @@ TEST(selected_rows_functor, cpu_add_to) { tensor1->mutable_data(make_ddim({height, row_numel}), cpu_place); functor(ctx, tensor1.get(), 3.0); - SelectedRowsAddToTensor add_to_tensor_functor; + SelectedRowsAddToTensor add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); auto* tensor1_data = tensor1->data(); diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu index 7de9291c17..777caf5635 100644 --- a/paddle/operators/math/selected_rows_functor_test.cu +++ b/paddle/operators/math/selected_rows_functor_test.cu @@ -24,7 +24,7 @@ TEST(selected_rows_functor, gpu_add) { GPUPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -48,7 +48,7 @@ TEST(selected_rows_functor, gpu_add) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), gpu_place); - SelectedRowsAdd add_functor; + SelectedRowsAdd add_functor; add_functor(ctx, *selected_rows1, *selected_rows2, output.get()); auto out_height = output->height(); @@ -90,7 +90,7 @@ TEST(selected_rows_functor, gpu_add) { std::unique_ptr tensor2{new Tensor()}; tensor2->mutable_data(make_ddim({height, row_numel}), gpu_place); - SelectedRowsAddTensor add_tensor_functor; + SelectedRowsAddTensor add_tensor_functor; add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); Tensor tensor2_cpu; @@ -122,7 +122,7 @@ TEST(selected_rows_functor, gpu_add_to) { GPUPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); - SetConstant functor; + SetConstant functor; int64_t height = 10; int64_t row_numel = 10; @@ -147,7 +147,7 @@ TEST(selected_rows_functor, gpu_add_to) { // simplely concat two SelectedRows out_value->mutable_data(make_ddim({7, 10}), gpu_place); - SelectedRowsAddTo add_to_functor; + SelectedRowsAddTo add_to_functor; add_to_functor(ctx, *selected_rows1, 0, output.get()); add_to_functor(ctx, *selected_rows2, in1_value->numel(), output.get()); @@ -187,7 +187,7 @@ TEST(selected_rows_functor, gpu_add_to) { tensor1->mutable_data(make_ddim({height, row_numel}), gpu_place); functor(ctx, tensor1.get(), 3.0); - SelectedRowsAddToTensor add_to_tensor_functor; + SelectedRowsAddToTensor add_to_tensor_functor; add_to_tensor_functor(ctx, *output, tensor1.get()); Tensor tensor1_cpu; diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc index 5b3bde02fb..88977be1f8 100644 --- a/paddle/operators/math/sequence2batch.cc +++ b/paddle/operators/math/sequence2batch.cc @@ -19,9 +19,9 @@ namespace operators { namespace math { template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& src, const size_t* index, framework::Tensor& dst, bool is_src_index) { auto src_dims = src.dims(); @@ -48,13 +48,13 @@ class CopyMatrixRowsFunctor { } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu index c5d968aeb2..452ae89510 100644 --- a/paddle/operators/math/sequence2batch.cu +++ b/paddle/operators/math/sequence2batch.cu @@ -39,9 +39,9 @@ __global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index, } template -class CopyMatrixRowsFunctor { +class CopyMatrixRowsFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& src, const size_t* index, framework::Tensor& dst, bool is_src_index) { auto src_dims = src.dims(); @@ -59,20 +59,19 @@ class CopyMatrixRowsFunctor { dim3 threads(128, 8); dim3 grid(8, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); CopyMatrixRowsKernel<<>>( src_data, dst_data, index, height, width, is_src_index); } }; -template class CopyMatrixRowsFunctor; -template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; +template class CopyMatrixRowsFunctor; -template class LoDTensor2BatchFunctor; -template class LoDTensor2BatchFunctor; -template class Batch2LoDTensorFunctor; -template class Batch2LoDTensorFunctor; +template class LoDTensor2BatchFunctor; +template class LoDTensor2BatchFunctor; +template class Batch2LoDTensorFunctor; +template class Batch2LoDTensorFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h index 73295ddbcb..a5c43a2c7d 100644 --- a/paddle/operators/math/sequence2batch.h +++ b/paddle/operators/math/sequence2batch.h @@ -26,7 +26,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class CopyMatrixRowsFunctor { public: // If is_src_index is true, @@ -34,12 +34,12 @@ class CopyMatrixRowsFunctor { // If is_src_index is false, // copy the input src to the indexed rows of output dst. // The indexed rows are based on the input index. - void operator()(const platform::DeviceContext& context, - const framework::Tensor& src, const size_t* index, - framework::Tensor& dst, bool is_src_index); + void operator()(const DeviceContext& context, const framework::Tensor& src, + const size_t* index, framework::Tensor& dst, + bool is_src_index); }; -template +template class LoDTensor2BatchFunctor { // Calculate the length of each sequence and // sort sequence index by the length. @@ -56,7 +56,7 @@ class LoDTensor2BatchFunctor { }; public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& lod_tensor, framework::LoDTensor& batch, bool is_cal_batch_lod, bool is_reverse = false) const { @@ -65,7 +65,7 @@ class LoDTensor2BatchFunctor { PADDLE_ENFORCE_GT(lods.size(), 2UL); PADDLE_ENFORCE_EQ(lods[1].size(), static_cast(lod_tensor.dims()[0])); - CopyMatrixRowsFunctor to_batch; + CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, lods[1].data(), batch, true); return; } @@ -143,22 +143,22 @@ class LoDTensor2BatchFunctor { } batch.set_lod(batch_lods); - CopyMatrixRowsFunctor to_batch; + CopyMatrixRowsFunctor to_batch; to_batch(context, lod_tensor, seq2batch_idx, batch, true); } }; -template +template class Batch2LoDTensorFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& batch, framework::LoDTensor& lod_tensor) const { auto in_lod = batch.lod(); PADDLE_ENFORCE_GT(in_lod.size(), 2UL); PADDLE_ENFORCE_EQ(in_lod[1].size(), static_cast(lod_tensor.dims()[0])); - CopyMatrixRowsFunctor to_seq; + CopyMatrixRowsFunctor to_seq; size_t* index = in_lod[1].data(); to_seq(context, batch, index, lod_tensor, false); } diff --git a/paddle/operators/math/sequence_pooling.cc b/paddle/operators/math/sequence_pooling.cc index 5913c99fdb..8fb92b1a13 100644 --- a/paddle/operators/math/sequence_pooling.cc +++ b/paddle/operators/math/sequence_pooling.cc @@ -20,9 +20,9 @@ namespace operators { namespace math { template -class MaxSeqPoolFunctor { +class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index) { auto in_dims = input.dims(); @@ -60,9 +60,9 @@ class MaxSeqPoolFunctor { }; template -class MaxSeqPoolGradFunctor { +class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad) { @@ -80,7 +80,7 @@ class MaxSeqPoolGradFunctor { const int* max_index = index.data(); T* ig_data = in_grad->data(); - SetConstant set_zero; + SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); int64_t num_seq = og_dims[0]; int64_t dim = out_grad.numel() / num_seq; @@ -93,10 +93,10 @@ class MaxSeqPoolGradFunctor { } }; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolGradFunctor; -template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence_pooling.cu b/paddle/operators/math/sequence_pooling.cu index 5ed951402f..4c9e6b375c 100644 --- a/paddle/operators/math/sequence_pooling.cu +++ b/paddle/operators/math/sequence_pooling.cu @@ -46,9 +46,9 @@ __global__ void KeMaxSequencePool(const T* input, const size_t* starts, } template -class MaxSeqPoolFunctor { +class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index) { auto in_dims = input.dims(); @@ -71,8 +71,7 @@ class MaxSeqPoolFunctor { dim3 threads(256, 1); dim3 grid(num_seq, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); KeMaxSequencePool<<>>( in_data, starts.data(), out_data, max_index, num_seq, dim); } @@ -91,9 +90,9 @@ __global__ void KeMaxSequencePoolGrad(const T* out_grad, const int* max_index, } template -class MaxSeqPoolGradFunctor { +class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad) { @@ -111,7 +110,7 @@ class MaxSeqPoolGradFunctor { const int* max_index = index.data(); T* ig_data = in_grad->data(); - SetConstant set_zero; + SetConstant set_zero; set_zero(context, in_grad, static_cast(0.0)); int64_t num_seq = og_dims[0]; int64_t dim = out_grad.numel() / num_seq; @@ -119,17 +118,16 @@ class MaxSeqPoolGradFunctor { unsigned int blocks = (num_seq * dim + 128 - 1) / 128; dim3 threads(128, 1); dim3 grid(blocks, 1); - auto stream = - reinterpret_cast(context).stream(); + auto stream = context.stream(); KeMaxSequencePoolGrad<<>>( og_data, max_index, ig_data, num_seq, dim); } }; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolFunctor; -template class MaxSeqPoolGradFunctor; -template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolFunctor; +template class MaxSeqPoolGradFunctor; +template class MaxSeqPoolGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/sequence_pooling.h b/paddle/operators/math/sequence_pooling.h index 35dfe26de1..13ffb2ebef 100644 --- a/paddle/operators/math/sequence_pooling.h +++ b/paddle/operators/math/sequence_pooling.h @@ -23,18 +23,18 @@ namespace math { #define FLT_MAX __FLT_MAX__ -template +template class MaxSeqPoolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::LoDTensor& input, framework::Tensor* output, framework::Tensor* index); }; -template +template class MaxSeqPoolGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::Tensor& out_grad, const framework::Tensor& index, framework::LoDTensor* in_grad); diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc index 3e2f15d6c2..72f10f35f4 100644 --- a/paddle/operators/math/softmax.cc +++ b/paddle/operators/math/softmax.cc @@ -19,10 +19,10 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu index 4dbab51d46..9e73f6a371 100644 --- a/paddle/operators/math/softmax.cu +++ b/paddle/operators/math/softmax.cu @@ -21,10 +21,10 @@ namespace paddle { namespace operators { namespace math { -template class SoftmaxFunctor; -template class SoftmaxFunctor; -template class SoftmaxGradFunctor; -template class SoftmaxGradFunctor; +template class SoftmaxFunctor; +template class SoftmaxFunctor; +template class SoftmaxGradFunctor; +template class SoftmaxGradFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h index fe10746502..471f44d340 100644 --- a/paddle/operators/math/softmax.h +++ b/paddle/operators/math/softmax.h @@ -19,19 +19,18 @@ namespace paddle { namespace operators { namespace math { -template +template class SoftmaxFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor* X, framework::Tensor* Y); + void operator()(const DeviceContext& context, const framework::Tensor* X, + framework::Tensor* Y); }; -template +template class SoftmaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor* y, const framework::Tensor* y_grad, - framework::Tensor* x_grad); + void operator()(const DeviceContext& context, const framework::Tensor* y, + const framework::Tensor* y_grad, framework::Tensor* x_grad); }; } // namespace math diff --git a/paddle/operators/math/softmax_impl.h b/paddle/operators/math/softmax_impl.h index 05793eeb3e..82f597ff79 100644 --- a/paddle/operators/math/softmax_impl.h +++ b/paddle/operators/math/softmax_impl.h @@ -32,10 +32,10 @@ struct ValueClip { } }; -template -void SoftmaxFunctor::operator()( - const platform::DeviceContext& context, const framework::Tensor* X, - framework::Tensor* Y) { +template +void SoftmaxFunctor::operator()(const DeviceContext& context, + const framework::Tensor* X, + framework::Tensor* Y) { auto logits = EigenMatrix::From(*X); auto softmax = EigenMatrix::From(*Y); @@ -56,19 +56,18 @@ void SoftmaxFunctor::operator()( .broadcast(one_by_class)) .unaryExpr(ValueClip()); - softmax.device(*context.GetEigenDevice()) = shifted_logits.exp(); - softmax.device(*context.GetEigenDevice()) = - (softmax * - softmax.sum(along_class) - .inverse() - .eval() - .reshape(batch_by_one) - .broadcast(one_by_class)); + softmax.device(*context.eigen_device()) = shifted_logits.exp(); + softmax.device(*context.eigen_device()) = (softmax * + softmax.sum(along_class) + .inverse() + .eval() + .reshape(batch_by_one) + .broadcast(one_by_class)); } -template -void SoftmaxGradFunctor::operator()( - const platform::DeviceContext& context, const framework::Tensor* y, +template +void SoftmaxGradFunctor::operator()( + const DeviceContext& context, const framework::Tensor* y, const framework::Tensor* y_grad, framework::Tensor* x_grad) { auto softmax = EigenMatrix::From(*y); auto softmax_grad = EigenMatrix::From(*y_grad); @@ -89,8 +88,7 @@ void SoftmaxGradFunctor::operator()( .eval() .reshape(batch_by_one) .broadcast(one_by_class); - logits_grad.device(*context.GetEigenDevice()) = - (softmax_grad - dot) * softmax; + logits_grad.device(*context.eigen_device()) = (softmax_grad - dot) * softmax; } } // namespace math diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc index b57d3dc141..ecd3a647e0 100644 --- a/paddle/operators/math/unpooling.cc +++ b/paddle/operators/math/unpooling.cc @@ -17,9 +17,9 @@ namespace paddle { namespace operators { namespace math { template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; @@ -48,9 +48,9 @@ class Unpool2dMaxFunctor { } }; template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -82,10 +82,10 @@ class Unpool2dMaxGradFunctor { } } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu index 37c3c8b689..ecbde0f6a7 100644 --- a/paddle/operators/math/unpooling.cu +++ b/paddle/operators/math/unpooling.cu @@ -67,9 +67,9 @@ __global__ void KernelUnpool2dMaxGrad( * All tensors are in NCHW format. */ template -class Unpool2dMaxFunctor { +class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output) { const int batch_size = input.dims()[0]; @@ -83,21 +83,18 @@ class Unpool2dMaxFunctor { T* output_data = output->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMax< - T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_height, output_width); + KernelUnpool2dMax<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_height, output_width); } }; /* * All tensors are in NCHW format. */ template -class Unpool2dMaxGradFunctor { +class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, @@ -116,19 +113,16 @@ class Unpool2dMaxGradFunctor { T* input_grad_data = input_grad->mutable_data(context.GetPlace()); int threads = 1024; int grid = (input.numel() + threads - 1) / threads; - KernelUnpool2dMaxGrad< - T><<(context) - .stream()>>>(input.numel(), input_data, indices_data, - input_height, input_width, output_channels, - output_data, output_grad_data, output_height, - output_width, input_grad_data); + KernelUnpool2dMaxGrad<<>>( + input.numel(), input_data, indices_data, input_height, input_width, + output_channels, output_data, output_grad_data, output_height, + output_width, input_grad_data); } }; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxGradFunctor; -template class Unpool2dMaxFunctor; -template class Unpool2dMaxFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxGradFunctor; +template class Unpool2dMaxFunctor; +template class Unpool2dMaxFunctor; } // namespace math } // namespace operators } // namespace paddle diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h index 7077d7c227..0f0ff1371e 100644 --- a/paddle/operators/math/unpooling.h +++ b/paddle/operators/math/unpooling.h @@ -18,18 +18,16 @@ limitations under the License. */ namespace paddle { namespace operators { namespace math { -template +template class Unpool2dMaxFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, framework::Tensor* output); }; -template +template class Unpool2dMaxGradFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& input, + void operator()(const DeviceContext& context, const framework::Tensor& input, const framework::Tensor& indices, const framework::Tensor& output, const framework::Tensor& output_grad, diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc index 99eb7fd46d..d574ed9234 100644 --- a/paddle/operators/math/vol2col.cc +++ b/paddle/operators/math/vol2col.cc @@ -25,9 +25,9 @@ namespace math { * output_depth, output_height, output_width] */ template -class Vol2ColFunctor { +class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, @@ -111,9 +111,9 @@ class Vol2ColFunctor { * output_depth, output_height, output_width] */ template -class Col2VolFunctor { +class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, @@ -190,10 +190,10 @@ class Col2VolFunctor { } }; -template class Vol2ColFunctor; -template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu index dae3be858e..b029442fe4 100644 --- a/paddle/operators/math/vol2col.cu +++ b/paddle/operators/math/vol2col.cu @@ -68,9 +68,9 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth, * output_depth, output_height, output_width] */ template -class Vol2ColFunctor { +class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, @@ -117,9 +117,7 @@ class Vol2ColFunctor { const int threads = 1024; const int blocks = (num_outputs + 1024 - 1) / 1024; - vol2col<<(context) - .stream()>>>( + vol2col<<>>( num_outputs, vol.data(), input_depth, input_height, input_width, dilations[0], dilations[1], dilations[2], filter_depth, filter_height, filter_width, strides[0], strides[1], strides[2], paddings[0], @@ -196,9 +194,9 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth, * output_depth, output_height, output_width] */ template -class Col2VolFunctor { +class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, + void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, @@ -245,9 +243,7 @@ class Col2VolFunctor { const int threads = 1024; const int blocks = (num_kernels + 1024 - 1) / 1024; - col2vol<<(context) - .stream()>>>( + col2vol<<>>( num_kernels, col.data(), input_depth, input_height, input_width, dilations[0], dilations[1], dilations[2], filter_depth, filter_height, filter_width, strides[0], strides[1], strides[2], paddings[0], @@ -256,10 +252,10 @@ class Col2VolFunctor { } }; -template class Vol2ColFunctor; -template class Vol2ColFunctor; -template class Col2VolFunctor; -template class Col2VolFunctor; +template class Vol2ColFunctor; +template class Vol2ColFunctor; +template class Col2VolFunctor; +template class Col2VolFunctor; } // namespace math } // namespace operators diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h index dc64d1d977..dcd80370e8 100644 --- a/paddle/operators/math/vol2col.h +++ b/paddle/operators/math/vol2col.h @@ -63,22 +63,20 @@ namespace math { * \note The caller needs to ensure that volShape.inputChannels is equal to * colShape.inputChannels. */ -template +template class Vol2ColFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& vol, + void operator()(const DeviceContext& context, const framework::Tensor& vol, const std::vector& dilations, const std::vector& strides, const std::vector& paddings, framework::Tensor* col) const; }; -template +template class Col2VolFunctor { public: - void operator()(const platform::DeviceContext& context, - const framework::Tensor& col, + void operator()(const DeviceContext& context, const framework::Tensor& col, const std::vector& dilations, const std::vector& strides, const std::vector& paddings, diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index 62c3152304..f46db3c567 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -16,7 +16,7 @@ limitations under the License. */ #include #include -template +template void testVol2col() { paddle::framework::Tensor input; paddle::framework::Tensor input_tmp; @@ -24,18 +24,7 @@ void testVol2col() { paddle::framework::Tensor output_tmp; auto* place = new Place(); - paddle::platform::DeviceContext* context; - if (paddle::platform::is_cpu_place(*place)) { - context = - new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace()); - } else { -#ifdef PADDLE_WITH_CUDA - context = - new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace()); -#else - PADDLE_THROW("no GPU support"); -#endif // PADDLE_WITH_CUDA - } + DeviceContext* context = new DeviceContext(*place); /** * input = [[0, 1, 2, @@ -88,7 +77,7 @@ void testVol2col() { output_depth, output_height, output_width}, *place); - paddle::operators::math::Vol2ColFunctor vol2col; + paddle::operators::math::Vol2ColFunctor vol2col; vol2col(*context, input, dilations, strides, paddings, &output); float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11}; @@ -113,7 +102,7 @@ void testVol2col() { CopyFrom(input_tmp, *place, *context, &input); } - paddle::operators::math::Col2VolFunctor col2vol; + paddle::operators::math::Col2VolFunctor col2vol; col2vol(*context, output, dilations, strides, paddings, &input); float* in_ptr; @@ -130,8 +119,9 @@ void testVol2col() { } TEST(math, vol2col) { - testVol2col(); + testVol2col(); #ifdef PADDLE_WITH_CUDA - testVol2col(); + testVol2col(); #endif // PADDLE_WITH_CUDA } diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc index 5a1a615420..ee0bc0c370 100644 --- a/paddle/operators/matmul_op.cc +++ b/paddle/operators/matmul_op.cc @@ -206,7 +206,8 @@ class MatMulOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad, ops::MatMulOpGrad); -REGISTER_OP_CPU_KERNEL(matmul, - ops::MatMulKernel); REGISTER_OP_CPU_KERNEL( - matmul_grad, ops::MatMulGradKernel); + matmul, ops::MatMulKernel); +REGISTER_OP_CPU_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/operators/matmul_op.cu.cc b/paddle/operators/matmul_op.cu.cc index b7e66382f0..6a3772c004 100644 --- a/paddle/operators/matmul_op.cu.cc +++ b/paddle/operators/matmul_op.cu.cc @@ -15,7 +15,8 @@ #include "paddle/operators/matmul_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(matmul, - ops::MatMulKernel); -REGISTER_OP_GPU_KERNEL( - matmul_grad, ops::MatMulGradKernel); +REGISTER_OP_CUDA_KERNEL( + matmul, ops::MatMulKernel); +REGISTER_OP_CUDA_KERNEL( + matmul_grad, + ops::MatMulGradKernel); diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h index 1e4aa48b70..de9da487b3 100644 --- a/paddle/operators/matmul_op.h +++ b/paddle/operators/matmul_op.h @@ -27,7 +27,7 @@ using DDim = framework::DDim; using framework::make_ddim; using framework::vectorize; -template +template class MatMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,8 +38,9 @@ class MatMulKernel : public framework::OpKernel { bool transpose_x = context.Attr("transpose_X"); bool transpose_y = context.Attr("transpose_Y"); - math::MatMulFunctor()(context.device_context(), x, transpose_x, y, - transpose_y, T(1), out, T(0)); + math::MatMulFunctor()( + context.template device_context(), x, transpose_x, y, + transpose_y, T(1), out, T(0)); } }; @@ -68,17 +69,16 @@ Tensor CombineBatchAndM(const Tensor& input) { // Reshape a rank-3 tensor from P x M x N to M x (P * N). // (Warning: This requires transposing data and writes into new memory.) // Identity op if the tensor is not of rank 3. -template -Tensor CombineBatchAndN(const framework::ExecutionContext& context, - const Tensor& input) { +template +Tensor CombineBatchAndN(const DeviceContext& context, const Tensor& input) { Tensor output; auto in_dims = input.dims(); if (in_dims.size() == 3) { output.Resize({in_dims[1], in_dims[0], in_dims[2]}); output.mutable_data(context.GetPlace()); std::vector axis = {1, 0, 2}; - math::Transpose trans; - trans(context.device_context(), input, &output, axis); + math::Transpose trans; + trans(context, input, &output, axis); std::vector out_dims = {in_dims[1], in_dims[0] * in_dims[2]}; output.Resize({in_dims[1], in_dims[0] * in_dims[2]}); } else { @@ -112,7 +112,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context, // // To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N // to X: (P * M) x K, dOut: (P * M) x N. -template +template class MatMulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -178,24 +178,23 @@ class MatMulGradKernel : public framework::OpKernel { Tensor Y = Reshape(y, make_ddim(y_dims)); Tensor dOut = Reshape(dout, make_ddim(dout_dims)); + auto& dev_ctx = context.template device_context(); if (dx) { dx->mutable_data(context.GetPlace()); const Tensor& dOut_for_dX = (x_dims.size() == 2 && y_dims.size() == 3) - ? CombineBatchAndN(context, dOut) + ? CombineBatchAndN(dev_ctx, dOut) : dOut; if (x_dims.size() == 2 && y_dims.size() == 3) { Y = transpose_y ? CombineBatchAndM(Y) - : CombineBatchAndN(context, Y); + : CombineBatchAndN(dev_ctx, Y); } if (transpose_x) { - math::MatMulFunctor()(context.device_context(), Y, - transpose_y, dOut_for_dX, transpose_x, - T(1), dx, T(0)); + math::MatMulFunctor()( + dev_ctx, Y, transpose_y, dOut_for_dX, transpose_x, T(1), dx, T(0)); } else { - math::MatMulFunctor()(context.device_context(), dOut_for_dX, - transpose_x, Y, !transpose_y, T(1), dx, - T(0)); + math::MatMulFunctor()( + dev_ctx, dOut_for_dX, transpose_x, Y, !transpose_y, T(1), dx, T(0)); } } @@ -205,18 +204,16 @@ class MatMulGradKernel : public framework::OpKernel { ? CombineBatchAndM(dOut) : dOut; if (y_dims.size() == 2 && x_dims.size() == 3) { - X = transpose_x ? CombineBatchAndN(context, X) + X = transpose_x ? CombineBatchAndN(dev_ctx, X) : CombineBatchAndM(X); dOut = CombineBatchAndM(dOut); } if (transpose_y) { - math::MatMulFunctor()(context.device_context(), dOut_for_dY, - transpose_y, X, transpose_x, T(1), dy, - T(0)); + math::MatMulFunctor()( + dev_ctx, dOut_for_dY, transpose_y, X, transpose_x, T(1), dy, T(0)); } else { - math::MatMulFunctor()(context.device_context(), X, - !transpose_x, dOut_for_dY, transpose_y, - T(1), dy, T(0)); + math::MatMulFunctor()( + dev_ctx, X, !transpose_x, dOut_for_dY, transpose_y, T(1), dy, T(0)); } } } diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc index 44bf402e95..011616e615 100644 --- a/paddle/operators/maxout_op.cc +++ b/paddle/operators/maxout_op.cc @@ -101,7 +101,8 @@ class MaxOutOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad, ops::MaxOutOpGrad); -REGISTER_OP_CPU_KERNEL(maxout, - ops::MaxOutKernel); REGISTER_OP_CPU_KERNEL( - maxout_grad, ops::MaxOutGradKernel); + maxout, ops::MaxOutKernel); +REGISTER_OP_CPU_KERNEL( + maxout_grad, + ops::MaxOutGradKernel); diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc index decd43913d..2904f0ff96 100644 --- a/paddle/operators/maxout_op.cu.cc +++ b/paddle/operators/maxout_op.cu.cc @@ -15,9 +15,10 @@ #include "paddle/operators/maxout_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(maxout, - ops::MaxOutKernel, - ops::MaxOutKernel); -REGISTER_OP_GPU_KERNEL( - maxout_grad, ops::MaxOutGradKernel, - ops::MaxOutGradKernel); +REGISTER_OP_CUDA_KERNEL( + maxout, ops::MaxOutKernel, + ops::MaxOutKernel); +REGISTER_OP_CUDA_KERNEL( + maxout_grad, + ops::MaxOutGradKernel, + ops::MaxOutGradKernel); diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h index 44a0d073dd..e8b12552b9 100644 --- a/paddle/operators/maxout_op.h +++ b/paddle/operators/maxout_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MaxOutKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -31,12 +31,13 @@ class MaxOutKernel : public framework::OpKernel { Tensor* out = context.Output("Out"); int groups = context.template Attr("groups"); - math::MaxOutFunctor maxout_forward; - maxout_forward(context.device_context(), *in_x, out, groups); + math::MaxOutFunctor maxout_forward; + maxout_forward(context.template device_context(), *in_x, out, + groups); } }; -template +template class MaxOutGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,14 +47,13 @@ class MaxOutGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out")); Tensor* in_x_grad = context.Output(framework::GradVarName("X")); int groups = context.template Attr("groups"); - auto& device_ctx = context.device_context(); - math::SetConstant zero; + auto& device_ctx = context.template device_context(); + math::SetConstant zero; if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0.0)); - math::MaxOutGradFunctor maxout_backward; - maxout_backward(context.device_context(), *in_x, in_x_grad, *out, - *out_grad, groups); + math::MaxOutGradFunctor maxout_backward; + maxout_backward(device_ctx, *in_x, in_x_grad, *out, *out_grad, groups); } } }; diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc index dcc5b4286f..8932d700c2 100644 --- a/paddle/operators/mean_op.cc +++ b/paddle/operators/mean_op.cc @@ -76,8 +76,9 @@ class MeanGradMaker : public framework::SingleGradOpDescMaker { namespace ops = paddle::operators; REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker); REGISTER_OPERATOR(mean_grad, ops::MeanGradOp); -REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel, - ops::MeanKernel); -REGISTER_OP_CPU_KERNEL(mean_grad, - ops::MeanGradKernel, - ops::MeanGradKernel); +REGISTER_OP_CPU_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CPU_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu index ca089938c0..93062bf540 100644 --- a/paddle/operators/mean_op.cu +++ b/paddle/operators/mean_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/mean_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel, - ops::MeanKernel); -REGISTER_OP_GPU_KERNEL(mean_grad, - ops::MeanGradKernel, - ops::MeanGradKernel); +REGISTER_OP_CUDA_KERNEL( + mean, ops::MeanKernel, + ops::MeanKernel); +REGISTER_OP_CUDA_KERNEL( + mean_grad, ops::MeanGradKernel, + ops::MeanGradKernel); diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h index c99286a5b9..351b345959 100644 --- a/paddle/operators/mean_op.h +++ b/paddle/operators/mean_op.h @@ -27,7 +27,7 @@ template using EigenVector = framework::EigenVector; -template +template class MeanKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -38,13 +38,14 @@ class MeanKernel : public framework::OpKernel { auto X = EigenVector::Flatten(*input); auto y = EigenScalar::From(*output); - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); y.device(place) = X.mean(); } }; -template +template class MeanGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,7 +57,8 @@ class MeanGradKernel : public framework::OpKernel { T ig_size = static_cast(IG->numel()); Eigen::DSizes bcast(ig_size); - EigenVector::Flatten(*IG).device(context.GetEigenDevice()) = + EigenVector::Flatten(*IG).device( + *context.template device_context().eigen_device()) = (EigenVector::From(*OG) / ig_size).broadcast(bcast); } }; diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc index 4684c20208..27f0c8de20 100644 --- a/paddle/operators/minus_op.cc +++ b/paddle/operators/minus_op.cc @@ -102,5 +102,5 @@ class MinusGradMaker : public framework::GradOpDescMakerBase { namespace ops = paddle::operators; REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker); -REGISTER_OP_CPU_KERNEL(minus, - ops::MinusKernel); +REGISTER_OP_CPU_KERNEL( + minus, ops::MinusKernel); diff --git a/paddle/operators/minus_op.cu b/paddle/operators/minus_op.cu index a8375cc630..3b202ea92e 100644 --- a/paddle/operators/minus_op.cu +++ b/paddle/operators/minus_op.cu @@ -14,5 +14,6 @@ #include "paddle/operators/minus_op.h" -REGISTER_OP_GPU_KERNEL( - minus, paddle::operators::MinusKernel); +REGISTER_OP_CUDA_KERNEL( + minus, + paddle::operators::MinusKernel); diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h index bd9a2790aa..78e1e1be6d 100644 --- a/paddle/operators/minus_op.h +++ b/paddle/operators/minus_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class MinusKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -28,7 +28,8 @@ class MinusKernel : public framework::OpKernel { auto* out_tensor = context.Output("Out"); out_tensor->mutable_data(context.GetPlace()); - auto& dev = context.GetEigenDevice(); + auto& dev = + *context.template device_context().eigen_device(); framework::EigenVector::Flatten(*out_tensor).device(dev) = framework::EigenVector::Flatten(*left_tensor) - framework::EigenVector::Flatten(*right_tensor); diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc index 28528848af..f0a42491bf 100644 --- a/paddle/operators/modified_huber_loss_op.cc +++ b/paddle/operators/modified_huber_loss_op.cc @@ -115,6 +115,6 @@ REGISTER_OP(modified_huber_loss, ops::ModifiedHuberLossOp, REGISTER_OP_CPU_KERNEL( modified_huber_loss, - ops::ModifiedHuberLossKernel); + ops::ModifiedHuberLossKernel); REGISTER_OP_CPU_KERNEL(modified_huber_loss_grad, ops::ModifiedHuberLossGradCPUKernel); diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu index 8854e166cd..40a8447da4 100644 --- a/paddle/operators/modified_huber_loss_op.cu +++ b/paddle/operators/modified_huber_loss_op.cu @@ -71,8 +71,8 @@ class ModifiedHuberLossGradGPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( modified_huber_loss, - ops::ModifiedHuberLossKernel); -REGISTER_OP_GPU_KERNEL(modified_huber_loss_grad, - ops::ModifiedHuberLossGradGPUKernel); + ops::ModifiedHuberLossKernel); +REGISTER_OP_CUDA_KERNEL(modified_huber_loss_grad, + ops::ModifiedHuberLossGradGPUKernel); diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h index aba75efad9..157ae0682e 100644 --- a/paddle/operators/modified_huber_loss_op.h +++ b/paddle/operators/modified_huber_loss_op.h @@ -46,7 +46,7 @@ struct ModifiedHuberLossForward { } }; -template +template class ModifiedHuberLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -57,7 +57,8 @@ class ModifiedHuberLossKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); out1->mutable_data(context.GetPlace()); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x = EigenVector::Flatten(*in0); auto y = EigenVector::Flatten(*in1); diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu index be0c8ea071..00f1253465 100644 --- a/paddle/operators/momentum_op.cu +++ b/paddle/operators/momentum_op.cu @@ -74,5 +74,5 @@ class MomentumOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(momentum, ops::MomentumOpCUDAKernel, - ops::MomentumOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(momentum, ops::MomentumOpCUDAKernel, + ops::MomentumOpCUDAKernel); diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc index 3c39ae10dc..bc4a5fdf0b 100644 --- a/paddle/operators/mul_op.cc +++ b/paddle/operators/mul_op.cc @@ -149,6 +149,7 @@ REGISTER_OPERATOR(mul, paddle::framework::OperatorWithKernel, ops::MulOpMaker, ops::MulOpShapeInference, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(mul_grad, ops::MulOpGrad); -REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); -REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); +REGISTER_OP_CPU_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CPU_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/operators/mul_op.cu.cc b/paddle/operators/mul_op.cu.cc index 66dc3d6d10..6095de58d0 100644 --- a/paddle/operators/mul_op.cu.cc +++ b/paddle/operators/mul_op.cu.cc @@ -15,6 +15,7 @@ #include "paddle/operators/mul_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); -REGISTER_OP_GPU_KERNEL(mul_grad, - ops::MulGradKernel); +REGISTER_OP_CUDA_KERNEL( + mul, ops::MulKernel); +REGISTER_OP_CUDA_KERNEL( + mul_grad, ops::MulGradKernel); diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h index 0eb9df41e9..1b467dca83 100644 --- a/paddle/operators/mul_op.h +++ b/paddle/operators/mul_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,15 +46,16 @@ class MulKernel : public framework::OpKernel { if (z_dim.size() != 2) { z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); } - math::matmul(context.device_context(), x_matrix, false, y_matrix, - false, 1, z, 0); + math::matmul( + context.template device_context(), x_matrix, false, + y_matrix, false, 1, z, 0); if (z_dim.size() != 2) { z->Resize(z_dim); } } }; -template +template class MulGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -77,6 +78,7 @@ class MulGradKernel : public framework::OpKernel { Tensor* dx = ctx.Output(framework::GradVarName("X")); Tensor* dy = ctx.Output(framework::GradVarName("Y")); + auto& dev_ctx = ctx.template device_context(); if (dx) { dx->mutable_data(ctx.GetPlace()); Tensor dx_matrix = dx->dims().size() > 2 @@ -84,8 +86,8 @@ class MulGradKernel : public framework::OpKernel { : *dx; // dx = dout * y'. dx: M x K, dout : M x N, y : K x N - math::matmul(ctx.device_context(), dout_mat, false, y_matrix, - true, 1, &dx_matrix, 0); + math::matmul(dev_ctx, dout_mat, false, y_matrix, true, + 1, &dx_matrix, 0); } if (dy) { dy->mutable_data(ctx.GetPlace()); @@ -93,8 +95,8 @@ class MulGradKernel : public framework::OpKernel { ? framework::ReshapeToMatrix(*dy, y_num_col_dims) : *dy; // dy = x' * dout. dy K x N, dout : M x N, x : M x K - math::matmul(ctx.device_context(), x_matrix, true, dout_mat, - false, 1, &dy_matrix, 0); + math::matmul(dev_ctx, x_matrix, true, dout_mat, false, + 1, &dy_matrix, 0); } } }; diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index 8e7f544e0d..b1ee8051c4 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -119,7 +119,8 @@ REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, paddle::framework::DefaultGradOpDescMaker); REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp); REGISTER_OP_CPU_KERNEL( - multiplex, ops::MultiplexCPUKernel); + multiplex, + ops::MultiplexCPUKernel); REGISTER_OP_CPU_KERNEL( multiplex_grad, - ops::MultiplexGradCPUKernel); + ops::MultiplexGradCPUKernel); diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 10dff8d021..47986e9ff8 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel { CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - Place place = boost::get(ctx.GetPlace()); + platform::GPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); @@ -60,7 +60,8 @@ class MultiplexGradGPUKernel : public framework::OpKernel { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(ctx.GetEigenDevice()) = t.constant(static_cast(0)); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); } } @@ -72,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - Place place = boost::get(ctx.GetPlace()); + platform::GPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { @@ -87,8 +88,9 @@ class MultiplexGradGPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - multiplex, ops::MultiplexGPUKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + multiplex, + ops::MultiplexGPUKernel); +REGISTER_OP_CUDA_KERNEL( multiplex_grad, - ops::MultiplexGradGPUKernel); + ops::MultiplexGradGPUKernel); diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h index ab3cafaa32..3443151161 100644 --- a/paddle/operators/multiplex_op.h +++ b/paddle/operators/multiplex_op.h @@ -22,7 +22,7 @@ namespace paddle { namespace operators { -template +template class MultiplexCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -35,7 +35,7 @@ class MultiplexCPUKernel : public framework::OpKernel { auto rows = ins[0]->dims()[0]; auto cols = ins[0]->numel() / rows; auto index = ids->data(); - Place place = boost::get(ctx.GetPlace()); + platform::CPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); @@ -47,7 +47,7 @@ class MultiplexCPUKernel : public framework::OpKernel { } }; -template +template class MultiplexGradCPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -60,14 +60,15 @@ class MultiplexGradCPUKernel : public framework::OpKernel { if (d_ins[i]) { d_ins[i]->mutable_data(ctx.GetPlace()); auto t = framework::EigenVector::Flatten(*d_ins[i]); - t.device(ctx.GetEigenDevice()) = t.constant(static_cast(0)); + t.device(*ctx.template device_context().eigen_device()) = + t.constant(static_cast(0)); } } auto rows = ins[0]->dims()[0]; auto cols = ins[0]->numel() / rows; auto* index = ids->data(); - Place place = boost::get(ctx.GetPlace()); + platform::CPUPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc index 4f0a2a79ed..6ca6db7253 100644 --- a/paddle/operators/nccl_op.cu.cc +++ b/paddle/operators/nccl_op.cu.cc @@ -204,6 +204,6 @@ class NCCLBcastKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); -REGISTER_OP_GPU_KERNEL(ncclBcast, ops::NCCLBcastKernel); -REGISTER_OP_GPU_KERNEL(ncclReduce, ops::NCCLReduceKernel); +REGISTER_OP_CUDA_KERNEL(ncclAllReduce, ops::NCCLAllReduceKernel); +REGISTER_OP_CUDA_KERNEL(ncclBcast, ops::NCCLBcastKernel); +REGISTER_OP_CUDA_KERNEL(ncclReduce, ops::NCCLReduceKernel); diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index bb7ae20286..d747cc0cf5 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -33,9 +33,9 @@ #include "paddle/platform/place.h" USE_NO_KERNEL_OP(ncclInit); -USE_GPU_ONLY_OP(ncclAllReduce); -USE_GPU_ONLY_OP(ncclReduce); -USE_GPU_ONLY_OP(ncclBcast); +USE_CUDA_ONLY_OP(ncclAllReduce); +USE_CUDA_ONLY_OP(ncclReduce); +USE_CUDA_ONLY_OP(ncclBcast); namespace f = paddle::framework; namespace p = paddle::platform; diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc index 952da10434..5ad1610fde 100644 --- a/paddle/operators/nce_op.cc +++ b/paddle/operators/nce_op.cc @@ -67,7 +67,7 @@ class NCEOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; @@ -170,7 +170,7 @@ class NCEOpGrad : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/nce_op.h b/paddle/operators/nce_op.h index 0a8a95de5f..6636dad060 100644 --- a/paddle/operators/nce_op.h +++ b/paddle/operators/nce_op.h @@ -28,7 +28,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template void PrepareSamples(const framework::ExecutionContext& context) { auto label = context.Input("Label"); const int64_t* label_data = label->data(); @@ -67,11 +67,11 @@ void PrepareSamples(const framework::ExecutionContext& context) { } } -template +template class NCEKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PrepareSamples(context); + PrepareSamples(context); auto sample_labels = context.Output("SampleLabels"); const int64_t* sample_labels_data = sample_labels->data(); auto sample_out = context.Output("SampleLogits"); @@ -135,7 +135,7 @@ class NCEKernel : public framework::OpKernel { } }; -template +template class NCEGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc index adb75df6ef..936dde22c3 100644 --- a/paddle/operators/pad_op.cc +++ b/paddle/operators/pad_op.cc @@ -134,6 +134,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker); REGISTER_OPERATOR(pad_grad, ops::PadOpGrad); -REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel); -REGISTER_OP_CPU_KERNEL(pad_grad, - ops::PadGradKernel); +REGISTER_OP_CPU_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CPU_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/operators/pad_op.cu b/paddle/operators/pad_op.cu index 555a7dba23..c309fb625c 100644 --- a/paddle/operators/pad_op.cu +++ b/paddle/operators/pad_op.cu @@ -16,6 +16,7 @@ #include "paddle/operators/pad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pad, ops::PadKernel); -REGISTER_OP_GPU_KERNEL(pad_grad, - ops::PadGradKernel); +REGISTER_OP_CUDA_KERNEL( + pad, ops::PadKernel); +REGISTER_OP_CUDA_KERNEL( + pad_grad, ops::PadGradKernel); diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h index 9534dbf545..1b95942af3 100644 --- a/paddle/operators/pad_op.h +++ b/paddle/operators/pad_op.h @@ -26,7 +26,7 @@ template using EigenTensor = framework::EigenTensor; -template +template void PadFunction(const framework::ExecutionContext& context) { auto pads = context.Attr>("paddings"); Eigen::array, D> paddings; @@ -42,33 +42,34 @@ void PadFunction(const framework::ExecutionContext& context) { auto x_tensor = EigenTensor::From(*x); auto out_tensor = EigenTensor::From(*out); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); out_tensor.device(place) = x_tensor.pad(paddings, pad_value); } -template +template class PadKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { int rank = context.Input("X")->dims().size(); switch (rank) { case 1: - PadFunction(context); + PadFunction(context); break; case 2: - PadFunction(context); + PadFunction(context); break; case 3: - PadFunction(context); + PadFunction(context); break; case 4: - PadFunction(context); + PadFunction(context); break; case 5: - PadFunction(context); + PadFunction(context); break; case 6: - PadFunction(context); + PadFunction(context); break; default: PADDLE_THROW( @@ -77,7 +78,7 @@ class PadKernel : public framework::OpKernel { } }; -template +template void PadGradFunction(const framework::ExecutionContext& context) { auto pads = context.Attr>("paddings"); Eigen::array, D> paddings; @@ -91,12 +92,13 @@ void PadGradFunction(const framework::ExecutionContext& context) { d_x->mutable_data(context.GetPlace()); auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); } } -template +template class PadGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -104,22 +106,22 @@ class PadGradKernel : public framework::OpKernel { context.Input(framework::GradVarName("Out"))->dims().size(); switch (rank) { case 1: - PadGradFunction(context); + PadGradFunction(context); break; case 2: - PadGradFunction(context); + PadGradFunction(context); break; case 3: - PadGradFunction(context); + PadGradFunction(context); break; case 4: - PadGradFunction(context); + PadGradFunction(context); break; case 5: - PadGradFunction(context); + PadGradFunction(context); break; case 6: - PadGradFunction(context); + PadGradFunction(context); break; default: PADDLE_THROW( diff --git a/paddle/operators/pool_cudnn_op.cc b/paddle/operators/pool_cudnn_op.cc index be9fcc5661..77407f5cdf 100644 --- a/paddle/operators/pool_cudnn_op.cc +++ b/paddle/operators/pool_cudnn_op.cc @@ -19,19 +19,21 @@ namespace ops = paddle::operators; REGISTER_OP(pool2d_cudnn, ops::PoolOp, ops::Pool2dOpMaker, pool2d_cudnn_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool2d_cudnn, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool2d_cudnn_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool2d_cudnn, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool2d_cudnn_grad, + ops::PoolGradKernel, + ops::PoolGradKernel) REGISTER_OP(pool3d_cudnn, ops::PoolOp, ops::Pool3dOpMaker, pool3d_cudnn_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool3d_cudnn, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool3d_cudnn_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool3d_cudnn, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool3d_cudnn_grad, + ops::PoolGradKernel, + ops::PoolGradKernel) diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc index 66dd194ccd..fc2b37bd0f 100644 --- a/paddle/operators/pool_cudnn_op.cu.cc +++ b/paddle/operators/pool_cudnn_op.cu.cc @@ -162,12 +162,12 @@ class PoolCudnnGradOpKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel, - ops::PoolCudnnOpKernel); -REGISTER_OP_GPU_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel, - ops::PoolCudnnGradOpKernel); - -REGISTER_OP_GPU_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel, - ops::PoolCudnnOpKernel); -REGISTER_OP_GPU_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel, - ops::PoolCudnnGradOpKernel); +REGISTER_OP_CUDA_KERNEL(pool2d_cudnn, ops::PoolCudnnOpKernel, + ops::PoolCudnnOpKernel); +REGISTER_OP_CUDA_KERNEL(pool2d_cudnn_grad, ops::PoolCudnnGradOpKernel, + ops::PoolCudnnGradOpKernel); + +REGISTER_OP_CUDA_KERNEL(pool3d_cudnn, ops::PoolCudnnOpKernel, + ops::PoolCudnnOpKernel); +REGISTER_OP_CUDA_KERNEL(pool3d_cudnn_grad, ops::PoolCudnnGradOpKernel, + ops::PoolCudnnGradOpKernel); diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc index e26ffd86e5..45fa20280c 100644 --- a/paddle/operators/pool_op.cc +++ b/paddle/operators/pool_op.cc @@ -216,19 +216,19 @@ namespace ops = paddle::operators; REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool2d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel) +REGISTER_OP_CPU_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool2d_grad, ops::PoolGradKernel, + ops::PoolGradKernel) REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad, ops::PoolOpGrad); -REGISTER_OP_CPU_KERNEL(pool3d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_CPU_KERNEL(pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CPU_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CPU_KERNEL( + pool3d_grad, ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/operators/pool_op.cu.cc b/paddle/operators/pool_op.cu.cc index 1010cb7622..39a9dfbf79 100644 --- a/paddle/operators/pool_op.cu.cc +++ b/paddle/operators/pool_op.cu.cc @@ -16,16 +16,18 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(pool2d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_GPU_KERNEL(pool2d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + pool2d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool2d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); -REGISTER_OP_GPU_KERNEL(pool3d, - ops::PoolKernel, - ops::PoolKernel); -REGISTER_OP_GPU_KERNEL(pool3d_grad, - ops::PoolGradKernel, - ops::PoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + pool3d, ops::PoolKernel, + ops::PoolKernel); +REGISTER_OP_CUDA_KERNEL( + pool3d_grad, + ops::PoolGradKernel, + ops::PoolGradKernel); diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h index 63492a89e8..ab85d587a3 100644 --- a/paddle/operators/pool_op.h +++ b/paddle/operators/pool_op.h @@ -50,7 +50,7 @@ class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker { framework::OpAttrChecker* op_checker); }; -template +template class PoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -67,41 +67,41 @@ class PoolKernel : public framework::OpKernel { ksize[i] = static_cast(in_x->dims()[i + 2]); } } - + auto& dev_ctx = context.template device_context(); switch (ksize.size()) { case 2: { if (pooling_type == "max") { paddle::operators::math::Pool2dFunctor< - Place, paddle::operators::math::MaxPool, T> + DeviceContext, paddle::operators::math::MaxPool, T> pool2d_forward; paddle::operators::math::MaxPool pool_process; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dFunctor< - Place, paddle::operators::math::AvgPool, T> + DeviceContext, paddle::operators::math::AvgPool, T> pool2d_forward; paddle::operators::math::AvgPool pool_process; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } } break; case 3: { if (pooling_type == "max") { paddle::operators::math::Pool3dFunctor< - Place, paddle::operators::math::MaxPool, T> + DeviceContext, paddle::operators::math::MaxPool, T> pool3d_forward; paddle::operators::math::MaxPool pool_process; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dFunctor< - Place, paddle::operators::math::AvgPool, T> + DeviceContext, paddle::operators::math::AvgPool, T> pool3d_forward; paddle::operators::math::AvgPool pool_process; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, pool_process, out); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, + out); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } @@ -109,7 +109,7 @@ class PoolKernel : public framework::OpKernel { } }; -template +template class PoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -130,42 +130,43 @@ class PoolGradKernel : public framework::OpKernel { ksize[i] = static_cast(in_x->dims()[i + 2]); } } - + auto& dev_ctx = context.template device_context(); if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); auto temp = framework::EigenVector::Flatten(*in_x_grad); - temp.device(context.GetEigenDevice()) = + temp.device( + *context.template device_context().eigen_device()) = temp.constant(static_cast(0)); switch (ksize.size()) { case 2: { if (pooling_type == "max") { - paddle::operators::math::MaxPool2dGradFunctor + paddle::operators::math::MaxPool2dGradFunctor pool2d_backward; - pool2d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, in_x_grad); + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dGradFunctor< - Place, paddle::operators::math::AvgPoolGrad, T> + DeviceContext, paddle::operators::math::AvgPoolGrad, T> pool2d_backward; paddle::operators::math::AvgPoolGrad pool_process; - pool2d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, pool_process, in_x_grad); + pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); } } break; case 3: { if (pooling_type == "max") { - paddle::operators::math::MaxPool3dGradFunctor + paddle::operators::math::MaxPool3dGradFunctor pool3d_backward; - pool3d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, in_x_grad); + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, in_x_grad); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dGradFunctor< - Place, paddle::operators::math::AvgPoolGrad, T> + DeviceContext, paddle::operators::math::AvgPoolGrad, T> pool3d_backward; paddle::operators::math::AvgPoolGrad pool_process; - pool3d_backward(context.device_context(), *in_x, *out, *out_grad, - ksize, strides, paddings, pool_process, in_x_grad); + pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, + paddings, pool_process, in_x_grad); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc index b9c42a6912..1a2383f8b8 100644 --- a/paddle/operators/pool_with_index_op.cc +++ b/paddle/operators/pool_with_index_op.cc @@ -266,12 +266,15 @@ REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp, REGISTER_OP_CPU_KERNEL( max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); REGISTER_OP_CPU_KERNEL( max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad, @@ -279,9 +282,12 @@ REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp, REGISTER_OP_CPU_KERNEL( max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); REGISTER_OP_CPU_KERNEL( max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/operators/pool_with_index_op.cu.cc b/paddle/operators/pool_with_index_op.cu.cc index 335064a7ee..4c9804da63 100644 --- a/paddle/operators/pool_with_index_op.cu.cc +++ b/paddle/operators/pool_with_index_op.cu.cc @@ -16,20 +16,28 @@ limitations under the License. */ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( max_pool2d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_GPU_KERNEL( + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( max_pool2d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( max_pool3d_with_index, - ops::MaxPoolWithIndexKernel, - ops::MaxPoolWithIndexKernel); -REGISTER_OP_GPU_KERNEL( + ops::MaxPoolWithIndexKernel, + ops::MaxPoolWithIndexKernel); +REGISTER_OP_CUDA_KERNEL( max_pool3d_with_index_grad, - ops::MaxPoolWithIndexGradKernel, - ops::MaxPoolWithIndexGradKernel) + ops::MaxPoolWithIndexGradKernel, + ops::MaxPoolWithIndexGradKernel) diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h index 40766c7e82..4f4087d1dd 100644 --- a/paddle/operators/pool_with_index_op.h +++ b/paddle/operators/pool_with_index_op.h @@ -24,7 +24,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class MaxPoolWithIndexKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -35,6 +35,8 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + + auto& dev_ctx = context.template device_context(); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -44,23 +46,23 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { switch (ksize.size()) { case 2: { - paddle::operators::math::MaxPool2dWithIndexFunctor + paddle::operators::math::MaxPool2dWithIndexFunctor pool2d_forward; - pool2d_forward(context.device_context(), *in_x, ksize, strides, - paddings, out, mask); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); } break; case 3: { - paddle::operators::math::MaxPool3dWithIndexFunctor + paddle::operators::math::MaxPool3dWithIndexFunctor pool3d_forward; - pool3d_forward(context.device_context(), *in_x, ksize, strides, - paddings, out, mask); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } } }; -template +template class MaxPoolWithIndexGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -81,18 +83,20 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); - auto& device_ctx = context.device_context(); + auto& device_ctx = context.template device_context(); math::set_constant(device_ctx, in_x_grad, 0); switch (ksize.size()) { case 2: { - paddle::operators::math::MaxPool2dWithIndexGradFunctor + paddle::operators::math::MaxPool2dWithIndexGradFunctor pool2d_backward; pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, paddings, in_x_grad); } break; case 3: { - paddle::operators::math::MaxPool3dWithIndexGradFunctor + paddle::operators::math::MaxPool3dWithIndexGradFunctor pool3d_backward; pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, paddings, in_x_grad); diff --git a/paddle/operators/positive_negative_pair_op.h b/paddle/operators/positive_negative_pair_op.h index 2efd3777e0..977e59b7d2 100644 --- a/paddle/operators/positive_negative_pair_op.h +++ b/paddle/operators/positive_negative_pair_op.h @@ -22,7 +22,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class PositiveNegativePairKernel : public framework::OpKernel { public: struct PredictionResult { diff --git a/paddle/operators/precision_recall_op.h b/paddle/operators/precision_recall_op.h index 4a871ce674..c0d55405a3 100644 --- a/paddle/operators/precision_recall_op.h +++ b/paddle/operators/precision_recall_op.h @@ -26,7 +26,7 @@ using EigenMatrix = framework::EigenMatrix; enum StateVariable { TP = 0, FP, TN, FN }; -template +template class PrecisionRecallKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc index 055c471b45..317a2a4015 100644 --- a/paddle/operators/prelu_op.cc +++ b/paddle/operators/prelu_op.cc @@ -85,7 +85,8 @@ namespace ops = paddle::operators; REGISTER_OP(prelu, ops::PReluOp, ops::PReluOpMaker, prelu_grad, ops::PReluGradOp); -REGISTER_OP_CPU_KERNEL(prelu, - ops::PReluKernel); -REGISTER_OP_CPU_KERNEL(prelu_grad, - ops::PReluGradKernel); +REGISTER_OP_CPU_KERNEL( + prelu, ops::PReluKernel); +REGISTER_OP_CPU_KERNEL( + prelu_grad, + ops::PReluGradKernel); diff --git a/paddle/operators/prelu_op.cu b/paddle/operators/prelu_op.cu index 9e391dabae..12033dee0e 100644 --- a/paddle/operators/prelu_op.cu +++ b/paddle/operators/prelu_op.cu @@ -14,8 +14,9 @@ #include "paddle/operators/prelu_op.h" -REGISTER_OP_GPU_KERNEL( - prelu, paddle::operators::PReluKernel); -REGISTER_OP_GPU_KERNEL( - prelu_grad, - paddle::operators::PReluGradKernel); +REGISTER_OP_CUDA_KERNEL( + prelu, + paddle::operators::PReluKernel); +REGISTER_OP_CUDA_KERNEL(prelu_grad, + paddle::operators::PReluGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h index 5ad31c2203..56f9a553ec 100644 --- a/paddle/operators/prelu_op.h +++ b/paddle/operators/prelu_op.h @@ -39,7 +39,7 @@ class PReluFunctor { const T* alpha_; }; -template +template class PReluKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -54,9 +54,9 @@ class PReluKernel : public framework::OpKernel { int numel = x->numel(); - Transform trans; - trans(context.device_context(), x_ptr, x_ptr + numel, o_ptr, - PReluFunctor(alpha_ptr)); + Transform trans; + trans(context.template device_context(), x_ptr, + x_ptr + numel, o_ptr, PReluFunctor(alpha_ptr)); } }; @@ -76,7 +76,7 @@ class PReluGradFunctor { const T* alpha_; }; -template +template class PReluGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -92,9 +92,9 @@ class PReluGradKernel : public framework::OpKernel { const T* out_ptr = out->data(); int numel = dx->numel(); - Transform trans; - trans(context.device_context(), out_ptr, out_ptr + numel, dout_ptr, dx_ptr, - PReluGradFunctor(alpha_ptr)); + Transform trans; + trans(context.template device_context(), out_ptr, + out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor(alpha_ptr)); // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready } diff --git a/paddle/operators/proximal_adagrad_op.cc b/paddle/operators/proximal_adagrad_op.cc index 36e460103a..cc350f6d26 100644 --- a/paddle/operators/proximal_adagrad_op.cc +++ b/paddle/operators/proximal_adagrad_op.cc @@ -114,4 +114,4 @@ REGISTER_OP_WITHOUT_GRADIENT(proximal_adagrad, ops::ProximalAdagradOp, ops::ProximalAdagradOpMaker); REGISTER_OP_CPU_KERNEL( proximal_adagrad, - ops::ProximalAdagradOpKernel); + ops::ProximalAdagradOpKernel); diff --git a/paddle/operators/proximal_adagrad_op.cu b/paddle/operators/proximal_adagrad_op.cu index d0ae039518..42a178f94b 100644 --- a/paddle/operators/proximal_adagrad_op.cu +++ b/paddle/operators/proximal_adagrad_op.cu @@ -15,6 +15,6 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/proximal_adagrad_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( proximal_adagrad, - ops::ProximalAdagradOpKernel); + ops::ProximalAdagradOpKernel); diff --git a/paddle/operators/proximal_adagrad_op.h b/paddle/operators/proximal_adagrad_op.h index 7a1560e8cb..523924d80e 100644 --- a/paddle/operators/proximal_adagrad_op.h +++ b/paddle/operators/proximal_adagrad_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class ProximalAdagradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -45,20 +45,20 @@ class ProximalAdagradOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto m_out = EigenVector::Flatten(*moment_out); - auto place = ctx.GetEigenDevice(); + auto* place = ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); - m_out.device(place) = m + g * g; + m_out.device(*place) = m + g * g; auto prox_param = p - lr.broadcast(grad_dsize) * g / m_out.sqrt(); if (l1 > static_cast(0)) { - p_out.device(place) = + p_out.device(*place) = prox_param.sign() * (((prox_param.abs() - (lr * l1).broadcast(grad_dsize)) .cwiseMax(static_cast(0.0))) / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize))); } else { - p_out.device(place) = + p_out.device(*place) = prox_param / (static_cast(1.0) + (lr * l2).broadcast(grad_dsize)); } } diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc index 5693d0ec9e..0b26beb3ac 100644 --- a/paddle/operators/proximal_gd_op.cc +++ b/paddle/operators/proximal_gd_op.cc @@ -94,4 +94,5 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp, ops::ProximalGDOpMaker); REGISTER_OP_CPU_KERNEL( - proximal_gd, ops::ProximalGDOpKernel); + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu index 26f4ebaa0f..b7dd840d19 100644 --- a/paddle/operators/proximal_gd_op.cu +++ b/paddle/operators/proximal_gd_op.cu @@ -15,5 +15,6 @@ specific language governing permissions and limitations under the License. */ #include "paddle/operators/proximal_gd_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - proximal_gd, ops::ProximalGDOpKernel); +REGISTER_OP_CUDA_KERNEL( + proximal_gd, + ops::ProximalGDOpKernel); diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h index bebda02041..64648b3cca 100644 --- a/paddle/operators/proximal_gd_op.h +++ b/paddle/operators/proximal_gd_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class ProximalGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -42,7 +42,7 @@ class ProximalGDOpKernel : public framework::OpKernel { auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); auto p_out = EigenVector::Flatten(*param_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc index 912f88f455..b80b175792 100644 --- a/paddle/operators/rank_loss_op.cc +++ b/paddle/operators/rank_loss_op.cc @@ -123,7 +123,8 @@ namespace ops = paddle::operators; REGISTER_OP(rank_loss, ops::RankLossOp, ops::RankLossOpMaker, rank_loss_grad, ops::RankLossGradOp); -REGISTER_OP_CPU_KERNEL(rank_loss, - ops::RankLossKernel); REGISTER_OP_CPU_KERNEL( - rank_loss_grad, ops::RankLossGradKernel); + rank_loss, ops::RankLossKernel); +REGISTER_OP_CPU_KERNEL( + rank_loss_grad, + ops::RankLossGradKernel); diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu index 5382e3a629..5aee66443d 100644 --- a/paddle/operators/rank_loss_op.cu +++ b/paddle/operators/rank_loss_op.cu @@ -14,9 +14,9 @@ #include "paddle/operators/rank_loss_op.h" -REGISTER_OP_GPU_KERNEL( - rank_loss, - paddle::operators::RankLossKernel); -REGISTER_OP_GPU_KERNEL( - rank_loss_grad, - paddle::operators::RankLossGradKernel); +REGISTER_OP_CUDA_KERNEL(rank_loss, + paddle::operators::RankLossKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(rank_loss_grad, + paddle::operators::RankLossGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h index 703c77a0b2..ea24b61fd9 100644 --- a/paddle/operators/rank_loss_op.h +++ b/paddle/operators/rank_loss_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class RankLossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -35,13 +35,13 @@ class RankLossKernel : public framework::OpKernel { auto left = framework::EigenVector::Flatten(*left_t); auto right = framework::EigenVector::Flatten(*right_t); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); out.device(dev) = (1. + (left - right).exp()).log() - label * (left - right); } }; -template +template class RankLossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -55,7 +55,7 @@ class RankLossGradKernel : public framework::OpKernel { auto* left_t = ctx.Input("Left"); auto* right_t = ctx.Input("Right"); - auto& dev = ctx.GetEigenDevice(); + auto& dev = *ctx.template device_context().eigen_device(); auto d_out = framework::EigenVector::Flatten(*d_out_t); auto label = framework::EigenVector::Flatten(*label_t); auto left = framework::EigenVector::Flatten(*left_t); diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc index 2589a54cfc..b754637bf2 100644 --- a/paddle/operators/reduce_op.cc +++ b/paddle/operators/reduce_op.cc @@ -180,12 +180,13 @@ REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad, REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad, ops::ReduceGradOp); -#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_CPU_KERNEL( \ - reduce_type, \ - ops::ReduceKernel); \ - REGISTER_OP_CPU_KERNEL(reduce_type##_grad, \ - ops::ReduceGradKernel); +#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CPU_KERNEL(reduce_type, \ + ops::ReduceKernel); \ + REGISTER_OP_CPU_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel); FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL); diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu index d306e1a240..a10ace5253 100644 --- a/paddle/operators/reduce_op.cu +++ b/paddle/operators/reduce_op.cu @@ -17,12 +17,13 @@ namespace ops = paddle::operators; -#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ - REGISTER_OP_GPU_KERNEL( \ - reduce_type, \ - ops::ReduceKernel); \ - REGISTER_OP_GPU_KERNEL(reduce_type##_grad, \ - ops::ReduceGradKernel); +#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor) \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type, ops::ReduceKernel); \ + REGISTER_OP_CUDA_KERNEL( \ + reduce_type##_grad, \ + ops::ReduceGradKernel); FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL); diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h index dd6547542d..47ce910f28 100644 --- a/paddle/operators/reduce_op.h +++ b/paddle/operators/reduce_op.h @@ -32,55 +32,55 @@ template ; struct SumFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.sum(dim); } }; struct SumGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { dx.device(place) = dy.broadcast(dim); } }; struct MeanFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.mean(dim); } }; struct MeanGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { dx.device(place) = dy.broadcast(dim) / dx.constant(size); } }; struct MaxFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.maximum(dim); } }; struct MinFunctor { - template - void operator()(const Place& place, X& x, Y& y, const Dim& dim) { + template + void operator()(const DeviceContext& place, X& x, Y& y, const Dim& dim) { y.device(place) = x.minimum(dim); } }; struct MaxOrMinGradFunctor { - template - void operator()(const Place& place, X& x, Y& y, DX& dx, DY& dy, + template + void operator()(const DeviceContext& place, X& x, Y& y, DX& dx, DY& dy, const Dim& dim, int size) { auto equals = x == y.broadcast(dim); auto ones = dx.constant(1); @@ -91,7 +91,7 @@ struct MaxOrMinGradFunctor { } }; -template +template class ReduceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -139,7 +139,8 @@ class ReduceKernel : public framework::OpKernel { dims = framework::make_ddim(dims_vector); } - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Functor functor; if (D == 1) { @@ -152,7 +153,7 @@ class ReduceKernel : public framework::OpKernel { } }; -template +template class ReduceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -201,7 +202,8 @@ class ReduceGradKernel : public framework::OpKernel { Eigen::array broadcast_dim; for (size_t i = 0; i < D; ++i) broadcast_dim[i] = 1; broadcast_dim[dim] = input0->dims()[dim]; - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); Functor functor; functor(place, x, x_reduce, x_grad, x_reduce_grad, broadcast_dim, broadcast_dim[dim]); diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu index dca6c15007..b7329238c0 100644 --- a/paddle/operators/reshape_op.cu +++ b/paddle/operators/reshape_op.cu @@ -14,9 +14,9 @@ #include "paddle/operators/reshape_op.h" -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( reshape, paddle::operators::ReshapeKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( reshape_grad, paddle::operators::ReshapeGradKernel); diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h index 73fd1da642..92d8cbbb56 100644 --- a/paddle/operators/reshape_op.h +++ b/paddle/operators/reshape_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { -template +template class ReshapeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { @@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel { } }; -template +template class ReshapeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc index a9c45f639c..fc3f9b8988 100644 --- a/paddle/operators/rmsprop_op.cc +++ b/paddle/operators/rmsprop_op.cc @@ -116,5 +116,5 @@ http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf) namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker); -REGISTER_OP_CPU_KERNEL(rmsprop, - ops::RmspropOpKernel); +REGISTER_OP_CPU_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu index 52634a5481..2a9fd6e104 100644 --- a/paddle/operators/rmsprop_op.cu +++ b/paddle/operators/rmsprop_op.cu @@ -16,5 +16,5 @@ #include "paddle/operators/rmsprop_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(rmsprop, - ops::RmspropOpKernel); +REGISTER_OP_CUDA_KERNEL( + rmsprop, ops::RmspropOpKernel); diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h index 7bf2129010..16a561835d 100644 --- a/paddle/operators/rmsprop_op.h +++ b/paddle/operators/rmsprop_op.h @@ -24,7 +24,7 @@ template using EigenVector = framework::EigenVector; -template +template class RmspropOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -51,7 +51,7 @@ class RmspropOpKernel : public framework::OpKernel { auto p_out = EigenVector::Flatten(*param_out); auto mom_out = EigenVector::Flatten(*moment_out); auto ms_out = EigenVector::Flatten(*mean_square_out); - auto place = ctx.GetEigenDevice(); + auto& place = *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc index 2b5e66c96b..75fcea8401 100644 --- a/paddle/operators/roi_pool_op.cc +++ b/paddle/operators/roi_pool_op.cc @@ -157,9 +157,10 @@ namespace ops = paddle::operators; REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad, ops::ROIPoolGradOp); REGISTER_OP_CPU_KERNEL( - roi_pool, ops::CPUROIPoolOpKernel, - ops::CPUROIPoolOpKernel); + roi_pool, + ops::CPUROIPoolOpKernel, + ops::CPUROIPoolOpKernel); REGISTER_OP_CPU_KERNEL( roi_pool_grad, - ops::CPUROIPoolGradOpKernel, - ops::CPUROIPoolOpKernel); + ops::CPUROIPoolGradOpKernel, + ops::CPUROIPoolOpKernel); diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu index 9a4c8ca752..a874befe4d 100644 --- a/paddle/operators/roi_pool_op.cu +++ b/paddle/operators/roi_pool_op.cu @@ -177,7 +177,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { if (x_grad) { x_grad->mutable_data(ctx.GetPlace()); math::SetConstant set_zero; - set_zero(ctx.device_context(), x_grad, static_cast(0)); + set_zero(ctx.cuda_device_context(), x_grad, static_cast(0)); int output_grad_size = out_grad->numel(); int blocks = NumBlocks(output_grad_size); @@ -199,10 +199,11 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - roi_pool, ops::GPUROIPoolOpKernel, - ops::GPUROIPoolOpKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + roi_pool, + ops::GPUROIPoolOpKernel, + ops::GPUROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( roi_pool_grad, - ops::GPUROIPoolGradOpKernel, - ops::GPUROIPoolOpKernel); + ops::GPUROIPoolGradOpKernel, + ops::GPUROIPoolOpKernel); diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h index 3812c66c65..09a9d3d870 100644 --- a/paddle/operators/roi_pool_op.h +++ b/paddle/operators/roi_pool_op.h @@ -19,7 +19,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class CPUROIPoolOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -126,7 +126,7 @@ class CPUROIPoolOpKernel : public framework::OpKernel { } }; -template +template class CPUROIPoolGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -145,8 +145,9 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel { const T* out_grad_data = out_grad->data(); const int64_t* argmax_data = argmax->data(); T* in_grad_data = in_grad->mutable_data(ctx.GetPlace()); - math::SetConstant set_zero; - set_zero(ctx.device_context(), in_grad, static_cast(0)); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), in_grad, + static_cast(0)); auto in_stride = framework::stride(in->dims()); auto argmax_stride = framework::stride(argmax->dims()); diff --git a/paddle/operators/row_conv_op.cc b/paddle/operators/row_conv_op.cc index ea0bb99f8d..5203a5079c 100644 --- a/paddle/operators/row_conv_op.cc +++ b/paddle/operators/row_conv_op.cc @@ -124,7 +124,8 @@ $$ }; template -class RowConvKernel : public framework::OpKernel { +class RowConvKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *x = context.Input("X"); @@ -169,7 +170,8 @@ class RowConvKernel : public framework::OpKernel { }; template -class RowConvGradKernel : public framework::OpKernel { +class RowConvGradKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *x = context.Input("X"); @@ -251,7 +253,8 @@ class RowConvGradKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP(row_conv, ops::RowConvOp, ops::RowConvOpMaker, row_conv_grad, ops::RowConvGradOp); -REGISTER_OP_CPU_KERNEL(row_conv, - ops::RowConvKernel); REGISTER_OP_CPU_KERNEL( - row_conv_grad, ops::RowConvGradKernel); + row_conv, ops::RowConvKernel); +REGISTER_OP_CPU_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.cu b/paddle/operators/row_conv_op.cu index e0d7ebda7e..3fc5eabcf5 100644 --- a/paddle/operators/row_conv_op.cu +++ b/paddle/operators/row_conv_op.cu @@ -292,7 +292,8 @@ __global__ void RowConvGradFilter(const T *in, const T *dout, int num_sequence, } // namespace template -class RowConvKernel : public framework::OpKernel { +class RowConvKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -327,7 +328,8 @@ class RowConvKernel : public framework::OpKernel { }; template -class RowConvGradKernel : public framework::OpKernel { +class RowConvGradKernel + : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { auto *X = context.Input("X"); @@ -347,7 +349,7 @@ class RowConvGradKernel : public framework::OpKernel { size_t *idx = batch_indices.data(); auto &device_ctx = context.cuda_device_context(); - math::SetConstant zero; + math::SetConstant zero; if (dFilter) { T *dfilter = dFilter->mutable_data(context.GetPlace()); @@ -402,7 +404,8 @@ class RowConvGradKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(row_conv, - ops::RowConvKernel); -REGISTER_OP_GPU_KERNEL( - row_conv_grad, ops::RowConvGradKernel); +REGISTER_OP_CUDA_KERNEL( + row_conv, ops::RowConvKernel); +REGISTER_OP_CUDA_KERNEL( + row_conv_grad, + ops::RowConvGradKernel); diff --git a/paddle/operators/row_conv_op.h b/paddle/operators/row_conv_op.h index 525e83908d..80912ad8f7 100644 --- a/paddle/operators/row_conv_op.h +++ b/paddle/operators/row_conv_op.h @@ -18,13 +18,13 @@ namespace paddle { namespace operators { -template +template class RowConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; }; -template +template class RowConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override; diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc index e5c10fec4d..d848be823e 100644 --- a/paddle/operators/scale_op.cc +++ b/paddle/operators/scale_op.cc @@ -75,8 +75,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker); -REGISTER_OP_CPU_KERNEL(scale, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel, - ops::ScaleKernel); +REGISTER_OP_CPU_KERNEL( + scale, ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel, + ops::ScaleKernel); diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu index 0d70775159..0c7980430f 100644 --- a/paddle/operators/scale_op.cu +++ b/paddle/operators/scale_op.cu @@ -14,8 +14,10 @@ #include "paddle/operators/scale_op.h" -REGISTER_OP_GPU_KERNEL( - scale, paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel, - paddle::operators::ScaleKernel); +REGISTER_OP_CUDA_KERNEL( + scale, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel, + paddle::operators::ScaleKernel); diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h index 4931294c9d..02a8c97a83 100644 --- a/paddle/operators/scale_op.h +++ b/paddle/operators/scale_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class ScaleKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& context) const { @@ -31,7 +31,8 @@ class ScaleKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*tensor); auto eigen_in = framework::EigenVector::Flatten(*in); - auto& dev = context.GetEigenDevice(); + auto& dev = + *context.template device_context().eigen_device(); eigen_out.device(dev) = scale * eigen_in; } }; diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu index 3b32ae2fb7..6b43a1389f 100644 --- a/paddle/operators/scatter_op.cu +++ b/paddle/operators/scatter_op.cu @@ -59,5 +59,5 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel); -REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(scatter, ops::ScatterOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel); diff --git a/paddle/operators/seq_expand_op.cc b/paddle/operators/seq_expand_op.cc index b862056ad4..ede9754697 100644 --- a/paddle/operators/seq_expand_op.cc +++ b/paddle/operators/seq_expand_op.cc @@ -148,8 +148,9 @@ class SeqExpandOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(seq_expand, ops::SeqExpandOp, ops::SeqExpandOpMaker, seq_expand_grad, ops::SeqExpandOpGrad); -REGISTER_OP_CPU_KERNEL(seq_expand, - ops::SeqExpandKernel); +REGISTER_OP_CPU_KERNEL( + seq_expand, + ops::SeqExpandKernel); REGISTER_OP_CPU_KERNEL( seq_expand_grad, - ops::SeqExpandGradKernel); + ops::SeqExpandGradKernel); diff --git a/paddle/operators/seq_expand_op.cu b/paddle/operators/seq_expand_op.cu index f1e4b82a76..8e67ce9ccb 100644 --- a/paddle/operators/seq_expand_op.cu +++ b/paddle/operators/seq_expand_op.cu @@ -16,8 +16,9 @@ #include "paddle/operators/seq_expand_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(seq_expand, - ops::SeqExpandKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + seq_expand, + ops::SeqExpandKernel); +REGISTER_OP_CUDA_KERNEL( seq_expand_grad, - ops::SeqExpandGradKernel); + ops::SeqExpandGradKernel); diff --git a/paddle/operators/seq_expand_op.h b/paddle/operators/seq_expand_op.h index 4ef0d02cf8..fbee0db454 100644 --- a/paddle/operators/seq_expand_op.h +++ b/paddle/operators/seq_expand_op.h @@ -23,7 +23,7 @@ namespace operators { using LoDTensor = framework::LoDTensor; -template +template class SeqExpandKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -37,7 +37,8 @@ class SeqExpandKernel : public framework::OpKernel { "The size of last lod level in Input(Y)" "must be equal to dims[0] of Input(X)."); out->set_lod(y->lod()); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); size_t element_len = framework::product(x_dims) / x_dims[0]; T* out_data = out->mutable_data(context.GetPlace()); auto out_starts = out->lod().back(); @@ -50,7 +51,7 @@ class SeqExpandKernel : public framework::OpKernel { Eigen::TensorMap> out_t(out_data, scale, element_len); Eigen::array cast({{scale, 1}}); - out_t.device(place) = x_t.broadcast(cast); + out_t.device(*place) = x_t.broadcast(cast); x_data += element_len; out_data += element_len * scale; } @@ -69,7 +70,7 @@ class SeqExpandKernel : public framework::OpKernel { * Grad(X).lod = Input(X).lod * * */ -template +template class SeqExpandGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -89,8 +90,9 @@ class SeqExpandGradKernel : public framework::OpKernel { d_out_t(d_out_data, static_cast(repeat), element_len); Eigen::TensorMap> d_x_t(d_x_data, static_cast(element_len)); - auto place = context.GetEigenDevice(); - d_x_t.device(place) = d_out_t.sum(Eigen::array({{0}})); + auto place = + context.template device_context().eigen_device(); + d_x_t.device(*place) = d_out_t.sum(Eigen::array({{0}})); d_out_data += (repeat * element_len); d_x_data += element_len; } diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc index d1de0b4447..9c7e5456e8 100644 --- a/paddle/operators/sequence_concat_op.cc +++ b/paddle/operators/sequence_concat_op.cc @@ -129,7 +129,7 @@ REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker, sequence_concat_grad, ops::SequenceConcatGradOp); REGISTER_OP_CPU_KERNEL( sequence_concat, - ops::SequenceConcatOpKernel); + ops::SequenceConcatOpKernel); REGISTER_OP_CPU_KERNEL( sequence_concat_grad, - ops::SequenceConcatGradOpKernel); + ops::SequenceConcatGradOpKernel); diff --git a/paddle/operators/sequence_concat_op.cu.cc b/paddle/operators/sequence_concat_op.cu.cc index 9ca99c2258..144bdb5af6 100644 --- a/paddle/operators/sequence_concat_op.cu.cc +++ b/paddle/operators/sequence_concat_op.cu.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_concat_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_concat, - ops::SequenceConcatOpKernel); -REGISTER_OP_GPU_KERNEL( - sequence_concat_grad, - ops::SequenceConcatGradOpKernel); + ops::SequenceConcatOpKernel); +REGISTER_OP_CUDA_KERNEL(sequence_concat_grad, + ops::SequenceConcatGradOpKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h index 09212070aa..8445224f46 100644 --- a/paddle/operators/sequence_concat_op.h +++ b/paddle/operators/sequence_concat_op.h @@ -59,7 +59,7 @@ LoD ConcatLoD(const std::vector ins, const size_t level) { return out_lod; } -template +template class SequenceConcatOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -119,7 +119,7 @@ class SequenceConcatOpKernel : public framework::OpKernel { } }; -template +template class SequenceConcatGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/sequence_conv_op.cc b/paddle/operators/sequence_conv_op.cc index c5533732d4..f5c4f1c133 100644 --- a/paddle/operators/sequence_conv_op.cc +++ b/paddle/operators/sequence_conv_op.cc @@ -179,9 +179,10 @@ REGISTER_OP(sequence_conv, ops::SequenceConvOp, ops::SequenceConvOpMaker, sequence_conv_grad, ops::SequenceConvGradOp); REGISTER_OP_CPU_KERNEL( - sequence_conv, ops::SequenceConvKernel, - ops::SequenceConvKernel); + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); REGISTER_OP_CPU_KERNEL( sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/operators/sequence_conv_op.cu.cc b/paddle/operators/sequence_conv_op.cu.cc index c8136dbcb3..eacba79ace 100644 --- a/paddle/operators/sequence_conv_op.cu.cc +++ b/paddle/operators/sequence_conv_op.cu.cc @@ -15,10 +15,11 @@ #include "paddle/operators/sequence_conv_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - sequence_conv, ops::SequenceConvKernel, - ops::SequenceConvKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + sequence_conv, + ops::SequenceConvKernel, + ops::SequenceConvKernel); +REGISTER_OP_CUDA_KERNEL( sequence_conv_grad, - ops::SequenceConvGradKernel, - ops::SequenceConvGradKernel); + ops::SequenceConvGradKernel, + ops::SequenceConvGradKernel); diff --git a/paddle/operators/sequence_conv_op.h b/paddle/operators/sequence_conv_op.h index b8fbe2647c..bb584b7bfa 100644 --- a/paddle/operators/sequence_conv_op.h +++ b/paddle/operators/sequence_conv_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class SequenceConvKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,21 +56,23 @@ class SequenceConvKernel : public framework::OpKernel { Tensor col; col.mutable_data(col_shape, context.GetPlace()); // Because if padding_trainable is false, padding data should be zeros. - math::SetConstant set_zero; - set_zero(context.device_context(), &col, static_cast(0)); + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); + set_zero(dev_ctx, &col, static_cast(0)); - math::ContextProjectFunctor seq_project_functor; + math::ContextProjectFunctor seq_project_functor; - seq_project_functor(context.device_context(), *in, *padding_data, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, &col); + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); - math::matmul(context.device_context(), col, false, filter, false, - static_cast(1.0), out, static_cast(0.0)); + math::matmul(dev_ctx, col, false, filter, false, + static_cast(1.0), out, + static_cast(0.0)); } }; -template +template class SequenceConvGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -95,7 +97,8 @@ class SequenceConvGradKernel : public framework::OpKernel { int down_pad = std::max(0, context_start + context_length - 1); int sequence_width = static_cast(in->dims()[1]); - math::SetConstant set_zero; + math::SetConstant set_zero; + auto& dev_ctx = context.template device_context(); // use col_shape in the im2col calculation framework::DDim col_shape = {in->dims()[0], sequence_width * context_length}; @@ -104,38 +107,36 @@ class SequenceConvGradKernel : public framework::OpKernel { if (in_g || filter_g || (padding_trainable && padding_data_g)) { col.mutable_data(col_shape, context.GetPlace()); // Because if padding_trainable is false, padding data should be zeros. - set_zero(context.device_context(), &col, static_cast(0)); - math::matmul(context.device_context(), *out_g, false, *filter, - true, T(1.0), &col, T(1.0)); + set_zero(dev_ctx, &col, static_cast(0)); + math::matmul(dev_ctx, *out_g, false, *filter, true, + T(1.0), &col, T(1.0)); } - math::ContextProjectFunctor seq_project_functor; - math::ContextProjectGradFunctor seq_project_grad_functor; + math::ContextProjectFunctor seq_project_functor; + math::ContextProjectGradFunctor seq_project_grad_functor; if (in_g) { in_g->mutable_data(context.GetPlace()); in_g->set_lod(in->lod()); - set_zero(context.device_context(), in_g, static_cast(0)); + set_zero(dev_ctx, in_g, static_cast(0)); - seq_project_grad_functor(context.device_context(), *in_g, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, false, true, - padding_data_g, &col); + seq_project_grad_functor(dev_ctx, *in_g, padding_trainable, context_start, + context_length, context_stride, up_pad, down_pad, + false, true, padding_data_g, &col); } if (padding_trainable && padding_data_g) { padding_data_g->mutable_data(context.GetPlace()); - set_zero(context.device_context(), padding_data_g, static_cast(0)); + set_zero(dev_ctx, padding_data_g, static_cast(0)); LoDTensor* input = const_cast(in); - seq_project_grad_functor(context.device_context(), *input, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, true, false, - padding_data_g, &col); + seq_project_grad_functor( + dev_ctx, *input, padding_trainable, context_start, context_length, + context_stride, up_pad, down_pad, true, false, padding_data_g, &col); } if (filter_g) { filter_g->mutable_data(context.GetPlace()); - set_zero(context.device_context(), filter_g, static_cast(0)); + set_zero(dev_ctx, filter_g, static_cast(0)); Tensor filter_grad = *filter_g; LoDTensor out_grad = *out_g; @@ -145,12 +146,12 @@ class SequenceConvGradKernel : public framework::OpKernel { padding_data = context.Input("PaddingData"); } - seq_project_functor(context.device_context(), *in, *padding_data, - padding_trainable, context_start, context_length, - context_stride, up_pad, down_pad, &col); + seq_project_functor(dev_ctx, *in, *padding_data, padding_trainable, + context_start, context_length, context_stride, up_pad, + down_pad, &col); - math::matmul(context.device_context(), col, true, out_grad, - false, T(1.0), &filter_grad, T(1.0)); + math::matmul(dev_ctx, col, true, out_grad, false, + T(1.0), &filter_grad, T(1.0)); } } }; diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc index bfda8649cd..3526e45a1b 100644 --- a/paddle/operators/sequence_pool_op.cc +++ b/paddle/operators/sequence_pool_op.cc @@ -123,7 +123,8 @@ namespace ops = paddle::operators; REGISTER_OP(sequence_pool, ops::SequencePoolOp, ops::SequencePoolOpMaker, sequence_pool_grad, ops::SequencePoolGradOp); REGISTER_OP_CPU_KERNEL( - sequence_pool, ops::SequencePoolKernel); + sequence_pool, + ops::SequencePoolKernel); REGISTER_OP_CPU_KERNEL( sequence_pool_grad, - ops::SequencePoolGradKernel); + ops::SequencePoolGradKernel); diff --git a/paddle/operators/sequence_pool_op.cu b/paddle/operators/sequence_pool_op.cu index 66850772d5..fcd6508435 100644 --- a/paddle/operators/sequence_pool_op.cu +++ b/paddle/operators/sequence_pool_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/sequence_pool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - sequence_pool, ops::SequencePoolKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + sequence_pool, + ops::SequencePoolKernel); +REGISTER_OP_CUDA_KERNEL( sequence_pool_grad, - ops::SequencePoolGradKernel); + ops::SequencePoolGradKernel); diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h index 7f136d8cf0..7519aa1d72 100644 --- a/paddle/operators/sequence_pool_op.h +++ b/paddle/operators/sequence_pool_op.h @@ -30,7 +30,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class SequencePoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -54,17 +54,18 @@ class SequencePoolKernel : public framework::OpKernel { auto lod_level_0 = lod[0]; out->mutable_data(context.GetPlace()); - + auto& dev_ctx = context.template device_context(); if (pooltype == "MAX") { - math::MaxSeqPoolFunctor max_pool; + math::MaxSeqPoolFunctor max_pool; auto* index = context.Output("MaxIndex"); index->Resize({dims}); index->mutable_data(context.GetPlace()); - max_pool(context.device_context(), *in, out, index); + max_pool(dev_ctx, *in, out, index); return; } - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); for (int i = 0; i < static_cast(lod_level_0.size()) - 1; ++i) { Tensor in_t = in->Slice(static_cast(lod_level_0[i]), static_cast(lod_level_0[i + 1])); @@ -91,7 +92,7 @@ class SequencePoolKernel : public framework::OpKernel { } }; -template +template class SequencePoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -105,20 +106,23 @@ class SequencePoolGradKernel : public framework::OpKernel { int64_t w = in->numel() / dims[0]; in_g->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); if (pooltype == "MAX") { - math::MaxSeqPoolGradFunctor max_pool_grad; + math::MaxSeqPoolGradFunctor max_pool_grad; auto* index = context.Input("MaxIndex"); - max_pool_grad(context.device_context(), *out_g, *index, in_g); + max_pool_grad(dev_ctx, *out_g, *index, in_g); return; } if (pooltype == "LAST" || pooltype == "FIRST") { // set X@Grad be zero at first when pooltype is LAST/FIRST - math::SetConstant functor; - functor(context.device_context(), in_g, 0); + math::SetConstant functor; + functor(dev_ctx, in_g, 0); } - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); + for (int i = 0; i < static_cast(lod.size()) - 1; ++i) { auto in_g_t = in_g->Slice(static_cast(lod[i]), static_cast(lod[i + 1])); diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc index 255683a572..481db8f9e5 100644 --- a/paddle/operators/sequence_slice_op.cc +++ b/paddle/operators/sequence_slice_op.cc @@ -125,7 +125,7 @@ REGISTER_OP(sequence_slice, ops::SequenceSliceOp, ops::SequenceSliceOpMaker, sequence_slice_grad, ops::SequenceSliceGradOp); REGISTER_OP_CPU_KERNEL( sequence_slice, - ops::SequenceSliceOpKernel); + ops::SequenceSliceOpKernel); REGISTER_OP_CPU_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel); diff --git a/paddle/operators/sequence_slice_op.cu b/paddle/operators/sequence_slice_op.cu index a9f59dadba..43a21d619f 100755 --- a/paddle/operators/sequence_slice_op.cu +++ b/paddle/operators/sequence_slice_op.cu @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_slice_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_slice, - ops::SequenceSliceOpKernel); -REGISTER_OP_GPU_KERNEL( + ops::SequenceSliceOpKernel); +REGISTER_OP_CUDA_KERNEL( sequence_slice_grad, - ops::SequenceSliceGradOpKernel); + ops::SequenceSliceGradOpKernel); diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h index 428ef556da..14bcaebbb4 100644 --- a/paddle/operators/sequence_slice_op.h +++ b/paddle/operators/sequence_slice_op.h @@ -39,7 +39,7 @@ inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data, return out_lod; } -template +template class SequenceSliceOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -108,7 +108,7 @@ class SequenceSliceOpKernel : public framework::OpKernel { } }; -template +template class SequenceSliceGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -143,8 +143,9 @@ class SequenceSliceGradOpKernel : public framework::OpKernel { if (x_grad) { x_grad->mutable_data(ctx.GetPlace()); x_grad->set_lod(in->lod()); - math::SetConstant set_zero; - set_zero(ctx.device_context(), x_grad, static_cast(0)); + math::SetConstant set_zero; + set_zero(ctx.template device_context(), x_grad, + static_cast(0)); auto out_grad_stride = framework::stride(out_grad->dims()); diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc index 32c1502566..37d5452e6b 100644 --- a/paddle/operators/sequence_softmax_op.cc +++ b/paddle/operators/sequence_softmax_op.cc @@ -99,7 +99,7 @@ REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp, ops::SequenceSoftmaxGradOp); REGISTER_OP_CPU_KERNEL( sequence_softmax, - ops::SequenceSoftmaxKernel); + ops::SequenceSoftmaxKernel); REGISTER_OP_CPU_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/operators/sequence_softmax_op.cu.cc b/paddle/operators/sequence_softmax_op.cu.cc index 7023795a3b..5f65b4daf9 100644 --- a/paddle/operators/sequence_softmax_op.cu.cc +++ b/paddle/operators/sequence_softmax_op.cu.cc @@ -15,9 +15,9 @@ limitations under the License. */ #include "paddle/operators/sequence_softmax_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( sequence_softmax, - ops::SequenceSoftmaxKernel) -REGISTER_OP_GPU_KERNEL( + ops::SequenceSoftmaxKernel) +REGISTER_OP_CUDA_KERNEL( sequence_softmax_grad, - ops::SequenceSoftmaxGradKernel); + ops::SequenceSoftmaxGradKernel); diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h index 1b68dd0662..e889e88cb3 100644 --- a/paddle/operators/sequence_softmax_op.h +++ b/paddle/operators/sequence_softmax_op.h @@ -23,7 +23,7 @@ namespace operators { using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; -template +template class SequenceSoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -52,12 +52,13 @@ class SequenceSoftmaxKernel : public framework::OpKernel { framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos}); x_i.Resize(dims_i); out_i.Resize(dims_i); - math::SoftmaxFunctor()(ctx.device_context(), &x_i, &out_i); + math::SoftmaxFunctor()( + ctx.template device_context(), &x_i, &out_i); } } }; -template +template class SequenceSoftmaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -83,8 +84,9 @@ class SequenceSoftmaxGradKernel : public framework::OpKernel { out_i.Resize(dims_i); out_grad_i.Resize(dims_i); x_grad_i.Resize(dims_i); - math::SoftmaxGradFunctor()(ctx.device_context(), &out_i, - &out_grad_i, &x_grad_i); + math::SoftmaxGradFunctor()( + ctx.template device_context(), &out_i, &out_grad_i, + &x_grad_i); } } }; diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc index 5576d7b8be..121bf60b27 100644 --- a/paddle/operators/sgd_op.cc +++ b/paddle/operators/sgd_op.cc @@ -62,8 +62,8 @@ $$param\_out = param - learning\_rate * grad$$ }; template -struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseSGDFunctor { + void operator()(const platform::CPUDeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output) { @@ -90,13 +90,14 @@ struct SparseSGDFunctor { } }; -template struct SparseSGDFunctor; -template struct SparseSGDFunctor; +template struct SparseSGDFunctor; +template struct SparseSGDFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(sgd, ops::SGDOp, ops::SGDOpMaker); -REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel, - ops::SGDOpKernel); +REGISTER_OP_CPU_KERNEL( + sgd, ops::SGDOpKernel, + ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu index 7b6c5ec306..a3c0db7e50 100644 --- a/paddle/operators/sgd_op.cu +++ b/paddle/operators/sgd_op.cu @@ -41,8 +41,8 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows, } // namespace template -struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, +struct SparseSGDFunctor { + void operator()(const platform::CUDADeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output) { @@ -62,21 +62,19 @@ struct SparseSGDFunctor { const int block_size = 256; dim3 threads(block_size, 1); dim3 grid(1, in_rows.size()); - SparseSGDFunctorKernel< - T, 256><<(context) - .stream()>>>(in_data, in_rows.data(), - learning_rate.data(), out_data, - in_row_numel); + SparseSGDFunctorKernel<<>>( + in_data, in_rows.data(), learning_rate.data(), out_data, + in_row_numel); } }; -template struct SparseSGDFunctor; -template struct SparseSGDFunctor; +template struct SparseSGDFunctor; +template struct SparseSGDFunctor; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel, - ops::SGDOpKernel); +REGISTER_OP_CUDA_KERNEL( + sgd, ops::SGDOpKernel, + ops::SGDOpKernel); diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h index 78b595fc6c..c920025a91 100644 --- a/paddle/operators/sgd_op.h +++ b/paddle/operators/sgd_op.h @@ -20,15 +20,15 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template struct SparseSGDFunctor { - void operator()(const platform::DeviceContext& context, + void operator()(const DeviceContext& context, const framework::SelectedRows& input, const framework::Tensor& learning_rate, framework::Tensor* output); }; -template +template class SGDOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -46,7 +46,8 @@ class SGDOpKernel : public framework::OpKernel { auto g = framework::EigenVector::Flatten(*grad); auto o = framework::EigenVector::Flatten(*param_out); auto lr = framework::EigenVector::Flatten(*learning_rate); - auto place = ctx.GetEigenDevice(); + auto& place = + *ctx.template device_context().eigen_device(); Eigen::DSizes grad_dsize(grad->numel()); o.device(place) = p - lr.broadcast(grad_dsize) * g; @@ -56,8 +57,9 @@ class SGDOpKernel : public framework::OpKernel { // It's better to find a more elegant solution. PADDLE_ENFORCE_EQ(param, param_out); auto* grad = ctx.Input("Grad"); - SparseSGDFunctor functor; - functor(ctx.device_context(), *grad, *learning_rate, param_out); + SparseSGDFunctor functor; + functor(ctx.template device_context(), *grad, + *learning_rate, param_out); } else { PADDLE_THROW("Unsupported Variable Type of Grad"); } diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc index 782f4c7936..b8a1bf122a 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -142,7 +142,7 @@ REGISTER_OP(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsGradOp); REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits, ops::SigmoidCrossEntropyWithLogitsKernel< - paddle::platform::CPUPlace, float>); + paddle::platform::CPUDeviceContext, float>); REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::CPUPlace, float>); + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu index 32a39956a1..1b569c93ed 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu @@ -16,9 +16,9 @@ #include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits, - ops::SigmoidCrossEntropyWithLogitsKernel< - paddle::platform::GPUPlace, float>); -REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits_grad, - ops::SigmoidCrossEntropyWithLogitsGradKernel< - paddle::platform::GPUPlace, float>); +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits, + ops::SigmoidCrossEntropyWithLogitsKernel< + paddle::platform::CUDADeviceContext, float>); +REGISTER_OP_CUDA_KERNEL(sigmoid_cross_entropy_with_logits_grad, + ops::SigmoidCrossEntropyWithLogitsGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h index 2a9d9bbc77..8fe7c5ba82 100644 --- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) -template +template class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -32,7 +32,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto labels = framework::EigenVector::Flatten(*Labels); auto out = framework::EigenVector::Flatten(*Out); - auto place = context.GetEigenDevice(); + auto &place = *context.device_context().eigen_device(); // term1 = max(x, 0) auto term1 = x.cwiseMax(static_cast(0)); @@ -46,7 +46,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { }; // dX = sigmoid(X) - labels -template +template class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -62,7 +62,8 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { auto labels = framework::EigenVector::Flatten(*Labels); auto dout = framework::EigenVector::Flatten(*dOut); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto &place = + *context.template device_context().eigen_device(); auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); dx.device(place) = dout * (sigmoid_x - labels); diff --git a/paddle/operators/sign_op.cc b/paddle/operators/sign_op.cc index 08bf2e4e7c..d5a7ccb77e 100644 --- a/paddle/operators/sign_op.cc +++ b/paddle/operators/sign_op.cc @@ -67,5 +67,5 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sign, ops::SignOp, ops::SignOpMaker, ops::SignGradMaker); -REGISTER_OP_CPU_KERNEL(sign, - ops::SignKernel); +REGISTER_OP_CPU_KERNEL( + sign, ops::SignKernel); diff --git a/paddle/operators/sign_op.cu b/paddle/operators/sign_op.cu index 4d0638cb97..9bc1c65d21 100644 --- a/paddle/operators/sign_op.cu +++ b/paddle/operators/sign_op.cu @@ -14,5 +14,6 @@ #include "paddle/operators/sign_op.h" -REGISTER_OP_GPU_KERNEL( - sign, paddle::operators::SignKernel); +REGISTER_OP_CUDA_KERNEL( + sign, + paddle::operators::SignKernel); diff --git a/paddle/operators/sign_op.h b/paddle/operators/sign_op.h index ab5cd4bac0..2e476ed665 100644 --- a/paddle/operators/sign_op.h +++ b/paddle/operators/sign_op.h @@ -19,7 +19,7 @@ namespace paddle { namespace operators { -template +template class SignKernel : public framework::OpKernel { public: virtual void Compute(const framework::ExecutionContext& context) const { @@ -29,7 +29,8 @@ class SignKernel : public framework::OpKernel { auto eigen_out = framework::EigenVector::Flatten(*out); auto eigen_in = framework::EigenVector::Flatten(*in); - auto& place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); eigen_out.device(place) = eigen_in.sign(); } }; diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc index 50543fcc14..56e8d9058f 100644 --- a/paddle/operators/smooth_l1_loss_op.cc +++ b/paddle/operators/smooth_l1_loss_op.cc @@ -138,7 +138,8 @@ REGISTER_OP(smooth_l1_loss, ops::SmoothL1LossOp, ops::SmoothL1LossOpMaker, smooth_l1_loss_grad, ops::SmoothL1LossGradOp); REGISTER_OP_CPU_KERNEL( - smooth_l1_loss, ops::SmoothL1LossKernel); + smooth_l1_loss, + ops::SmoothL1LossKernel); REGISTER_OP_CPU_KERNEL( smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); + ops::SmoothL1LossGradKernel); diff --git a/paddle/operators/smooth_l1_loss_op.cu b/paddle/operators/smooth_l1_loss_op.cu index 1c3172f438..8e94ebac64 100644 --- a/paddle/operators/smooth_l1_loss_op.cu +++ b/paddle/operators/smooth_l1_loss_op.cu @@ -17,8 +17,9 @@ #include "paddle/operators/smooth_l1_loss_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( - smooth_l1_loss, ops::SmoothL1LossKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + smooth_l1_loss, + ops::SmoothL1LossKernel); +REGISTER_OP_CUDA_KERNEL( smooth_l1_loss_grad, - ops::SmoothL1LossGradKernel); + ops::SmoothL1LossGradKernel); diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h index 39d0070b6c..1a70c9c63c 100644 --- a/paddle/operators/smooth_l1_loss_op.h +++ b/paddle/operators/smooth_l1_loss_op.h @@ -44,7 +44,7 @@ struct SmoothL1LossForward { T sigma2; }; -template +template class SmoothL1LossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -57,7 +57,8 @@ class SmoothL1LossKernel : public framework::OpKernel { out0->mutable_data(context.GetPlace()); out1->mutable_data(context.GetPlace()); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); auto sigma = static_cast(context.Attr("sigma")); T sigma2 = sigma * sigma; @@ -67,12 +68,12 @@ class SmoothL1LossKernel : public framework::OpKernel { auto y = EigenVector::Flatten(*in1); auto diff = EigenVector::Flatten(*out0); - diff.device(place) = x - y; + diff.device(*place) = x - y; // multiply inside weight if (has_weight) { auto inside_weight = EigenVector::Flatten(*in2); // cache diff, reused in bp - diff.device(place) = diff * inside_weight; + diff.device(*place) = diff * inside_weight; } auto in_counts = in0->numel(); @@ -81,12 +82,12 @@ class SmoothL1LossKernel : public framework::OpKernel { context.GetPlace()); auto errors = EigenVector::Flatten(ptensor_errors); // apply smooth l1 forward - errors.device(place) = diff.unaryExpr(SmoothL1LossForward(sigma2)); + errors.device(*place) = diff.unaryExpr(SmoothL1LossForward(sigma2)); // multiply outside weight if (has_weight) { auto outside_weight = EigenVector::Flatten(*in3); - errors.device(place) = errors * outside_weight; + errors.device(*place) = errors * outside_weight; } auto loss = EigenVector::Flatten(*out1); // first dimension of 'X' is the number of samples @@ -94,7 +95,7 @@ class SmoothL1LossKernel : public framework::OpKernel { framework::make_ddim({static_cast(in0->dims()[0]), static_cast(in_counts / in0->dims()[0])}); auto errors_mat_view = EigenMatrix::From(ptensor_errors, mat_dims); - loss.device(place) = errors_mat_view.sum(Eigen::array({{1}})); + loss.device(*place) = errors_mat_view.sum(Eigen::array({{1}})); } }; @@ -114,7 +115,7 @@ struct SmoothL1LossBackward { T sigma2; }; -template +template class SmoothL1LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -126,7 +127,8 @@ class SmoothL1LossGradKernel : public framework::OpKernel { T sigma2 = sigma * sigma; bool has_weight = (in0 != nullptr) && (in1 != nullptr); - auto place = context.GetEigenDevice(); + auto* place = + context.template device_context().eigen_device(); auto in_dims = in2->dims(); auto counts = in2->numel(); @@ -139,7 +141,7 @@ class SmoothL1LossGradKernel : public framework::OpKernel { context.GetPlace()); auto diff = EigenVector::Flatten(ptensor_diff); // apply smooth l1 backwoard - diff.device(place) = EigenVector::Flatten(*in2).unaryExpr( + diff.device(*place) = EigenVector::Flatten(*in2).unaryExpr( SmoothL1LossBackward(sigma2)); // compute weights @@ -147,11 +149,11 @@ class SmoothL1LossGradKernel : public framework::OpKernel { ptensor_weights.mutable_data(mat_dims, context.GetPlace()); auto weights = EigenMatrix::From(ptensor_weights); // initialize to 1.0 - weights.device(place) = weights.constant(static_cast(1.0)); + weights.device(*place) = weights.constant(static_cast(1.0)); if (has_weight) { auto inside_weight = EigenMatrix::From(*in0, mat_dims); auto outside_weight = EigenMatrix::From(*in1, mat_dims); - weights.device(place) = inside_weight * outside_weight; + weights.device(*place) = inside_weight * outside_weight; } // compute gradients @@ -167,13 +169,13 @@ class SmoothL1LossGradKernel : public framework::OpKernel { if (out0) { out0->mutable_data(context.GetPlace()); auto x_grad = EigenMatrix::From(*out0, mat_dims); - x_grad.device(place) = gradients; + x_grad.device(*place) = gradients; } if (out1) { out1->mutable_data(context.GetPlace()); auto y_grad = EigenMatrix::From(*out1, mat_dims); - y_grad.device(place) = -1 * gradients; + y_grad.device(*place) = -1 * gradients; } } }; diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc index 93e0525bad..0988c83d43 100644 --- a/paddle/operators/softmax_op.cc +++ b/paddle/operators/softmax_op.cc @@ -89,7 +89,8 @@ namespace ops = paddle::operators; REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker, softmax_grad, ops::SoftmaxOpGrad); -REGISTER_OP_CPU_KERNEL(softmax, - ops::SoftmaxKernel); REGISTER_OP_CPU_KERNEL( - softmax_grad, ops::SoftmaxGradKernel); + softmax, ops::SoftmaxKernel); +REGISTER_OP_CPU_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.cu.cc b/paddle/operators/softmax_op.cu.cc index 013ace19ae..7b9882cbcf 100644 --- a/paddle/operators/softmax_op.cu.cc +++ b/paddle/operators/softmax_op.cu.cc @@ -16,7 +16,8 @@ namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(softmax, - ops::SoftmaxKernel); -REGISTER_OP_GPU_KERNEL( - softmax_grad, ops::SoftmaxGradKernel); +REGISTER_OP_CUDA_KERNEL( + softmax, ops::SoftmaxKernel); +REGISTER_OP_CUDA_KERNEL( + softmax_grad, + ops::SoftmaxGradKernel); diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h index 44d1e63f1b..0f8998b99e 100644 --- a/paddle/operators/softmax_op.h +++ b/paddle/operators/softmax_op.h @@ -21,7 +21,7 @@ namespace operators { using Tensor = framework::Tensor; -template +template class SoftmaxKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -31,11 +31,12 @@ class SoftmaxKernel : public framework::OpKernel { // allocate memory on device. Y->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), X, Y); + math::SoftmaxFunctor()( + context.template device_context(), X, Y); } }; -template +template class SoftmaxGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -46,7 +47,8 @@ class SoftmaxGradKernel : public framework::OpKernel { // allocate memory on device. dX->mutable_data(context.GetPlace()); - math::SoftmaxGradFunctor()(context.device_context(), Y, dY, dX); + math::SoftmaxGradFunctor()( + context.template device_context(), Y, dY, dX); } }; diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu index b1faddac3f..6100c63f9a 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/operators/softmax_with_cross_entropy_op.cu @@ -69,10 +69,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), - logits, softmax); - math::CrossEntropyFunctor()( - context.device_context(), loss, softmax, labels, + math::SoftmaxFunctor()( + context.cuda_device_context(), logits, softmax); + math::CrossEntropyFunctor()( + context.cuda_device_context(), loss, softmax, labels, context.Attr("soft_label")); } }; @@ -98,18 +98,18 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { if (context.Attr("soft_label")) { const T* label_data = labels->data(); - SoftCrossEntropyGradientKernel<<< - grid, block, 0, reinterpret_cast( - context.device_context()) - .stream()>>>(logit_grad_data, loss_grad_data, - label_data, batch_size, class_num); + SoftCrossEntropyGradientKernel< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); } else { const int64_t* label_data = labels->data(); - CrossEntropyGrad<<< - grid, block, 0, reinterpret_cast( - context.device_context()) - .stream()>>>(logit_grad_data, loss_grad_data, - label_data, batch_size, class_num); + CrossEntropyGrad< + T><<() + .stream()>>>(logit_grad_data, loss_grad_data, label_data, + batch_size, class_num); } } }; @@ -118,9 +118,9 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy, - ops::SoftmaxWithCrossEntropyCUDAKernel, - ops::SoftmaxWithCrossEntropyCUDAKernel); -REGISTER_OP_GPU_KERNEL(softmax_with_cross_entropy_grad, - ops::SoftmaxWithCrossEntropyGradCUDAKernel, - ops::SoftmaxWithCrossEntropyGradCUDAKernel); +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy, + ops::SoftmaxWithCrossEntropyCUDAKernel, + ops::SoftmaxWithCrossEntropyCUDAKernel); +REGISTER_OP_CUDA_KERNEL(softmax_with_cross_entropy_grad, + ops::SoftmaxWithCrossEntropyGradCUDAKernel, + ops::SoftmaxWithCrossEntropyGradCUDAKernel); diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h index c4ab3f74b4..9c3431605b 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.h +++ b/paddle/operators/softmax_with_cross_entropy_op.h @@ -40,11 +40,12 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { softmax->mutable_data(context.GetPlace()); loss->mutable_data(context.GetPlace()); - math::SoftmaxFunctor()(context.device_context(), - logits, softmax); - math::CrossEntropyFunctor()( - context.device_context(), loss, softmax, labels, - context.Attr("soft_label")); + auto& dev_ctx = + context.template device_context(); + math::SoftmaxFunctor()(dev_ctx, logits, + softmax); + math::CrossEntropyFunctor()( + dev_ctx, loss, softmax, labels, context.Attr("soft_label")); } }; @@ -62,14 +63,15 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { const int class_num = logit_grad->dims()[1]; auto out_grad_mat = EigenMatrix::From(*out_grad); auto logit_grad_mat = EigenMatrix::From(*logit_grad); - + auto& place = *context.template device_context() + .eigen_device(); if (context.Attr("soft_label")) { auto lbl_mat = EigenMatrix::From(*labels); - logit_grad_mat.device(context.GetEigenDevice()) = + logit_grad_mat.device(place) = out_grad_mat.broadcast(Eigen::DSizes(1, class_num)) * (logit_grad_mat - lbl_mat); } else { - logit_grad_mat.device(context.GetEigenDevice()) = + logit_grad_mat.device(place) = logit_grad_mat * out_grad_mat.broadcast(Eigen::DSizes(1, class_num)); diff --git a/paddle/operators/split_op.cu.cc b/paddle/operators/split_op.cu.cc index 93d1fc3c44..dbad0bbf68 100644 --- a/paddle/operators/split_op.cu.cc +++ b/paddle/operators/split_op.cu.cc @@ -14,5 +14,5 @@ limitations under the License. */ #include "paddle/operators/split_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(split, - ops::SplitOpKernel); +REGISTER_OP_CUDA_KERNEL( + split, ops::SplitOpKernel); diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h index fa26e5f677..a38c435d53 100644 --- a/paddle/operators/split_op.h +++ b/paddle/operators/split_op.h @@ -21,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class SplitOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc index bec2a2c18a..50bc6da196 100644 --- a/paddle/operators/squared_l2_distance_op.cc +++ b/paddle/operators/squared_l2_distance_op.cc @@ -115,7 +115,7 @@ REGISTER_OP(squared_l2_distance, ops::SquaredL2DistanceOp, ops::SquaredL2DistanceGradOp); REGISTER_OP_CPU_KERNEL( squared_l2_distance, - ops::SquaredL2DistanceKernel); -REGISTER_OP_CPU_KERNEL( - squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceKernel); +REGISTER_OP_CPU_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CPUDeviceContext, float>); diff --git a/paddle/operators/squared_l2_distance_op.cu b/paddle/operators/squared_l2_distance_op.cu index 3fe62f1a9c..ecc82ed1e4 100644 --- a/paddle/operators/squared_l2_distance_op.cu +++ b/paddle/operators/squared_l2_distance_op.cu @@ -17,9 +17,9 @@ #include "paddle/operators/squared_l2_distance_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( squared_l2_distance, - ops::SquaredL2DistanceKernel); -REGISTER_OP_GPU_KERNEL( - squared_l2_distance_grad, - ops::SquaredL2DistanceGradKernel); + ops::SquaredL2DistanceKernel); +REGISTER_OP_CUDA_KERNEL(squared_l2_distance_grad, + ops::SquaredL2DistanceGradKernel< + paddle::platform::CUDADeviceContext, float>); diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h index 259ef40296..5bd5f4819a 100644 --- a/paddle/operators/squared_l2_distance_op.h +++ b/paddle/operators/squared_l2_distance_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class SquaredL2DistanceKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -51,7 +51,8 @@ class SquaredL2DistanceKernel : public framework::OpKernel { auto sub_result = EigenMatrix::From(*out0); auto z = EigenVector::Flatten(*out1); - auto place = context.GetEigenDevice(); + auto& place = + *context.template device_context().eigen_device(); auto x_dims = x.dimensions(); auto y_dims = y.dimensions(); // buffer the substraction result @@ -67,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel { } }; -template +template class SquaredL2DistanceGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -89,7 +90,8 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel { sub_result; // propagate back to input - auto eigen_place = context.GetEigenDevice(); + auto& eigen_place = + *context.template device_context().eigen_device(); if (x_g) { x_g->mutable_data(context.GetPlace()); // eigen matrix diff --git a/paddle/operators/squared_l2_norm_op.cc b/paddle/operators/squared_l2_norm_op.cc index 3c10e6159f..3cff61a02f 100644 --- a/paddle/operators/squared_l2_norm_op.cc +++ b/paddle/operators/squared_l2_norm_op.cc @@ -72,7 +72,7 @@ REGISTER_OP(squared_l2_norm, ops::SquaredL2NormOp, ops::SquaredL2NormOpMaker, squared_l2_norm_grad, ops::SquaredL2NormGradOp); REGISTER_OP_CPU_KERNEL( squared_l2_norm, - ops::SquaredL2NormKernel); + ops::SquaredL2NormKernel); REGISTER_OP_CPU_KERNEL( squared_l2_norm_grad, - ops::SquaredL2NormGradKernel); + ops::SquaredL2NormGradKernel); diff --git a/paddle/operators/squared_l2_norm_op.cu b/paddle/operators/squared_l2_norm_op.cu index d384e9c28c..2d6567d090 100644 --- a/paddle/operators/squared_l2_norm_op.cu +++ b/paddle/operators/squared_l2_norm_op.cu @@ -16,9 +16,9 @@ #include "paddle/operators/squared_l2_norm_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( squared_l2_norm, - ops::SquaredL2NormKernel); -REGISTER_OP_GPU_KERNEL( + ops::SquaredL2NormKernel); +REGISTER_OP_CUDA_KERNEL( squared_l2_norm_grad, - ops::SquaredL2NormGradKernel); + ops::SquaredL2NormGradKernel); diff --git a/paddle/operators/squared_l2_norm_op.h b/paddle/operators/squared_l2_norm_op.h index 48d7b1c2d5..0ced7e7d70 100644 --- a/paddle/operators/squared_l2_norm_op.h +++ b/paddle/operators/squared_l2_norm_op.h @@ -20,7 +20,7 @@ namespace paddle { namespace operators { // Out = sum(square(X)) -template +template class SquaredL2NormKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -30,14 +30,15 @@ class SquaredL2NormKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto out = framework::EigenScalar::From(*Out); - auto place = context.GetEigenDevice(); + auto *place = + context.template device_context().eigen_device(); - out.device(place) = x.square().sum(); + out.device(*place) = x.square().sum(); } }; // dX = X -template +template class SquaredL2NormGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -53,10 +54,11 @@ class SquaredL2NormGradKernel : public framework::OpKernel { auto x = framework::EigenVector::Flatten(*X); auto dout = framework::EigenVector::Flatten(*dOut); auto dx = framework::EigenVector::Flatten(*dX); - auto place = context.GetEigenDevice(); + auto *place = + context.template device_context().eigen_device(); Eigen::DSizes x_dsize(X->numel()); - dx.device(place) = (dout.broadcast(x_dsize) * x) * static_cast(2.0); + dx.device(*place) = (dout.broadcast(x_dsize) * x) * static_cast(2.0); } }; diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index 744b2fe3f2..cd52672f78 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -195,7 +195,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker, ops::SumOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); +REGISTER_OP_CPU_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu index 5c30dd4d47..873155076c 100644 --- a/paddle/operators/sum_op.cu +++ b/paddle/operators/sum_op.cu @@ -13,7 +13,8 @@ limitations under the License. */ #include "paddle/operators/sum_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel, - ops::SumKernel, - ops::SumKernel, - ops::SumKernel); +REGISTER_OP_CUDA_KERNEL( + sum, ops::SumKernel, + ops::SumKernel, + ops::SumKernel, + ops::SumKernel); diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h index ed6c80ce60..eaa36aa1ae 100644 --- a/paddle/operators/sum_op.h +++ b/paddle/operators/sum_op.h @@ -26,7 +26,7 @@ template using EigenVector = framework::EigenVector; -template +template class SumKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { @@ -43,12 +43,14 @@ class SumKernel : public framework::OpKernel { auto result = EigenVector::Flatten(*out); if (!in_place) { - math::SetConstant constant_functor; - constant_functor(context.device_context(), out, 0.0); + math::SetConstant constant_functor; + constant_functor(context.template device_context(), out, + 0.0); } - math::SelectedRowsAddToTensor functor; - auto place = context.GetEigenDevice(); + math::SelectedRowsAddToTensor functor; + auto &place = + *context.template device_context().eigen_device(); // If in_place, just skip the first tensor for (int i = in_place ? 1 : 0; i < N; i++) { if (in_vars[i]->IsType()) { @@ -60,7 +62,7 @@ class SumKernel : public framework::OpKernel { result.device(place) = result + in; } else if (in_vars[i]->IsType()) { auto &in_t = in_vars[i]->Get(); - functor(context.device_context(), in_t, out); + functor(context.template device_context(), in_t, out); } else { PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); } @@ -82,14 +84,14 @@ class SumKernel : public framework::OpKernel { out_value->Resize(framework::make_ddim(in_dim_vec)); out_value->mutable_data(context.GetPlace()); - math::SelectedRowsAddTo functor; + math::SelectedRowsAddTo functor; int64_t offset = 0; for (int i = 0; i < N; i++) { PADDLE_ENFORCE_EQ(out->height(), in_vars[i]->Get().height()); - functor(context.device_context(), in_vars[i]->Get(), - offset, out); + functor(context.template device_context(), + in_vars[i]->Get(), offset, out); offset += in_vars[i]->Get().value().numel(); } } else if (out_var->IsType()) { @@ -112,7 +114,8 @@ class SumKernel : public framework::OpKernel { PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); auto in = EigenVector::Flatten(in_array[i]); auto result = EigenVector::Flatten(out_array[i]); - result.device(context.GetEigenDevice()) = result + in; + result.device(*context.template device_context() + .eigen_device()) = result + in; } } } diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu index 7851c71bbe..453bd07267 100644 --- a/paddle/operators/top_k_op.cu +++ b/paddle/operators/top_k_op.cu @@ -317,4 +317,4 @@ class TopkOpCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); +REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel); diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h index bc8563717a..e9cd9bbd4d 100644 --- a/paddle/operators/top_k_op.h +++ b/paddle/operators/top_k_op.h @@ -27,7 +27,7 @@ template using EigenMatrix = framework::EigenMatrix; -template +template class TopkKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc index 94de3d5069..de5ff561ad 100644 --- a/paddle/operators/transpose_op.cc +++ b/paddle/operators/transpose_op.cc @@ -112,8 +112,8 @@ class TransposeOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(transpose, ops::TransposeOp, ops::TransposeOpMaker, transpose_grad, ops::TransposeOpGrad); -REGISTER_OP_CPU_KERNEL(transpose, - ops::TransposeKernel); +REGISTER_OP_CPU_KERNEL( + transpose, ops::TransposeKernel); REGISTER_OP_CPU_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel); diff --git a/paddle/operators/transpose_op.cu.cc b/paddle/operators/transpose_op.cu.cc index af3f581462..7d23f1493e 100644 --- a/paddle/operators/transpose_op.cu.cc +++ b/paddle/operators/transpose_op.cu.cc @@ -15,8 +15,9 @@ #include "paddle/operators/transpose_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(transpose, - ops::TransposeKernel); -REGISTER_OP_GPU_KERNEL( +REGISTER_OP_CUDA_KERNEL( + transpose, + ops::TransposeKernel); +REGISTER_OP_CUDA_KERNEL( transpose_grad, - ops::TransposeGradKernel); + ops::TransposeGradKernel); diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h index e296032f41..d995271a6b 100644 --- a/paddle/operators/transpose_op.h +++ b/paddle/operators/transpose_op.h @@ -20,33 +20,33 @@ namespace paddle { namespace operators { -template -inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx, +template +inline void TransCompute(const int dim, const DeviceContext& dev_ctx, const framework::Tensor& in, framework::Tensor* out, const std::vector& axis) { switch (dim) { case 1: - math::Transpose trans1; + math::Transpose trans1; trans1(dev_ctx, in, out, axis); break; case 2: - math::Transpose trans2; + math::Transpose trans2; trans2(dev_ctx, in, out, axis); break; case 3: - math::Transpose trans3; + math::Transpose trans3; trans3(dev_ctx, in, out, axis); break; case 4: - math::Transpose trans4; + math::Transpose trans4; trans4(dev_ctx, in, out, axis); break; case 5: - math::Transpose trans5; + math::Transpose trans5; trans5(dev_ctx, in, out, axis); break; case 6: - math::Transpose trans6; + math::Transpose trans6; trans6(dev_ctx, in, out, axis); break; default: @@ -54,7 +54,7 @@ inline void TransCompute(const int dim, const platform::DeviceContext& dev_ctx, } } -template +template class TransposeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -64,12 +64,12 @@ class TransposeKernel : public framework::OpKernel { std::vector axis = context.Attr>("axis"); int ndims = axis.size(); - auto& dev_ctx = context.device_context(); - TransCompute(ndims, dev_ctx, *x, out, axis); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *x, out, axis); } }; -template +template class TransposeGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -88,8 +88,9 @@ class TransposeGradKernel : public framework::OpKernel { } int ndims = axis.size(); - auto& dev_ctx = context.device_context(); - TransCompute(ndims, dev_ctx, *out_grad, x_grad, reversed_axis); + auto& dev_ctx = context.template device_context(); + TransCompute(ndims, dev_ctx, *out_grad, x_grad, + reversed_axis); } }; diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc index fff1dc7ccd..2a49ee471f 100644 --- a/paddle/operators/uniform_random_op.cc +++ b/paddle/operators/uniform_random_op.cc @@ -67,7 +67,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( static_cast(ctx.Attr("dtype")), - ctx.device_context()); + ctx.GetPlace()); } }; diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu index 8b20bb8287..cfe9d293cf 100644 --- a/paddle/operators/uniform_random_op.cu +++ b/paddle/operators/uniform_random_op.cu @@ -63,6 +63,6 @@ class GPUUniformRandomKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_GPU_KERNEL(uniform_random, - paddle::operators::GPUUniformRandomKernel, - paddle::operators::GPUUniformRandomKernel); +REGISTER_OP_CUDA_KERNEL(uniform_random, + paddle::operators::GPUUniformRandomKernel, + paddle::operators::GPUUniformRandomKernel); diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 89c48e071c..49df2a530c 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -135,9 +135,10 @@ class UnpoolOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad, ops::UnpoolOpGrad); -REGISTER_OP_CPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); REGISTER_OP_CPU_KERNEL( - unpool_grad, ops::UnpoolGradKernel, - ops::UnpoolGradKernel); + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CPU_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc index 18aafb7dc7..9b002e35c4 100644 --- a/paddle/operators/unpool_op.cu.cc +++ b/paddle/operators/unpool_op.cu.cc @@ -15,9 +15,10 @@ limitations under the License. */ #include "paddle/operators/unpool_op.h" namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(unpool, - ops::UnpoolKernel, - ops::UnpoolKernel); -REGISTER_OP_GPU_KERNEL( - unpool_grad, ops::UnpoolGradKernel, - ops::UnpoolGradKernel); +REGISTER_OP_CUDA_KERNEL( + unpool, ops::UnpoolKernel, + ops::UnpoolKernel); +REGISTER_OP_CUDA_KERNEL( + unpool_grad, + ops::UnpoolGradKernel, + ops::UnpoolGradKernel); diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h index 243eb7e532..ee18b118c9 100644 --- a/paddle/operators/unpool_op.h +++ b/paddle/operators/unpool_op.h @@ -20,7 +20,7 @@ limitations under the License. */ namespace paddle { namespace operators { -template +template class UnpoolKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -32,15 +32,16 @@ class UnpoolKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); T* output_data = out->mutable_data(context.GetPlace()); + auto& dev_ctx = context.template device_context(); if (output_data) { - math::SetConstant set_zero; - set_zero(context.device_context(), out, static_cast(0)); + math::SetConstant set_zero; + set_zero(dev_ctx, out, static_cast(0)); } - math::Unpool2dMaxFunctor unpool2d_max_forward; - unpool2d_max_forward(context.device_context(), *in_x, *in_y, out); + math::Unpool2dMaxFunctor unpool2d_max_forward; + unpool2d_max_forward(dev_ctx, *in_x, *in_y, out); } }; -template +template class UnpoolGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -56,15 +57,14 @@ class UnpoolGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); - auto& device_ctx = context.device_context(); - math::SetConstant zero; + auto& device_ctx = context.template device_context(); + math::SetConstant zero; if (in_x_grad) { in_x_grad->mutable_data(context.GetPlace()); zero(device_ctx, in_x_grad, static_cast(0)); } - math::Unpool2dMaxGradFunctor unpool2d_max_backward; - unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out, - *out_grad, in_x_grad); + math::Unpool2dMaxGradFunctor unpool2d_max_backward; + unpool2d_max_backward(device_ctx, *in_x, *in_y, *out, *out_grad, in_x_grad); } }; } // namespace operators diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index ae4f0bf896..2c7f964216 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -15,12 +15,6 @@ limitations under the License. */ namespace paddle { namespace platform { -template <> -Eigen::DefaultDevice* DeviceContext::GetEigenDevice< - platform::CPUPlace, Eigen::DefaultDevice>() const { - return reinterpret_cast(this)->eigen_device(); -} - CPUDeviceContext::CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); } @@ -37,12 +31,6 @@ Place CPUDeviceContext::GetPlace() const { return CPUPlace(); } #ifdef PADDLE_WITH_CUDA -template <> -Eigen::GpuDevice* -DeviceContext::GetEigenDevice() const { - return reinterpret_cast(this)->eigen_device(); -} - class EigenCudaStreamDevice : public Eigen::StreamInterface { public: EigenCudaStreamDevice() : scratch_(nullptr), semaphore_(nullptr) { diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index ef5f19214d..596d9d0bba 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -27,24 +27,11 @@ limitations under the License. */ namespace paddle { namespace platform { -template -struct EigenDeviceConverter; - -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::DefaultDevice; -}; - class DeviceContext { public: virtual ~DeviceContext() {} virtual Place GetPlace() const = 0; - template ::EigenDeviceType> - DeviceType* GetEigenDevice() const; - virtual void Wait() const {} }; @@ -62,10 +49,6 @@ class CPUDeviceContext : public DeviceContext { }; #ifdef PADDLE_WITH_CUDA -template <> -struct EigenDeviceConverter { - using EigenDeviceType = Eigen::GpuDevice; -}; class EigenCudaStreamDevice; diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc index 8bf5174c4a..4893cd92f6 100644 --- a/paddle/platform/device_context_test.cc +++ b/paddle/platform/device_context_test.cc @@ -22,9 +22,8 @@ TEST(Device, Init) { int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; i++) { - DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); - Eigen::GpuDevice* gpu_device = - device_context->template GetEigenDevice(); + CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); + Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); delete device_context; } diff --git a/paddle/platform/transform.h b/paddle/platform/transform.h index bb9d59ec0a..148ebaed3d 100644 --- a/paddle/platform/transform.h +++ b/paddle/platform/transform.h @@ -31,7 +31,7 @@ namespace paddle { namespace platform { // Transform on host or device. It provides the same API in std library. -template +template struct Transform { template void operator()(const DeviceContext& context, InputIter first, InputIter last, @@ -45,16 +45,16 @@ struct Transform { }; template <> -struct Transform { +struct Transform { template - void operator()(const DeviceContext& context, InputIter first, InputIter last, - OutputIter result, UnaryOperation op) { + void operator()(const platform::CPUDeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { std::transform(first, last, result, op); } template - void operator()(const DeviceContext& context, InputIter1 first1, + void operator()(const platform::CPUDeviceContext& context, InputIter1 first1, InputIter1 last1, InputIter2 first2, OutputIter result, BinaryOperation op) { std::transform(first1, last1, first2, result, op); @@ -63,27 +63,25 @@ struct Transform { #ifdef __NVCC__ template <> -struct Transform { +struct Transform { template - void operator()(const DeviceContext& context, InputIter first, InputIter last, - OutputIter result, UnaryOperation op) { + void operator()(const platform::CUDADeviceContext& context, InputIter first, + InputIter last, OutputIter result, UnaryOperation op) { auto place = context.GetPlace(); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); - auto& ctx = reinterpret_cast(context); - thrust::transform(thrust::cuda::par.on(ctx.stream()), + thrust::transform(thrust::cuda::par.on(context.stream()), details::DevPtrCast(first), details::DevPtrCast(last), details::DevPtrCast(result), op); } template - void operator()(const DeviceContext& context, InputIter1 first1, + void operator()(const platform::CUDADeviceContext& context, InputIter1 first1, InputIter1 last1, InputIter2 first2, OutputIter result, BinaryOperation op) { auto place = context.GetPlace(); PADDLE_ENFORCE(is_gpu_place(place), "It must use GPU place."); - auto& ctx = reinterpret_cast(context); - thrust::transform(thrust::cuda::par.on(ctx.stream()), + thrust::transform(thrust::cuda::par.on(context.stream()), details::DevPtrCast(first1), details::DevPtrCast(last1), details::DevPtrCast(first2), details::DevPtrCast(result), op); diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu index c76cab80e4..d36eac8379 100644 --- a/paddle/platform/transform_test.cu +++ b/paddle/platform/transform_test.cu @@ -39,7 +39,7 @@ TEST(Transform, CPUUnary) { using namespace paddle::platform; CPUDeviceContext ctx; float buf[4] = {0.1, 0.2, 0.3, 0.4}; - Transform trans; + Transform trans; trans(ctx, buf, buf + 4, buf, Scale(10)); for (int i = 0; i < 4; ++i) { ASSERT_NEAR(buf[i], static_cast(i + 1), 1e-5); @@ -54,7 +54,7 @@ TEST(Transform, GPUUnary) { float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf)); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale(10)); ctx.Wait(); Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf)); @@ -68,7 +68,7 @@ TEST(Transform, CPUBinary) { using namespace paddle::platform; using namespace paddle::memory; int buf[4] = {1, 2, 3, 4}; - Transform trans; + Transform trans; CPUDeviceContext ctx; trans(ctx, buf, buf + 4, buf, buf, Multiply()); for (int i = 0; i < 4; ++i) { @@ -84,7 +84,7 @@ TEST(Transform, GPUBinary) { CUDADeviceContext ctx(gpu0); int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf)); - Transform trans; + Transform trans; trans(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply()); ctx.Wait(); Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf)); From de8c4627776b2b9a60dbcc64c48e858c80a2715f Mon Sep 17 00:00:00 2001 From: QI JUN Date: Tue, 12 Dec 2017 14:42:33 +0800 Subject: [PATCH 27/35] Update new op docs (#6505) * update docs about how to add a new operator --- doc/howto/dev/new_op_cn.md | 98 ++++++++++++++++---------------------- doc/howto/dev/new_op_en.md | 93 ++++++++++++++++-------------------- 2 files changed, 80 insertions(+), 111 deletions(-) diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md index 6cfc9536f2..44dbeecbbd 100644 --- a/doc/howto/dev/new_op_cn.md +++ b/doc/howto/dev/new_op_cn.md @@ -30,8 +30,8 @@ -------------- | :---------------------- OpProtoMake定义 | `.cc`文件,Backward Op不需要定义OpProtoMake Op定义 | `.cc`文件 -Kernel实现 | CPU、GPU共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,GPU 实现在`.cu`文件中。 -注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,GPU实现在`.cu`文件中 +Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU 实现在`.cc`文件中,CUDA 实现在`.cu`文件中。 +注册Op | Op注册实现在`.cc`文件;Kernel注册CPU实现在`.cc`文件中,CUDA实现在`.cu`文件中 实现新的op都添加至目录[paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators)下,文件命名以`*_op.h`(如有) 、 `*_op.cc` 、`*_op.cu`(如有)结尾。**系统会根据文件名自动构建op和其对应的Python扩展。** @@ -153,7 +153,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, `MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数: -- `typename Place`: 表示设备类型,不同设备(CPU、GPU)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 +- `typename DeviceContext`: 表示设备类型,不同设备(CPU、CUDA)共享同一个Kernel时,需加该模板参数,不共享则不加,一个不共享的例子是[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 - `typename T` : 表示数据类型,如`float`, `double`等。 @@ -165,7 +165,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, 下面是 `MulKernel` `Compute`的实现: ```cpp - template + template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -173,18 +173,16 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, auto* Y = context.Input("Y"); auto* Z = context.Output("Out"); Z->mutable_data(context.GetPlace()); - auto* device_context = - const_cast(context.device_context_); - math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + auto& device_context = context.template device_context(); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); } }; - ``` -需要注意:**不同设备(CPU、GPU)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** +需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** -`MulOp`的CPU、GPU实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 +`MulOp`的CPU、CUDA实现共享同一个`Kernel`。`OpKernel`不共享的例子可以参考:[`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43)。 -为了使`OpKernel`的计算过程书写更加简单,并且CPU、GPU的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。 +为了使`OpKernel`的计算过程书写更加简单,并且CPU、CUDA的代码可以复用,我们通常借助 Eigen unsupported Tensor模块来实现`Compute`接口。关于在PaddlePaddle中如何使用Eigen库,请参考[使用文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md)。 到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。 @@ -197,9 +195,9 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, ```cpp namespace ops = paddle::operators; REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); - REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); + ops::MulGradKernel); ``` 在上面的代码中: @@ -209,17 +207,17 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, - `REGISTER_OP_CPU_KERNEL` :注册`ops::MulKernel`类,并特化模板参数为`paddle::platform::CPUPlace`和`float`类型,同理,注册`ops::MulGradKernel`类。 -- 在 `.cu`文件中注册GPU Kernel。 - - 请注意,如果GPU Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下: +- 在 `.cu`文件中注册CUDA Kernel。 + - 请注意,如果CUDA Kernel的实现基于Eigen unsupported模块,那么在 `.cu`的开始请加上宏定义 `#define EIGEN_USE_GPU`,代码示例如下: ```cpp // if use Eigen unsupported module before include head files - // #define EIGEN_USE_GPU + #define EIGEN_USE_GPU namespace ops = paddle::operators; - REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); - REGISTER_OP_GPU_KERNEL(mul_grad, - ops::MulGradKernel); + REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CUDA_KERNEL(mul_grad, + ops::MulGradKernel); ``` ### 5. 编译 @@ -236,71 +234,55 @@ make mul_op ## 实现单元测试 -单测包括对比前向Op不同设备(CPU、GPU)的实现、对比反向OP不同设备(CPU、GPU)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。 +单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。 -### 前向Operator单元测试 -前向Op单元测试继承自`unittest.TestCase`,并定义元类`__metaclass__ = OpTestMeta`。各项更加具体的单元测试在`OpTestMeta`里完成。测试前向Operator,需要: +Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要: 1. 在`setUp`函数定义输入、输出,以及相关的属性参数。 2. 生成随机的输入数据。 3. 在Python脚本中实现与前向operator相同的计算逻辑,得到输出值,与operator前向计算的输出进行对比。 +4. 反向计算已经自动集成进测试框架,直接调用相应接口即可。 ```python import unittest import numpy as np - from gradient_checker import GradientChecker, create_op - from op_test_util import OpTestMeta + from op_test import OpTest - class TestMulOp(unittest.TestCase): - __metaclass__ = OpTestMeta + class TestMulOp(OpTest): def setUp(self): - self.type = "mul" + self.op_type = "mul" self.inputs = { 'X': np.random.random((32, 84)).astype("float32"), 'Y': np.random.random((84, 100)).astype("float32") } self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} - ``` -上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: - -- `self.type = "mul" ` : 定义类型,与operator注册时注册的类型一致。 -- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 -- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 - - -### 反向Operator单元测试 + def test_check_output(self): + self.check_output() -反向Op单元测试继承自`GradientChecker`,而`GradientChecker`继承自`unittest.TestCase`,因此,**反向单元测试函数需要以`test_`开头**。 + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) -```python -class TestMulGradOp(GradientChecker): - def setUp(self): - self.op = create_op("mul") - self.inputs = { - 'X': np.random.random((32, 84)).astype("float32"), - 'Y': np.random.random((84, 100)).astype("float32") - } + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) - def test_check_grad_normal(self): - # mul op will enlarge the relative error - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) + ``` - def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) -``` +上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: -下面解释代码中一些关键的地方: +- `self.op_type = "mul" ` : 定义类型,与operator注册时注册的类型一致。 +- `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 +- `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 -- 调用`create_op("mul")`创建反向Op对应的前向Op。 +而反向测试中: - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。 - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。 - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。 @@ -328,5 +310,5 @@ ctest -R test_mul_op - 为每个Op创建单独的`*_op.h`(如有)、`*_op.cc`和`*_op.cu`(如有)。不允许一个文件中包含多个Op,这将会导致编译出错。 - 注册Op时的类型名,需要和该Op的名字一样。即不允许在`A_op.cc`里面,注册`REGISTER_OP(B, ...)`等,这将会导致单元测试出错。 -- 如果Op没有实现GPU Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。 +- 如果Op没有实现CUDA Kernel,请不要创建空的`*_op.cu`,这将会导致单元测试出错。 - 如果多个Op依赖一些共用的函数,可以创建非`*_op.*`格式的文件来存放,如`gather.h`文件。 diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md index 1e88e1f5b4..510233306c 100644 --- a/doc/howto/dev/new_op_en.md +++ b/doc/howto/dev/new_op_en.md @@ -28,8 +28,8 @@ An operator can be differentiated by whether in has kernel methods. An operator -------------- | :---------------------- OpProtoMake definition | `.cc`files, Backward Op does not need an OpProtoMake interface. Op definition | `.cc` files -Kernel implementation | The kernel methods shared between CPU and GPU are defined in `.h` files. CPU-specific kernels live in `.cc` files, while GPU-specific kernels are implemented in `.cu`files. -Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the GPU implementation. +Kernel implementation | The kernel methods shared between CPU and CUDA are defined in `.h` files. CPU-specific kernels live in `.cc` files, while CUDA-specific kernels are implemented in `.cu`files. +Registering the Op | Ops are registered in `.cc` files; For Kernel registration, `.cc` files contain the CPU implementation, while `.cu` files contain the CUDA implementation. New Operator implementations are added to the list [paddle/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators), with file names in the format `*_op.h` (if applicable), `*_op.cc`, `*_op.cu` (if applicable).** The system will use the naming scheme to automatically build operators and their corresponding Python extensions. ** @@ -151,7 +151,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w `MulKernel` inherits `framework::OpKernel`, which includes the following templates: -- `typename Place` denotes device type. When different devices, namely the CPU and the GPU, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). +- `typename DeviceContext` denotes device context type. When different devices, namely the CPUDeviceContext and the CUDADeviceContext, share the same kernel, this template needs to be added. If they don't share kernels, this must not be added. An example of a non-sharing kernel is [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). - `typename T` denotes data type, such as `float` or `double`. @@ -163,7 +163,7 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w `MulKernel`'s implementation of `Compute` is as follows: ```cpp - template + template class MulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -171,16 +171,15 @@ Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, w auto* Y = context.Input("Y"); auto* Z = context.Output("Out"); Z->mutable_data(context.GetPlace()); - auto* device_context = - const_cast(context.device_context_); - math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); + auto& device_context = context.template device_context(); + math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); } }; ``` -Note that **different devices (CPU, GPU)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.** +Note that **different devices (CPU, CUDA)share an Op definition; whether or not they share the same `OpKernel` depends on whether `Compute` calls functions that support both devices.** -`MulOp`'s CPU and GPU share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). +`MulOp`'s CPU and CUDA share the same `Kernel`. A non-sharing `OpKernel` example can be seen in [`OnehotCrossEntropyOpKernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/cross_entropy_op.h#L43). To ease the writing of `OpKernel` compute, and for reusing code cross-device, [`Eigen-unsupported Tensor`](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md?fileviewer=file-view-default) module is used to implement `Compute` interface. To learn about how the Eigen library is used in PaddlePaddle, please see [usage document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/use_eigen_cn.md). @@ -196,9 +195,9 @@ The definition of its corresponding backward operator, if applicable, is similar ```cpp namespace ops = paddle::operators; REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker, mul_grad, ops::MulOpGrad); - REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel); REGISTER_OP_CPU_KERNEL(mul_grad, - ops::MulGradKernel); + ops::MulGradKernel); ``` In that code block, @@ -208,17 +207,17 @@ The definition of its corresponding backward operator, if applicable, is similar - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`. -- Registering GPU Kernel in `.cu` files - - Note that if GPU Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as +- Registering CUDA Kernel in `.cu` files + - Note that if CUDA Kernel is implemented using the `Eigen unsupported` module, then on top of `.cu`, a macro definition `#define EIGEN_USE_GPU` is needed, such as ```cpp // if use Eigen unsupported module before include head files #define EIGEN_USE_GPU namespace ops = paddle::operators; - REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel); - REGISTER_OP_GPU_KERNEL(mul_grad, - ops::MulGradKernel); + REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel); + REGISTER_OP_CUDA_KERNEL(mul_grad, + ops::MulGradKernel); ``` ### 5. Compilation @@ -253,62 +252,50 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass 2. Generating random input data. -3. Implementing the same computation logic in a Python script: +3. Implementing the same computation logic in a Python script. + +4. Call check gradient function to check the backward operator. ```python import unittest import numpy as np - from gradient_checker import GradientChecker, create_op - from op_test_util import OpTestMeta + from op_test import OpTest - class TestMulOp(unittest.TestCase): - __metaclass__ = OpTestMeta + class TestMulOp(OpTest): def setUp(self): - self.type = "mul" + self.op_type = "mul" self.inputs = { 'X': np.random.random((32, 84)).astype("float32"), 'Y': np.random.random((84, 100)).astype("float32") } self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} - ``` -Get its output, and compare it with the forward operator's own output. - -The code above first loads required packages. In addition, we have - -- `self.type = "mul" ` defines the type that is identical to what the operator's registered type. -- `self.inputs` defines input, with type `numpy.array` and initializes it. -- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. -### Testing Backward Operators + def test_check_output(self): + self.check_output() + + def test_check_grad_normal(self): + self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) -A backward operator unit test inherits `GradientChecker`, which inherits `unittest.TestCase`. As a result, **a backward operator unit test needs to be have the prefix `test_`**. + def test_check_grad_ingore_x(self): + self.check_grad( + ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) -```python -class TestMulGradOp(GradientChecker): - def setUp(self): - self.op = create_op("mul") - self.inputs = { - 'X': np.random.random((32, 84)).astype("float32"), - 'Y': np.random.random((84, 100)).astype("float32") - } + def test_check_grad_ingore_y(self): + self.check_grad( + ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - def test_check_grad_normal(self): - # mul op will enlarge the relative error - self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5) + ``` +Get its output, and compare it with the forward operator's own output. - def test_check_grad_ingore_x(self): - self.check_grad( - ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X")) +The code above first loads required packages. In addition, we have - def test_check_grad_ingore_y(self): - self.check_grad( - ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) -``` +- `self.op_type = "mul" ` defines the type that is identical to what the operator's registered type. +- `self.inputs` defines input, with type `numpy.array` and initializes it. +- `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. -Some key points in the code above include: +Some key points in checking gradient above include: -- `create_op("mul")` creates the backward operator's corresponding forward operator. - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods. - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested. - The second variable `"Out"` points to the network's final output target `Out`. @@ -338,5 +325,5 @@ ctest -R test_mul_op - Every `*_op.h` (if applicable), `*_op.cc`, and `*_op.cu` (if applicable) must be created for a unique Op. Compiling will fail if multiple operators are included per file. - The type with which an operator is registered needs to be identical to the Op's name. Registering `REGISTER_OP(B, ...)` in `A_op.cc` will cause unit testing failures. -- If the operator does not implement a GPU kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail. +- If the operator does not implement a CUDA kernel, please refrain from creating an empty `*_op.cu` file, or else unit tests will fail. - If multiple operators rely on some shared methods, a file NOT named `*_op.*` can be created to store them, such as `gather.h`. From d918ccded35d972eba42b07767972bc4c2dd0921 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Tue, 12 Dec 2017 15:22:52 +0800 Subject: [PATCH 28/35] Add fill_op (#6477) * Add fill_op * Fix bug --- paddle/operators/fill_op.cc | 111 +++++++++++++++++++ python/paddle/v2/fluid/tests/test_fill_op.py | 24 ++++ 2 files changed, 135 insertions(+) create mode 100644 paddle/operators/fill_op.cc create mode 100644 python/paddle/v2/fluid/tests/test_fill_op.py diff --git a/paddle/operators/fill_op.cc b/paddle/operators/fill_op.cc new file mode 100644 index 0000000000..382e161c5d --- /dev/null +++ b/paddle/operators/fill_op.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/data_type.h" +#include "paddle/framework/op_registry.h" +#include "paddle/operators/detail/safe_ref.h" + +namespace paddle { +namespace operators { + +struct FillOpVisitor { + FillOpVisitor(framework::LoDTensor *tensor, const std::vector &value) + : tensor_(tensor), value_(value) {} + + template + void operator()() const { + platform::CPUPlace cpu; + auto *data = tensor_->mutable_data(cpu); + std::transform(value_.data(), value_.data() + tensor_->numel(), data, + [](float dat) { return static_cast(dat); }); + } + + framework::LoDTensor *tensor_; + const std::vector &value_; +}; + +class FillOp : public framework::OperatorBase { + public: + FillOp(const std::string &type, const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + void Run(const framework::Scope &scope, + const platform::DeviceContext &dev_ctx) const override { + auto &out = + detail::Ref(detail::Ref(scope.FindVar(Output("Out")), + "Cannot find variable %s", Output("Out")) + .GetMutable()); + out.Resize(framework::make_ddim(Attr>("shape"))); + auto dtype = static_cast(Attr("dtype")); + platform::CPUPlace cpu; + auto force_cpu = Attr("force_cpu"); + out.mutable_data(force_cpu ? cpu : dev_ctx.GetPlace(), + framework::ToTypeIndex(dtype)); + + framework::LoDTensor tensor; + + if (force_cpu || platform::is_cpu_place(dev_ctx.GetPlace())) { + tensor.ShareDataWith(out); + } else { + // Always make tensor in CPU memory. + tensor.Resize(out.dims()); + tensor.mutable_data(cpu, framework::ToTypeIndex(dtype)); + } + + framework::VisitDataType( + dtype, FillOpVisitor(&tensor, Attr>("value"))); + + if (!force_cpu && platform::is_gpu_place(dev_ctx.GetPlace())) { + // Copy tensor to out + framework::CopyFrom(tensor, dev_ctx.GetPlace(), dev_ctx, &out); + } + } +}; + +class FillOpMaker : public framework::OpProtoAndCheckerMaker { + public: + FillOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker) + : OpProtoAndCheckerMaker(proto, op_checker) { + AddComment(R"DOC(Fill operator + +Fill an tensor with `value` and `shape`. The type of the tensor is specify by +`dtype`. +)DOC"); + AddOutput("Out", "(LoDTensor) The output tensor."); + AddAttr>( + "value", "The float values of tensor, which are flatten in row major"); + AddAttr>("shape", "The shape of output tensor"); + AddAttr("dtype", "The data type of output tensor, Default is float") + .SetDefault(framework::DataType::FP32); + AddAttr("force_cpu", + "Whether the output tensor must be at CPU memory or not. " + "Default is false.") + .SetDefault(false); + } +}; + +class FillOpInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext *context) const override { + context->SetOutputDim( + "Out", + framework::make_ddim(context->Attrs().Get>("shape"))); + } +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; +REGISTER_OPERATOR(fill, ops::FillOp, ops::FillOpInferShape, ops::FillOpMaker); diff --git a/python/paddle/v2/fluid/tests/test_fill_op.py b/python/paddle/v2/fluid/tests/test_fill_op.py new file mode 100644 index 0000000000..88337598c8 --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_fill_op.py @@ -0,0 +1,24 @@ +import unittest +import numpy as np +from op_test import OpTest +import paddle.v2.fluid.core as core + + +class TestFillOp(OpTest): + def setUp(self): + self.op_type = "fill" + val = np.random.random(size=[100, 200]) + self.inputs = {} + self.attrs = { + 'value': val.flatten().tolist(), + 'shape': [100, 200], + 'dtype': int(core.DataType.FP64) + } + self.outputs = {'Out': val.astype('float64')} + + def test_check_output(self): + self.check_output() + + +if __name__ == '__main__': + unittest.main() From 8f7d0b18145c8310fb36ad017aa0f1d4bce98c65 Mon Sep 17 00:00:00 2001 From: fengjiayi Date: Tue, 12 Dec 2017 15:54:38 +0800 Subject: [PATCH 29/35] add param_attr for nets (#6509) --- python/paddle/v2/fluid/layers.py | 6 ++++-- python/paddle/v2/fluid/nets.py | 7 +++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index fd8a2ed18c..1f45487902 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -1732,8 +1732,10 @@ def conv2d_transpose(input, h_in = input.shape[2] w_in = input.shape[3] - filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0] - filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1] + filter_size_h = output_size[0] - \ + (h_in - 1) * stride[0] + 2 * padding[0] + filter_size_w = output_size[1] - \ + (w_in - 1) * stride[1] + 2 * padding[1] filter_size = [filter_size_h, filter_size_w] elif isinstance(filter_size, int): filter_size = [filter_size, filter_size] diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py index 05728ad75a..7ef524318e 100644 --- a/python/paddle/v2/fluid/nets.py +++ b/python/paddle/v2/fluid/nets.py @@ -9,6 +9,7 @@ def simple_img_conv_pool(input, pool_size, pool_stride, act, + param_attr=None, pool_type='max', main_program=None, startup_program=None): @@ -16,6 +17,7 @@ def simple_img_conv_pool(input, input=input, num_filters=num_filters, filter_size=filter_size, + param_attr=param_attr, act=act, main_program=main_program, startup_program=startup_program) @@ -36,6 +38,7 @@ def img_conv_group(input, conv_padding=1, conv_filter_size=3, conv_act=None, + param_attr=None, conv_with_batchnorm=False, conv_batchnorm_drop_rate=None, pool_stride=1, @@ -57,6 +60,7 @@ def img_conv_group(input, conv_padding = __extend_list__(conv_padding) conv_filter_size = __extend_list__(conv_filter_size) + param_attr = __extend_list__(param_attr) conv_with_batchnorm = __extend_list__(conv_with_batchnorm) conv_batchnorm_drop_rate = __extend_list__(conv_batchnorm_drop_rate) @@ -70,6 +74,7 @@ def img_conv_group(input, num_filters=conv_num_filter[i], filter_size=conv_filter_size[i], padding=conv_padding[i], + param_attr=param_attr[i], act=local_conv_act, main_program=main_program, startup_program=startup_program) @@ -101,6 +106,7 @@ def img_conv_group(input, def sequence_conv_pool(input, num_filters, filter_size, + param_attr=None, act="sigmoid", pool_type="max", main_program=None, @@ -109,6 +115,7 @@ def sequence_conv_pool(input, input=input, num_filters=num_filters, filter_size=filter_size, + param_attr=param_attr, act=act, main_program=main_program, startup_program=startup_program) From 3ef8ec37bb80427ab8abe23384c42a9e9056c87d Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 13 Dec 2017 10:35:57 +0800 Subject: [PATCH 30/35] fix the new_op doc (#6540) * fix the ending symbol * fix invalid content link --- doc/howto/dev/new_op_cn.md | 36 ++++++++++++++++++++---------------- doc/howto/dev/new_op_en.md | 19 ++++++++++--------- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md index 44dbeecbbd..757a5840bc 100644 --- a/doc/howto/dev/new_op_cn.md +++ b/doc/howto/dev/new_op_cn.md @@ -1,17 +1,18 @@ # 如何写新的Operator - [概念简介](#概念简介) - - [实现C++类](#实现C++类) - - [定义ProtoMaker类](#定义ProtoMaker类) - - [定义Operator类](#定义Operator类) - - [定义OpKernel类](#定义OpKernel类) - - [注册Operator](#注册Operator) + - [实现C++类](#实现c类) + - [定义ProtoMaker类](#定义protomaker类) + - [定义Operator类](#定义operator类) + - [定义OpKernel类](#定义opkernel类) + - [注册Operator](#注册operator) - [编译](#编译) - - [绑定Python](#绑定Python) + - [绑定Python](#绑定python) - [实现单元测试](#实现单元测试) - - [前向Operator单测](#前向Operator单测) - - [反向Operator单测](#反向Operator单测) + - [前向Operator单测](#前向operator单测) + - [反向Operator单测](#反向operator单测) - [编译和执行](#编译和执行) + - [注意事项](#注意事项) ## 概念简介 @@ -43,7 +44,7 @@ Kernel实现 | CPU、CUDA共享Kernel实现在`.h`文件中,否则,CPU ## 实现C++类 -### 1. 定义ProtoMaker类 +### 定义ProtoMaker类 矩阵乘法的公式:$Out = X * Y$, 可见该计算由两个输入,一个输出组成。 @@ -100,7 +101,7 @@ The equation is: Out = scale*X - `AddAttr("scale", "...").SetDefault(1.0);` : 增加`scale`系数,作为参数属性,并且设置默认值为1.0。 -### 2. 定义Operator类 +### 定义Operator类 下面的点实现了MulOp的定义: @@ -149,7 +150,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, 通常`OpProtoMaker`和`Op`类的定义写在`.cc`文件中,和下面将要介绍的注册函数一起放在`.cc`中 -### 3. 定义OpKernel类 +### 定义OpKernel类 `MulKernel`继承自`framework::OpKernel`,带有下面两个模板参数: @@ -177,6 +178,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, math::matmul(*X, false, *Y, false, 1, Z, 0, device_context); } }; + ``` 需要注意:**不同设备(CPU、CUDA)共享一个Op定义,是否则共享同一个`OpKernel`,取决于`Compute`调用的函数是否支持不同设备。** @@ -188,7 +190,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, 到此,前向Op实现完成。接下来,需要在`.cc`文件中注册该op和kernel。 反向Op类的定义,反向OpKernel的定义与前向Op类似,这里不再赘述。**但需注意反向Op没有`ProtoMaker`**。 -### 4. 注册Operator +### 注册Operator - 在`.cc`文件中注册前向、反向Op类,注册CPU Kernel。 @@ -220,7 +222,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, ops::MulGradKernel); ``` -### 5. 编译 +### 编译 运行下面命令可以进行编译: @@ -236,6 +238,7 @@ make mul_op 单测包括对比前向Op不同设备(CPU、CUDA)的实现、对比反向OP不同设备(CPU、CUDA)的实现、反向Op的梯度测试。下面介绍介绍[`MulOp`的单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/tests/test_mul_op.py)。 +### 前向Operator单测 Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp`里完成。测试Operator,需要: @@ -273,8 +276,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp def test_check_grad_ingore_y(self): self.check_grad( ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - - ``` + ``` 上面的代码首先导入依赖的包,下面是对`setUp`函数中操作的重要变量的详细解释: @@ -282,6 +284,8 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp - `self.inputs` : 定义输入,类型为`numpy.array`,并初始化。 - `self.outputs` : 定义输出,并在Python脚本中完成与operator同样的计算逻辑,返回Python端的计算结果。 +### 反向operator单测 + 而反向测试中: - `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。 - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。 @@ -290,7 +294,7 @@ Op单元测试继承自`OpTest`。各项更加具体的单元测试在`TestMulOp - `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。 -### 编译和执行单元测试 +### 编译和执行 `python/paddle/v2/framework/tests` 目录下新增的 `test_*.py` 单元测试会被自动加入工程进行编译。 diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md index 510233306c..fe86936bc1 100644 --- a/doc/howto/dev/new_op_en.md +++ b/doc/howto/dev/new_op_en.md @@ -1,8 +1,8 @@ # How to write a new operator - [Background](#background) - - [Implementing C++ Types](#implementing-c++-types) - - [Defining ProtoMaker](#defining-protoMaker) + - [Implementing C++ Types](#implementing-c-types) + - [Defining ProtoMaker](#defining-protomaker) - [Defining Operator](#defining-operator) - [Registering Operator](#registering-operator) - [Compilation](#compilation) @@ -41,7 +41,7 @@ Let's take matrix multiplication operator, [MulOp](https://github.com/PaddlePadd ## Implementing C++ Types -### 1. Defining Class ProtoMaker +### Defining ProtoMaker Matrix Multiplication can be written as $Out = X * Y$, meaning that the operation consists of two inputs and pne output. @@ -98,7 +98,7 @@ There are two changes in this example: - `AddAttr("scale", "...").SetDefault(1.0);` adds `scale`constant as an attribute, and sets the default value to 1.0. -### 2. Defining Operator +### Defining Operator The following code defines the interface for MulOp: @@ -147,7 +147,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs, Usually `OpProtoMaker` and `Op`'s type definitions are written in `.cc` files, which also include the registration methods introduced later. -### 3. Defining OpKernel +### Defining OpKernel `MulKernel` inherits `framework::OpKernel`, which includes the following templates: @@ -188,7 +188,7 @@ This concludes the forward implementation of an operator. Next its operation and The definition of its corresponding backward operator, if applicable, is similar to that of an forward operator. **Note that a backward operator does not include a `ProtoMaker`**. -### 4. Registering Operator +### Registering Operator - In `.cc` files, register forward and backward operator classes and the CPU kernel. @@ -220,7 +220,7 @@ The definition of its corresponding backward operator, if applicable, is similar ops::MulGradKernel); ``` -### 5. Compilation +### Compilation Run the following commands to compile. @@ -284,8 +284,7 @@ A forward operator unit test inherits `unittest.TestCase` and defines metaclass def test_check_grad_ingore_y(self): self.check_grad( ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y')) - - ``` + ``` Get its output, and compare it with the forward operator's own output. The code above first loads required packages. In addition, we have @@ -294,6 +293,8 @@ The code above first loads required packages. In addition, we have - `self.inputs` defines input, with type `numpy.array` and initializes it. - `self.outputs` defines output and completes the same operator computation in the Python script, and returns its result from the Python script. +### Testing Backward Operators + Some key points in checking gradient above include: - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods. From 8ad36cdb5d2c3a79c82c36cd7c2e79bc2d4cc4bf Mon Sep 17 00:00:00 2001 From: "Yang Yang(Tony)" Date: Wed, 13 Dec 2017 10:41:31 +0800 Subject: [PATCH 31/35] PaddlePaddle Fluid Source Overview (#6485) * first commit * Update read_source.md --- doc/howto/read_source.md | 67 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 67 insertions(+) create mode 100644 doc/howto/read_source.md diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md new file mode 100644 index 0000000000..383acb0c82 --- /dev/null +++ b/doc/howto/read_source.md @@ -0,0 +1,67 @@ +# PaddlePaddle Fluid Source Code Overview + +Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/tests/book + +Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework + +Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators + +Optimizer: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer + +Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory + +# Compile Time + +The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). + +```python +x = fluid.layers.data(name='x', shape=[13], dtype='float32') +y = fluid.layers.data(name='y', shape=[1], dtype='float32') + +y_predict = fluid.layers.fc(input=x, size=1, act=None) +cost = fluid.layers.square_error_cost(input=y_predict, label=y) +avg_cost = fluid.layers.mean(x=cost) + +sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001) +sgd_optimizer.minimize(avg_cost) +``` + +- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93) +- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py) + - Every Layer has one or more operators and variables/parameters + - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files: + - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h) + - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) + - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h) +- Optimizer: `fluid.optimizer.SGD`. It does the following + - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)] + - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)] + +# Run Time + +The following **evaluates** the NN. Instantiates all the variables, operators. + +```python +place = fluid.CPUPlace() +feeder = fluid.DataFeeder(place=place, feed_list=[x, y]) +exe = fluid.Executor(place) + +# Allocate memory. Initialize Parameter. +exe.run(fluid.default_startup_program()) + +# Allocate memory. Do computation. +exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[avg_cost]) +``` + +- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h) + - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h) +- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)] + - Feeds the data: `feed=feeder.feed(data)` + - Evaluates all the operators + - Fetches the result: `fetch_list=[avg_cost]` +- Other worth looking files: + - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live + - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live + - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory) From 697facc92f58e6b65c76db2f1b6efc282b3c57a0 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Wed, 13 Dec 2017 10:43:20 +0800 Subject: [PATCH 32/35] "add registry interface" (#6449) * "add registry interface" * "move function to registry" * "rename with meaningful name" * "add exposed layers" * "fixed based on comments" * "remove unsed comments" --- python/paddle/v2/fluid/layers.py | 185 ++--------------- python/paddle/v2/fluid/registry.py | 186 ++++++++++++++++++ python/paddle/v2/fluid/tests/test_registry.py | 22 +++ 3 files changed, 221 insertions(+), 172 deletions(-) create mode 100644 python/paddle/v2/fluid/registry.py create mode 100644 python/paddle/v2/fluid/tests/test_registry.py diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 1f45487902..9f5a219b20 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -1,12 +1,12 @@ -import core +import contextlib + import proto.framework_pb2 as framework_pb2 +import core from framework import OpProtoHolder, Variable, Program, Operator from initializer import Constant, Normal, Xavier, Initializer from paddle.v2.fluid.layer_helper import LayerHelper, unique_name -import re -import cStringIO +from registry import register_layer from param_attr import ParamAttr -import contextlib __all__ = [ 'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat', @@ -14,6 +14,15 @@ __all__ = [ 'batch_norm', 'accuracy', 'split_lod_tensor', 'While' ] +_REGISTER_LAYER_FROM_OPS = [ + 'mean', 'mul', 'elementwise_add', 'elementwise_div', 'dropout', 'reshape', + 'sigmoid', 'scale', 'transpose', 'sigmoid_cross_entropy_with_logits' +] + +for _OP in set(_REGISTER_LAYER_FROM_OPS): + globals()[_OP] = register_layer(_OP) + __all__.append(_OP) + def fc(input, size, @@ -309,174 +318,6 @@ def create_tensor(dtype, name=None, main_program=None, startup_program=None): return helper.create_variable(name=helper.name, dtype=dtype) -def _convert_(name): - """ - Formatting. - - Args: - name: The name/alias - - This function takes in a name and converts it to a standard format of - group1_group2. Where as per the regular expression, group1 can have - alphabets and numbers and group2 has capital alphabets. - - """ - s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) - return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() - - -def _generate_doc_string_(op_proto): - """ - Generate docstring by OpProto - - Args: - op_proto (framework_pb2.OpProto): a protobuf message typed OpProto - - Returns: - str: the document string - """ - - def _type_to_str_(tp): - return framework_pb2.AttrType.Name(tp) - - if not isinstance(op_proto, framework_pb2.OpProto): - raise TypeError("OpProto should be `framework_pb2.OpProto`") - - buf = cStringIO.StringIO() - buf.write(op_proto.comment) - buf.write('\nArgs:\n') - for each_input in op_proto.inputs: - line_begin = ' {0}: '.format(_convert_(each_input.name)) - buf.write(line_begin) - buf.write(each_input.comment) - buf.write('\n') - buf.write(' ' * len(line_begin)) - buf.write('Duplicable: ') - buf.write(str(each_input.duplicable)) - buf.write(' Optional: ') - buf.write(str(each_input.dispensable)) - buf.write('\n') - - for each_attr in op_proto.attrs: - buf.write(' ') - buf.write(each_attr.name) - buf.write(' (') - buf.write(_type_to_str_(each_attr.type)) - buf.write('): ') - buf.write(each_attr.comment) - buf.write('\n') - - if len(op_proto.outputs) != 0: - buf.write('\nReturns:\n') - buf.write(' ') - for each_opt in op_proto.outputs: - if not each_opt.intermediate: - break - buf.write(each_opt.comment) - - return buf.getvalue() - - -def _create_op_func_(op_type): - """ - Create an Operator for a Function. - - Args: - op_type: The name of the operator to be created - - This function takes in the operator type (sigmoid, mean , average etc) and - creates the operator functionality. - - """ - op_proto = OpProtoHolder.instance().get_op_proto(op_type) - not_intermediate_outputs = \ - filter(lambda output: not output.intermediate, op_proto.outputs) - intermediate_outputs = \ - filter(lambda output: output.intermediate, op_proto.outputs) - - if len(not_intermediate_outputs) != 1: - raise ValueError("Only one non intermediate output operator can be", - "automatically generated") - - if not_intermediate_outputs[0].duplicable: - raise ValueError( - "Only non duplicable op can be automatically generated") - - for output in intermediate_outputs: - if output.duplicable: - raise ValueError("The op can be automatically generated only when ", - "all intermediate ops are not duplicable") - - o_name = not_intermediate_outputs[0].name - intermediate_output_names = [output.name for output in intermediate_outputs] - - def infer_and_check_dtype(op_proto, **kwargs): - """ - This function performs the sanity check for dtype and - instance type. - """ - dtype = None - for ipt in op_proto.inputs: - name = _convert_(ipt.name) - val = kwargs.pop(name, []) - if not isinstance(val, list) and not isinstance(val, tuple): - val = [val] - for each in val: - if not isinstance(each, Variable): - raise ValueError("input of {0} must be variable".format( - op_type)) - - if dtype is None: - dtype = each.dtype - elif dtype != each.dtype: - raise ValueError( - "operator {0} must input same dtype. {1} vs {2}".format( - op_type, dtype, each.dtype)) - - return dtype - - def func(**kwargs): - helper = LayerHelper(op_type, **kwargs) - - dtype = infer_and_check_dtype(op_proto, **kwargs) - - inputs = dict() - for ipt in op_proto.inputs: - name = _convert_(ipt.name) - val = kwargs.pop(name, []) - if not isinstance(val, list) and not isinstance(val, tuple): - val = [val] - inputs[ipt.name] = val - - outputs = dict() - out = helper.create_tmp_variable(dtype=dtype) - outputs[o_name] = [out] - for name in intermediate_output_names: - outputs[name] = [helper.create_tmp_variable(dtype=dtype)] - helper.append_op( - type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs) - return helper.append_activation(out) - - func.__name__ = op_type - globals()[op_type] = func - func.__doc__ = _generate_doc_string_(op_proto) - global __all__ - __all__.append(op_type) - - -_create_op_func_('mean') -_create_op_func_('mul') -_create_op_func_('elementwise_add') -_create_op_func_('elementwise_div') -_create_op_func_('dropout') -_create_op_func_('reshape') -_create_op_func_('sigmoid') -_create_op_func_('scale') -_create_op_func_('reshape') -_create_op_func_('transpose') -_create_op_func_('sigmoid_cross_entropy_with_logits') - - def cast(x, dtype, main_program=None): """ This function takes in the input with input_dtype diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py new file mode 100644 index 0000000000..6f5dd365de --- /dev/null +++ b/python/paddle/v2/fluid/registry.py @@ -0,0 +1,186 @@ +import re +import cStringIO +import warnings +import functools +import inspect + +import proto.framework_pb2 as framework_pb2 +from framework import OpProtoHolder, Variable, Program, Operator +from paddle.v2.fluid.layer_helper import LayerHelper, unique_name + +__all__ = ['deprecated', 'register_layer'] + + +def _convert_(name): + """ + Formatting. + + Args: + name: The name/alias + + This function takes in a name and converts it to a standard format of + group1_group2. Where as per the regular expression, group1 can have + alphabets and numbers and group2 has capital alphabets. + + """ + s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) + return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() + + +def _generate_doc_string_(op_proto): + """ + Generate docstring by OpProto + + Args: + op_proto (framework_pb2.OpProto): a protobuf message typed OpProto + + Returns: + str: the document string + """ + + def _type_to_str_(tp): + return framework_pb2.AttrType.Name(tp) + + if not isinstance(op_proto, framework_pb2.OpProto): + raise TypeError("OpProto should be `framework_pb2.OpProto`") + + buf = cStringIO.StringIO() + buf.write(op_proto.comment) + buf.write('\nArgs:\n') + for each_input in op_proto.inputs: + line_begin = ' {0}: '.format(_convert_(each_input.name)) + buf.write(line_begin) + buf.write(each_input.comment) + buf.write('\n') + buf.write(' ' * len(line_begin)) + buf.write('Duplicable: ') + buf.write(str(each_input.duplicable)) + buf.write(' Optional: ') + buf.write(str(each_input.dispensable)) + buf.write('\n') + + for each_attr in op_proto.attrs: + buf.write(' ') + buf.write(each_attr.name) + buf.write(' (') + buf.write(_type_to_str_(each_attr.type)) + buf.write('): ') + buf.write(each_attr.comment) + buf.write('\n') + + if len(op_proto.outputs) != 0: + buf.write('\nReturns:\n') + buf.write(' ') + for each_opt in op_proto.outputs: + if not each_opt.intermediate: + break + buf.write(each_opt.comment) + + return buf.getvalue() + + +def register_layer(op_type): + """ + Register an Python layer for an Operator + + Args: + op_type: The name of the operator to be created + + This function takes in the operator type (sigmoid, mean , average etc) and + creates the operator functionality. + + """ + op_proto = OpProtoHolder.instance().get_op_proto(op_type) + not_intermediate_outputs = \ + filter(lambda output: not output.intermediate, op_proto.outputs) + intermediate_outputs = \ + filter(lambda output: output.intermediate, op_proto.outputs) + + if len(not_intermediate_outputs) != 1: + raise ValueError("Only one non intermediate output operator can be", + "automatically generated") + + if not_intermediate_outputs[0].duplicable: + raise ValueError( + "Only non duplicable op can be automatically generated") + + for output in intermediate_outputs: + if output.duplicable: + raise ValueError("The op can be automatically generated only when ", + "all intermediate ops are not duplicable") + + o_name = not_intermediate_outputs[0].name + intermediate_output_names = [output.name for output in intermediate_outputs] + + def infer_and_check_dtype(op_proto, **kwargs): + """ + This function performs the sanity check for dtype and + instance type. + """ + dtype = None + for ipt in op_proto.inputs: + name = _convert_(ipt.name) + val = kwargs.pop(name, []) + if not isinstance(val, list) and not isinstance(val, tuple): + val = [val] + for each in val: + if not isinstance(each, Variable): + raise ValueError("input of {0} must be variable".format( + op_type)) + + if dtype is None: + dtype = each.dtype + elif dtype != each.dtype: + raise ValueError( + "operator {0} must input same dtype. {1} vs {2}".format( + op_type, dtype, each.dtype)) + + return dtype + + def func(**kwargs): + helper = LayerHelper(op_type, **kwargs) + + dtype = infer_and_check_dtype(op_proto, **kwargs) + + inputs = dict() + for ipt in op_proto.inputs: + name = _convert_(ipt.name) + val = kwargs.pop(name, []) + if not isinstance(val, list) and not isinstance(val, tuple): + val = [val] + inputs[ipt.name] = val + + outputs = dict() + out = helper.create_tmp_variable(dtype=dtype) + outputs[o_name] = [out] + for name in intermediate_output_names: + outputs[name] = [helper.create_tmp_variable(dtype=dtype)] + helper.append_op( + type=op_type, inputs=inputs, outputs=outputs, attrs=kwargs) + return helper.append_activation(out) + + func.__name__ = op_type + func.__doc__ = _generate_doc_string_(op_proto) + return func + + +def deprecated(func_or_class): + """ + Deprecated warning decorator. It will result a warning message. + Should be used before class or function, member function + """ + + @functools.wraps(func) + def func_wrapper(*args, **kwargs): + """ + Wrap func with deprecated warning + """ + warnings.simplefilter('always', DeprecationWarning) #turn off filter + warnings.warn( + "Call to deprecated function {}.".format(func.__name__), + category=DeprecationWarning, + stacklevel=2) + warnings.simplefilter('default', DeprecationWarning) #reset filter + return func(*args, **kwargs) + + return func_wrapper diff --git a/python/paddle/v2/fluid/tests/test_registry.py b/python/paddle/v2/fluid/tests/test_registry.py new file mode 100644 index 0000000000..f8328f31cf --- /dev/null +++ b/python/paddle/v2/fluid/tests/test_registry.py @@ -0,0 +1,22 @@ +import unittest +import warnings + +import paddle.v2.fluid as fluid +import paddle.v2.fluid.framework as framework +import paddle.v2.fluid.layers as layers +import paddle.v2.fluid.registry as registry + + +class TestRegistry(unittest.TestCase): + def test_registry_layer(self): + self.layer_type = "mean" + program = framework.Program() + + x = fluid.layers.data(name='X', shape=[10, 10], dtype='float32') + output = layers.mean(x) + place = fluid.CPUPlace() + exe = fluid.Executor(place) + + X = np.random.random((10, 10)).astype("float32") + mean_out = exe.run(program, feed={"X": X}, fetch_list=[output]) + self.assertAlmostEqual(np.mean(X), mean_out) From 1ba8f7fe71fc8df40669a24fc5af88d0904240f1 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 13 Dec 2017 14:18:19 +0800 Subject: [PATCH 33/35] The comments in reshape_op is wrong (#6565) --- paddle/operators/reshape_op.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc index 39bf2118d6..7fd33bf662 100644 --- a/paddle/operators/reshape_op.cc +++ b/paddle/operators/reshape_op.cc @@ -84,9 +84,9 @@ Given a 2-D tensor X with 2 rows and 2 columns [[1, 2], [3, 4]] and target shape = [1, 4], the reshape operator will transform -the tensor X into a 1-D tensor: +the tensor X into a 2-D tensor: - [1, 2, 3, 4] + [[1, 2, 3, 4]] )DOC"); } From d069f2ca0a92dd07eb06c9a02528c41b0702f131 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 13 Dec 2017 14:37:05 +0800 Subject: [PATCH 34/35] Make fluid.layers.fc support multiple param_attr (#6532) Fix #6531 --- python/paddle/v2/fluid/param_attr.py | 2 ++ python/paddle/v2/fluid/tests/test_layers.py | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py index 86088fdd7c..7952a5ea51 100644 --- a/python/paddle/v2/fluid/param_attr.py +++ b/python/paddle/v2/fluid/param_attr.py @@ -36,6 +36,8 @@ class ParamAttr(object): def to_attr(arg): if arg is None: return ParamAttr() + elif isinstance(arg, list) or isinstance(arg, tuple): + return [ParamAttr.to_attr(a) for a in arg] elif isinstance(arg, ParamAttr): return arg elif isinstance(arg, str) or isinstance(arg, unicode): diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index 57f6a362de..9b88080158 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -29,7 +29,10 @@ class TestBook(unittest.TestCase): label = layers.data(name='label', shape=[1], dtype='int32') hidden1 = layers.fc(input=images, size=128, act='relu') hidden2 = layers.fc(input=hidden1, size=64, act='relu') - predict = layers.fc(input=hidden2, size=10, act='softmax') + predict = layers.fc(input=[hidden2, hidden1], + size=10, + act='softmax', + param_attr=["sftmax.w1", "sftmax.w2"]) cost = layers.cross_entropy(input=predict, label=label) avg_cost = layers.mean(x=cost) self.assertIsNotNone(avg_cost) From 0a8addf802e2198084b9cc66e49ca4ae2fdd8125 Mon Sep 17 00:00:00 2001 From: Yu Yang Date: Wed, 13 Dec 2017 15:24:53 +0800 Subject: [PATCH 35/35] Make cast op support bool (#6562) Also add `elemwise_sub/mul/abs/clip` layers --- paddle/operators/cast_op.cc | 3 ++- paddle/operators/cast_op.cu | 3 ++- python/paddle/v2/fluid/layers.py | 5 +++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/paddle/operators/cast_op.cc b/paddle/operators/cast_op.cc index 42bff69a1e..d641b8fc9f 100644 --- a/paddle/operators/cast_op.cc +++ b/paddle/operators/cast_op.cc @@ -74,4 +74,5 @@ REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, - ops::CastOpKernel); + ops::CastOpKernel, + ops::CastOpKernel); diff --git a/paddle/operators/cast_op.cu b/paddle/operators/cast_op.cu index 4681deaa62..91e6fb391c 100644 --- a/paddle/operators/cast_op.cu +++ b/paddle/operators/cast_op.cu @@ -19,4 +19,5 @@ using CastOpKernel = paddle::operators::CastOpKernel; REGISTER_OP_CUDA_KERNEL(cast, CastOpKernel, CastOpKernel, - CastOpKernel, CastOpKernel); + CastOpKernel, CastOpKernel, + CastOpKernel); diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py index 9f5a219b20..f67d6d08c7 100644 --- a/python/paddle/v2/fluid/layers.py +++ b/python/paddle/v2/fluid/layers.py @@ -15,8 +15,9 @@ __all__ = [ ] _REGISTER_LAYER_FROM_OPS = [ - 'mean', 'mul', 'elementwise_add', 'elementwise_div', 'dropout', 'reshape', - 'sigmoid', 'scale', 'transpose', 'sigmoid_cross_entropy_with_logits' + 'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose', + 'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div', + 'elementwise_sub', 'elementwise_mul', 'clip', 'abs' ] for _OP in set(_REGISTER_LAYER_FROM_OPS):