From 521db98bc980bd5d2a5c150b7bd9eccc4d884ddd Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Mon, 18 Dec 2017 18:53:29 +0800 Subject: [PATCH 01/24] Refine CUDA profiler and delete the test file. --- paddle/platform/cuda_profiler.h | 18 +-------------- python/paddle/v2/fluid/profiler.py | 22 ++++++++++++++++--- python/paddle/v2/fluid/tests/test_profiler.py | 5 ++++- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h index b6311cb23d..67d5f626d4 100644 --- a/paddle/platform/cuda_profiler.h +++ b/paddle/platform/cuda_profiler.h @@ -22,23 +22,7 @@ namespace paddle { namespace platform { void CudaProfilerInit(std::string output_file, std::string output_mode, - std::vector config_flags) { - std::array buf; - std::string tmpl = "/tmp/cuda_profile_config.XXXXXX"; - PADDLE_ENFORCE_LT(tmpl.size(), buf.size()); - memcpy(buf.data(), tmpl.data(), tmpl.size()); - auto result = mktemp(buf.data()); - PADDLE_ENFORCE(strlen(result) != 0); - std::string config_file = result; - - { - std::ofstream ofs(config_file, std::ios::out | std::ios::trunc); - PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate()); - for (const auto& line : config_flags) { - ofs << line << std::endl; - } - } - + std::string config_file) { PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv"); cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair; PADDLE_ENFORCE( diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index 2069b713fa..fb21ec4f34 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -1,5 +1,6 @@ import paddle.v2.fluid.core as core from contextlib import contextmanager +import os __all__ = ['CudaProfiler'] @@ -8,9 +9,20 @@ NVPROF_CONFIG = [ "gpuendtimestamp", "gridsize3d", "threadblocksize", + "dynsmemperblock", + "stasmemperblock", + "regperthread", + "memtransfersize", + "memtransferdir", + "memtransferhostmemtype", "streamid", + "cacheconfigrequested", + "cacheconfigrequested", + "cacheconfigrequested", "enableonstart 0", "conckerneltrace", + "active_warps", + "active_warps", ] @@ -30,17 +42,21 @@ def cuda_profiler(output_file, output_mode=None, config=None): written into this file. output_mode (string) : The output mode has Key-Value pair format and Comma separated values format. It should be 'kvp' or 'csv'. - config (string) : The profiler options and counters can refer to - "Compute Command Line Profiler User Guide". + config (list of string) : The profiler options and counters can refer + to "Compute Command Line Profiler User Guide". """ if output_mode is None: output_mode = 'csv' if output_mode not in ['kvp', 'csv']: raise ValueError("The output mode must be 'kvp' or 'csv'.") config = NVPROF_CONFIG if config is None else config - core.nvprof_init(output_file, output_mode, config) + config_file = 'nvprof_config_file' + with open(config_file, 'wb') as fp: + fp.writelines(["%s\n" % item for item in config]) + core.nvprof_init(output_file, output_mode, config_file) # Enables profiler collection by the active CUDA profiling tool. core.nvprof_start() yield # Disables profiler collection. core.nvprof_stop() + os.remove(config_file) diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py index 395d0dc36a..d01e257449 100644 --- a/python/paddle/v2/fluid/tests/test_profiler.py +++ b/python/paddle/v2/fluid/tests/test_profiler.py @@ -3,6 +3,7 @@ import numpy as np import paddle.v2.fluid as fluid import paddle.v2.fluid.profiler as profiler import paddle.v2.fluid.layers as layers +import os class TestProfiler(unittest.TestCase): @@ -18,10 +19,12 @@ class TestProfiler(unittest.TestCase): exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) - with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof: + output_file = 'cuda_profiler.txt' + with profiler.cuda_profiler(output_file, 'csv') as nvprof: for i in range(epoc): input = np.random.random(dshape).astype('float32') exe.run(fluid.default_main_program(), feed={'data': input}) + os.remove(output_file) if __name__ == '__main__': From 22446fe1c83de392bae8ae06f1a15750b0120632 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Tue, 19 Dec 2017 13:35:03 +0800 Subject: [PATCH 02/24] Fix the config arguments. --- python/paddle/v2/fluid/profiler.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index fb21ec4f34..5093c269f7 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -17,12 +17,11 @@ NVPROF_CONFIG = [ "memtransferhostmemtype", "streamid", "cacheconfigrequested", - "cacheconfigrequested", - "cacheconfigrequested", + "cacheconfigexecuted", + "countermodeaggregate", "enableonstart 0", - "conckerneltrace", - "active_warps", "active_warps", + "active_cycles", ] From 3a09ebef5314a60f6db03ac3ef43729e7cfecca2 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 14:15:25 +0800 Subject: [PATCH 03/24] update alexnet training data --- benchmark/IntelOptimizedPaddle.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index 8ee7fd28c5..94a79c3c8e 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -22,6 +22,7 @@ On each machine, we will test and compare the performance of training on single #### Training Test on batch size 64, 128, 256 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz +Pay attetion that the speed below includes forward, backward and parameter update time. So we can not directly compare the data with the benchmark of caffe `time` [command](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/caffe/image/run.sh#L9), which only contain forward and backward. The updating time of parameter would become very heavy when the weight size are large, especially on alexnet. Input image size - 3 * 224 * 224, Time: images/second @@ -55,6 +56,16 @@ Input image size - 3 * 224 * 224, Time: images/second +- Alexnet + +| BatchSize | 64 | 128 | 256 | +|--------------|--------| ------ | -------| +| OpenBLAS | 0.85 | 1.03 | 1.17 | +| MKLML | 71.26 | 106.94 | 155.18 | +| MKL-DNN     | 362.66 | 497.66 | 610.73 | + +chart TBD + #### Inference Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz - VGG-19 From 661f03b42dc14d521d625b8b193a1f3b0a1b7c6c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 16:16:38 +0800 Subject: [PATCH 04/24] add mkldnn fluid design doc --- doc/design/mkl/mkldnn_fluid.md | 150 +++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 doc/design/mkl/mkldnn_fluid.md diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/design/mkl/mkldnn_fluid.md new file mode 100644 index 0000000000..4ad5b39ebd --- /dev/null +++ b/doc/design/mkl/mkldnn_fluid.md @@ -0,0 +1,150 @@ +# Design Doc: Add MKLDNN Kernel in Fluid Operator + + +## Principles + +First of all, we should follow some basical principles like: +1. [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc. +2. [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h). +3. [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization among different divices, which is the topic #6549. So basically we should override `GetActualKernelType` and `trans` functions to support switching kernels. +4. [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`. + +## Sulution + +In general, there are three part we should follow to run a MKL-DNN primitive. +- create a primitive descriptor that describe this operator +- create a memory that handle all memory buffers needed +- stream the primitive create by first two + +We do not want to see the first two would be re-initialized every iteration again and again. \ +So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822). + +Assuming that three condition would be confirmed: +1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`. +2. the `Input Tensor` inside `Compute` function is the one after converted. +3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user. + +### Compute +The algorithm of `Compute` would be described as follow, let's take conv like an example. + +```c++ + + PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library."); + + auto& ctx = executionContext.template device_context(); + + // find primitive by unique key from mkldnn context + // the op_key should be a unique name of this op instance + auto& p = ctx.findPrimitive(op_key + "_fwd"); + + // assuming the input tensor inside this compute function is the one after converted + // this point should be guarantee by another mechanism + auto& i = ctx.findMemory(op_key + "_input"); + + if (p == nullptr || i == nullptr || inputSizeChanged(p, i)) { + auto fwd_primitive_desc = createPrimitiveDesc(ctx); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + shared_ptr in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data())); + shared_ptr wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data())); + shared_ptr out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data(ctx.GetPlace()))); + shared_ptr fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out) + ); + + ctx.addMemory(op_key+"_input", in); + ctx.addMemory(op_key+"_output", out); + ctx.addMemory(op_key+"_filer", wgt); + ctx.addPrimitive(op_key+"_fwd", fwd_primitive); + ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc); + } + + p = ctx.findPrimitive(op_key + "_fwd"); + + PADDLE_ENFORCE(p, "Should have Forward Primitive"); + PADDLE_ENFORCE(ctx.findMemory(op_unique_key+"_input"), "Should have input memory"); + PADDLE_ENFORCE(ctx.findMemory(op_unique_key+"_output"), "Should have output memory"); + PADDLE_ENFORCE(ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory"); + PADDLE_ENFORCE(ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc"); + ctx.submit(p); + ctx.execute(); // the convert primitive should have already contained. + +``` + +The `createPrimitiveDesc` returns the primitive descripotor of this operator, would be like this: +```c++ + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + algorithm algo = static_cast(ctx.Attr("convolution_algorithm_option")); + prop_kind pk = ctx.Attr("is_test") ? prop_kind::forward_inference : prop_kind::forward_training; + + auto fwd_desc = mkldnn::conv_fwd::desc(/* all the setting above*/); + shared_ptr fwd_primitive_desc(new mkldnn::conv_fwd::primitive_desc(fwd_desc, ctx.getEngine())); + + return fwd_primitive_desc; + } +``` + +### MKLDNNDeviceContext +`MKLDNNDeviceContext`, which is very straightforward, should contain some base information like: `stream`, `engine` and the map needed. + + +### mkldnn_helper +Some functions would be put in `paddle/platform/mkldnn_helper.h`. +- create MKLDNN memories +- create MKLDNN primitives +- error check function +- etc + + +### Kernel Switch +We should `reorder` the different Layout from other device or to other device. `GetExpectedKernelType` and `trans` functions can help us to implement it. + +`GetExpectedKernelType` should get the context, and this operator can return the best `KernelType`. +`trans` would be like this: + +```c++ +void trans(inputs, ctx) override { + if (NoNeedTrasn()) { + return; + } + + // find reorder primitive by op_key from context + auto& p = ctx.findPrimitive(op_key + "_reorder_input"); + auto& i = ctx.findMemory(op_key + "_src_input"); + + if (p == nullptr || i == nullptr || changeSized(i, input)) { + auto prim = createPrimitiveDesc(ctx); + auto src = createMemory(memoryDesc(input->dims(), actual_layout), input->data); + auto newbuffer = paddle::memory::Alloc(ctx.GetPlace(), input->size_in_bytes()); + auto dst = createMemory(p->expected_desc(), newbuffer->data); + auto reorder_primitive(new mkldnn::reorder(src, dst)); + + ctx.addMemory(op_key+"_src_input", src); + ctx.addMemory(op_key+"_input", dst); + ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive); + } + + p = ctx.findPrimitive(op_key + "_reorder_input"); + PADDLE_ENFORCE(p, "Should have Reorder Primitive"); + ctx.submit(p); + if (! this->isMKLDNNKernel()) { + // execute immediately only if this is not mkldnn kernel function. + // otherwise, it can be executed with the operator primitive in Compute + ctx.stream(); + } + // after submit, the input tensor in ctx should be changes as the converted one + // there should be another mechanism to ensure this +} +``` + +### Unit Test +All the functions should be tested corresponding。 +TBD From 544ff7848a3eb34125d5f65cdcf513e3e9f807c2 Mon Sep 17 00:00:00 2001 From: dangqingqing Date: Fri, 22 Dec 2017 17:00:22 +0800 Subject: [PATCH 05/24] Resume the nvprof config. --- python/paddle/v2/fluid/profiler.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py index 5093c269f7..dcecd76224 100644 --- a/python/paddle/v2/fluid/profiler.py +++ b/python/paddle/v2/fluid/profiler.py @@ -9,19 +9,9 @@ NVPROF_CONFIG = [ "gpuendtimestamp", "gridsize3d", "threadblocksize", - "dynsmemperblock", - "stasmemperblock", - "regperthread", - "memtransfersize", - "memtransferdir", - "memtransferhostmemtype", "streamid", - "cacheconfigrequested", - "cacheconfigexecuted", - "countermodeaggregate", "enableonstart 0", - "active_warps", - "active_cycles", + "conckerneltrace", ] From da5422665b78f438f585932246f63795fd7c2cf4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 22 Dec 2017 17:52:18 +0800 Subject: [PATCH 06/24] follow comments and refine --- doc/design/mkl/mkldnn_fluid.md | 79 +++++++++++++++++----------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/doc/design/mkl/mkldnn_fluid.md b/doc/design/mkl/mkldnn_fluid.md index 4ad5b39ebd..bef126f3f0 100644 --- a/doc/design/mkl/mkldnn_fluid.md +++ b/doc/design/mkl/mkldnn_fluid.md @@ -1,25 +1,26 @@ # Design Doc: Add MKLDNN Kernel in Fluid Operator - ## Principles First of all, we should follow some basical principles like: 1. [How to write a new operator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md). We are trying to add a new kind of kernel into operators, so basically we should follow this doc. 2. [Supporting new Device/Library](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/support_new_device.md). Since MKLDNN is a new library to fluid, we should add `MKLDNNDeviceContext` and maybe `mkldnn_helper.h`, just like [cudnn_helper.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/cudnn_helper.h). -3. [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization among different divices, which is the topic #6549. So basically we should override `GetActualKernelType` and `trans` functions to support switching kernels. +3. [Switch Kernel](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). Another important point is that we should ensure the data synchronization between different kernel types, which is this [topic](https://github.com/PaddlePaddle/Paddle/issues/6549). So basically we should override `GetExpectedKernelType` and `trans` functions to support switching kernels. 4. [The Keys of Operator Kernel Type](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). Kernel Type is a pivotal conception which can record the `Place`, `Library`, `DataType` and `Layout`. ## Sulution -In general, there are three part we should follow to run a MKL-DNN primitive. -- create a primitive descriptor that describe this operator -- create a memory that handle all memory buffers needed -- stream the primitive create by first two +In general, there are four parts we should follow to run a MKL-DNN primitive. +- Create a primitive descriptor that describe this operator +- Create a primitive itself by primitive descriptor and the engine +- Create all memory buffers that primitive needed +- Launch a stream to execute the primitive created +More details can refer to [here](http://01org.github.io/mkl-dnn). -We do not want to see the first two would be re-initialized every iteration again and again. \ +It's better to avoid reinitialization of primitives and memory handles in the first three stages in every iteration. \ So we plan to create a map to record all the `primitive` and `memory`, which should not take too much memories as discussed [here](https://github.com/PaddlePaddle/Paddle/issues/6822). -Assuming that three condition would be confirmed: +It's assumed that following three conditions should be satisfied. 1. there is a unique key for each operator instance. May be the actual name of `Output Tensor`. 2. the `Input Tensor` inside `Compute` function is the one after converted. 3. we can get the phase(eg. `is_test`) inside `Compute` function, otherwise we need to expose this attribue to user. @@ -32,44 +33,42 @@ The algorithm of `Compute` would be described as follow, let's take conv like an PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); PADDLE_ENFORCE(platform::is_mkldnn_library(ctx.GetLibrary()), "It must use MKLDNN Library."); - auto& ctx = executionContext.template device_context(); + auto& dev_ctx = ctx.template device_context(); // find primitive by unique key from mkldnn context // the op_key should be a unique name of this op instance - auto& p = ctx.findPrimitive(op_key + "_fwd"); + auto& p = dev_ctx.findPrimitive(op_key + "_fwd"); // assuming the input tensor inside this compute function is the one after converted // this point should be guarantee by another mechanism - auto& i = ctx.findMemory(op_key + "_input"); + auto& i = dev_ctx.findMemory(op_key + "_input"); if (p == nullptr || i == nullptr || inputSizeChanged(p, i)) { auto fwd_primitive_desc = createPrimitiveDesc(ctx); auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); - shared_ptr in(new mkldnn::memory(fwd_primitive_desc->src_primitive_desc(), input->data())); shared_ptr wgt(new mkldnn::memory(fwd_primitive_desc->weights_primitive_desc(), filter->data())); shared_ptr out(new mkldnn::memory(fwd_primitive_desc->dst_primitive_desc(), output->mutable_data(ctx.GetPlace()))); - shared_ptr fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out) - ); - - ctx.addMemory(op_key+"_input", in); - ctx.addMemory(op_key+"_output", out); - ctx.addMemory(op_key+"_filer", wgt); - ctx.addPrimitive(op_key+"_fwd", fwd_primitive); - ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc); + shared_ptr fwd_primitive(new mkldnn::conv_fwd(*fwd_primitive_desc, *in, *wgt, *out)); + + dev_ctx.addMemory(op_key+"_input", in); + dev_ctx.addMemory(op_key+"_output", out); + dev_ctx.addMemory(op_key+"_filer", wgt); + dev_ctx.addPrimitive(op_key+"_fwd", fwd_primitive); + dev_ctx.addPrimitiveDesc(op_key+"_fwd_PD", fwd_primitive_desc); } - p = ctx.findPrimitive(op_key + "_fwd"); + p = dev_ctx.findPrimitive(op_key + "_fwd"); - PADDLE_ENFORCE(p, "Should have Forward Primitive"); - PADDLE_ENFORCE(ctx.findMemory(op_unique_key+"_input"), "Should have input memory"); - PADDLE_ENFORCE(ctx.findMemory(op_unique_key+"_output"), "Should have output memory"); - PADDLE_ENFORCE(ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory"); - PADDLE_ENFORCE(ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc"); - ctx.submit(p); - ctx.execute(); // the convert primitive should have already contained. + PADDLE_ENFORCE(p, "Should have forward Primitive"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_input"), "Should have input memory"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_output"), "Should have output memory"); + PADDLE_ENFORCE(dev_ctx.findMemory(op_unique_key+"_filter"), "Should have filter memory"); + PADDLE_ENFORCE(dev_ctx.findPrimitiveDesc(op_unique_key+"_fwd_PD"), "Should have forward PrimitiveDesc"); + dev_ctx.submit(p); + dev_ctx.execute(); // the convert primitive should have already contained. ``` @@ -112,13 +111,13 @@ We should `reorder` the different Layout from other device or to other device. ` ```c++ void trans(inputs, ctx) override { - if (NoNeedTrasn()) { + if (NoNeedTrans()) { return; } - // find reorder primitive by op_key from context - auto& p = ctx.findPrimitive(op_key + "_reorder_input"); - auto& i = ctx.findMemory(op_key + "_src_input"); + auto& dev_ctx = ctx.template device_context(); + auto& p = dev_ctx.findPrimitive(op_key + "_reorder_input"); + auto& i = dev_ctx.findMemory(op_key + "_src_input"); if (p == nullptr || i == nullptr || changeSized(i, input)) { auto prim = createPrimitiveDesc(ctx); @@ -127,24 +126,24 @@ void trans(inputs, ctx) override { auto dst = createMemory(p->expected_desc(), newbuffer->data); auto reorder_primitive(new mkldnn::reorder(src, dst)); - ctx.addMemory(op_key+"_src_input", src); - ctx.addMemory(op_key+"_input", dst); - ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive); + dev_ctx.addMemory(op_key+"_src_input", src); + dev_ctx.addMemory(op_key+"_input", dst); + dev_ctx.addPrimitive(op_key+"_reorder_input", reorder_primitive); } - p = ctx.findPrimitive(op_key + "_reorder_input"); + p = dev_ctx.findPrimitive(op_key + "_reorder_input"); PADDLE_ENFORCE(p, "Should have Reorder Primitive"); - ctx.submit(p); + dev_ctx.submit(p); if (! this->isMKLDNNKernel()) { // execute immediately only if this is not mkldnn kernel function. // otherwise, it can be executed with the operator primitive in Compute - ctx.stream(); + dev_ctx.stream(); } - // after submit, the input tensor in ctx should be changes as the converted one + // after submit, the input tensor in ExecutionContext should be changed as the converted one // there should be another mechanism to ensure this } ``` ### Unit Test -All the functions should be tested corresponding。 +All the functions should be tested corresponding. TBD From 313afc9ccef6c85483e5d9c1255f60a22b43eb3c Mon Sep 17 00:00:00 2001 From: qiaolongfei Date: Mon, 25 Dec 2017 00:10:46 +0800 Subject: [PATCH 07/24] add op_kernel_type_test --- paddle/framework/CMakeLists.txt | 2 + paddle/framework/data_layout.h | 21 ++++++++++ paddle/framework/library_type.h | 18 +++++++++ paddle/framework/op_kernel_type.h | 9 +++++ paddle/framework/op_kernel_type_test.cc | 51 +++++++++++++++++++++++++ paddle/framework/operator.cc | 7 ---- paddle/framework/operator.h | 2 - 7 files changed, 101 insertions(+), 9 deletions(-) create mode 100644 paddle/framework/op_kernel_type_test.cc diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index be9c01fb04..5f826aeb83 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -61,3 +61,5 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece) cc_test(init_test SRCS init_test.cc DEPS init) + +cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context) diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h index 7429de7ee3..7d7a444cf0 100644 --- a/paddle/framework/data_layout.h +++ b/paddle/framework/data_layout.h @@ -14,6 +14,9 @@ limitations under the License. */ #pragma once +#include +#include "paddle/platform/enforce.h" + namespace paddle { namespace framework { @@ -33,5 +36,23 @@ inline DataLayout StringToDataLayout(const std::string& str) { } } +inline std::string DataLayoutToString(const DataLayout& data_layout) { + switch (data_layout) { + case kNHWC: + return "NHWC"; + case kNCHW: + return "NCHW"; + case kAnyLayout: + return "ANY_LAYOUT"; + default: + PADDLE_THROW("unknown DataLayou %d", data_layout); + } +} + +inline std::ostream& operator<<(std::ostream& out, DataLayout l) { + out << DataLayoutToString(l); + return out; +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h index 49b273656b..aa66cf00f3 100644 --- a/paddle/framework/library_type.h +++ b/paddle/framework/library_type.h @@ -22,5 +22,23 @@ namespace framework { enum LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 }; +inline std::string LibraryTypeToString(const LibraryType& library_type) { + switch (library_type) { + case kPlain: + return "PLAIN"; + case kMKLDNN: + return "MKLDNN"; + case kCUDNN: + return "CUDNN"; + default: + PADDLE_THROW("unknown LibraryType %d", library_type); + } +} + +inline std::ostream& operator<<(std::ostream& out, LibraryType l) { + out << LibraryTypeToString(l); + return out; +} + } // namespace } // framework diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h index a1dea0d9d8..e9c45b958c 100644 --- a/paddle/framework/op_kernel_type.h +++ b/paddle/framework/op_kernel_type.h @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/framework/data_layout.h" #include "paddle/framework/data_type.h" #include "paddle/framework/library_type.h" +#include "paddle/platform/device_context.h" #include "paddle/platform/place.h" namespace paddle { @@ -68,5 +69,13 @@ struct OpKernelType { } }; +inline std::ostream& operator<<(std::ostream& os, + const OpKernelType& kernel_key) { + os << "data_type[" << kernel_key.data_type_ << "]:data_layout[" + << kernel_key.data_layout_ << "]:place[" << kernel_key.place_ + << "]:library_type[" << kernel_key.library_type_ << "]"; + return os; +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc new file mode 100644 index 0000000000..899676b5c1 --- /dev/null +++ b/paddle/framework/op_kernel_type_test.cc @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/framework/op_kernel_type.h" +#include +#include + +TEST(OpKernelType, ToString) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::DataType; + using CPUPlace = paddle::platform::CPUPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + + std::ostringstream stream; + stream << op_kernel_type; + ASSERT_EQ( + stream.str(), + "data_type[5]:data_layout[NCHW]:place[CPUPlace]:library_type[CUDNN]"); +} + +TEST(OpKernelType, Hash) { + using OpKernelType = paddle::framework::OpKernelType; + using DataType = paddle::framework::proto::DataType; + using CPUPlace = paddle::platform::CPUPlace; + using GPUPlace = paddle::platform::GPUPlace; + using DataLayout = paddle::framework::DataLayout; + using LibraryType = paddle::framework::LibraryType; + + OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW, + LibraryType::kCUDNN); + OpKernelType op_kernel_type_2(DataType::FP32, GPUPlace(0), DataLayout::kNCHW, + LibraryType::kCUDNN); + + OpKernelType::Hash hasher; + ASSERT_NE(hasher(op_kernel_type_1), hasher(op_kernel_type_2)); +} \ No newline at end of file diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index 06184f6ba9..f147cc5a6e 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -242,13 +242,6 @@ std::vector ExecutionContext::MultiOutput( return res; } -std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key) { - os << "data_type[" << kernel_key.data_type_ << "]:data_layout[" - << kernel_key.data_layout_ << "]:place[" << kernel_key.place_ - << "]:library_type[" << kernel_key.library_type_ << "]"; - return os; -} - bool OpSupportGPU(const std::string& op_type) { auto& all_kernels = OperatorWithKernel::AllOpKernels(); auto it = all_kernels.find(op_type); diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index aba34c5bcb..b592eea1b9 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -381,8 +381,6 @@ class OperatorWithKernel : public OperatorBase { proto::DataType IndicateDataType(const ExecutionContext& ctx) const; }; -std::ostream& operator<<(std::ostream& os, const OpKernelType& kernel_key); - extern bool OpSupportGPU(const std::string& op_type); } // namespace framework From 9e7c0686770f939bc6a9370724edb1fe491480b1 Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 25 Dec 2017 09:49:24 +0800 Subject: [PATCH 08/24] fix embedding example --- python/paddle/v2/fluid/layers/nn.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py index 2adce99d05..941675ec3e 100644 --- a/python/paddle/v2/fluid/layers/nn.py +++ b/python/paddle/v2/fluid/layers/nn.py @@ -163,8 +163,9 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): Examples: .. code-block:: python + dict_size = len(dataset.ids) data = fluid.layers.data(name='ids', shape=[32, 32], dtype='float32') - fc = fluid.layers.embedding(input=data, size=16) + fc = fluid.layers.embedding(input=data, size=[dict_size, 16]) """ helper = LayerHelper('embedding', **locals()) From 127bc2e09c3e014a116c2e86f2f8abae8add10e6 Mon Sep 17 00:00:00 2001 From: Yancey Date: Mon, 25 Dec 2017 11:15:33 +0800 Subject: [PATCH 09/24] Implement a simple threadpool (#6684) * implement a simple threadpool * unlock before cv.notify * add done function * add lock with GetAvailable function * delete done_ * using call_once in GetInstance * update by comment * update comment * enhance unit test for multi threads task --- paddle/framework/CMakeLists.txt | 1 + paddle/framework/threadpool.h | 161 ++++++++++++++++++++++++++++ paddle/framework/threadpool_test.cc | 58 ++++++++++ 3 files changed, 220 insertions(+) create mode 100644 paddle/framework/threadpool.h create mode 100644 paddle/framework/threadpool_test.cc diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 5f826aeb83..25a0db2768 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -59,6 +59,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) +cc_test(threadpool_test SRCS threadpool_test.cc) cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece) cc_test(init_test SRCS init_test.cc DEPS init) diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h new file mode 100644 index 0000000000..9a1ece3ae8 --- /dev/null +++ b/paddle/framework/threadpool.h @@ -0,0 +1,161 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/platform/call_once.h" +#include "paddle/platform/enforce.h" + +namespace paddle { +namespace framework { + +typedef std::function Task; + +class ThreadPool { + public: + /** + * @brief Get a instance of threadpool, the thread number will + * be specified as the number of hardware thread contexts + */ + static ThreadPool* GetInstance() { + std::call_once(init_flag, &ThreadPool::Init); + return threadpool.get(); + } + + ~ThreadPool() { + { + // notify all threads to stop running + running_ = false; + scheduled_.notify_all(); + } + + for (auto& t : threads_) { + t->join(); + t.reset(nullptr); + } + } + + int GetNumThreads() const { return num_threads_; } + + int GetAvailable() { + std::unique_lock lock(mutex_); + return available_; + } + + /** + * @brief Push a function to the queue, and will be scheduled and + * executed if a thread is available. + * @param[in] Task will be pushed to the task queue. + */ + void Run(const Task& fn) { + std::unique_lock lock(mutex_); + tasks_.push(fn); + lock.unlock(); + scheduled_.notify_one(); + } + + /** + * @brief Wait until all the tasks are completed. + */ + void Wait() { + std::unique_lock lock(mutex_); + completed_.wait(lock, [=] { return Done() == true; }); + } + + private: + ThreadPool& operator=(const ThreadPool&) = delete; + ThreadPool(const ThreadPool&) = delete; + + ThreadPool(int num_threads) + : num_threads_(num_threads), available_(num_threads), running_(true) { + threads_.resize(num_threads); + for (auto& thread : threads_) { + // TODO(Yancey1989): binding the thread on the specify CPU number + thread.reset(new std::thread(std::bind(&ThreadPool::TaskLoop, this))); + } + } + + /** + * @brief If the task queue is empty and avaialbe + * is equal to the number of threads, means that + * all tasks are completed. + * + * Note: this function is not thread-safe. + * + * @return true if all tasks are completed. + */ + bool Done() { return tasks_.empty() && available_ == num_threads_; } + + void TaskLoop() { + while (running_) { + std::unique_lock lock(mutex_); + scheduled_.wait(lock, [=] { return !tasks_.empty() || !running_; }); + + if (!running_) { + break; + } + // pop a task from the task queue + auto task = tasks_.front(); + tasks_.pop(); + + --available_; + lock.unlock(); + + // run the task + task(); + + { + std::unique_lock lock(mutex_); + ++available_; + if (Done()) { + completed_.notify_all(); + } + } + } + } + + static void Init() { + if (threadpool.get() == nullptr) { + // TODO(Yancey1989): specify the max threads number + int num_threads = std::thread::hardware_concurrency(); + PADDLE_ENFORCE_GT(num_threads, 0); + threadpool.reset(new ThreadPool(num_threads)); + } + } + + private: + static std::unique_ptr threadpool; + static std::once_flag init_flag; + + int num_threads_; + int available_; + bool running_; + std::queue tasks_; + std::vector> threads_; + std::mutex mutex_; + std::condition_variable scheduled_; + std::condition_variable completed_; +}; + +std::unique_ptr ThreadPool::threadpool(nullptr); +std::once_flag ThreadPool::init_flag; +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc new file mode 100644 index 0000000000..78c762608e --- /dev/null +++ b/paddle/framework/threadpool_test.cc @@ -0,0 +1,58 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "threadpool.h" +#include +#include +#include +#include +#include + +namespace framework = paddle::framework; + +void do_sum(framework::ThreadPool* pool, std::atomic& sum, int cnt) { + for (int i = 0; i < cnt; ++i) { + pool->Run([&sum]() { sum.fetch_add(1); }); + } +} + +TEST(ThreadPool, ConcurrentInit) { + framework::ThreadPool* pool; + int concurrent_cnt = 50; + std::vector threads; + for (int i = 0; i < concurrent_cnt; ++i) { + std::thread t([&pool]() { pool = framework::ThreadPool::GetInstance(); }); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } +} + +TEST(ThreadPool, ConcurrentStart) { + framework::ThreadPool* pool = framework::ThreadPool::GetInstance(); + std::atomic sum(0); + std::vector threads; + int concurrent_cnt = 50; + // sum = (n * (n + 1)) / 2 + for (int i = 1; i <= concurrent_cnt; ++i) { + std::thread t(do_sum, pool, std::ref(sum), i); + threads.push_back(std::move(t)); + } + for (auto& t : threads) { + t.join(); + } + pool->Wait(); + EXPECT_EQ(sum, ((concurrent_cnt + 1) * concurrent_cnt) / 2); +} From 39ef5736c2f796079403e2483930236a2be45444 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Mon, 25 Dec 2017 12:15:54 +0800 Subject: [PATCH 10/24] Use tranform to rewrite adam --- paddle/operators/adam_op.h | 132 ++++++++++++++++++++++++++----------- 1 file changed, 93 insertions(+), 39 deletions(-) diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 45157842a6..887258530c 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -13,59 +13,113 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include "paddle/framework/eigen.h" +#include // for sqrt in CPU and CUDA #include "paddle/framework/op_registry.h" +#include "paddle/operators/detail/safe_ref.h" +#include "paddle/platform/transform.h" namespace paddle { namespace operators { +template +struct AdamFunctor { + T beta1_; + T beta2_; + T epsilon_; + + const T* beta1_pow_; + const T* beta2_pow_; + const T* moment1_; + T* moment1_out_; + const T* moment2_; + T* moment2_out_; + const T* lr_; + const T* grad_; + const T* param_; + + AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, + const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, + T* mom2_out, const T* lr, const T* grad, const T* param) + : beta1_(beta1), + beta2_(beta2), + epsilon_(epsilon), + beta1_pow_(beta1_pow), + beta2_pow_(beta2_pow), + moment1_(mom1), + moment1_out_(mom1_out), + moment2_(mom2), + moment2_out_(mom2_out), + lr_(lr), + grad_(grad), + param_(param) {} + + // From param[i] --> param_out[i]; + inline HOSTDEVICE T operator()(const T& p) const { + size_t i = &p - param_; + // Merge all memory access together. + T g = grad_[i]; + T mom1 = moment1_[i]; + T mom2 = moment2_[i]; + T lr = *lr_; + T beta1_pow = *beta1_pow_; + T beta2_pow = *beta2_pow_; + + // Calculation + lr = lr * sqrt(1 - beta2_pow) / (1 - beta1_pow); + mom1 = beta1_ * mom1 + (1 - beta1_) * g; + mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; + T new_p = p - lr * (mom1 / (sqrt(mom2) + epsilon_)); + + // Write back to global memory + moment1_out_[i] = mom1; + moment2_out_[i] = mom2; + return new_p; + } +}; + template class AdamOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out_tensor = ctx.Output("ParamOut"); - auto moment1_out_tensor = ctx.Output("Moment1Out"); - auto moment2_out_tensor = ctx.Output("Moment2Out"); - - param_out_tensor->mutable_data(ctx.GetPlace()); - moment1_out_tensor->mutable_data(ctx.GetPlace()); - moment2_out_tensor->mutable_data(ctx.GetPlace()); + using paddle::framework::LoDTensor; + using paddle::operators::detail::Ref; T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); + auto& param = Ref(ctx.Input("Param"), "Must set Param"); + auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); + auto& mom1 = Ref(ctx.Input("Moment1"), "Must set Moment1"); + auto& mom2 = Ref(ctx.Input("Moment2"), "Must set Moment2"); + auto& lr = + Ref(ctx.Input("LearningRate"), "Must set LearningRate"); + + auto& beta1_pow = + Ref(ctx.Input("Beta1Pow"), "Must set Beta1Pow"); + auto& beta2_pow = + Ref(ctx.Input("Beta2Pow"), "Must set Beta2Pow"); + + auto& param_out = + Ref(ctx.Output("ParamOut"), "Must set ParamOut"); + auto& mom1_out = + Ref(ctx.Output("Moment1Out"), "Must set Moment1Out"); + auto& mom2_out = + Ref(ctx.Output("Moment2Out"), "Must set Moment1Out"); + + AdamFunctor functor(beta1, beta2, epsilon, beta1_pow.template data(), + beta2_pow.template data(), + mom1.template data(), + mom1_out.template mutable_data(ctx.GetPlace()), + mom2.template data(), + mom2_out.template mutable_data(ctx.GetPlace()), + lr.template data(), grad.template data(), + param.template data()); - auto param = framework::EigenVector::Flatten( - *ctx.Input("Param")); - auto grad = framework::EigenVector::Flatten( - *ctx.Input("Grad")); - auto moment1 = framework::EigenVector::Flatten( - *ctx.Input("Moment1")); - auto moment2 = framework::EigenVector::Flatten( - *ctx.Input("Moment2")); - auto lr = framework::EigenVector::Flatten( - *ctx.Input("LearningRate")); - auto beta1_pow = framework::EigenVector::Flatten( - *ctx.Input("Beta1Pow")); - auto beta2_pow = framework::EigenVector::Flatten( - *ctx.Input("Beta2Pow")); - auto param_out = framework::EigenVector::Flatten(*param_out_tensor); - auto moment1_out = framework::EigenVector::Flatten(*moment1_out_tensor); - auto moment2_out = framework::EigenVector::Flatten(*moment2_out_tensor); - auto* place = ctx.template device_context().eigen_device(); - - moment1_out.device(*place) = beta1 * moment1 + (1 - beta1) * grad; - moment2_out.device(*place) = beta2 * moment2 + (1 - beta2) * grad.square(); - - // All of these are tensors of 1 element - auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow); - // Eigen does not support automatic broadcast - // Get dimensions of moment vector to broadcast lr_t - Eigen::DSizes m_dsize(moment1_out_tensor->numel()); - param_out.device(*place) = - param - - lr_t.broadcast(m_dsize) * - (moment1_out / (moment2_out.sqrt() + epsilon)); + const T* in_ptr = param.template data(); + T* out_ptr = param_out.template mutable_data(ctx.GetPlace()); + platform::Transform trans; + trans(static_cast(ctx.device_context()), in_ptr, + in_ptr + param_out.numel(), out_ptr, functor); } }; From 0d2235aadf87a22773d6ffe8322126715f42d3aa Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 25 Dec 2017 12:44:31 +0800 Subject: [PATCH 11/24] GPUPlace to CUDAPlace (#6960) --- paddle/framework/init.cc | 2 +- paddle/framework/lod_tensor.cc | 2 +- paddle/framework/lod_tensor_test.cu | 2 +- paddle/framework/op_kernel_type_test.cc | 4 +-- paddle/framework/op_registry.h | 2 +- paddle/framework/tensor.md | 2 +- paddle/framework/tensor_impl.h | 6 ++-- paddle/framework/tensor_test.cc | 16 +++++------ paddle/framework/tensor_util.h | 18 ++++++------ paddle/framework/tensor_util_test.cc | 6 ++-- paddle/memory/README.md | 10 +++---- paddle/memory/memcpy.cc | 24 ++++++---------- paddle/memory/memory.cc | 8 +++--- paddle/memory/memory_test.cc | 6 ++-- paddle/operators/accuracy_op.cu | 2 +- paddle/operators/batch_norm_op.cu.cc | 4 +-- paddle/operators/conv_cudnn_op.cu.cc | 8 +++--- .../operators/conv_transpose_cudnn_op.cu.cc | 8 +++--- paddle/operators/detail/strided_memcpy.h | 2 +- paddle/operators/linear_chain_crf_op.h | 8 +++--- paddle/operators/lookup_table_op.cu | 2 +- paddle/operators/lstm_unit_op.cu | 4 +-- paddle/operators/math/im2col_test.cc | 3 +- paddle/operators/math/math_function.cu | 8 +++--- paddle/operators/math/math_function_test.cu | 10 +++---- .../operators/math/selected_rows_functor.cu | 12 ++++---- .../math/selected_rows_functor_test.cu | 4 +-- paddle/operators/math/vol2col_test.cc | 2 +- paddle/operators/multiplex_op.cu | 4 +-- paddle/operators/nccl_op.cu.cc | 6 ++-- paddle/operators/nccl_op_test.cu.cc | 18 ++++++------ paddle/operators/pool_cudnn_op.cu.cc | 4 +-- paddle/operators/reshape_op.cu | 4 +-- paddle/operators/strided_memcpy_test.cc | 4 +-- paddle/operators/top_k_op.cu | 2 +- paddle/platform/device_context.cc | 10 +++---- paddle/platform/device_context.h | 6 ++-- paddle/platform/device_context_test.cu | 18 ++++++------ paddle/platform/nccl_test.cu | 4 +-- paddle/platform/place.cc | 8 ++++-- paddle/platform/place.h | 28 ++++++++++--------- paddle/platform/place_test.cc | 6 ++-- paddle/platform/transform_test.cu | 4 +-- paddle/pybind/pybind.cc | 16 +++++------ paddle/pybind/tensor_py.h | 4 +-- paddle/testing/paddle_gtest_main.cc | 2 +- python/paddle/v2/fluid/__init__.py | 4 +-- python/paddle/v2/fluid/executor.py | 2 +- .../tests/book/test_recommender_system.py | 2 +- python/paddle/v2/fluid/tests/op_test.py | 4 +-- .../paddle/v2/fluid/tests/test_adagrad_op.py | 2 +- .../v2/fluid/tests/test_batch_norm_op.py | 4 +-- .../v2/fluid/tests/test_gaussian_random_op.py | 2 +- python/paddle/v2/fluid/tests/test_profiler.py | 2 +- python/paddle/v2/fluid/tests/test_sgd_op.py | 2 +- .../v2/fluid/tests/test_uniform_random_op.py | 2 +- 56 files changed, 179 insertions(+), 180 deletions(-) diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc index 4deb4fa903..3ff2da3446 100644 --- a/paddle/framework/init.cc +++ b/paddle/framework/init.cc @@ -54,7 +54,7 @@ bool InitDevices(const std::vector &devices) { #ifdef PADDLE_WITH_CUDA auto pos = string::RFind(p, ':', string::Piece::npos); auto number = device.substr(pos + 1); - places.emplace_back(platform::GPUPlace(std::stoi(number))); + places.emplace_back(platform::CUDAPlace(std::stoi(number))); #else LOG(WARNING) << "'GPU' is not supported, Please re-compile with WITH_GPU option"; diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc index 465f8c62b5..d766d3c416 100644 --- a/paddle/framework/lod_tensor.cc +++ b/paddle/framework/lod_tensor.cc @@ -224,7 +224,7 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, while (size != 0) { size_t size_to_write = std::min(kBufSize, static_cast(size)); memory::Copy(cpu, buf.get(), - boost::get(tensor.place()), + boost::get(tensor.place()), reinterpret_cast(data), size_to_write, gpu_dev_ctx.stream()); gpu_dev_ctx.Wait(); diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu index 5b90fbfca7..e8508ad265 100644 --- a/paddle/framework/lod_tensor_test.cu +++ b/paddle/framework/lod_tensor_test.cu @@ -27,7 +27,7 @@ __global__ void test(size_t* a, int size) { TEST(LoDTensor, LoDInGPU) { paddle::framework::LoDTensor lod_tensor; - paddle::platform::GPUPlace place(0); + paddle::platform::CUDAPlace place(0); paddle::framework::LoD src_lod; src_lod.push_back(std::vector{0, 2, 4, 6, 8, 10, 12, 14}); diff --git a/paddle/framework/op_kernel_type_test.cc b/paddle/framework/op_kernel_type_test.cc index 899676b5c1..8753d7cc37 100644 --- a/paddle/framework/op_kernel_type_test.cc +++ b/paddle/framework/op_kernel_type_test.cc @@ -37,13 +37,13 @@ TEST(OpKernelType, Hash) { using OpKernelType = paddle::framework::OpKernelType; using DataType = paddle::framework::proto::DataType; using CPUPlace = paddle::platform::CPUPlace; - using GPUPlace = paddle::platform::GPUPlace; + using CUDAPlace = paddle::platform::CUDAPlace; using DataLayout = paddle::framework::DataLayout; using LibraryType = paddle::framework::LibraryType; OpKernelType op_kernel_type_1(DataType::FP32, CPUPlace(), DataLayout::kNCHW, LibraryType::kCUDNN); - OpKernelType op_kernel_type_2(DataType::FP32, GPUPlace(0), DataLayout::kNCHW, + OpKernelType op_kernel_type_2(DataType::FP32, CUDAPlace(0), DataLayout::kNCHW, LibraryType::kCUDNN); OpKernelType::Hash hasher; diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h index 244c117465..9bb2a3b5c2 100644 --- a/paddle/framework/op_registry.h +++ b/paddle/framework/op_registry.h @@ -188,7 +188,7 @@ class OpKernelRegistrar : public Registrar { } #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ - REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::GPUPlace, __VA_ARGS__) + REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) #define REGISTER_OP_CPU_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) diff --git a/paddle/framework/tensor.md b/paddle/framework/tensor.md index 7a80816d8e..0a27ac9bb6 100644 --- a/paddle/framework/tensor.md +++ b/paddle/framework/tensor.md @@ -71,7 +71,7 @@ private: ``` ```c++ -typedef boost::variant Place; +typedef boost::variant Place; typedef boost::variant, Dim<2>, Dim<3>, Dim<4>, Dim<5>, Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar; typedef boost::variant< diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index aba1f9f093..3d93b7808b 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -125,11 +125,11 @@ inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { boost::get(place), size, type)); } else if (platform::is_gpu_place(place)) { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); } #else - holder_.reset(new PlaceholderImpl( - boost::get(place), size, type)); + holder_.reset(new PlaceholderImpl( + boost::get(place), size, type)); } #endif offset_ = 0; diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index ceca64365a..f347981f2e 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -80,20 +80,20 @@ TEST(Tensor, MutableData) { float* p1 = nullptr; float* p2 = nullptr; // initialization - p1 = src_tensor.mutable_data(make_ddim({1, 2, 3}), GPUPlace()); + p1 = src_tensor.mutable_data(make_ddim({1, 2, 3}), CUDAPlace()); EXPECT_NE(p1, nullptr); // set src_tensor a new dim with large size // momery is supposed to be re-allocated - p2 = src_tensor.mutable_data(make_ddim({3, 4}), GPUPlace()); + p2 = src_tensor.mutable_data(make_ddim({3, 4}), CUDAPlace()); EXPECT_NE(p2, nullptr); EXPECT_NE(p1, p2); // set src_tensor a new dim with same size // momery block is supposed to be unchanged - p1 = src_tensor.mutable_data(make_ddim({2, 2, 3}), GPUPlace()); + p1 = src_tensor.mutable_data(make_ddim({2, 2, 3}), CUDAPlace()); EXPECT_EQ(p1, p2); // set src_tensor a new dim with smaller size // momery block is supposed to be unchanged - p2 = src_tensor.mutable_data(make_ddim({2, 2}), GPUPlace()); + p2 = src_tensor.mutable_data(make_ddim({2, 2}), CUDAPlace()); EXPECT_EQ(p1, p2); } #endif @@ -130,7 +130,7 @@ TEST(Tensor, ShareDataWith) { { Tensor src_tensor; Tensor dst_tensor; - src_tensor.mutable_data(make_ddim({2, 3, 4}), GPUPlace()); + src_tensor.mutable_data(make_ddim({2, 3, 4}), CUDAPlace()); dst_tensor.ShareDataWith(src_tensor); ASSERT_EQ(src_tensor.data(), dst_tensor.data()); } @@ -166,7 +166,7 @@ TEST(Tensor, Slice) { #ifdef PADDLE_WITH_CUDA { Tensor src_tensor; - src_tensor.mutable_data(make_ddim({6, 9}), GPUPlace()); + src_tensor.mutable_data(make_ddim({6, 9}), CUDAPlace()); Tensor slice_tensor = src_tensor.Slice(2, 6); DDim slice_dims = slice_tensor.dims(); ASSERT_EQ(arity(slice_dims), 2); @@ -176,11 +176,11 @@ TEST(Tensor, Slice) { uintptr_t src_data_address = reinterpret_cast(src_tensor.data()); uintptr_t src_mutable_data_address = reinterpret_cast( - src_tensor.mutable_data(src_tensor.dims(), GPUPlace())); + src_tensor.mutable_data(src_tensor.dims(), CUDAPlace())); uintptr_t slice_data_address = reinterpret_cast(slice_tensor.data()); uintptr_t slice_mutable_data_address = reinterpret_cast( - slice_tensor.mutable_data(slice_tensor.dims(), GPUPlace())); + slice_tensor.mutable_data(slice_tensor.dims(), CUDAPlace())); EXPECT_EQ(src_data_address, src_mutable_data_address); EXPECT_EQ(slice_data_address, slice_mutable_data_address); EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address); diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index 4e34b90d57..5b474e4aef 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -47,11 +47,11 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src_place) && // NOLINT platform::is_cpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); + auto src_gpu_place = boost::get(src_place); auto dst_cpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); + auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); memory::Copy( dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size, @@ -59,21 +59,21 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, } else if (platform::is_cpu_place(src_place) && platform::is_gpu_place(dst_place)) { auto src_cpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); + auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); + auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place); memory::Copy( dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, reinterpret_cast(ctx).stream()); } else if (platform::is_gpu_place(src_place) && platform::is_gpu_place(dst_place)) { - auto src_gpu_place = boost::get(src_place); - auto dst_gpu_place = boost::get(dst_place); + auto src_gpu_place = boost::get(src_place); + auto dst_gpu_place = boost::get(dst_place); auto ctx_place = ctx.GetPlace(); PADDLE_ENFORCE(platform::is_gpu_place(ctx_place)); - auto ctx_gpu_place = boost::get(ctx_place); + auto ctx_gpu_place = boost::get(ctx_place); PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place); memory::Copy( dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size, @@ -108,7 +108,7 @@ inline void CopyFromVector(const std::vector& src, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(dst_place)) { // NOLINT memory::Copy( - boost::get(dst_place), dst_ptr, src_place, src_ptr, + boost::get(dst_place), dst_ptr, src_place, src_ptr, size, reinterpret_cast(ctx).stream()); } @@ -141,7 +141,7 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, #ifdef PADDLE_WITH_CUDA else if (platform::is_gpu_place(src.place())) { // NOLINT memory::Copy( - dst_place, dst_ptr, boost::get(src.place()), + dst_place, dst_ptr, boost::get(src.place()), src_ptr, size, reinterpret_cast(ctx).stream()); } diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index 03a70de182..3afb98a4a5 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -58,7 +58,7 @@ TEST(CopyFrom, Tensor) { memcpy(src_ptr, arr, 9 * sizeof(int)); // CPU Tensor to GPU Tensor - auto gpu_place = new platform::GPUPlace(0); + auto gpu_place = new platform::CUDAPlace(0); platform::CUDADeviceContext gpu_ctx(*gpu_place); CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); @@ -143,7 +143,7 @@ TEST(CopyFromVector, Tensor) { // Copy to GPUTensor gpu_tensor.Resize(make_ddim({3, 3})); - auto gpu_place = new paddle::platform::GPUPlace(); + auto gpu_place = new paddle::platform::CUDAPlace(); CUDADeviceContext gpu_ctx(*gpu_place); CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); // Copy from GPU to CPU tensor for comparison @@ -210,7 +210,7 @@ TEST(CopyToVector, Tensor) { { std::vector src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9}; Tensor gpu_tensor; - GPUPlace place; + CUDAPlace place; CUDADeviceContext gpu_ctx(place); CopyFromVector(src_vec, gpu_ctx, &gpu_tensor); diff --git a/paddle/memory/README.md b/paddle/memory/README.md index 6cb003c50b..7cf61d089b 100644 --- a/paddle/memory/README.md +++ b/paddle/memory/README.md @@ -12,13 +12,13 @@ p = memory::Alloc(platform::CPUPlace(), 4*1024); To allocate 4KB memory on the 3rd GPU: ```cpp -p = memory::Alloc(platform::GPUPlace(2), 4*1024); +p = memory::Alloc(platform::CUDAPlace(2), 4*1024); ``` To free memory and check the so-far used amount of memory on a place: ```cpp -auto pl = platform::GPUPlace(0); +auto pl = platform::CUDAPlace(0); p = memory::Alloc(pl, 4*1024); cout << memory::Used(pl); memory::Free(pl, p); @@ -36,7 +36,7 @@ template size_t Used(Place); } // namespace memory ``` -These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`: +These function templates have specializations on either `platform::CPUPlace` or `platform::CUDAPlace`: ```cpp template<> @@ -49,7 +49,7 @@ and ```cpp template<> -void Alloc(GPUPlace p, size_t size) { +void Alloc(CUDAPlace p, size_t size) { return GetGPUBuddyAllocator(p.id)->Alloc(size); } ``` @@ -122,7 +122,7 @@ There are two implementations of `Context`: 1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory. -1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. +1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::CUDAPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory. ### Majel diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc index 5c629dc3d2..b46141aafd 100644 --- a/paddle/memory/memcpy.cc +++ b/paddle/memory/memcpy.cc @@ -28,31 +28,25 @@ void Copy(platform::CPUPlace, void* dst, #ifdef PADDLE_WITH_CUDA template <> -void Copy(platform::CPUPlace dst_place, - void* dst, - platform::GPUPlace src_place, - const void* src, size_t num, - cudaStream_t stream) { +void Copy( + platform::CPUPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream); } template <> -void Copy(platform::GPUPlace dst_place, - void* dst, - platform::CPUPlace src_place, - const void* src, size_t num, - cudaStream_t stream) { +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CPUPlace src_place, + const void* src, size_t num, cudaStream_t stream) { platform::SetDeviceId(dst_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream); } template <> -void Copy(platform::GPUPlace dst_place, - void* dst, - platform::GPUPlace src_place, - const void* src, size_t num, - cudaStream_t stream) { +void Copy( + platform::CUDAPlace dst_place, void* dst, platform::CUDAPlace src_place, + const void* src, size_t num, cudaStream_t stream) { if (dst_place == src_place) { platform::SetDeviceId(src_place.device); platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream); diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 9cafdfda75..c4bb6baee7 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -83,12 +83,12 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { } template <> -size_t Used(platform::GPUPlace place) { +size_t Used(platform::CUDAPlace place) { return GetGPUBuddyAllocator(place.device)->Used(); } template <> -void* Alloc(platform::GPUPlace place, size_t size) { +void* Alloc(platform::CUDAPlace place, size_t size) { auto* buddy_allocator = GetGPUBuddyAllocator(place.device); auto* ptr = buddy_allocator->Alloc(size); if (ptr == nullptr) { @@ -101,14 +101,14 @@ void* Alloc(platform::GPUPlace place, size_t size) { LOG(WARNING) << "total " << total; LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); - LOG(WARNING) << "GPU memory used: " << Used(place); + LOG(WARNING) << "GPU memory used: " << Used(place); platform::SetDeviceId(cur_dev); } return ptr; } template <> -void Free(platform::GPUPlace place, void* p) { +void Free(platform::CUDAPlace place, void* p) { GetGPUBuddyAllocator(place.device)->Free(p); } diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc index 2444931e26..f476bf7126 100644 --- a/paddle/memory/memory_test.cc +++ b/paddle/memory/memory_test.cc @@ -82,7 +82,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { #ifdef PADDLE_WITH_CUDA -size_t align(size_t size, paddle::platform::GPUPlace place) { +size_t align(size_t size, paddle::platform::CUDAPlace place) { size += sizeof(paddle::memory::detail::Metadata); size_t alignment = paddle::platform::GpuMinChunkSize(); size_t remaining = size % alignment; @@ -94,7 +94,7 @@ TEST(BuddyAllocator, GPUAllocation) { EXPECT_EQ(p, nullptr); - paddle::platform::GPUPlace gpu(0); + paddle::platform::CUDAPlace gpu(0); p = paddle::memory::Alloc(gpu, 4096); EXPECT_NE(p, nullptr); @@ -103,7 +103,7 @@ TEST(BuddyAllocator, GPUAllocation) { } TEST(BuddyAllocator, GPUMultAlloc) { - paddle::platform::GPUPlace gpu; + paddle::platform::CUDAPlace gpu; std::unordered_map ps; diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu index dd51aad105..0aadd5af41 100644 --- a/paddle/operators/accuracy_op.cu +++ b/paddle/operators/accuracy_op.cu @@ -56,7 +56,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* inference = ctx.Input("Out"); auto* indices = ctx.Input("Indices"); auto* label = ctx.Input("Label"); diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc index 55d0736a4c..3d17725ab4 100644 --- a/paddle/operators/batch_norm_op.cu.cc +++ b/paddle/operators/batch_norm_op.cu.cc @@ -53,7 +53,7 @@ class BatchNormKernel public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); double epsilon = static_cast(ctx.Attr("epsilon")); const float momentum = ctx.Attr("momentum"); const bool is_test = ctx.Attr("is_test"); @@ -179,7 +179,7 @@ class BatchNormGradKernel public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); double epsilon = static_cast(ctx.Attr("epsilon")); const std::string data_layout_str = ctx.Attr("data_layout"); const DataLayout data_layout = diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc index 3da0a9001a..79e020b755 100644 --- a/paddle/operators/conv_cudnn_op.cu.cc +++ b/paddle/operators/conv_cudnn_op.cu.cc @@ -36,7 +36,7 @@ class CudnnConvOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); @@ -130,7 +130,7 @@ class CudnnConvOpKernel : public framework::OpKernel { handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, algo, &workspace_size_in_bytes)); // Allocate on GPU memory - platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv forward --------------------- T alpha = 1.0f, beta = 0.0f; @@ -151,7 +151,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto input = ctx.Input("Input"); auto filter = ctx.Input("Filter"); auto output_grad = ctx.Input(framework::GradVarName("Output")); @@ -277,7 +277,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv workspace --------------------- // Already on GPU void* cudnn_workspace = nullptr; - platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- T alpha = 1.0f, beta = 0.0f; diff --git a/paddle/operators/conv_transpose_cudnn_op.cu.cc b/paddle/operators/conv_transpose_cudnn_op.cu.cc index f0297f6c40..b3663209ff 100644 --- a/paddle/operators/conv_transpose_cudnn_op.cu.cc +++ b/paddle/operators/conv_transpose_cudnn_op.cu.cc @@ -35,7 +35,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* input = ctx.Input("Input"); auto* filter = ctx.Input("Filter"); auto* output = ctx.Output("Output"); @@ -100,7 +100,7 @@ class CudnnConvTransposeOpKernel : public framework::OpKernel { cudnn_output_desc, algo, &workspace_size_in_bytes)); // Allocate on GPU memory - platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv transpose forward --------------------- @@ -120,7 +120,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto input = ctx.Input("Input"); auto filter = ctx.Input("Filter"); auto output_grad = ctx.Input(framework::GradVarName("Output")); @@ -201,7 +201,7 @@ class CudnnConvTransposeGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv workspace --------------------- // Already on GPU void* cudnn_workspace = nullptr; - platform::GPUPlace gpu = boost::get(ctx.GetPlace()); + platform::CUDAPlace gpu = boost::get(ctx.GetPlace()); cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes); // ------------------- cudnn conv backward data --------------------- // FIXME(typhoonzero): template type T may not be the same as cudnn call. diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h index 068c82f399..b81bb8ba7e 100644 --- a/paddle/operators/detail/strided_memcpy.h +++ b/paddle/operators/detail/strided_memcpy.h @@ -35,7 +35,7 @@ struct StridedMemcpyFunctor { memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head); } else { #ifdef PADDLE_WITH_CUDA - auto& gpu_place = boost::get(place); + auto& gpu_place = boost::get(place); auto& cuda_ctx = reinterpret_cast(dev_ctx); memory::Copy(gpu_place, dst, gpu_place, src, sizeof(T) * dst_dim.head, diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h index 694584e79c..19c6715ec8 100644 --- a/paddle/operators/linear_chain_crf_op.h +++ b/paddle/operators/linear_chain_crf_op.h @@ -219,8 +219,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel { // operators runs on GPU device. auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, Tensor* dst) { - dst->mutable_data(platform::GPUPlace()); - framework::CopyFrom(src, platform::GPUPlace(), ctx, dst); + dst->mutable_data(platform::CUDAPlace()); + framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst); }; copyTensor(ctx, emission_exps_src, emission_exps_dst); copyTensor(ctx, transition_exps_src, transition_exps_dst); @@ -433,8 +433,8 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel { auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor* src, Tensor* dst) { if (src && dst) { - dst->mutable_data(platform::GPUPlace()); - framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst); + dst->mutable_data(platform::CUDAPlace()); + framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst); } }; copyTensor(ctx, emission_grad_src, emission_grad_dst); diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu index 9431030a53..a3ab1a7297 100644 --- a/paddle/operators/lookup_table_op.cu +++ b/paddle/operators/lookup_table_op.cu @@ -101,7 +101,7 @@ class LookupTableGradCUDAKernel : public framework::OpKernel { // copy GPU memory to CPU pinned memory framework::Vector new_rows; new_rows.resize(ids_dim[0]); - auto gpu_place = boost::get(context.GetPlace()); + auto gpu_place = boost::get(context.GetPlace()); memory::Copy(platform::CPUPlace(), new_rows.data(), gpu_place, ids_data, ids_dim[0] * sizeof(int64_t), stream); diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu index 291f2c295e..4b164d964c 100644 --- a/paddle/operators/lstm_unit_op.cu +++ b/paddle/operators/lstm_unit_op.cu @@ -98,7 +98,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* x_tensor = ctx.Input("X"); auto* c_prev_tensor = ctx.Input("C_prev"); @@ -129,7 +129,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto x_tensor = ctx.Input("X"); auto c_prev_tensor = ctx.Input("C_prev"); diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc index 256f3bc9bd..26c038e435 100644 --- a/paddle/operators/math/im2col_test.cc +++ b/paddle/operators/math/im2col_test.cc @@ -159,6 +159,7 @@ void testIm2col() { TEST(math, im2col) { testIm2col(); #ifdef PADDLE_WITH_CUDA - testIm2col(); + testIm2col(); #endif } diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 7852bb53a9..0a818bc5d4 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -105,7 +105,7 @@ void matmul( PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && platform::is_gpu_place(matrix_b.place()) && platform::is_gpu_place(matrix_out->place()), - "Matrix must all be in GPUPlace"); + "Matrix must all be in CUDAPlace"); int M = dim_out[0]; int N = dim_out[1]; @@ -134,7 +134,7 @@ void matmul( PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && platform::is_gpu_place(matrix_b.place()) && platform::is_gpu_place(matrix_out->place()), - "Matrix must all be in GPUPlace"); + "Matrix must all be in CUDAPlace"); int M = dim_out[0]; int N = dim_out[1]; @@ -266,7 +266,7 @@ struct TensorSetConstantGPU { }; template <> -void set_constant_with_place( +void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { framework::VisitDataType(framework::ToDataType(tensor->type()), @@ -277,7 +277,7 @@ template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, float value) { - set_constant_with_place(context, tensor, value); + set_constant_with_place(context, tensor, value); } template struct RowwiseAdd; diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu index 32e96d9487..4325a79664 100644 --- a/paddle/operators/math/math_function_test.cu +++ b/paddle/operators/math/math_function_test.cu @@ -13,7 +13,7 @@ TEST(math_function, notrans_mul_trans) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); @@ -47,7 +47,7 @@ TEST(math_function, trans_mul_notrans) { float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); @@ -96,7 +96,7 @@ TEST(math_function, gemm_notrans_cublas) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); @@ -151,7 +151,7 @@ TEST(math_function, gemm_trans_cublas) { float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::platform::CUDADeviceContext context(*gpu_place); paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); @@ -189,7 +189,7 @@ void GemvTest(int m, int n, bool trans) { T* data_b = vec_b.mutable_data({trans ? m : n}, *cpu_place); T* data_c = vec_c.mutable_data({trans ? n : m}, *cpu_place); - auto* gpu_place = new paddle::platform::GPUPlace(0); + auto* gpu_place = new paddle::platform::CUDAPlace(0); paddle::framework::Tensor g_mat_a; paddle::framework::Tensor g_vec_b; paddle::framework::Tensor g_vec_c; diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu index c44577e00a..9fddd97a36 100644 --- a/paddle/operators/math/selected_rows_functor.cu +++ b/paddle/operators/math/selected_rows_functor.cu @@ -58,15 +58,15 @@ struct SelectedRowsAdd { PADDLE_ENFORCE(platform::is_gpu_place(out_place)); memory::Copy( - boost::get(out_place), out_data, - boost::get(in1_place), in1_data, + boost::get(out_place), out_data, + boost::get(in1_place), in1_data, in1_value.numel() * sizeof(T), reinterpret_cast(context).stream()); auto* in2_data = in2_value.data(); - memory::Copy(boost::get(out_place), + memory::Copy(boost::get(out_place), out_data + in1_value.numel(), - boost::get(in2_place), in2_data, + boost::get(in2_place), in2_data, in2_value.numel() * sizeof(T), context.stream()); } }; @@ -160,9 +160,9 @@ struct SelectedRowsAddTo { auto* in1_data = in1_value.data(); auto* in2_data = in2_value->data(); - memory::Copy(boost::get(in2_place), + memory::Copy(boost::get(in2_place), in2_data + input2_offset, - boost::get(in1_place), in1_data, + boost::get(in1_place), in1_data, in1_value.numel() * sizeof(T), context.stream()); } }; diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu index 777caf5635..0a2e36f68a 100644 --- a/paddle/operators/math/selected_rows_functor_test.cu +++ b/paddle/operators/math/selected_rows_functor_test.cu @@ -21,7 +21,7 @@ TEST(selected_rows_functor, gpu_add) { using namespace paddle::platform; using namespace paddle::operators::math; - GPUPlace gpu_place(0); + CUDAPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); SetConstant functor; @@ -119,7 +119,7 @@ TEST(selected_rows_functor, gpu_add_to) { using namespace paddle::platform; using namespace paddle::operators::math; - GPUPlace gpu_place(0); + CUDAPlace gpu_place(0); CPUPlace cpu_place; CUDADeviceContext ctx(gpu_place); SetConstant functor; diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc index f46db3c567..3794f0e52d 100644 --- a/paddle/operators/math/vol2col_test.cc +++ b/paddle/operators/math/vol2col_test.cc @@ -122,6 +122,6 @@ TEST(math, vol2col) { testVol2col(); #ifdef PADDLE_WITH_CUDA testVol2col(); + paddle::platform::CUDAPlace>(); #endif // PADDLE_WITH_CUDA } diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu index 47986e9ff8..57e6880b4e 100644 --- a/paddle/operators/multiplex_op.cu +++ b/paddle/operators/multiplex_op.cu @@ -36,7 +36,7 @@ class MultiplexGPUKernel : public framework::OpKernel { CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - platform::GPUPlace place = boost::get(ctx.GetPlace()); + platform::CUDAPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { int32_t k = index[i]; PADDLE_ENFORCE_GE(k, 0, "index must be nonnegative."); @@ -73,7 +73,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel { auto* index = index_t_cpu.data(); auto stream = ctx.cuda_device_context().stream(); - platform::GPUPlace place = boost::get(ctx.GetPlace()); + platform::CUDAPlace place = boost::get(ctx.GetPlace()); for (auto i = 0; i < rows; i++) { size_t k = static_cast(index[i]); if (d_ins[k]) { diff --git a/paddle/operators/nccl_op.cu.cc b/paddle/operators/nccl_op.cu.cc index 6ca6db7253..1b986a1365 100644 --- a/paddle/operators/nccl_op.cu.cc +++ b/paddle/operators/nccl_op.cu.cc @@ -67,7 +67,7 @@ class NCCLAllReduceKernel : public framework::OpKernel { auto stream = ctx.cuda_device_context().stream(); // device id - int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); for (size_t i = 0; i < ins.size(); ++i) { @@ -120,7 +120,7 @@ class NCCLReduceKernel : public framework::OpKernel { ctx.device_context()) .stream(); // device id - int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); auto ins_names = ctx.Inputs("X"); @@ -164,7 +164,7 @@ class NCCLBcastKernel : public framework::OpKernel { ctx.device_context()) .stream(); // device id - int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); + int gpu_id = boost::get(ctx.GetPlace()).GetDeviceId(); int idx = comm->GetCommId(gpu_id); if (idx == root) { diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc index b6e4ccb73f..361bfa8d75 100644 --- a/paddle/operators/nccl_op_test.cu.cc +++ b/paddle/operators/nccl_op_test.cu.cc @@ -52,7 +52,7 @@ class NCCLTester : public ::testing::Test { virtual void SetUp() override { paddle::platform::CPUPlace cpu_place; for (size_t i = 0; i < gpu_list.size(); ++i) { - p::GPUPlace place(i); + p::CUDAPlace place(i); dev_ctxs.emplace_back(new p::CUDADeviceContext(place)); } @@ -87,7 +87,7 @@ class NCCLTester : public ::testing::Test { std::unique_lock lk(mu); const f::OpDesc *op1 = &op_desc; - p::GPUPlace place(gpu_id); + p::CUDAPlace place(gpu_id); auto &ctx = dev_ctxs.at(gpu_id); auto *send_tensor = scope->Var("st")->GetMutable(); @@ -171,7 +171,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { for (size_t i = 0; i < dev_scopes.size(); ++i) { p::CPUPlace cpu_place; - p::GPUPlace gpu_place(gpu_list[i]); + p::CUDAPlace gpu_place(gpu_list[i]); auto &recv_tensor = dev_scopes[i]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -180,7 +180,7 @@ TEST_F(NCCLTester, ncclAllReduceOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::GPUPlace(gpu_list[i]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list[i]), rt, recv_tensor.numel() * sizeof(float), static_cast(dev_ctxs[i])->stream()); @@ -219,7 +219,7 @@ TEST_F(NCCLTester, ncclReduceOp) { float result = std::accumulate(gpu_list.begin(), gpu_list.end(), 0); p::CPUPlace cpu_place; - p::GPUPlace gpu_place(gpu_list[kRoot]); + p::CUDAPlace gpu_place(gpu_list[kRoot]); auto &recv_tensor = dev_scopes[kRoot]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -229,7 +229,7 @@ TEST_F(NCCLTester, ncclReduceOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::GPUPlace(gpu_list[kRoot]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list[kRoot]), rt, recv_tensor.numel() * sizeof(float), static_cast(dev_ctxs[kRoot])->stream()); @@ -268,7 +268,7 @@ TEST_F(NCCLTester, ncclBcastOp) { float result = kRoot; p::CPUPlace cpu_place; - p::GPUPlace gpu_place(gpu_list[idx]); + p::CUDAPlace gpu_place(gpu_list[idx]); auto &recv_tensor = dev_scopes[idx]->FindVar("rt")->Get(); auto *rt = recv_tensor.data(); @@ -277,7 +277,7 @@ TEST_F(NCCLTester, ncclBcastOp) { auto *ct = result_tensor->mutable_data(cpu_place); paddle::memory::Copy( - cpu_place, ct, p::GPUPlace(gpu_list[idx]), rt, + cpu_place, ct, p::CUDAPlace(gpu_list[idx]), rt, recv_tensor.numel() * sizeof(float), static_cast(dev_ctxs[idx])->stream()); @@ -300,7 +300,7 @@ int main(int argc, char **argv) { places.emplace_back(paddle::platform::CPUPlace()); int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::GPUPlace(i)); + places.emplace_back(paddle::platform::CUDAPlace(i)); gpu_list.emplace_back(i); } diff --git a/paddle/operators/pool_cudnn_op.cu.cc b/paddle/operators/pool_cudnn_op.cu.cc index fc2b37bd0f..2d0001ba11 100644 --- a/paddle/operators/pool_cudnn_op.cu.cc +++ b/paddle/operators/pool_cudnn_op.cu.cc @@ -29,7 +29,7 @@ class PoolCudnnOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); const Tensor *input = ctx.Input("X"); Tensor *output = ctx.Output("Out"); @@ -90,7 +90,7 @@ class PoolCudnnGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); const Tensor *input = ctx.Input("X"); const Tensor *output = ctx.Input("Out"); diff --git a/paddle/operators/reshape_op.cu b/paddle/operators/reshape_op.cu index b7329238c0..a5dcd2ec96 100644 --- a/paddle/operators/reshape_op.cu +++ b/paddle/operators/reshape_op.cu @@ -16,7 +16,7 @@ REGISTER_OP_CUDA_KERNEL( reshape, - paddle::operators::ReshapeKernel); + paddle::operators::ReshapeKernel); REGISTER_OP_CUDA_KERNEL( reshape_grad, - paddle::operators::ReshapeGradKernel); + paddle::operators::ReshapeGradKernel); diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc index 230cc1ab0b..d47fd98d06 100644 --- a/paddle/operators/strided_memcpy_test.cc +++ b/paddle/operators/strided_memcpy_test.cc @@ -82,7 +82,7 @@ TEST(StridedMemcpy, GPUCrop) { }; // clang-format on - platform::GPUPlace gpu0(0); + platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; platform::CUDADeviceContext ctx(gpu0); @@ -121,7 +121,7 @@ TEST(StridedMemcpy, GPUConcat) { }; // clang-format on - platform::GPUPlace gpu0(0); + platform::CUDAPlace gpu0(0); platform::CPUPlace cpu; platform::CUDADeviceContext ctx(gpu0); diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu index 453bd07267..0a70ad87e6 100644 --- a/paddle/operators/top_k_op.cu +++ b/paddle/operators/top_k_op.cu @@ -283,7 +283,7 @@ class TopkOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), - "It must use GPUPlace."); + "It must use CUDAPlace."); auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); auto* indices = ctx.Output("Indices"); diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index a28e9de716..8ee0f18e64 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -58,10 +58,10 @@ DeviceContextPool::DeviceContextPool( #ifdef PADDLE_WITH_CUDA device_contexts_.emplace(places[i], new platform::CUDADeviceContext( - boost::get(places[i]))); + boost::get(places[i]))); #else PADDLE_THROW( - "'GPUPlace' is not supported, Please re-compile with WITH_GPU " + "'CUDAPlace' is not supported, Please re-compile with WITH_GPU " "option"); #endif } @@ -91,7 +91,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } ~EigenCudaStreamDevice() override {} - void Reinitialize(const cudaStream_t* cuda_stream, GPUPlace place) { + void Reinitialize(const cudaStream_t* cuda_stream, CUDAPlace place) { stream_ = cuda_stream; place_ = place; device_prop_ = &Eigen::m_deviceProperties[place.device]; @@ -130,14 +130,14 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } private: - GPUPlace place_; + CUDAPlace place_; const cudaStream_t* stream_; // not owned; const cudaDeviceProp* device_prop_; // not owned; mutable void* scratch_; mutable unsigned int* semaphore_; }; -CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) { +CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { SetDeviceId(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 9b958f7c92..877a66363a 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -58,7 +58,7 @@ class EigenCudaStreamDevice; class CUDADeviceContext : public DeviceContext { public: - explicit CUDADeviceContext(GPUPlace place); + explicit CUDADeviceContext(CUDAPlace place); virtual ~CUDADeviceContext(); /*! \brief Wait for all operations completion in the stream. */ @@ -80,7 +80,7 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream() const; private: - GPUPlace place_; + CUDAPlace place_; std::unique_ptr eigen_device_; std::unique_ptr eigen_stream_; @@ -143,7 +143,7 @@ class DeviceContextPool { size_t operator()(const platform::Place& place) const { int pre_hash = place.which() + (1 << LEFT_SHIFT); if (platform::is_gpu_place(place)) { - pre_hash += boost::get(place).GetDeviceId(); + pre_hash += boost::get(place).GetDeviceId(); } return hash_(pre_hash); } diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu index f046c79e0a..186824c019 100644 --- a/paddle/platform/device_context_test.cu +++ b/paddle/platform/device_context_test.cu @@ -20,11 +20,11 @@ limitations under the License. */ TEST(Device, Init) { using paddle::platform::DeviceContext; using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); + CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); delete device_context; @@ -33,11 +33,11 @@ TEST(Device, Init) { TEST(Device, CUDADeviceContext) { using paddle::platform::CUDADeviceContext; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; i++) { - CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i)); + CUDADeviceContext* device_context = new CUDADeviceContext(CUDAPlace(i)); Eigen::GpuDevice* gpu_device = device_context->eigen_device(); ASSERT_NE(nullptr, gpu_device); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); @@ -70,7 +70,7 @@ TEST(Device, DeviceContextPool) { using paddle::platform::CUDADeviceContext; using paddle::platform::Place; using paddle::platform::CPUPlace; - using paddle::platform::GPUPlace; + using paddle::platform::CUDAPlace; DeviceContextPool& pool = DeviceContextPool::Get(); auto cpu_dev_ctx1 = pool.Borrow(CPUPlace()); @@ -80,14 +80,14 @@ TEST(Device, DeviceContextPool) { std::vector gpu_places; int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - gpu_places.emplace_back(GPUPlace(i)); + gpu_places.emplace_back(CUDAPlace(i)); } auto dev_ctxs = pool.Borrow(gpu_places); for (size_t i = 0; i < dev_ctxs.size(); ++i) { auto* dev_ctx = static_cast(dev_ctxs[i]); - // check same as GPUPlace(i) - GPUPlace place = boost::get(dev_ctx->GetPlace()); + // check same as CUDAPlace(i) + CUDAPlace place = boost::get(dev_ctx->GetPlace()); EXPECT_EQ(place.GetDeviceId(), static_cast(i)); } } @@ -106,7 +106,7 @@ int main(int argc, char** argv) { places.emplace_back(paddle::platform::CPUPlace()); int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::GPUPlace(i)); + places.emplace_back(paddle::platform::CUDAPlace(i)); } VLOG(0) << " DeviceCount " << count; diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu index 6750c8da7d..f57c329402 100644 --- a/paddle/platform/nccl_test.cu +++ b/paddle/platform/nccl_test.cu @@ -50,7 +50,7 @@ struct PerThreadData { T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); } - PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) { + PerThreadData(int gpu_id, size_t size) : dev_ctx(CUDAPlace(gpu_id)) { send_buff.resize(size); for (size_t i = 0; i < size; ++i) { send_buff[i] = static_cast(i); @@ -140,7 +140,7 @@ int main(int argc, char** argv) { places.emplace_back(paddle::platform::CPUPlace()); int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - places.emplace_back(paddle::platform::GPUPlace(i)); + places.emplace_back(paddle::platform::CUDAPlace(i)); } VLOG(0) << " DeviceCount " << count; diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc index 25fe8d21b4..4d23cfd886 100644 --- a/paddle/platform/place.cc +++ b/paddle/platform/place.cc @@ -24,7 +24,9 @@ class PlacePrinter : public boost::static_visitor<> { explicit PlacePrinter(std::ostream &os) : os_(os) {} void operator()(const CPUPlace &) { os_ << "CPUPlace"; } void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; } - void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; } + void operator()(const CUDAPlace &p) { + os_ << "CUDAPlace(" << p.device << ")"; + } private: std::ostream &os_; @@ -37,12 +39,12 @@ static Place the_default_place; void set_place(const Place &place) { the_default_place = place; } const Place &get_place() { return the_default_place; } -const GPUPlace default_gpu() { return GPUPlace(0); } +const CUDAPlace default_gpu() { return CUDAPlace(0); } const CPUPlace default_cpu() { return CPUPlace(); } const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); } bool is_gpu_place(const Place &p) { - return boost::apply_visitor(IsGPUPlace(), p); + return boost::apply_visitor(IsCUDAPlace(), p); } bool is_cpu_place(const Place &p) { return !is_gpu_place(p) && !is_mkldnn_place(p); diff --git a/paddle/platform/place.h b/paddle/platform/place.h index daeafbbcd7..4eab1a3964 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -39,43 +39,45 @@ struct MKLDNNPlace { inline bool operator!=(const MKLDNNPlace &) const { return false; } }; -struct GPUPlace { - GPUPlace() : GPUPlace(0) {} - explicit GPUPlace(int d) : device(d) {} +struct CUDAPlace { + CUDAPlace() : CUDAPlace(0) {} + explicit CUDAPlace(int d) : device(d) {} inline int GetDeviceId() const { return device; } // needed for variant equality comparison - inline bool operator==(const GPUPlace &o) const { return device == o.device; } - inline bool operator!=(const GPUPlace &o) const { return !(*this == o); } + inline bool operator==(const CUDAPlace &o) const { + return device == o.device; + } + inline bool operator!=(const CUDAPlace &o) const { return !(*this == o); } int device; }; -struct CUDNNPlace : public GPUPlace { - CUDNNPlace() : GPUPlace() {} - explicit CUDNNPlace(int d) : GPUPlace(d) {} +struct CUDNNPlace : public CUDAPlace { + CUDNNPlace() : CUDAPlace() {} + explicit CUDNNPlace(int d) : CUDAPlace(d) {} }; -struct IsGPUPlace : public boost::static_visitor { +struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const MKLDNNPlace &) const { return false; } - bool operator()(const GPUPlace &gpu) const { return true; } + bool operator()(const CUDAPlace &gpu) const { return true; } bool operator()(const CUDNNPlace &) const { return true; } }; struct IsMKLDNNPlace : public boost::static_visitor { bool operator()(const MKLDNNPlace &) const { return true; } bool operator()(const CPUPlace &) const { return false; } - bool operator()(const GPUPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDNNPlace &) const { return false; } }; -typedef boost::variant Place; +typedef boost::variant Place; void set_place(const Place &); const Place &get_place(); -const GPUPlace default_gpu(); +const CUDAPlace default_gpu(); const CPUPlace default_cpu(); const MKLDNNPlace default_mkldnn(); diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc index c536b59ed8..21f7d9f213 100644 --- a/paddle/platform/place_test.cc +++ b/paddle/platform/place_test.cc @@ -4,7 +4,7 @@ TEST(Place, Equality) { paddle::platform::CPUPlace cpu; - paddle::platform::GPUPlace g0(0), g1(1), gg0(0); + paddle::platform::CUDAPlace g0(0), g1(1), gg0(0); paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0); EXPECT_EQ(cpu, cpu); @@ -41,8 +41,8 @@ TEST(Place, Default) { TEST(Place, Print) { { std::stringstream ss; - ss << paddle::platform::GPUPlace(1); - EXPECT_EQ("GPUPlace(1)", ss.str()); + ss << paddle::platform::CUDAPlace(1); + EXPECT_EQ("CUDAPlace(1)", ss.str()); } { std::stringstream ss; diff --git a/paddle/platform/transform_test.cu b/paddle/platform/transform_test.cu index 464096111e..8e2483aa84 100644 --- a/paddle/platform/transform_test.cu +++ b/paddle/platform/transform_test.cu @@ -49,7 +49,7 @@ TEST(Transform, CPUUnary) { TEST(Transform, GPUUnary) { using namespace paddle::platform; using namespace paddle::memory; - GPUPlace gpu0(0); + CUDAPlace gpu0(0); CUDADeviceContext ctx(gpu0); float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4}; float* gpu_buf = static_cast(Alloc(gpu0, sizeof(float) * 4)); @@ -80,7 +80,7 @@ TEST(Transform, GPUBinary) { using namespace paddle::platform; using namespace paddle::memory; int buf[4] = {1, 2, 3, 4}; - GPUPlace gpu0(0); + CUDAPlace gpu0(0); CUDADeviceContext ctx(gpu0); int* gpu_buf = static_cast(Alloc(gpu0, sizeof(buf))); Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf), ctx.stream()); diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index de6b24f70d..668a48e816 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -79,7 +79,7 @@ PYBIND11_PLUGIN(core) { self.Resize(make_ddim(dim)); }) .def("alloc_float", - [](Tensor &self, paddle::platform::GPUPlace &place) { + [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); }) .def("alloc_float", @@ -91,7 +91,7 @@ PYBIND11_PLUGIN(core) { self.mutable_data(place); }) .def("alloc_int", - [](Tensor &self, paddle::platform::GPUPlace &place) { + [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); }) .def("set", PyCPUTensorSetFromArray) @@ -310,10 +310,10 @@ All parameter, weight, gradient are variables in Paddle. return new paddle::platform::CPUDeviceContext(); }) .def_static("create", - [](paddle::platform::GPUPlace& place) + [](paddle::platform::CUDAPlace& place) -> paddle::platform::DeviceContext* { #ifndef PADDLE_WITH_CUDA - PADDLE_THROW("GPUPlace is not supported in CPU device."); + PADDLE_THROW("CUDAPlace is not supported in CPU device."); #else return new paddle::platform::CUDADeviceContext(place); #endif @@ -323,9 +323,9 @@ All parameter, weight, gradient are variables in Paddle. #ifdef PADDLE_WITH_CUDA py::class_(m, "Communicator").def(py::init<>()); #endif - py::class_(m, "GPUPlace") + py::class_(m, "CUDAPlace") .def(py::init()) - .def("__str__", string::to_string); + .def("__str__", string::to_string); py::class_(m, "CPUPlace") .def(py::init<>()) @@ -338,7 +338,7 @@ All parameter, weight, gradient are variables in Paddle. self = cpu_place; }) .def("set_place", - [](platform::Place &self, const platform::GPUPlace &gpu_place) { + [](platform::Place &self, const platform::CUDAPlace &gpu_place) { self = gpu_place; }); @@ -363,7 +363,7 @@ All parameter, weight, gradient are variables in Paddle. const platform::CPUPlace &place) { self.Run(scope, place); }) .def("run", [](OperatorBase &self, const Scope &scope, - const platform::GPUPlace &place) { self.Run(scope, place); }) + const platform::CUDAPlace &place) { self.Run(scope, place); }) .def("type", [](const OperatorBase &op) -> std::string { return op.Type(); }) .def("outputs", diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h index 413fd9b046..7b8c29ff84 100644 --- a/paddle/pybind/tensor_py.h +++ b/paddle/pybind/tensor_py.h @@ -71,7 +71,7 @@ struct CastToPyBufferImpl { dst_ptr, src_ptr, sizeof(CUR_TYPE) * tensor.numel(), cudaMemcpyDeviceToHost, dev_ctx->stream()); #else - PADDLE_THROW("'GPUPlace' is not supported in CPU only device."); + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); #endif } else if (paddle::platform::is_cpu_place(tensor.place())) { dst_tensor = tensor; @@ -127,7 +127,7 @@ template void PyCUDATensorSetFromArray( framework::Tensor &self, py::array_t array, - paddle::platform::GPUPlace &place) { + paddle::platform::CUDAPlace &place) { std::vector dims; dims.reserve(array.ndim()); for (size_t i = 0; i < array.ndim(); ++i) { diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 7ba1bf095a..108ff335bf 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -36,7 +36,7 @@ int main(int argc, char** argv) { paddle::memory::Used(paddle::platform::CPUPlace()); std::vector devs = {"CPU"}; #ifdef PADDLE_WITH_CUDA - paddle::memory::Used(paddle::platform::GPUPlace(0)); + paddle::memory::Used(paddle::platform::CUDAPlace(0)); devs.push_back("GPU:0"); #endif paddle::framework::InitDevices(devs); diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py index 051b9094aa..c72b573069 100644 --- a/python/paddle/v2/fluid/__init__.py +++ b/python/paddle/v2/fluid/__init__.py @@ -15,14 +15,14 @@ import backward import regularizer from param_attr import ParamAttr from data_feeder import DataFeeder -from core import LoDTensor, CPUPlace, GPUPlace +from core import LoDTensor, CPUPlace, CUDAPlace from distribute_transpiler import DistributeTranspiler import clip Tensor = LoDTensor __all__ = framework.__all__ + executor.__all__ + [ 'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward', - 'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr' + 'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr' 'DataFeeder', 'clip', 'DistributeTranspiler' ] diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py index cdd576294f..2c91afb363 100644 --- a/python/paddle/v2/fluid/executor.py +++ b/python/paddle/v2/fluid/executor.py @@ -47,7 +47,7 @@ class Executor(object): act_places.append(p) # TODO(dzhwinter) : consider that our fluid tests all written in - # GPUPlace(gpu_id), this will be changed in the future + # CUDAPlace(gpu_id), this will be changed in the future if core.is_compile_gpu(): core.init_devices(["CPU", "GPU:0"]) else: diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py index b0c11ba341..e3cc2a8937 100644 --- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py +++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py @@ -142,7 +142,7 @@ def main(): opts = sgd_optimizer.minimize(cost) if USE_GPU: - place = core.GPUPlace(0) + place = core.CUDAPlace(0) else: place = core.CPUPlace() diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py index 087283bfde..8dbfbd547a 100644 --- a/python/paddle/v2/fluid/tests/op_test.py +++ b/python/paddle/v2/fluid/tests/op_test.py @@ -316,7 +316,7 @@ class OpTest(unittest.TestCase): def check_output(self, atol=1e-5): places = [core.CPUPlace()] if core.is_compile_gpu() and core.op_support_gpu(self.op_type): - places.append(core.GPUPlace(0)) + places.append(core.CUDAPlace(0)) for place in places: self.check_output_with_place(place, atol) @@ -379,7 +379,7 @@ class OpTest(unittest.TestCase): "Gradient Check On %s" % str(cpu_place)) if core.is_compile_gpu() and self.op.support_gpu(): - gpu_place = core.GPUPlace(0) + gpu_place = core.CUDAPlace(0) gpu_analytic_grads = self._get_gradient(inputs_to_check, gpu_place, output_names, no_grad_set) diff --git a/python/paddle/v2/fluid/tests/test_adagrad_op.py b/python/paddle/v2/fluid/tests/test_adagrad_op.py index 1ff3932164..7b2d02fbf4 100644 --- a/python/paddle/v2/fluid/tests/test_adagrad_op.py +++ b/python/paddle/v2/fluid/tests/test_adagrad_op.py @@ -167,7 +167,7 @@ class TestSparseAdagradOp(unittest.TestCase): def test_sparse_adagrad(self): places = [core.CPUPlace()] if core.is_compile_gpu(): - places.append(core.GPUPlace(0)) + places.append(core.CUDAPlace(0)) for place in places: self.check_with_place(place) diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py index dfc047e1f0..abbd48d2b8 100644 --- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py @@ -304,7 +304,7 @@ class TestBatchNormOp(OpTest): self.__assert_close(saved_variance_tensor, saved_variance, "saved_variance") self.__assert_close(mean_out_tensor, mean_out, "mean_out") - if isinstance(place, core.GPUPlace): + if isinstance(place, core.CUDAPlace): atol = 5e-2 else: atol = 1e-4 @@ -339,7 +339,7 @@ class TestBatchNormOp(OpTest): places = [core.CPUPlace()] if core.is_compile_gpu() and core.op_support_gpu("batch_norm"): - places.append(core.GPUPlace(0)) + places.append(core.CUDAPlace(0)) core.init_devices(["CPU", "GPU:0"]) else: diff --git a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py index 4afe0c6a6d..6f6a60ccb3 100644 --- a/python/paddle/v2/fluid/tests/test_gaussian_random_op.py +++ b/python/paddle/v2/fluid/tests/test_gaussian_random_op.py @@ -20,7 +20,7 @@ class TestGaussianRandomOp(unittest.TestCase): def test_gpu(self): if core.is_compile_gpu(): - self.gaussian_random_test(place=fluid.GPUPlace(0)) + self.gaussian_random_test(place=fluid.CUDAPlace(0)) def gaussian_random_test(self, place): diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py index d01e257449..e3f3ac58ef 100644 --- a/python/paddle/v2/fluid/tests/test_profiler.py +++ b/python/paddle/v2/fluid/tests/test_profiler.py @@ -15,7 +15,7 @@ class TestProfiler(unittest.TestCase): data = layers.data(name='data', shape=[3, 28, 28], dtype='float32') conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1]) - place = fluid.GPUPlace(0) + place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) diff --git a/python/paddle/v2/fluid/tests/test_sgd_op.py b/python/paddle/v2/fluid/tests/test_sgd_op.py index 9c345792be..14d41e172a 100644 --- a/python/paddle/v2/fluid/tests/test_sgd_op.py +++ b/python/paddle/v2/fluid/tests/test_sgd_op.py @@ -78,7 +78,7 @@ class TestSparseSGDOp(unittest.TestCase): def test_sparse_sgd(self): places = [core.CPUPlace()] if core.is_compile_gpu(): - places.append(core.GPUPlace(0)) + places.append(core.CUDAPlace(0)) for place in places: self.check_with_place(place) diff --git a/python/paddle/v2/fluid/tests/test_uniform_random_op.py b/python/paddle/v2/fluid/tests/test_uniform_random_op.py index d6872c8ba3..dbe4d6bcd0 100644 --- a/python/paddle/v2/fluid/tests/test_uniform_random_op.py +++ b/python/paddle/v2/fluid/tests/test_uniform_random_op.py @@ -23,7 +23,7 @@ class TestUniformRandomOp(unittest.TestCase): def test_gpu(self): if core.is_compile_gpu(): - self.uniform_random_test(place=core.GPUPlace(0)) + self.uniform_random_test(place=core.CUDAPlace(0)) def uniform_random_test(self, place): program = fluid.Program() From cd357aa0f079c277020bb502c2e39d615e934210 Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Mon, 25 Dec 2017 13:33:13 +0800 Subject: [PATCH 12/24] Set RelWithDebInfo flags --- CMakeLists.txt | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b309ff37e5..5df83499d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,8 +16,6 @@ cmake_minimum_required(VERSION 3.0) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) -SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") -SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") include(system) @@ -201,6 +199,10 @@ if(WITH_GOLANG) endif(WITH_GOLANG) set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") + +SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") +SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") + add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) From d142a7338f58f528159993030e3ebf69dd3bc94d Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 25 Dec 2017 14:10:13 +0800 Subject: [PATCH 13/24] Fix/copyfrom context (#6954) * "fix CopyFrom parameters" * "fix unused Place argument" * "fixed based on comment" --- paddle/framework/tensor_util.h | 55 ++++++++++++++++++++++++++++ paddle/framework/tensor_util_test.cc | 11 +++--- 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index 5b474e4aef..ebfb0e5538 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -82,6 +82,28 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, #endif } +/** + * @brief CopyFrom support CPU <-> CPU + */ +inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, + Tensor* dst) { + src.check_memory_size(); + dst->Resize(src.dims()); + + auto src_place = src.place(); + auto src_ptr = src.data(); + + auto dst_ptr = dst->mutable_data(dst_place, src.type()); + + auto size = src.numel() * SizeOfType(src.type()); + + PADDLE_ENFORCE(platform::is_cpu_place(src_place) && + platform::is_cpu_place(dst_place)); + + memory::Copy(boost::get(dst_place), dst_ptr, + boost::get(src_place), src_ptr, size); +} + /** * @brief Copy the content of an external vector to a tensor. * @@ -115,6 +137,21 @@ inline void CopyFromVector(const std::vector& src, #endif } +/** + * @brief CopyFromVector CPU vector -> CPU Tensor + */ +template +inline void CopyFromVector(const std::vector& src, Tensor* dst) { + platform::CPUPlace dst_place = platform::CPUPlace(); + auto src_ptr = static_cast(src.data()); + platform::CPUPlace src_place; + dst->Resize({static_cast(src.size())}); + auto dst_ptr = static_cast(dst->mutable_data(dst_place)); + auto size = src.size() * sizeof(T); + + memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); +} + /** * @brief Copy the content of a tensor to a vector * @@ -148,5 +185,23 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx, #endif } +/** + * @brief CopyToVector CPUTensor <-> CPU Vector + */ +template +inline void CopyToVector(const Tensor& src, std::vector* dst) { + auto src_ptr = static_cast(src.data()); + auto size = src.numel() * sizeof(T); + + platform::CPUPlace dst_place; + dst->resize(src.numel()); + auto dst_ptr = static_cast(dst->data()); + + PADDLE_ENFORCE(platform::is_cpu_place(src.place())); + + memory::Copy(dst_place, dst_ptr, boost::get(src.place()), + src_ptr, size); +} + } // namespace framework } // namespace paddle diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index 3afb98a4a5..6fc243aaf6 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -17,6 +17,7 @@ namespace paddle { namespace framework { + TEST(CopyFrom, Tensor) { Tensor src_tensor; Tensor dst_tensor; @@ -29,7 +30,7 @@ TEST(CopyFrom, Tensor) { memcpy(src_ptr, arr, 9 * sizeof(int)); auto cpu_place = new platform::CPUPlace(); - CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor); + CopyFrom(src_tensor, *cpu_place, &dst_tensor); const int* dst_ptr = dst_tensor.data(); ASSERT_NE(src_ptr, dst_ptr); @@ -104,8 +105,7 @@ TEST(CopyFromVector, Tensor) { // Copy to CPU Tensor cpu_tensor.Resize(make_ddim({3, 3})); auto cpu_place = new paddle::platform::CPUPlace(); - CPUDeviceContext cpu_ctx(*cpu_place); - CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + CopyFromVector(src_vec, &cpu_tensor); // Compare Tensors const int* cpu_ptr = cpu_tensor.data(); @@ -117,7 +117,7 @@ TEST(CopyFromVector, Tensor) { src_vec.erase(src_vec.begin(), src_vec.begin() + 5); cpu_tensor.Resize(make_ddim({2, 2})); - CopyFromVector(src_vec, cpu_ctx, &cpu_tensor); + CopyFromVector(src_vec, &cpu_tensor); cpu_ptr = cpu_tensor.data(); src_ptr = src_vec.data(); ASSERT_NE(src_ptr, cpu_ptr); @@ -198,9 +198,8 @@ TEST(CopyToVector, Tensor) { } CPUPlace place; - CPUDeviceContext cpu_ctx(place); std::vector dst; - CopyToVector(src, cpu_ctx, &dst); + CopyToVector(src, &dst); for (int i = 0; i < 3 * 3; ++i) { EXPECT_EQ(src_ptr[i], dst[i]); From 55af7a77a40a7090de8b8ea653fdbca179e9f8c1 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 25 Dec 2017 14:29:58 +0800 Subject: [PATCH 14/24] fix typo in comments of sequence_pool_op --- paddle/operators/sequence_pool_op.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc index 0eb675caad..db601a8b7d 100644 --- a/paddle/operators/sequence_pool_op.cc +++ b/paddle/operators/sequence_pool_op.cc @@ -49,7 +49,7 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker { .AsIntermediate(); AddAttr( "pooltype", - "(int, default AVERAGE) the pooling pooltype of SequencePoolOp.") + "(string, default 'AVERAGE') the pooling pooltype of SequencePoolOp.") .SetDefault("AVERAGE") .InEnum({"AVERAGE", "SUM", "SQRT", "LAST", "FIRST", "MAX"}); AddComment(R"DOC( From 1fdf8853d420e884db8a71b47a707889e494d4df Mon Sep 17 00:00:00 2001 From: Yang Yu Date: Mon, 25 Dec 2017 14:35:28 +0800 Subject: [PATCH 15/24] Optimize adam_op --- paddle/operators/adam_op.h | 32 +++++++------- paddle/platform/for_range.h | 85 +++++++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+), 16 deletions(-) create mode 100644 paddle/platform/for_range.h diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h index 887258530c..c4e2c8bb88 100644 --- a/paddle/operators/adam_op.h +++ b/paddle/operators/adam_op.h @@ -16,7 +16,7 @@ limitations under the License. */ #include // for sqrt in CPU and CUDA #include "paddle/framework/op_registry.h" #include "paddle/operators/detail/safe_ref.h" -#include "paddle/platform/transform.h" +#include "paddle/platform/for_range.h" namespace paddle { namespace operators { @@ -36,10 +36,12 @@ struct AdamFunctor { const T* lr_; const T* grad_; const T* param_; + T* param_out_; AdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, - T* mom2_out, const T* lr, const T* grad, const T* param) + T* mom2_out, const T* lr, const T* grad, const T* param, + T* param_out) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -51,11 +53,10 @@ struct AdamFunctor { moment2_out_(mom2_out), lr_(lr), grad_(grad), - param_(param) {} + param_(param), + param_out_(param_out) {} - // From param[i] --> param_out[i]; - inline HOSTDEVICE T operator()(const T& p) const { - size_t i = &p - param_; + inline HOSTDEVICE void operator()(size_t i) const { // Merge all memory access together. T g = grad_[i]; T mom1 = moment1_[i]; @@ -63,17 +64,18 @@ struct AdamFunctor { T lr = *lr_; T beta1_pow = *beta1_pow_; T beta2_pow = *beta2_pow_; + T p = param_[i]; // Calculation - lr = lr * sqrt(1 - beta2_pow) / (1 - beta1_pow); + lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow); mom1 = beta1_ * mom1 + (1 - beta1_) * g; mom2 = beta2_ * mom2 + (1 - beta2_) * g * g; - T new_p = p - lr * (mom1 / (sqrt(mom2) + epsilon_)); + p -= lr * (mom1 / (sqrt(mom2) + epsilon_)); // Write back to global memory moment1_out_[i] = mom1; moment2_out_[i] = mom2; - return new_p; + param_out_[i] = p; } }; @@ -113,13 +115,11 @@ class AdamOpKernel : public framework::OpKernel { mom2.template data(), mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad.template data(), - param.template data()); - - const T* in_ptr = param.template data(); - T* out_ptr = param_out.template mutable_data(ctx.GetPlace()); - platform::Transform trans; - trans(static_cast(ctx.device_context()), in_ptr, - in_ptr + param_out.numel(), out_ptr, functor); + param.template data(), + param_out.template mutable_data(ctx.GetPlace())); + platform::ForRange for_range( + static_cast(ctx.device_context()), param.numel()); + for_range(functor); } }; diff --git a/paddle/platform/for_range.h b/paddle/platform/for_range.h new file mode 100644 index 0000000000..6ba6b01076 --- /dev/null +++ b/paddle/platform/for_range.h @@ -0,0 +1,85 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include "paddle/platform/device_context.h" + +namespace paddle { +namespace platform { + +template +struct ForRange { + ForRange(const DeviceContext& dev_ctx, size_t limit); + + template + void operator()(Function func) const; +}; + +template <> +struct ForRange { + ForRange(const CPUDeviceContext& dev_ctx, size_t limit) : limit_(limit) {} + + template + void operator()(Function func) const { + for (size_t i = 0; i < limit_; ++i) { + func(i); + } + } + + size_t limit_; +}; + +#ifdef __NVCC__ +template +__global__ static void ForRangeElemwiseOpGridIsOne(Function func) { + size_t idx = static_cast(threadIdx.x); + func(idx); +} + +template +__global__ static void ForRangeElemwiseOp(Function func, int limit) { + size_t idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); + if (idx < limit) { + func(idx); + } +} + +template <> +struct ForRange { + ForRange(const CUDADeviceContext& dev_ctx, size_t limit) + : dev_ctx_(dev_ctx), limit_(static_cast(limit)) {} + + template + inline void operator()(Function func) const { + constexpr size_t num_threads = 1024; + int block_size = limit_ <= num_threads ? limit_ : num_threads; + int grid_size = (limit_ + num_threads - 1) / num_threads; + + if (grid_size == 1) { + ForRangeElemwiseOpGridIsOne<<<1, block_size, 0, dev_ctx_.stream()>>>( + func); + } else { + ForRangeElemwiseOp<<>>( + func, limit_); + } + } + + const CUDADeviceContext& dev_ctx_; + int limit_; +}; + +#endif + +} // namespace platform +} // namespace paddle From efd3726929d0f0421aefb76f3163bfbaddae7d9e Mon Sep 17 00:00:00 2001 From: QI JUN Date: Mon, 25 Dec 2017 14:51:28 +0800 Subject: [PATCH 16/24] remove unused place (#6972) * remove unused place * fix ci --- paddle/operators/math/math_function.cc | 8 -------- paddle/operators/math/math_function.cu | 7 ------- paddle/platform/device_context.cc | 8 +++----- paddle/platform/device_context.h | 6 +----- paddle/platform/device_context_test.cu | 5 ++--- paddle/platform/place.cc | 9 +-------- paddle/platform/place.h | 26 +------------------------- paddle/platform/place_test.cc | 13 +------------ 8 files changed, 9 insertions(+), 73 deletions(-) diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc index a05810d778..2b35e4532a 100644 --- a/paddle/operators/math/math_function.cc +++ b/paddle/operators/math/math_function.cc @@ -277,14 +277,6 @@ void set_constant_with_place( TensorSetConstantCPU(tensor, value)); } -template <> -void set_constant_with_place( - const platform::DeviceContext& context, framework::Tensor* tensor, - float value) { - framework::VisitDataType(framework::ToDataType(tensor->type()), - TensorSetConstantCPU(tensor, value)); -} - struct TensorSetConstantWithPlace : public boost::static_visitor { TensorSetConstantWithPlace(const platform::DeviceContext& context, framework::Tensor* tensor, float value) diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu index 0a818bc5d4..927838a094 100644 --- a/paddle/operators/math/math_function.cu +++ b/paddle/operators/math/math_function.cu @@ -273,13 +273,6 @@ void set_constant_with_place( TensorSetConstantGPU(context, tensor, value)); } -template <> -void set_constant_with_place( - const platform::DeviceContext& context, framework::Tensor* tensor, - float value) { - set_constant_with_place(context, tensor, value); -} - template struct RowwiseAdd; template struct RowwiseAdd; template struct ColwiseSum; diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc index 8ee0f18e64..e450ef32a4 100644 --- a/paddle/platform/device_context.cc +++ b/paddle/platform/device_context.cc @@ -178,20 +178,18 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } cudaStream_t CUDADeviceContext::stream() const { return stream_; } -CUDNNDeviceContext::CUDNNDeviceContext(CUDNNPlace place) - : CUDADeviceContext(place), place_(place) { +CUDNNDeviceContext::CUDNNDeviceContext(CUDAPlace place) + : CUDADeviceContext(place) { PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream())); } CUDNNDeviceContext::~CUDNNDeviceContext() { - SetDeviceId(place_.device); + SetDeviceId(boost::get(GetPlace()).device); Wait(); PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); } -Place CUDNNDeviceContext::GetPlace() const { return CUDNNPlace(); } - cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; } #endif diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h index 877a66363a..8ba12e1657 100644 --- a/paddle/platform/device_context.h +++ b/paddle/platform/device_context.h @@ -92,18 +92,14 @@ class CUDADeviceContext : public DeviceContext { class CUDNNDeviceContext : public CUDADeviceContext { public: - explicit CUDNNDeviceContext(CUDNNPlace place); + explicit CUDNNDeviceContext(CUDAPlace place); virtual ~CUDNNDeviceContext(); - /*! \brief Return place in the device context. */ - Place GetPlace() const final; - /*! \brief Return cudnn handle in the device context. */ cudnnHandle_t cudnn_handle() const; private: cudnnHandle_t cudnn_handle_; - CUDNNPlace place_; }; #endif diff --git a/paddle/platform/device_context_test.cu b/paddle/platform/device_context_test.cu index 186824c019..91011bf71c 100644 --- a/paddle/platform/device_context_test.cu +++ b/paddle/platform/device_context_test.cu @@ -51,12 +51,11 @@ TEST(Device, CUDADeviceContext) { TEST(Device, CUDNNDeviceContext) { using paddle::platform::CUDNNDeviceContext; - using paddle::platform::CUDNNPlace; + using paddle::platform::CUDAPlace; if (paddle::platform::dynload::HasCUDNN()) { int count = paddle::platform::GetCUDADeviceCount(); for (int i = 0; i < count; ++i) { - CUDNNDeviceContext* device_context = - new CUDNNDeviceContext(CUDNNPlace(i)); + CUDNNDeviceContext* device_context = new CUDNNDeviceContext(CUDAPlace(i)); cudnnHandle_t cudnn_handle = device_context->cudnn_handle(); ASSERT_NE(nullptr, cudnn_handle); ASSERT_NE(nullptr, device_context->stream()); diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc index 4d23cfd886..b571eb7016 100644 --- a/paddle/platform/place.cc +++ b/paddle/platform/place.cc @@ -23,7 +23,6 @@ class PlacePrinter : public boost::static_visitor<> { public: explicit PlacePrinter(std::ostream &os) : os_(os) {} void operator()(const CPUPlace &) { os_ << "CPUPlace"; } - void operator()(const MKLDNNPlace &) { os_ << "MKLDNNPlace"; } void operator()(const CUDAPlace &p) { os_ << "CUDAPlace(" << p.device << ")"; } @@ -41,18 +40,12 @@ const Place &get_place() { return the_default_place; } const CUDAPlace default_gpu() { return CUDAPlace(0); } const CPUPlace default_cpu() { return CPUPlace(); } -const MKLDNNPlace default_mkldnn() { return MKLDNNPlace(); } bool is_gpu_place(const Place &p) { return boost::apply_visitor(IsCUDAPlace(), p); } -bool is_cpu_place(const Place &p) { - return !is_gpu_place(p) && !is_mkldnn_place(p); -} -bool is_mkldnn_place(const Place &p) { - return boost::apply_visitor(IsMKLDNNPlace(), p); -} +bool is_cpu_place(const Place &p) { return !is_gpu_place(p); } bool places_are_same_class(const Place &p1, const Place &p2) { return p1.which() == p2.which(); diff --git a/paddle/platform/place.h b/paddle/platform/place.h index 4eab1a3964..d25eaa689f 100644 --- a/paddle/platform/place.h +++ b/paddle/platform/place.h @@ -31,14 +31,6 @@ struct CPUPlace { inline bool operator!=(const CPUPlace &) const { return false; } }; -struct MKLDNNPlace { - MKLDNNPlace() {} - - // needed for variant equality comparison - inline bool operator==(const MKLDNNPlace &) const { return true; } - inline bool operator!=(const MKLDNNPlace &) const { return false; } -}; - struct CUDAPlace { CUDAPlace() : CUDAPlace(0) {} explicit CUDAPlace(int d) : device(d) {} @@ -53,37 +45,21 @@ struct CUDAPlace { int device; }; -struct CUDNNPlace : public CUDAPlace { - CUDNNPlace() : CUDAPlace() {} - explicit CUDNNPlace(int d) : CUDAPlace(d) {} -}; - struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } - bool operator()(const MKLDNNPlace &) const { return false; } bool operator()(const CUDAPlace &gpu) const { return true; } - bool operator()(const CUDNNPlace &) const { return true; } -}; - -struct IsMKLDNNPlace : public boost::static_visitor { - bool operator()(const MKLDNNPlace &) const { return true; } - bool operator()(const CPUPlace &) const { return false; } - bool operator()(const CUDAPlace &) const { return false; } - bool operator()(const CUDNNPlace &) const { return false; } }; -typedef boost::variant Place; +typedef boost::variant Place; void set_place(const Place &); const Place &get_place(); const CUDAPlace default_gpu(); const CPUPlace default_cpu(); -const MKLDNNPlace default_mkldnn(); bool is_gpu_place(const Place &); bool is_cpu_place(const Place &); -bool is_mkldnn_place(const Place &); bool places_are_same_class(const Place &, const Place &); std::ostream &operator<<(std::ostream &, const Place &); diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc index 21f7d9f213..4f1eba01df 100644 --- a/paddle/platform/place_test.cc +++ b/paddle/platform/place_test.cc @@ -5,37 +5,26 @@ TEST(Place, Equality) { paddle::platform::CPUPlace cpu; paddle::platform::CUDAPlace g0(0), g1(1), gg0(0); - paddle::platform::CUDNNPlace d0(0), d1(1), dd0(0); EXPECT_EQ(cpu, cpu); EXPECT_EQ(g0, g0); EXPECT_EQ(g1, g1); EXPECT_EQ(g0, gg0); - EXPECT_EQ(d0, dd0); EXPECT_NE(g0, g1); - EXPECT_NE(d0, d1); EXPECT_TRUE(paddle::platform::places_are_same_class(g0, gg0)); EXPECT_FALSE(paddle::platform::places_are_same_class(g0, cpu)); - - EXPECT_TRUE(paddle::platform::is_gpu_place(d0)); - EXPECT_FALSE(paddle::platform::places_are_same_class(g0, d0)); } TEST(Place, Default) { EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::get_place())); EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu())); EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu())); - EXPECT_TRUE( - paddle::platform::is_mkldnn_place(paddle::platform::default_mkldnn())); + EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place())); paddle::platform::set_place(paddle::platform::CPUPlace()); EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place())); - - paddle::platform::set_place(paddle::platform::MKLDNNPlace()); - EXPECT_FALSE(paddle::platform::is_cpu_place(paddle::platform::get_place())); - EXPECT_TRUE(paddle::platform::is_mkldnn_place(paddle::platform::get_place())); } TEST(Place, Print) { From af0c4c45a3343189b3811883c9d5ae2757961008 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Mon, 25 Dec 2017 15:30:03 +0800 Subject: [PATCH 17/24] Impl kernel hint (#6883) * init kernel hint * fix typo * rm unused code * add include in op_kernel.h * restore op_kernel since it will be moved to op_kernel_type * change force_cpu to use_cpu * fix compilation --- paddle/framework/operator.cc | 17 +++++++++++++---- paddle/framework/operator.h | 9 ++++++++- paddle/framework/operator_test.cc | 2 +- paddle/operators/accuracy_op.cc | 2 +- paddle/operators/auc_op.cc | 2 +- paddle/operators/batch_norm_op.cc | 2 +- paddle/operators/chunk_eval_op.cc | 2 +- paddle/operators/compare_op.cc | 4 ++-- paddle/operators/crf_decoding_op.cc | 8 +++++++- paddle/operators/cross_entropy_op.cc | 4 ++-- .../fill_constant_batch_size_like_op.cc | 2 +- paddle/operators/gather_op.cc | 4 ++-- paddle/operators/gaussian_random_op.cc | 2 +- paddle/operators/linear_chain_crf_op.cc | 4 ++-- paddle/operators/lod_reset_op.cc | 4 ++-- paddle/operators/logical_op.cc | 4 ++-- paddle/operators/lookup_table_op.cc | 4 ++-- paddle/operators/lstm_op.cc | 4 ++-- paddle/operators/multiplex_op.cc | 4 ++-- paddle/operators/nce_op.cc | 4 ++-- paddle/operators/pool_with_index_op.cc | 4 ++-- paddle/operators/positive_negative_pair_op.cc | 2 +- paddle/operators/precision_recall_op.cc | 2 +- paddle/operators/roi_pool_op.cc | 4 ++-- paddle/operators/scatter_op.cc | 4 ++-- paddle/operators/sequence_pool_op.cc | 2 +- paddle/operators/sequence_slice_op.cc | 4 ++-- .../operators/softmax_with_cross_entropy_op.cc | 4 ++-- paddle/operators/sum_op.cc | 2 +- paddle/operators/uniform_random_op.cc | 2 +- paddle/operators/unpool_op.cc | 4 ++-- paddle/pybind/const_value.cc | 5 +++++ python/paddle/v2/fluid/framework.py | 4 ++++ 33 files changed, 81 insertions(+), 50 deletions(-) diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc index f147cc5a6e..66840a2e03 100644 --- a/paddle/framework/operator.cc +++ b/paddle/framework/operator.cc @@ -402,19 +402,28 @@ void OperatorWithKernel::Run(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; ExecutionContext ctx(*this, scope, *dev_ctx); - auto kernel_key = GetKernelType(ctx); - auto kernel_iter = kernels.find(kernel_key); + auto actual_kernel_key = GetActualKernelType(ctx); + auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key); + auto kernel_iter = kernels.find(expected_kernel_key); if (kernel_iter == kernels.end()) { - PADDLE_THROW("The operator %s does not support %s", type_, kernel_key); + PADDLE_THROW("The operator %s does not support %s", type_, + expected_kernel_key); } kernel_iter->second->Compute(ctx); } -OpKernelType OperatorWithKernel::GetKernelType( + +OpKernelType OperatorWithKernel::GetActualKernelType( const ExecutionContext& ctx) const { return OpKernelType(IndicateDataType(ctx), ctx.GetPlace()); } + +OpKernelType OperatorWithKernel::GetExpectedKernelType( + const OpKernelType& actual_kernel_type) const { + return actual_kernel_type; +} + proto::DataType OperatorWithKernel::IndicateDataType( const ExecutionContext& ctx) const { auto& scope = ctx.scope(); diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h index b592eea1b9..55eed57e66 100644 --- a/paddle/framework/operator.h +++ b/paddle/framework/operator.h @@ -52,6 +52,11 @@ constexpr char kGradVarSuffix[] = "@GRAD"; /// Variables with this suffix are supposed to be filled up with zeros. constexpr char kZeroVarSuffix[] = "@ZERO"; +// define some kernel hint +const std::string kUseCPU = "use_cpu"; +const std::string kUseCUDNN = "use_cudnn"; +const std::string kUseMKLDNN = "use_mkldnn"; + inline std::string GradVarName(const std::string& var_name) { return var_name + kGradVarSuffix; } @@ -373,7 +378,9 @@ class OperatorWithKernel : public OperatorBase { } protected: - virtual OpKernelType GetKernelType(const ExecutionContext& ctx) const; + virtual OpKernelType GetActualKernelType(const ExecutionContext& ctx) const; + virtual OpKernelType GetExpectedKernelType( + const OpKernelType& actual_kernel_type) const; private: // indicate kernel DataType by input data. Defaultly all input data must be diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc index fbca45b59d..4d38a7ada9 100644 --- a/paddle/framework/operator_test.cc +++ b/paddle/framework/operator_test.cc @@ -114,7 +114,7 @@ class OpWithKernelTest : public OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override {} - OpKernelType GetKernelType(const ExecutionContext& ctx) const override { + OpKernelType GetActualKernelType(const ExecutionContext& ctx) const override { return OpKernelType(proto::DataType::FP32, ctx.GetPlace()); } }; diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc index b8ed93f4eb..d7baa6e905 100644 --- a/paddle/operators/accuracy_op.cc +++ b/paddle/operators/accuracy_op.cc @@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Out")->type()), diff --git a/paddle/operators/auc_op.cc b/paddle/operators/auc_op.cc index 811c487089..c16bc11931 100644 --- a/paddle/operators/auc_op.cc +++ b/paddle/operators/auc_op.cc @@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Out")->type()), diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc index 1c14acbe11..49cb0fa4d9 100644 --- a/paddle/operators/batch_norm_op.cc +++ b/paddle/operators/batch_norm_op.cc @@ -304,7 +304,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { const auto *var = ctx.InputVar(framework::GradVarName("Y")); if (var == nullptr) { diff --git a/paddle/operators/chunk_eval_op.cc b/paddle/operators/chunk_eval_op.cc index f1f274a7af..a040404266 100644 --- a/paddle/operators/chunk_eval_op.cc +++ b/paddle/operators/chunk_eval_op.cc @@ -55,7 +55,7 @@ class ChunkEvalOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType(framework::proto::DataType::FP32, ctx.device_context()); diff --git a/paddle/operators/compare_op.cc b/paddle/operators/compare_op.cc index 1148172f3a..10bf3d4bbc 100644 --- a/paddle/operators/compare_op.cc +++ b/paddle/operators/compare_op.cc @@ -66,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { - framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx); + framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx); // CompareOp kernel's device type is decided by input tensor place kt.place_ = ctx.Input("X")->place(); return kt; diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc index 27d0871f82..024e1d061a 100644 --- a/paddle/operators/crf_decoding_op.cc +++ b/paddle/operators/crf_decoding_op.cc @@ -120,12 +120,18 @@ class CRFDecodingOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Emission")->type()), ctx.device_context()); } + + framework::OpKernelType GetExpectedKernelType( + const framework::OpKernelType& actual_kernel_type) const override { + return framework::OpKernelType(actual_kernel_type.data_type_, + platform::CPUPlace()); + } }; } // namespace operators } // namespace paddle diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc index 1ab7c0a06f..a9c5c7046f 100644 --- a/paddle/operators/cross_entropy_op.cc +++ b/paddle/operators/cross_entropy_op.cc @@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel { protected: // Explicitly set that the data type of computation kernel of cross_entropy // is determined by its input "X". - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), @@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { protected: // Explicitly set that the data type of computation kernel of cross_entropy // is determined by its input "X". - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), diff --git a/paddle/operators/fill_constant_batch_size_like_op.cc b/paddle/operators/fill_constant_batch_size_like_op.cc index 7a7e280e78..852ecdfe45 100644 --- a/paddle/operators/fill_constant_batch_size_like_op.cc +++ b/paddle/operators/fill_constant_batch_size_like_op.cc @@ -49,7 +49,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( static_cast(ctx.Attr("dtype")), diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc index 47af222314..45e9d8df70 100644 --- a/paddle/operators/gather_op.cc +++ b/paddle/operators/gather_op.cc @@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), @@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc index 5eab1d5f4e..da4d281081 100644 --- a/paddle/operators/gaussian_random_op.cc +++ b/paddle/operators/gaussian_random_op.cc @@ -57,7 +57,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( static_cast(ctx.Attr("dtype")), diff --git a/paddle/operators/linear_chain_crf_op.cc b/paddle/operators/linear_chain_crf_op.cc index ad15e8ebd2..666207ea07 100644 --- a/paddle/operators/linear_chain_crf_op.cc +++ b/paddle/operators/linear_chain_crf_op.cc @@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { protected: // Explicitly set that the data type of computation kernel of linear_chain_crf // is determined by its input "Emission". - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Emission")->type()), @@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { protected: // Explicitly set that the data type of output of the linear_chain_crf_grad // operator is determined by its input: gradients of LogLikelihood. - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType( diff --git a/paddle/operators/lod_reset_op.cc b/paddle/operators/lod_reset_op.cc index ccb87258c6..f33874bd7b 100644 --- a/paddle/operators/lod_reset_op.cc +++ b/paddle/operators/lod_reset_op.cc @@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), @@ -97,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), diff --git a/paddle/operators/logical_op.cc b/paddle/operators/logical_op.cc index 2bd6c6efae..ee8e4dd2ad 100644 --- a/paddle/operators/logical_op.cc +++ b/paddle/operators/logical_op.cc @@ -99,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { - framework::OpKernelType kt = OperatorWithKernel::GetKernelType(ctx); + framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx); // LogicalOp kernel's device type is decided by input tensor place kt.place_ = ctx.Input("X")->place(); return kt; diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc index 0a9defa8c5..73b7464929 100644 --- a/paddle/operators/lookup_table_op.cc +++ b/paddle/operators/lookup_table_op.cc @@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("W")->type()), @@ -98,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("W")->type()), diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc index f82156170e..b8fcec0f29 100644 --- a/paddle/operators/lstm_op.cc +++ b/paddle/operators/lstm_op.cc @@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), @@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc index f524de60db..d25e4c269c 100644 --- a/paddle/operators/multiplex_op.cc +++ b/paddle/operators/multiplex_op.cc @@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.MultiInput("X")[0]->type()), @@ -102,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.MultiInput("X")[0]->type()), diff --git a/paddle/operators/nce_op.cc b/paddle/operators/nce_op.cc index 6dd457f7a2..d39ca87d53 100644 --- a/paddle/operators/nce_op.cc +++ b/paddle/operators/nce_op.cc @@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), @@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc index 980e9dc08b..76c5123527 100644 --- a/paddle/operators/pool_with_index_op.cc +++ b/paddle/operators/pool_with_index_op.cc @@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), @@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), diff --git a/paddle/operators/positive_negative_pair_op.cc b/paddle/operators/positive_negative_pair_op.cc index c607c93a15..a6b23c995b 100644 --- a/paddle/operators/positive_negative_pair_op.cc +++ b/paddle/operators/positive_negative_pair_op.cc @@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Score")->type()), diff --git a/paddle/operators/precision_recall_op.cc b/paddle/operators/precision_recall_op.cc index 21dcd28c67..c5753147ef 100644 --- a/paddle/operators/precision_recall_op.cc +++ b/paddle/operators/precision_recall_op.cc @@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("MaxProbs")->type()), diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc index 85b6a8e151..ef1804d976 100644 --- a/paddle/operators/roi_pool_op.cc +++ b/paddle/operators/roi_pool_op.cc @@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), @@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc index 173c958255..806dccc6ca 100644 --- a/paddle/operators/scatter_op.cc +++ b/paddle/operators/scatter_op.cc @@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Ref")->type()), @@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Ref")->type()), diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc index 0eb675caad..47f5bd891f 100644 --- a/paddle/operators/sequence_pool_op.cc +++ b/paddle/operators/sequence_pool_op.cc @@ -107,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc index 309ee1f3a8..98bd885490 100644 --- a/paddle/operators/sequence_slice_op.cc +++ b/paddle/operators/sequence_slice_op.cc @@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), @@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc index d9911a6901..13266d394d 100644 --- a/paddle/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/operators/softmax_with_cross_entropy_op.cc @@ -118,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Logits")->type()), @@ -159,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType( diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc index 891839bf9c..b86e826642 100644 --- a/paddle/operators/sum_op.cc +++ b/paddle/operators/sum_op.cc @@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { auto x_vars = ctx.MultiInputVar("X"); if (x_vars[0]->IsType()) { diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc index 3c705cb339..e985e491e9 100644 --- a/paddle/operators/uniform_random_op.cc +++ b/paddle/operators/uniform_random_op.cc @@ -63,7 +63,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { } protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( static_cast(ctx.Attr("dtype")), diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc index 1b682d5c72..aeed9679b2 100644 --- a/paddle/operators/unpool_op.cc +++ b/paddle/operators/unpool_op.cc @@ -71,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) { class UnpoolOp : public framework::OperatorWithKernel { protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), @@ -110,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel { class UnpoolOpGrad : public framework::OperatorWithKernel { protected: - framework::OpKernelType GetKernelType( + framework::OpKernelType GetActualKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), diff --git a/paddle/pybind/const_value.cc b/paddle/pybind/const_value.cc index b13ad42ea2..761635aa5e 100644 --- a/paddle/pybind/const_value.cc +++ b/paddle/pybind/const_value.cc @@ -23,6 +23,11 @@ void BindConstValue(pybind11::module& m) { m.def("kTempVarName", [] { return framework::kTempVarName; }); m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); + + // for kernel_hint key + m.def("kUseCPU", [] { return framework::kUseCPU; }); + m.def("kUseCUDNN", [] { return framework::kUseCUDNN; }); + m.def("kUseMKLDNN", [] { return framework::kUseMKLDNN; }); } } // namespace pybind diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 7b65fe80ae..add854306e 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -17,6 +17,10 @@ TEMP_VAR_NAME = core.kTempVarName() GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() +USE_CPU = core.kUseCPU() +USE_CUDNN = core.kUseMKLDNN() +USE_MKLDNN = core.kUseMKLDNN() + def grad_var_name(var_name): """ From ea5d6eae9f8391aaea52d6891d83b60e1fabf1fd Mon Sep 17 00:00:00 2001 From: QI JUN Date: Mon, 25 Dec 2017 15:53:50 +0800 Subject: [PATCH 18/24] update support new device docs (#6963) * update docs * follow comments * fix typo --- doc/design/support_new_device.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md index fd23dc211a..f54b2b3694 100644 --- a/doc/design/support_new_device.md +++ b/doc/design/support_new_device.md @@ -25,13 +25,14 @@ There are mainly three parts that we have to consider while integrating a new de ### Place and DeviceContext +Please remind that device and computing library are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. #### Place -Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent different devices and computing libraries. There are inheritance relationships between different kinds of `Place`. +Fluid uses class [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L55) to represent the device memory where data is located. If we add another device, we have to add corresponding `DevicePlace`. ``` - | CPUPlace --> MKLDNNPlace -Place --| CUDAPlace --> CUDNNPlace + | CPUPlace +Place --| CUDAPlace | FPGAPlace ``` @@ -43,7 +44,7 @@ typedef boost::variant Place; #### DeviceContext -Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different hardwares, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`. +Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L30) to manage the resources in different libraries, such as CUDA stream in `CDUADeviceContext`. There are also inheritance relationships between different kinds of `DeviceContext`. ``` @@ -106,7 +107,7 @@ template size_t Used(Place place); ``` -To implementing these interfaces, we have to implement MemoryAllocator for different Devices +To implement these interfaces, we have to implement MemoryAllocator for different Devices. #### Tensor @@ -243,6 +244,7 @@ REGISTER_OP_CUDA_KERNEL( Generally, we will impelement OpKernel for all Device/Library of an Operator. We can easily train a Convolutional Neural Network in GPU. However, some OpKernel is not sutibale on a specific Device. For example, crf operator can only run on CPU, whereas most other operators can run at GPU. To achieve high performance in such circumstance, we have to switch between different Device/Library. -We will discuss how to implement an efficient OpKernel switch policy. +For more details, please refer to following docs: -- TBD +- operator kernel type [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md) +- switch kernel [doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md) From fbd1d1cf26fc0c0d242685c29fc89213c8151798 Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 25 Dec 2017 16:36:59 +0800 Subject: [PATCH 19/24] refine pipe_reader --- python/paddle/v2/reader/decorator.py | 121 +++++++++++---------------- 1 file changed, 50 insertions(+), 71 deletions(-) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 27c82c95f7..8262d992ea 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -334,93 +334,72 @@ def _buf2lines(buf, line_break="\n"): return lines[:-1], lines[-1] -def pipe_reader(left_cmd, - parser, - bufsize=8192, - file_type="plain", - cut_lines=True, - line_break="\n"): +class PipeReader: """ - pipe_reader read data by stream from a command, take it's - stdout into a pipe buffer and redirect it to the parser to - parse, then yield data as your desired format. + PipeReader read data by stream from a command, take it's + stdout into a pipe buffer and redirect it to the parser to + parse, then yield data as your desired format. - You can using standard linux command or call another program - to read data, from HDFS, Ceph, URL, AWS S3 etc: + You can using standard linux command or call another program + to read data, from HDFS, Ceph, URL, AWS S3 etc: - cmd = "hadoop fs -cat /path/to/some/file" - cmd = "cat sample_file.tar.gz" - cmd = "curl http://someurl" - cmd = "python print_s3_bucket.py" + .. code-block:: python + cmd = "hadoop fs -cat /path/to/some/file" + cmd = "cat sample_file.tar.gz" + cmd = "curl http://someurl" + cmd = "python print_s3_bucket.py" - A sample parser: + An example: + + .. code-block:: python - def sample_parser(lines): - # parse each line as one sample data, - # return a list of samples as batches. - ret = [] - for l in lines: - ret.append(l.split(" ")[1:5]) - return ret - - :param left_cmd: command to excute to get stdout from. - :type left_cmd: string - :param parser: parser function to parse lines of data. - if cut_lines is True, parser will receive list - of lines. - if cut_lines is False, parser will receive a - raw buffer each time. - parser should return a list of parsed values. - :type parser: callable - :param bufsize: the buffer size used for the stdout pipe. - :type bufsize: int - :param file_type: can be plain/gzip, stream buffer data type. - :type file_type: string - :param cut_lines: whether to pass lines instead of raw buffer - to the parser - :type cut_lines: bool - :param line_break: line break of the file, like \n or \r - :type line_break: string - - :return: the reader generator. - :rtype: callable + def example_reader(): + for f in myfiles: + pr = PipeReader("cat %s"%f) + for l in pr.get_line(): + sample = l.split(" ") + yield sample """ - if not isinstance(left_cmd, str): - raise TypeError("left_cmd must be a string") - if not callable(parser): - raise TypeError("parser must be a callable object") - - # TODO(typhoonzero): add a thread to read stderr - - # Always init a decompress object is better than - # create in the loop. - dec = zlib.decompressobj( - 32 + zlib.MAX_WBITS) # offset 32 to skip the header - def reader(): - process = subprocess.Popen( - left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + def __init__(self, command, bufsize=8192, file_type="plain"): + if not isinstance(command, str): + raise TypeError("left_cmd must be a string") + if file_type == "gzip": + self.dec = zlib.decompressobj( + 32 + zlib.MAX_WBITS) # offset 32 to skip the header + self.file_type = file_type + self.bufsize = bufsize + self.process = subprocess.Popen( + command.split(" "), bufsize=bufsize, stdout=subprocess.PIPE) + + def get_line(self, cut_lines=True, line_break="\n"): + """ + :param cut_lines: cut buffer to lines + :type cut_lines: bool + :param line_break: line break of the file, like \n or \r + :type line_break: string + + :return: one line or a buffer of bytes + :rtype: string + """ remained = "" while True: - buff = process.stdout.read(bufsize) + buff = self.process.stdout.read(self.bufsize) if buff: - if file_type == "gzip": - decomp_buff = dec.decompress(buff) - elif file_type == "plain": + if self.file_type == "gzip": + decomp_buff = self.dec.decompress(buff) + elif self.file_type == "plain": decomp_buff = buff else: - raise TypeError("file_type %s is not allowed" % file_type) + raise TypeError("file_type %s is not allowed" % + self.file_type) if cut_lines: lines, remained = _buf2lines(''.join( [remained, decomp_buff]), line_break) - parsed_list = parser(lines) - for ret in parsed_list: - yield ret + for line in lines: + yield line else: - for ret in parser(decomp_buff): - yield ret + yield decomp_buff else: break - - return reader From 9b67688b0d37e2d96d2261d8cdc42dc4b9473d1d Mon Sep 17 00:00:00 2001 From: typhoonzero Date: Mon, 25 Dec 2017 17:22:36 +0800 Subject: [PATCH 20/24] follow comments --- python/paddle/v2/reader/decorator.py | 2 +- .../paddle/v2/reader/tests/decorator_test.py | 24 +++++++++---------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py index 8262d992ea..44a6e34463 100644 --- a/python/paddle/v2/reader/decorator.py +++ b/python/paddle/v2/reader/decorator.py @@ -14,7 +14,7 @@ __all__ = [ 'map_readers', 'buffered', 'compose', 'chain', 'shuffle', - 'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader' + 'ComposeNotAligned', 'firstn', 'xmap_readers', 'PipeReader' ] from threading import Thread diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py index 06e14796da..4ba71969df 100644 --- a/python/paddle/v2/reader/tests/decorator_test.py +++ b/python/paddle/v2/reader/tests/decorator_test.py @@ -147,8 +147,11 @@ class TestXmap(unittest.TestCase): class TestPipeReader(unittest.TestCase): def test_pipe_reader(self): - def simple_parser(lines): - return lines + def example_reader(myfiles): + for f in myfiles: + pr = paddle.v2.reader.PipeReader("cat %s" % f, bufsize=128) + for l in pr.get_line(): + yield l import tempfile @@ -159,17 +162,12 @@ class TestPipeReader(unittest.TestCase): for r in records: f.write('%s\n' % r) - cmd = "cat %s" % temp.name - reader = paddle.v2.reader.pipe_reader( - cmd, simple_parser, bufsize=128) - for i in xrange(4): - result = [] - for r in reader(): - result.append(r) - - for idx, e in enumerate(records): - print e, result[idx] - self.assertEqual(e, result[idx]) + result = [] + for r in example_reader([temp.name]): + result.append(r) + + for idx, e in enumerate(records): + self.assertEqual(e, result[idx]) finally: # delete the temporary file temp.close() From e9ee6fe8c361652cd405b13e7b81372dc73768ed Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 25 Dec 2017 20:26:40 +0800 Subject: [PATCH 21/24] update alexnet training data --- benchmark/IntelOptimizedPaddle.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md index 94a79c3c8e..6cc9598947 100644 --- a/benchmark/IntelOptimizedPaddle.md +++ b/benchmark/IntelOptimizedPaddle.md @@ -60,9 +60,9 @@ Input image size - 3 * 224 * 224, Time: images/second | BatchSize | 64 | 128 | 256 | |--------------|--------| ------ | -------| -| OpenBLAS | 0.85 | 1.03 | 1.17 | -| MKLML | 71.26 | 106.94 | 155.18 | -| MKL-DNN     | 362.66 | 497.66 | 610.73 | +| OpenBLAS | 2.13 | 2.45 | 2.68 | +| MKLML | 66.37 | 105.60 | 144.04 | +| MKL-DNN | 399.00 | 498.94 | 626.53 | chart TBD From 7777c811b1a9b5dfd86cb97fa347f4a1e6880314 Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 25 Dec 2017 20:44:26 +0800 Subject: [PATCH 22/24] "add data layout" (#6955) * "add data layout" * "need kernel registry support" * "fix data layout" * "reorder include headers" * "change enum to enum class" * "fix CI" --- paddle/framework/data_layout.h | 9 +++++---- paddle/framework/library_type.h | 8 ++++---- paddle/framework/op_kernel_type.h | 1 + paddle/framework/tensor.h | 19 ++++++++++++++++++- paddle/framework/tensor_impl.h | 1 + paddle/framework/tensor_test.cc | 9 +++++++++ paddle/framework/tensor_util.h | 2 ++ paddle/framework/tensor_util_test.cc | 9 ++++++++- paddle/pybind/pybind.cc | 4 ++++ 9 files changed, 52 insertions(+), 10 deletions(-) diff --git a/paddle/framework/data_layout.h b/paddle/framework/data_layout.h index 7d7a444cf0..4a8669c3a4 100644 --- a/paddle/framework/data_layout.h +++ b/paddle/framework/data_layout.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/platform/enforce.h" #include #include "paddle/platform/enforce.h" @@ -20,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace framework { -enum DataLayout { +enum class DataLayout { kNHWC = 0, kNCHW = 1, kAnyLayout = 2, @@ -38,11 +39,11 @@ inline DataLayout StringToDataLayout(const std::string& str) { inline std::string DataLayoutToString(const DataLayout& data_layout) { switch (data_layout) { - case kNHWC: + case DataLayout::kNHWC: return "NHWC"; - case kNCHW: + case DataLayout::kNCHW: return "NCHW"; - case kAnyLayout: + case DataLayout::kAnyLayout: return "ANY_LAYOUT"; default: PADDLE_THROW("unknown DataLayou %d", data_layout); diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h index aa66cf00f3..6baae6c2bb 100644 --- a/paddle/framework/library_type.h +++ b/paddle/framework/library_type.h @@ -20,15 +20,15 @@ namespace framework { // For more details about the design of LibraryType, Please refer to // https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md#library -enum LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 }; +enum class LibraryType { kPlain = 0, kMKLDNN = 1, kCUDNN = 2 }; inline std::string LibraryTypeToString(const LibraryType& library_type) { switch (library_type) { - case kPlain: + case LibraryType::kPlain: return "PLAIN"; - case kMKLDNN: + case LibraryType::kMKLDNN: return "MKLDNN"; - case kCUDNN: + case LibraryType::kCUDNN: return "CUDNN"; default: PADDLE_THROW("unknown LibraryType %d", library_type); diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h index e9c45b958c..97b542e345 100644 --- a/paddle/framework/op_kernel_type.h +++ b/paddle/framework/op_kernel_type.h @@ -40,6 +40,7 @@ struct OpKernelType { // place, data_type, library_type kinds less than 2^8 constexpr static int LEFT_SHIFT = 8; + proto::DataType data_type_; DataLayout data_layout_; platform::Place place_; diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h index 6a0c5133c9..b9f6884f7c 100644 --- a/paddle/framework/tensor.h +++ b/paddle/framework/tensor.h @@ -20,12 +20,12 @@ limitations under the License. */ #include #include +#include "paddle/framework/data_layout.h" #include "paddle/framework/ddim.h" #include "paddle/memory/memory.h" #include "paddle/platform/device_context.h" #include "paddle/platform/enforce.h" #include "paddle/platform/place.h" -#include "unsupported/Eigen/CXX11/Tensor" namespace paddle { @@ -115,6 +115,10 @@ class Tensor { inline void check_memory_size() const; + inline DataLayout layout() const { return layout_; } + + inline void set_layout(const DataLayout layout) { layout_ = layout; } + private: friend class LoDTensor; @@ -173,6 +177,19 @@ class Tensor { DDim dims_; + /** + * @brief the layout of memory block, default is NCHW. + * + * @note the memory allocation order, describe how weight/data is stored + * For example, in 4-D Tensor(rank=4), there are three commonly + * used layout. They are + * NCHW, NHWC, CHWN. + * N,C,H,W for respectively the batch size, the number of + * feature maps, the height. + */ + + DataLayout layout_ = DataLayout::kNHWC; + /** * @brief A PlaceHolder may be shared by more than one tensor. * diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h index 3d93b7808b..6c6f298edc 100644 --- a/paddle/framework/tensor_impl.h +++ b/paddle/framework/tensor_impl.h @@ -165,6 +165,7 @@ inline Tensor Tensor::Slice(int begin_idx, int end_idx) const { size_t base = numel() / dims_[0]; Tensor dst; dst.holder_ = holder_; + dst.set_layout(layout_); DDim dst_dims = dims_; dst_dims[0] = end_idx - begin_idx; dst.Resize(dst_dims); diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc index f347981f2e..ca76a9fcb9 100644 --- a/paddle/framework/tensor_test.cc +++ b/paddle/framework/tensor_test.cc @@ -200,3 +200,12 @@ TEST(Tensor, ReshapeToMatrix) { ASSERT_EQ(res.dims()[0], 2 * 3); ASSERT_EQ(res.dims()[1], 4 * 9); } + +TEST(Tensor, Layout) { + using namespace paddle::framework; + using namespace paddle::platform; + Tensor src; + ASSERT_EQ(src.layout(), DataLayout::kNHWC); + src.set_layout(DataLayout::kAnyLayout); + ASSERT_EQ(src.layout(), DataLayout::kAnyLayout); +} diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h index ebfb0e5538..692f5f1af7 100644 --- a/paddle/framework/tensor_util.h +++ b/paddle/framework/tensor_util.h @@ -33,6 +33,7 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, src.check_memory_size(); dst->Resize(src.dims()); + dst->set_layout(src.layout()); auto src_place = src.place(); auto src_ptr = src.data(); @@ -89,6 +90,7 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, Tensor* dst) { src.check_memory_size(); dst->Resize(src.dims()); + dst->set_layout(src.layout()); auto src_place = src.place(); auto src_ptr = src.data(); diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc index 6fc243aaf6..f388c19f28 100644 --- a/paddle/framework/tensor_util_test.cc +++ b/paddle/framework/tensor_util_test.cc @@ -28,6 +28,7 @@ TEST(CopyFrom, Tensor) { int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9}; memcpy(src_ptr, arr, 9 * sizeof(int)); + src_tensor.set_layout(DataLayout::kAnyLayout); auto cpu_place = new platform::CPUPlace(); CopyFrom(src_tensor, *cpu_place, &dst_tensor); @@ -38,14 +39,18 @@ TEST(CopyFrom, Tensor) { EXPECT_EQ(src_ptr[i], dst_ptr[i]); } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + Tensor slice_tensor = src_tensor.Slice(1, 2); - CopyFrom(slice_tensor, *cpu_place, cpu_ctx, &dst_tensor); + CopyFrom(slice_tensor, *cpu_place, &dst_tensor); const int* slice_ptr = slice_tensor.data(); dst_ptr = dst_tensor.data(); ASSERT_NE(dst_ptr, slice_ptr); for (size_t i = 0; i < 3; ++i) { EXPECT_EQ(dst_ptr[i], slice_ptr[i]); } + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); + #ifdef PADDLE_WITH_CUDA { Tensor src_tensor; @@ -91,6 +96,8 @@ TEST(CopyFrom, Tensor) { for (size_t i = 0; i < 3; ++i) { EXPECT_EQ(dst_ptr[i], slice_ptr[i]); } + + EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); } #endif } diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc index 668a48e816..07e38476e6 100644 --- a/paddle/pybind/pybind.cc +++ b/paddle/pybind/pybind.cc @@ -78,6 +78,10 @@ PYBIND11_PLUGIN(core) { [](Tensor &self, const std::vector &dim) { self.Resize(make_ddim(dim)); }) + .def("set_layout", + [](Tensor &self, const std::string &layout) { + self.set_layout(StringToDataLayout(layout)); + }) .def("alloc_float", [](Tensor &self, paddle::platform::CUDAPlace &place) { self.mutable_data(place); From 097d71a14f509133cd73885ef79b14f5aa024580 Mon Sep 17 00:00:00 2001 From: Luo Tao Date: Mon, 25 Dec 2017 21:08:34 +0800 Subject: [PATCH 23/24] Add "download mklml failed" into FAQ --- doc/faq/build_and_install/index_cn.rst | 28 ++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/doc/faq/build_and_install/index_cn.rst b/doc/faq/build_and_install/index_cn.rst index a2bdeead78..ed8a0c7e87 100644 --- a/doc/faq/build_and_install/index_cn.rst +++ b/doc/faq/build_and_install/index_cn.rst @@ -109,3 +109,31 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二 解决办法是: * 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包,使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面,单元测试会引用site-packages里面的python包,而不是源码目录里 :code:`/python` 目录下的python包。同时,即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用,因为python的搜索路径是优先已经安装的python包。 + +8. 下载MKLML库失败 +------------------ + +.. code-block:: bash + + make[2]: *** [third_party/mklml/src/extern_mklml-stamp/extern_mklml-download] 错误 4 + make[1]: *** [CMakeFiles/extern_mklml.dir/all] 错误 2 + make[1]: *** 正在等待未完成的任务.... + +原因:网速或SSL链接原因,导致MKLML库下载不成功。 + +解决办法是:手动下载并安装,具体步骤如下。 + +.. code-block:: bash + + // 1. 进入对应的目录 + cd build/third_party/mklml/src/extern_mklml + + // 2. 查看包的大小, 正常情况下是75M,如果小于75M,即下载失败: + du -sh mklml_lnx_2018.0.1.20171007.tgz + + // 3. 手动下载且解压缩,并手动生成download成功标签: + wget --no-check-certificate https://github.com/01org/mkl-dnn/releases/download/v0.11/mklml_lnx_2018.0.1.20171007.tgz -c -O mklml_lnx_2018.0.1.20171007.tgz + tar zxf mklml_lnx_2018.0.1.20171007.tgz + touch ../extern_mklml-stamp/extern_mklml-download + + // 4. 接着编译即可 From 80dafdf594b28318a5c7904af718db815be015ff Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Tue, 26 Dec 2017 09:26:22 +0800 Subject: [PATCH 24/24] "fix threadpool style" (#7017) * "fix threadpool style" * "remove header" --- paddle/framework/CMakeLists.txt | 3 ++- paddle/framework/threadpool.cc | 24 ++++++++++++++++++++++++ paddle/framework/threadpool.h | 11 +++-------- paddle/framework/threadpool_test.cc | 6 ++---- 4 files changed, 31 insertions(+), 13 deletions(-) create mode 100644 paddle/framework/threadpool.cc diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt index 25a0db2768..c2a57a95ee 100644 --- a/paddle/framework/CMakeLists.txt +++ b/paddle/framework/CMakeLists.txt @@ -59,7 +59,8 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) -cc_test(threadpool_test SRCS threadpool_test.cc) +cc_library(threadpool SRCS threadpool.cc) +cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool) cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece) cc_test(init_test SRCS init_test.cc DEPS init) diff --git a/paddle/framework/threadpool.cc b/paddle/framework/threadpool.cc new file mode 100644 index 0000000000..2b9be0646c --- /dev/null +++ b/paddle/framework/threadpool.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/framework/threadpool.h" + +namespace paddle { +namespace framework { + +std::unique_ptr ThreadPool::threadpool(nullptr); +std::once_flag ThreadPool::init_flag; + +} // namespace framework +} // namespace paddle diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h index 9a1ece3ae8..5f6b2d458f 100644 --- a/paddle/framework/threadpool.h +++ b/paddle/framework/threadpool.h @@ -13,15 +13,13 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include -#include #include -#include #include #include #include -#include "paddle/platform/call_once.h" #include "paddle/platform/enforce.h" namespace paddle { @@ -81,10 +79,9 @@ class ThreadPool { } private: - ThreadPool& operator=(const ThreadPool&) = delete; - ThreadPool(const ThreadPool&) = delete; + DISABLE_COPY_AND_ASSIGN(ThreadPool); - ThreadPool(int num_threads) + explicit ThreadPool(int num_threads) : num_threads_(num_threads), available_(num_threads), running_(true) { threads_.resize(num_threads); for (auto& thread : threads_) { @@ -155,7 +152,5 @@ class ThreadPool { std::condition_variable completed_; }; -std::unique_ptr ThreadPool::threadpool(nullptr); -std::once_flag ThreadPool::init_flag; } // namespace framework } // namespace paddle diff --git a/paddle/framework/threadpool_test.cc b/paddle/framework/threadpool_test.cc index 78c762608e..012d92a5ed 100644 --- a/paddle/framework/threadpool_test.cc +++ b/paddle/framework/threadpool_test.cc @@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "threadpool.h" #include #include -#include -#include -#include + +#include "threadpool.h" namespace framework = paddle::framework;