Merge pull request #4177 from reyoung/feature/transform_ctx

Feature/transform ctx
8 years ago · 24cb1ce002
parent c4e783e505 eeaccc8351
commit 24cb1ce002
3 changed files with 29 additions and 17 deletions
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@ -24,4 +24,4 @@ cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
 nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)

 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
-nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place)
+nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
--- a/paddle/platform/transform.h
+++ b/paddle/platform/transform.h
@ -14,6 +14,7 @@

 #pragma once

+#include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/hostdevice.h"
 #include "paddle/platform/place.h"
@ -21,6 +22,7 @@
 #include <algorithm>
 #include <type_traits>
 #ifdef __NVCC__
+#include <thrust/execution_policy.h>
 #include <thrust/transform.h>
 #include "paddle/platform/details/device_ptr_cast.h"
 #endif
@ -28,34 +30,39 @@
 namespace paddle {
 namespace platform {
 // Transform on host or device. It provides the same API in std library.
-template <typename Place, typename InputIter, typename OutputIter,
-          typename UnaryOperation>
-void Transform(Place place, InputIter first, InputIter last, OutputIter result,
-               UnaryOperation op) {
+template <typename InputIter, typename OutputIter, typename UnaryOperation>
+void Transform(const DeviceContext& context, InputIter first, InputIter last,
+               OutputIter result, UnaryOperation op) {
+  auto place = context.GetPlace();
  if (is_cpu_place(place)) {
    std::transform(first, last, result, op);
  } else {
 #ifdef __NVCC__
+    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
    using namespace details;
-    thrust::transform(DevPtrCast(first), DevPtrCast(last), DevPtrCast(result),
-                      op);
+    thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first),
+                      DevPtrCast(last), DevPtrCast(result), op);
 #else
    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
 #endif
  }
 }

-template <typename Place, typename InputIter1, typename InputIter2,
-          typename OutputIter, typename BinaryOperation>
-void Transform(Place place, InputIter1 first1, InputIter1 last1,
-               InputIter2 first2, OutputIter result, BinaryOperation op) {
+template <typename InputIter1, typename InputIter2, typename OutputIter,
+          typename BinaryOperation>
+void Transform(const DeviceContext& context, InputIter1 first1,
+               InputIter1 last1, InputIter2 first2, OutputIter result,
+               BinaryOperation op) {
+  auto place = context.GetPlace();
  if (is_cpu_place(place)) {
    std::transform(first1, last1, first2, result, op);
  } else {
 #ifdef __NVCC__
+    auto& ctx = reinterpret_cast<const CUDADeviceContext&>(context);
    using namespace details;
-    thrust::transform(DevPtrCast(first1), DevPtrCast(last1), DevPtrCast(first2),
-                      DevPtrCast(result), op);
+    thrust::transform(thrust::cuda::par.on(ctx.stream()), DevPtrCast(first1),
+                      DevPtrCast(last1), DevPtrCast(first2), DevPtrCast(result),
+                      op);
 #else
    PADDLE_THROW("Do not invoke `Transform<GPUPlace>` in .cc file");
 #endif
--- a/paddle/platform/transform_test.cu
+++ b/paddle/platform/transform_test.cu
@ -36,8 +36,9 @@ class Multiply {

 TEST(Transform, CPUUnary) {
  using namespace paddle::platform;
+  CPUDeviceContext ctx;
  float buf[4] = {0.1, 0.2, 0.3, 0.4};
-  Transform(CPUPlace(), buf, buf + 4, buf, Scale<float>(10));
+  Transform(ctx, buf, buf + 4, buf, Scale<float>(10));
  for (int i = 0; i < 4; ++i) {
    ASSERT_NEAR(buf[i], static_cast<float>(i + 1), 1e-5);
  }
@ -47,10 +48,12 @@ TEST(Transform, GPUUnary) {
  using namespace paddle::platform;
  using namespace paddle::memory;
  GPUPlace gpu0(0);
+  CUDADeviceContext ctx(gpu0);
  float cpu_buf[4] = {0.1, 0.2, 0.3, 0.4};
  float* gpu_buf = static_cast<float*>(Alloc(gpu0, sizeof(float) * 4));
  Copy(gpu0, gpu_buf, CPUPlace(), cpu_buf, sizeof(cpu_buf));
-  Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, Scale<float>(10));
+  ctx.Wait();
  Copy(CPUPlace(), cpu_buf, gpu0, gpu_buf, sizeof(cpu_buf));
  Free(gpu0, gpu_buf);
  for (int i = 0; i < 4; ++i) {
@ -62,7 +65,7 @@ TEST(Transform, CPUBinary) {
  using namespace paddle::platform;
  using namespace paddle::memory;
  int buf[4] = {1, 2, 3, 4};
-  Transform(CPUPlace(), buf, buf + 4, buf, buf, Multiply<int>());
+  Transform(CPUDeviceContext(), buf, buf + 4, buf, buf, Multiply<int>());
  for (int i = 0; i < 4; ++i) {
    ASSERT_EQ((i + 1) * (i + 1), buf[i]);
  }
@ -73,9 +76,11 @@ TEST(Transform, GPUBinary) {
  using namespace paddle::memory;
  int buf[4] = {1, 2, 3, 4};
  GPUPlace gpu0(0);
+  CUDADeviceContext ctx(gpu0);
  int* gpu_buf = static_cast<int*>(Alloc(gpu0, sizeof(buf)));
  Copy(gpu0, gpu_buf, CPUPlace(), buf, sizeof(buf));
-  Transform(gpu0, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  Transform(ctx, gpu_buf, gpu_buf + 4, gpu_buf, gpu_buf, Multiply<int>());
+  ctx.Wait();
  Copy(CPUPlace(), buf, gpu0, gpu_buf, sizeof(buf));
  Free(gpu0, gpu_buf);
  for (int i = 0; i < 4; ++i) {