From ba168bd2d23f763f1b4c6357943da01890fc6421 Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Wed, 22 Aug 2018 12:14:26 +0000 Subject: [PATCH] modify API.spec --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/stack_op.h | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 9250cde1b2..c03df86e0f 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -162,6 +162,7 @@ paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None)) +paddle.fluid.layers.stack ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=(0,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) diff --git a/paddle/fluid/operators/stack_op.h b/paddle/fluid/operators/stack_op.h index b139f48d87..c777d5feae 100644 --- a/paddle/fluid/operators/stack_op.h +++ b/paddle/fluid/operators/stack_op.h @@ -154,17 +154,22 @@ class StackKernel : public framework::OpKernel { if (std::is_same::value || n > kMaxThreshold) { #ifdef __NVCC__ + VLOG(10) << "Stack more than " << kMaxThreshold + << " tensors on GPU may be slow."; thrust::device_vector device_x_vec(x_datas); auto x_data_arr = device_x_vec.data().get(); #else auto x_data_arr = x_datas.data(); #endif StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); +#ifdef __NVCC__ + // Wait() must be called because device_x_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#endif } #ifdef __NVCC__ else { // NOLINT - VLOG(10) << "Stack more than " << kMaxThreshold - << " tensors on GPU may be slow."; framework::Array x_data_arr; for (int i = 0; i < n; ++i) x_data_arr[i] = x_datas[i]; StackFunctorForRange(dev_ctx, x_data_arr, y_data, total_num, n, post); @@ -243,6 +248,8 @@ class StackGradKernel : public framework::OpKernel { if (std::is_same::value || n > kMaxThreshold) { #ifdef __NVCC__ + VLOG(10) << "Stack more than " << kMaxThreshold + << " tensors on GPU may be slow."; thrust::device_vector device_dx_vec(dx_datas); auto dx_data_arr = device_dx_vec.data().get(); #else @@ -250,11 +257,14 @@ class StackGradKernel : public framework::OpKernel { #endif StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n, post); +#ifdef __NVCC__ + // Wait() must be called because device_dx_vec may be destructed before + // kernel ends + dev_ctx.Wait(); +#endif } #ifdef __NVCC__ else { // NOLINT - VLOG(10) << "Stack more than " << kMaxThreshold - << " tensors on GPU may be slow."; framework::Array dx_data_arr; for (int i = 0; i < n; ++i) dx_data_arr[i] = dx_datas[i]; StackGradFunctorForRange(dev_ctx, dx_data_arr, dy_data, total_num, n,