Remove template parameter for Tensor methods (#4937)

* Remove template parameter for Tensor methods

* Also check the type is correct when data()
* Simplize holder_

* Fix accuracy_op

* Register Code
revert-4814-Add_sequence_project_op
Yu Yang 8 years ago committed by GitHub
parent 43702a89d5
commit c532b96741

@ -26,6 +26,8 @@ inline DataType ToDataType(std::type_index type) {
return DataType::FP64;
} else if (typeid(int).hash_code() == type.hash_code()) {
return DataType::INT32;
} else if (typeid(int64_t).hash_code() == type.hash_code()) {
return DataType::INT64;
} else {
PADDLE_THROW("Not supported");
}

@ -34,7 +34,7 @@ void SetFeedVariable(const LoDTensor& input, const std::string& var_name,
feed_inputs.resize(index + 1);
}
// shared data with input tensor
feed_inputs[index].ShareDataWith<T>(input);
feed_inputs[index].ShareDataWith(input);
// set lod
feed_inputs[index].set_lod(input.lod());
}

@ -60,6 +60,10 @@ class Tensor {
template <typename T>
inline T* mutable_data(platform::Place place);
inline void* mutable_data(platform::Place place, std::type_index type);
inline void* mutable_data(platform::Place place);
/**
* @brief Return a pointer to mutable memory block.
*
@ -81,7 +85,6 @@ class Tensor {
inline Tensor& Resize(const DDim& dims);
/*! The internal of two tensors share the same memory block. */
template <typename T>
inline Tensor& ShareDataWith(const Tensor& src);
/**
@ -96,26 +99,9 @@ class Tensor {
// TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
// Remove `CopyFrom` and `CopyFromVector` from Tensor interface
// and make them global functions
template <typename T>
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx);
// FIXME(yuyang18): CopyFrom should without template T, use the replace
// `CopyFrom` with `CopyFromTensor`
inline void CopyFromTensor(const Tensor& src,
const platform::Place& dst_place,
const platform::DeviceContext& ctx) {
// NOLINTNEXTLINES_8 cpplint.py will recognize below lines as functions.
// That is a bug of cpplint.py. Just ignore lint these lines.
if (src.type() == std::type_index(typeid(double))) {
CopyFrom<double>(src, dst_place, ctx);
} else if (src.type() == std::type_index(typeid(float))) {
CopyFrom<float>(src, dst_place, ctx);
} else if (src.type() == std::type_index(typeid(int))) {
CopyFrom<int>(src, dst_place, ctx);
}
}
/**
* @brief Copy the content of an external vector to a tensor.
*
@ -135,7 +121,6 @@ class Tensor {
* @param[in] begin_idx The begin index of the slice.
* @param[in] end_idx The end index of the slice.
*/
template <typename T>
inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
platform::Place place() const {
@ -146,7 +131,6 @@ class Tensor {
std::type_index type() const { return holder_->type(); }
private:
template <typename T>
inline void check_memory_size() const;
private:
@ -155,20 +139,22 @@ class Tensor {
* parameter of Variable.
*/
struct Placeholder {
virtual ~Placeholder() {}
virtual ~Placeholder() = default;
virtual void* ptr() const = 0;
virtual size_t size() const = 0;
virtual std::type_index type() const = 0;
virtual platform::Place place() const = 0;
virtual void set_type(std::type_index type) = 0;
};
template <typename T, typename Place>
template <typename Place>
struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size)
: ptr_(static_cast<T*>(memory::Alloc(place, size)),
memory::PODDeleter<T, Place>(place)),
PlaceholderImpl(Place place, size_t size, std::type_index type)
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
memory::PODDeleter<uint8_t, Place>(place)),
place_(place),
size_(size) {
size_(size),
type_(type) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU"));
}
@ -176,16 +162,20 @@ class Tensor {
virtual size_t size() const { return size_; }
virtual platform::Place place() const { return place_; }
virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
virtual std::type_index type() const { return std::type_index(typeid(T)); }
virtual std::type_index type() const { return type_; }
virtual void set_type(std::type_index type) { type_ = type; }
/*! the pointer of memory block. */
std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
/*! the place of memory block. */
platform::Place place_;
/*! the size of memory block. */
size_t size_;
/* the current type of memory */
std::type_index type_;
};
/*! holds the memory block if allocated. */

@ -106,8 +106,8 @@ void TensorArray::Write(size_t index, const LoDTensor& value) {
values_[index].Resize(value.dims());
values_[index].mutable_data<value_type>(platform::CPUPlace());
values_[index].CopyFrom<value_type>(value, platform::CPUPlace(),
platform::CPUDeviceContext());
values_[index].CopyFrom(value, platform::CPUPlace(),
platform::CPUDeviceContext());
}
void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
@ -116,7 +116,7 @@ void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
values_.resize(index + 1);
}
values_[index].ShareDataWith<value_type>(value);
values_[index].ShareDataWith(value);
}
LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
@ -163,9 +163,9 @@ LoDTensor TensorArray::Stack() const {
result.mutable_data<value_type>(platform::CPUPlace());
for (size_t idx = 0; idx < size(); idx++) {
result.Slice<value_type>(idx, idx + 1)
.CopyFrom<value_type>(Read(idx), platform::CPUPlace(),
platform::CPUDeviceContext());
result.Slice(idx, idx + 1)
.CopyFrom(Read(idx), platform::CPUPlace(),
platform::CPUDeviceContext());
}
return result;
}
@ -191,13 +191,12 @@ void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
auto& value = values_[elem];
if (data_shared) {
// share memory
value.ShareDataWith<value_type>(source.Slice<value_type>(elem, elem + 1));
value.ShareDataWith(source.Slice(elem, elem + 1));
} else {
// copy
value.Resize(value_dims);
value.CopyFrom<value_type>(source.Slice<value_type>(elem, elem + 1),
platform::CPUPlace(),
platform::CPUDeviceContext());
value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
platform::CPUDeviceContext());
}
}
}
@ -242,11 +241,10 @@ LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
for (size_t i = 0; i < indice.size(); i++) {
auto index = indice[i];
auto target = result.Slice<value_type>(i, i + 1);
auto slice = source->Slice<value_type>(index, index + 1);
auto target = result.Slice(i, i + 1);
auto slice = source->Slice(index, index + 1);
target.CopyFrom<value_type>(slice, platform::CPUPlace(),
platform::CPUDeviceContext());
target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
}
return result;
@ -277,10 +275,10 @@ LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
// target is result[index]
auto index = seq_meta.begin + batch_id;
if (index >= seq_meta.end) break;
auto source_ = source[batch_id].Slice<float>(seq_id, seq_id + 1);
auto target = result.Slice<float>(index, index + 1);
target.CopyFrom<float>(source_, platform::CPUPlace(),
platform::CPUDeviceContext());
auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
auto target = result.Slice(index, index + 1);
target.CopyFrom(source_, platform::CPUPlace(),
platform::CPUDeviceContext());
}
}

@ -91,7 +91,7 @@ class TensorArrayPackTester : public ::testing::Test {
size_t begin = level[i];
size_t end = level[i + 1];
for (size_t j = begin; j < end; j++) {
auto record = source.Slice<int>(j, j + 1);
auto record = source.Slice(j, j + 1);
for (int dim = 0; dim < 128; dim++) {
record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
}

@ -19,12 +19,50 @@ limitations under the License. */
namespace paddle {
namespace framework {
template <typename... T>
struct SizeOfTypeFunctor;
template <typename T>
struct SizeOfTypeFunctor<T> {
size_t operator()(std::type_index type) const {
if (typeid(T).hash_code() == type.hash_code()) {
return sizeof(T);
} else {
return 0UL;
}
}
};
template <>
struct SizeOfTypeFunctor<> {
size_t operator()(std::type_index type) const { return 0UL; }
};
template <typename HEAD, typename... TAIL>
struct SizeOfTypeFunctor<HEAD, TAIL...> {
size_t operator()(std::type_index type) const {
SizeOfTypeFunctor<HEAD> head;
size_t head_size = head(type);
if (head_size != 0) {
return head_size;
}
SizeOfTypeFunctor<TAIL...> tail;
return tail(type);
}
};
static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
size_t size = functor(type);
PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
return size;
}
inline void Tensor::check_memory_size() const {
PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE_GE(
holder_->size(), numel() * sizeof(T) + offset_,
holder_->size(), numel() * SizeOfType(type()) + offset_,
"Tensor's dims_ is out of bound. Call Tensor::mutable_data "
"first to re-allocate memory.\n"
"or maybe the required data-type mismatches the data already stored.");
@ -32,14 +70,23 @@ inline void Tensor::check_memory_size() const {
template <typename T>
inline const T* Tensor::data() const {
check_memory_size<T>();
check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code(),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
template <typename T>
inline T* Tensor::data() {
check_memory_size<T>();
check_memory_size();
PADDLE_ENFORCE(std::is_same<T, void>::value ||
holder_->type().hash_code() == typeid(T).hash_code(),
"Tensor holds the wrong type, it holds %s",
this->holder_->type().name());
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
@ -54,51 +101,62 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
template <typename T>
inline T* Tensor::mutable_data(platform::Place place) {
static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
}
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
if (holder_ != nullptr) {
holder_->set_type(type);
}
PADDLE_ENFORCE_GT(numel(), 0,
"Tensor's numel must be larger than zero to call "
"Tensor::mutable_data. Call Tensor::set_dim first.");
int64_t size = numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */
int64_t size = numel() * sizeof(T);
if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size));
holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type));
} else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
}
#else
holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), size));
holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
boost::get<platform::GPUPlace>(place), size, type));
}
#endif
offset_ = 0;
}
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
inline void* Tensor::mutable_data(platform::Place place) {
PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type());
}
template <typename T>
inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
src.check_memory_size<T>();
src.check_memory_size();
*this = src;
return *this;
}
template <typename T>
inline void Tensor::CopyFrom(const Tensor& src,
const platform::Place& dst_place,
const platform::DeviceContext& ctx) {
src.check_memory_size<T>();
src.check_memory_size();
Resize(src.dims());
auto src_place = src.holder_->place();
auto src_ptr = static_cast<const void*>(src.data<T>());
auto src_ptr = src.data<void>();
auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
auto dst_ptr = mutable_data(dst_place, src.type());
auto size = src.numel() * sizeof(T);
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
@ -165,9 +223,8 @@ inline void Tensor::CopyFromVector(const std::vector<T>& src,
#endif
}
template <typename T>
inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
check_memory_size<T>();
check_memory_size();
PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
PADDLE_ENFORCE_LT(begin_idx, end_idx,
@ -182,7 +239,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx;
dst.Resize(dst_dims);
dst.offset_ = offset_ + begin_idx * base * sizeof(T);
dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
return dst;
}
}
@ -196,10 +253,9 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); }
template <typename T>
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res;
res.ShareDataWith<T>(src);
res.ShareDataWith(src);
res.Resize(flatten_to_2d(src.dims(), num_col_dims));
return res;
}

@ -108,7 +108,7 @@ TEST(Tensor, ShareDataWith) {
// Try to share data form uninitialized tensor
bool caught = false;
try {
dst_tensor.ShareDataWith<float>(src_tensor);
dst_tensor.ShareDataWith(src_tensor);
} catch (paddle::platform::EnforceNotMet err) {
caught = true;
std::string msg =
@ -122,7 +122,7 @@ TEST(Tensor, ShareDataWith) {
ASSERT_TRUE(caught);
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
dst_tensor.ShareDataWith<int>(src_tensor);
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
@ -131,7 +131,7 @@ TEST(Tensor, ShareDataWith) {
Tensor src_tensor;
Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
dst_tensor.ShareDataWith<int>(src_tensor);
dst_tensor.ShareDataWith(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#endif
@ -143,7 +143,7 @@ TEST(Tensor, Slice) {
{
Tensor src_tensor;
src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
Tensor slice_tensor = src_tensor.Slice(1, 3);
DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 3);
EXPECT_EQ(slice_dims[0], 2);
@ -167,7 +167,7 @@ TEST(Tensor, Slice) {
{
Tensor src_tensor;
src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
Tensor slice_tensor = src_tensor.Slice(2, 6);
DDim slice_dims = slice_tensor.dims();
ASSERT_EQ(arity(slice_dims), 2);
EXPECT_EQ(slice_dims[0], 4);
@ -202,7 +202,7 @@ TEST(Tensor, CopyFrom) {
memcpy(src_ptr, arr, 9 * sizeof(int));
auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(src_tensor, *cpu_place, cpu_ctx);
dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr);
@ -210,8 +210,8 @@ TEST(Tensor, CopyFrom) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place, cpu_ctx);
Tensor slice_tensor = src_tensor.Slice(1, 2);
dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr);
@ -233,11 +233,11 @@ TEST(Tensor, CopyFrom) {
// CPU Tensor to GPU Tensor
auto gpu_place = new paddle::platform::GPUPlace(0);
CUDADeviceContext gpu_ctx(*gpu_place);
gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place, gpu_ctx);
gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
// GPU Tensor to CPU Tensor
auto cpu_place = new paddle::platform::CPUPlace();
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors
gpu_ctx.Wait();
@ -247,13 +247,13 @@ TEST(Tensor, CopyFrom) {
EXPECT_EQ(src_ptr[i], dst_ptr[i]);
}
Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor
gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place, gpu_ctx);
gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
// GPU Tensor to CPU Tensor
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Slice Tensors
gpu_ctx.Wait();
@ -320,7 +320,7 @@ TEST(Tensor, CopyFromVector) {
CUDADeviceContext gpu_ctx(*gpu_place);
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
// Copy from GPU to CPU tensor for comparison
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors
gpu_ctx.Wait();
@ -340,7 +340,7 @@ TEST(Tensor, CopyFromVector) {
cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
gpu_tensor.Resize(make_ddim({2, 2}));
gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place, gpu_ctx);
dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
// Sync before Compare Tensors
gpu_ctx.Wait();
@ -368,7 +368,7 @@ TEST(Tensor, ReshapeToMatrix) {
for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
src_ptr[i] = i;
}
Tensor res = ReshapeToMatrix<int>(src, 2);
Tensor res = ReshapeToMatrix(src, 2);
ASSERT_EQ(res.dims()[0], 2 * 3);
ASSERT_EQ(res.dims()[1], 4 * 9);
}

@ -69,5 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`.
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
REGISTER_OP_CPU_KERNEL(accuracy,
ops::AccuracyKernel<paddle::platform::CPUPlace, float>);
REGISTER_OP_CPU_KERNEL(
accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
ops::AccuracyKernel<paddle::platform::CPUPlace, double>,
ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);

@ -21,9 +21,9 @@ namespace paddle {
namespace operators {
using platform::PADDLE_CUDA_NUM_THREADS;
template <int BlockSize>
__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
const int* labeldata, float* accuracy) {
template <typename T, int BlockSize>
__global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata,
const T* labeldata, float* accuracy) {
int count = 0;
__shared__ int total[BlockSize];
@ -57,8 +57,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
auto* accuracy = ctx.Output<Tensor>("Accuracy");
// FIXME(typhoonzero): only support indices currently
// if add support for output values, how to detect the data type?
const int* inference_data = inference->data<int>();
const int* label_data = label->data<int>();
const T* inference_data = inference->data<T>();
const T* label_data = label->data<T>();
float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
size_t num_samples = inference->dims()[0];
@ -69,7 +69,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
return;
}
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
AccuracyCudaKernel<T, PADDLE_CUDA_NUM_THREADS><<<
1, PADDLE_CUDA_NUM_THREADS, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
@ -81,5 +81,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_GPU_KERNEL(accuracy,
paddle::operators::AccuracyOpCUDAKernel<float>);
REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
paddle::operators::AccuracyOpCUDAKernel<double>,
paddle::operators::AccuracyOpCUDAKernel<int>,
paddle::operators::AccuracyOpCUDAKernel<int64_t>);

@ -108,17 +108,17 @@ class GemmConv2DKernel : public framework::OpKernel<T> {
int in_step = input_channels / groups;
int out_step = output_channels / groups;
for (int i = 0; i < batch_size; i++) {
Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice<T>(i, i + 1).Resize(output_matrix_shape);
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
for (int g = 0; g < groups; g++) {
// im2col
Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0], strides[1],
paddings[0], paddings[1]);
// gemm
Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step);
Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), filter_slice, false,
col_matrix, false, T(1.0), &out_slice, T(0.0));
}
@ -198,22 +198,20 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch =
output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
Tensor in_grad_batch =
input_grad->Slice<T>(i, i + 1).Resize(input_shape);
output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// gemm
Tensor out_grad_slice =
out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
Tensor filter_slice =
filter.Slice<T>(g * out_step, (g + 1) * out_step);
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), filter_slice, true,
out_grad_slice, false, T(1.0), &col_matrix,
T(0.0));
// col2im
Tensor in_grad_slice =
in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step);
in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
col2im(context.device_context(), in_grad_slice, col, strides[0],
strides[1], paddings[0], paddings[1]);
}
@ -229,19 +227,19 @@ class GemmConvGrad2DKernel : public framework::OpKernel<T> {
for (int i = 0; i < batch_size; i++) {
Tensor out_grad_batch =
output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
for (int g = 0; g < groups; g++) {
// im2col
Tensor out_grad_slice =
out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
im2col(context.device_context(), in_slice, col, strides[0],
strides[1], paddings[0], paddings[1]);
// gemm
Tensor filter_grad_slice =
filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step);
filter_grad_.Slice(g * out_step, (g + 1) * out_step);
math::matmul<Place, T>(context.device_context(), out_grad_slice,
false, col_matrix, true, T(1.0),
&filter_grad_slice, T(1.0));

@ -48,12 +48,11 @@ inline void ReorderBootState(const DySeqMetaBatch& metas,
const LoDTensor& boot_state, LoDTensor* tensor,
const platform::Place& dst_place) {
for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
auto slice = tensor->Slice<T>(seq_id, seq_id + 1);
auto slice = tensor->Slice(seq_id, seq_id + 1);
auto boot_slice =
boot_state.Slice<T>(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
// TODO(superjom) pass in device context as an argument
slice.template CopyFrom<T>(boot_slice, dst_place,
platform::CPUDeviceContext());
slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext());
}
}
@ -138,7 +137,7 @@ void DynamicRecurrentOp::WriteStepInputs() const {
if (var == nullptr) {
var = step_scope.Var(item.first);
}
var->GetMutable<LoDTensor>()->ShareDataWith<value_type>(tensor);
var->GetMutable<LoDTensor>()->ShareDataWith(tensor);
}
}
}
@ -206,7 +205,7 @@ void DynamicRecurrentOp::ConcatOutputs() const {
for (auto& item : step_outputs_) {
auto tensor = item.second.Pack(level, some_meta, some_lod);
auto* output = cache_.outlinks[item.first]->GetMutable<LoDTensor>();
const_cast<LoDTensor*>(output)->ShareDataWith<value_type>(tensor);
const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
}
}
@ -260,8 +259,8 @@ void DynamicRecurrentOp::LinkState(const rnn::MemoryAttr& memory,
}
// shink and share from previous state
auto shrinked_pre_state = pre_state->Slice<value_type>(0, num_instances);
state_pre.ShareDataWith<value_type>(shrinked_pre_state);
auto shrinked_pre_state = pre_state->Slice(0, num_instances);
state_pre.ShareDataWith(shrinked_pre_state);
}
void DynamicRecurrentOp::ArgCache::Init(

@ -47,7 +47,7 @@ class FeedOp : public framework::OperatorBase {
auto &feed_list = feed_var->Get<framework::FeedFetchList>();
auto &feed_item = feed_list.at(static_cast<size_t>(col));
auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
out_item->CopyFromTensor(feed_item, dev_ctx.GetPlace(), dev_ctx);
out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx);
out_item->set_lod(feed_item.lod());
}
};

@ -51,7 +51,7 @@ class FetchOp : public framework::OperatorBase {
// FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs?
dst_item.CopyFromTensor(src_item, platform::CPUPlace(), dev_ctx);
dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);
VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
}

@ -64,7 +64,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
input.CopyFrom<float>(input_tmp, *place, *context);
input.CopyFrom(input_tmp, *place, *context);
}
output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place);
@ -85,8 +85,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>();
} else {
output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace(),
*context);
output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
out_cfo_ptr = output_tmp.data<float>();
}
EXPECT_EQ(out_cfo_ptr[0], 0);
@ -102,8 +101,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>();
} else {
output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace(),
*context);
output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
out_ocf_ptr = output_tmp.data<float>();
}
EXPECT_EQ(out_ocf_ptr[0], 0);

@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) {
auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
input1_gpu.CopyFrom(input1, *gpu_place, context);
input2_gpu.CopyFrom(input1, *gpu_place, context);
out_gpu.mutable_data<float>({2, 2}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
out.CopyFrom<float>(out_gpu, *cpu_place, context);
out.CopyFrom(out_gpu, *cpu_place, context);
float* out_ptr = out.data<float>();
context.Wait();
@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) {
auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
input2_gpu.CopyFrom<float>(input1, *gpu_place, context);
input1_gpu.CopyFrom(input1, *gpu_place, context);
input2_gpu.CopyFrom(input1, *gpu_place, context);
out_gpu.mutable_data<float>({3, 3}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
out.CopyFrom<float>(out_gpu, *cpu_place, context);
out.CopyFrom(out_gpu, *cpu_place, context);
float* out_ptr = out.data<float>();
context.Wait();
@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) {
auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
input1_gpu.CopyFrom(input1, *gpu_place, context);
input2_gpu.CopyFrom(input2, *gpu_place, context);
input3_gpu.CopyFrom(input3, *gpu_place, context);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place);
@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) {
paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
input3.CopyFrom(input3_gpu, *cpu_place, context);
// numpy code:
// a = np.arange(6).reshape(2, 3)
@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) {
auto* gpu_place = new paddle::platform::GPUPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
input1_gpu.CopyFrom<float>(input1, *gpu_place, context);
input2_gpu.CopyFrom<float>(input2, *gpu_place, context);
input3_gpu.CopyFrom<float>(input3, *gpu_place, context);
input1_gpu.CopyFrom(input1, *gpu_place, context);
input2_gpu.CopyFrom(input2, *gpu_place, context);
input3_gpu.CopyFrom(input3, *gpu_place, context);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place);
@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) {
paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
input3.CopyFrom<float>(input3_gpu, *cpu_place, context);
input3.CopyFrom(input3_gpu, *cpu_place, context);
context.Wait();
EXPECT_EQ(input3_ptr[0], 0);

@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu;
out_cpu.CopyFrom<float>(*out_value, cpu_place, ctx);
out_cpu.CopyFrom(*out_value, cpu_place, ctx);
ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>();
@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
Tensor tensor2_cpu;
tensor2_cpu.CopyFrom<float>(*tensor2, cpu_place, ctx);
tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx);
ctx.Wait();
auto* tensor2_cpu_data = tensor2_cpu.data<float>();

@ -78,7 +78,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
input.CopyFrom<float>(input_tmp, *place, *context);
input.CopyFrom(input_tmp, *place, *context);
}
output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width},
@ -93,7 +93,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>();
} else {
output_tmp.CopyFrom<float>(output, paddle::platform::CPUPlace(), *context);
output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context);
out_cfo_ptr = output_tmp.data<float>();
}
@ -107,7 +107,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp;
} else {
input.CopyFrom<float>(input_tmp, *place, *context);
input.CopyFrom(input_tmp, *place, *context);
}
paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
@ -118,7 +118,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>();
} else {
input_tmp.CopyFrom<float>(input, paddle::platform::CPUPlace(), *context);
input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
in_ptr = input_tmp.data<float>();
}

@ -46,7 +46,7 @@ class MatMulKernel : public framework::OpKernel<T> {
template <typename T>
inline Tensor Reshape(const Tensor& input, const DDim& dims) {
Tensor output;
output.ShareDataWith<T>(input);
output.ShareDataWith(input);
output.Resize(dims);
return output;
}
@ -56,7 +56,7 @@ inline Tensor Reshape(const Tensor& input, const DDim& dims) {
template <typename T>
Tensor CombineBatchAndM(const Tensor& input) {
Tensor output;
output.ShareDataWith<T>(input);
output.ShareDataWith(input);
auto in_dims = input.dims();
if (in_dims.size() == 3) {
std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
@ -80,7 +80,7 @@ Tensor CombineBatchAndN(const framework::ExecutionContext& context,
std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
output.Resize(make_ddim(out_dims));
} else {
output.ShareDataWith<T>(input);
output.ShareDataWith(input);
}
return output;
}

@ -36,12 +36,12 @@ class MulKernel : public framework::OpKernel<T> {
Tensor* z = context.Output<Tensor>("Out");
const Tensor x_matrix =
x->dims().size() > 2
? framework::ReshapeToMatrix<T>(
? framework::ReshapeToMatrix(
*x, context.template Attr<int>("x_num_col_dims"))
: *x;
const Tensor y_matrix =
y->dims().size() > 2
? framework::ReshapeToMatrix<T>(
? framework::ReshapeToMatrix(
*y, context.template Attr<int>("y_num_col_dims"))
: *y;
@ -59,30 +59,30 @@ class MulGradKernel : public framework::OpKernel<T> {
int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
const Tensor* x = ctx.Input<Tensor>("X");
const Tensor* y = ctx.Input<Tensor>("Y");
const Tensor x_matrix =
x->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*x, x_num_col_dims)
: *x;
const Tensor y_matrix =
y->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*y, y_num_col_dims)
: *y;
const Tensor x_matrix = x->dims().size() > 2
? framework::ReshapeToMatrix(*x, x_num_col_dims)
: *x;
const Tensor y_matrix = y->dims().size() > 2
? framework::ReshapeToMatrix(*y, y_num_col_dims)
: *y;
const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
if (dx) {
dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
*dx, x_num_col_dims)
: *dx;
Tensor dx_matrix = dx->dims().size() > 2
? framework::ReshapeToMatrix(*dx, x_num_col_dims)
: *dx;
// dx = dout * y'. dx: M x K, dout : M x N, y : K x N
math::matmul<Place, T>(ctx.device_context(), *dout, false, y_matrix, true,
1, &dx_matrix, 0);
}
if (dy) {
dy->mutable_data<T>(ctx.GetPlace());
Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
*dy, y_num_col_dims)
: *dy;
Tensor dy_matrix = dy->dims().size() > 2
? framework::ReshapeToMatrix(*dy, y_num_col_dims)
: *dy;
// dy = x' * dout. dy K x N, dout : M x N, x : M x K
math::matmul<Place, T>(ctx.device_context(), x_matrix, true, *dout, false,
1, &dy_matrix, 0);

@ -33,8 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows;
// copy index to cpu
Tensor index_t_cpu;
index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace(),
ctx.device_context());
index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
auto* index = index_t_cpu.data<int32_t>();
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
@ -71,8 +70,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows;
// copy index to cpu
Tensor index_t_cpu;
index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace(),
ctx.device_context());
index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
auto* index = index_t_cpu.data<int32_t>();
auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(

@ -95,7 +95,7 @@ void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>();
pre_mem->Resize(boot_mem->dims());
PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
pre_mem->ShareDataWith<float>(*boot_mem);
pre_mem->ShareDataWith(*boot_mem);
}
}
@ -171,7 +171,7 @@ void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
auto* boot_mem_grad =
step_scope->Var(attr.boot_var)->GetMutable<LoDTensor>();
boot_mem_grad->Resize(mem_grad->dims());
boot_mem_grad->ShareDataWith<float>(*mem_grad);
boot_mem_grad->ShareDataWith(*mem_grad);
}
}

@ -33,7 +33,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
std::transform(shape.begin(), shape.end(), shape_int64.begin(),
[](int a) { return static_cast<int64_t>(a); });
auto out_dims = framework::make_ddim(shape_int64);
out->CopyFrom<T>(*in, ctx.GetPlace(), ctx.device_context());
out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
out->Resize(out_dims);
}
};
@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
d_x->mutable_data<T>(ctx.GetPlace());
auto in_dims = d_x->dims();
d_x->CopyFrom<T>(*d_out, ctx.GetPlace(), ctx.device_context());
d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context());
d_x->Resize(in_dims);
}
};

@ -43,7 +43,7 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
// The input of operators of each step is Tensor here.
// Maybe need to modify Slice function.
*step_input = input->Slice<float>(j, j + 1);
*step_input = input->Slice(j, j + 1);
step_input->Resize(step_dims);
}
}
@ -71,8 +71,8 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
// TODO(luotao02) data type and platform::DeviceContext() should set
// correctly
(output->Slice<float>(j, j + 1))
.CopyFrom<float>(*step_output, platform::CPUPlace(), ctx);
(output->Slice(j, j + 1))
.CopyFrom(*step_output, platform::CPUPlace(), ctx);
}
}
}
@ -95,7 +95,7 @@ void LinkMemories(const std::vector<Scope*>& scopes,
auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
mem->Resize(linked_mem->dims());
mem->ShareDataWith<float>(*linked_mem);
mem->ShareDataWith(*linked_mem);
}
}

@ -30,7 +30,7 @@ class ScatterOpCUDAKernel : public framework::OpKernel<T> {
auto *Updates = ctx.Input<Tensor>("Updates");
auto *Out = ctx.Output<Tensor>("Out");
Out->ShareDataWith<T>(*Ref);
Out->ShareDataWith(*Ref);
GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
}
@ -48,7 +48,7 @@ class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO
dRef->ShareDataWith<T>(*dOut);
dRef->ShareDataWith(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates = dO[Index]
GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);

@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
auto *Out = ctx.Output<Tensor>("Out");
// In place output: Out = Ref, Out[Index] += Updates
Out->ShareDataWith<T>(*Ref);
Out->ShareDataWith(*Ref);
// Apply ScatterUpdate: Out[index] += Updates[:]
ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
}
@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
// In place gradient: dRef = dO
dRef->ShareDataWith<T>(*dOut);
dRef->ShareDataWith(*dOut);
dUpdates->mutable_data<T>(ctx.GetPlace());
// Gradient by Gather: dUpdates += dO[Index]
CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);

Some files were not shown because too many files have changed in this diff Show More

Loading…
Cancel
Save