Merge pull request #8275 from reyoung/feature/rewrite_vector

Rewrite mixed_vector.h
emailweixu-patch-1
Yu Yang 8 years ago committed by GitHub
commit 06f10942a8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

6
.gitignore vendored

@ -1,3 +1,9 @@
paddle/operators/check_t.save
paddle/operators/check_tensor.ls
paddle/operators/tensor.save
python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
*.DS_Store *.DS_Store
build/ build/
build_doc/ build_doc/

@ -181,7 +181,8 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}) list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL}) # nvcc 9 does not support -Os. Use Release flags instead
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
endif() endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)

@ -46,29 +46,7 @@ namespace framework {
* 0 2 4 7 * 0 2 4 7
* 0 2 5 7 10 12 15 20 * 0 2 5 7 10 12 15 20
*/ */
struct LoD : public std::vector<Vector<size_t>> { using LoD = std::vector<Vector<size_t>>;
using std::vector<Vector<size_t>>::vector;
platform::Place place() const {
if (this->size() == 0) {
// Not Initialze Yet.
return platform::CPUPlace();
} else {
return this->front().place();
}
}
void CopyFromCUDA() {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyFromCUDA();
}
}
void CopyToPeer(platform::Place place) {
for (auto it = this->begin(); it != this->end(); ++it) {
it->CopyToPeer(place);
}
}
};
std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t); std::ostream& operator<<(std::ostream& os, const LoDTensor& t);

@ -20,6 +20,7 @@
#include "paddle/platform/assert.h" #include "paddle/platform/assert.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <paddle/platform/place.h>
__global__ void test(size_t* a, int size) { __global__ void test(size_t* a, int size) {
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
@ -36,10 +37,9 @@ TEST(LoD, data) {
lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11})); lod.push_back(std::vector<size_t>({0, 1, 6, 8, 10, 11}));
auto& v = lod[0]; auto& v = lod[0];
test<<<1, 1>>>(v.cuda_data(), v.size()); paddle::platform::CUDAPlace gpu(0);
test<<<1, 1>>>(v.CUDAMutableData(gpu), v.size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
v.CopyFromCUDA();
for (size_t i = 0; i < v.size(); ++i) { for (size_t i = 0; i < v.size(); ++i) {
EXPECT_EQ(v[i], i * 2); EXPECT_EQ(v[i], i * 2);
} }
@ -63,9 +63,8 @@ TEST(LoDTensor, LoDInGPU) {
auto lod = lod_tensor.lod(); auto lod = lod_tensor.lod();
test<<<1, 8>>>(lod[0].cuda_data(), lod[0].size()); test<<<1, 8>>>(lod[0].CUDAMutableData(place), lod[0].size());
cudaDeviceSynchronize(); cudaDeviceSynchronize();
lod.CopyFromCUDA();
for (size_t i = 0; i < src_lod[0].size(); ++i) { for (size_t i = 0; i < src_lod[0].size(); ++i) {
EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2); EXPECT_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);

File diff suppressed because it is too large Load Diff

@ -11,62 +11,83 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include "gtest/gtest.h"
#include "paddle/framework/init.h" #include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/framework/mixed_vector.h" #include "paddle/framework/mixed_vector.h"
#include "paddle/platform/gpu_info.h"
using namespace paddle::framework;
using namespace paddle::platform;
using namespace paddle::memory;
template <typename T> template <typename T>
__global__ void test(T* data, int size) { using vec = paddle::framework::Vector<T>;
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
i += blockDim.x * gridDim.x) { TEST(mixed_vector, CPU_VECTOR) {
data[i] *= 2; vec<int> tmp;
for (int i = 0; i < 10; ++i) {
tmp.push_back(i);
}
ASSERT_EQ(tmp.size(), 10);
vec<int> tmp2;
tmp2 = tmp;
ASSERT_EQ(tmp2.size(), 10);
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp2[i], i);
ASSERT_EQ(tmp2[i], tmp[i]);
}
int cnt = 0;
for (auto& t : tmp2) {
ASSERT_EQ(t, cnt);
++cnt;
} }
} }
TEST(Vector, Normal) { static __global__ void multiply_10(int* ptr) {
// fill the device context pool. for (int i = 0; i < 10; ++i) {
InitDevices(); ptr[i] *= 10;
}
}
cudaStream_t GetCUDAStream(paddle::platform::CUDAPlace place) {
return reinterpret_cast<const paddle::platform::CUDADeviceContext*>(
paddle::platform::DeviceContextPool::Instance().Get(place))
->stream();
}
Vector<size_t> vec({1, 2, 3}); TEST(mixed_vector, GPU_VECTOR) {
size_t* ptr = vec.data(); vec<int> tmp;
for (size_t i = 0; i < vec.size(); ++i) { for (int i = 0; i < 10; ++i) {
EXPECT_EQ(vec[i], *(ptr + i)); tmp.push_back(i);
} }
ASSERT_EQ(tmp.size(), 10);
paddle::platform::CUDAPlace gpu(0);
vec.clear(); multiply_10<<<1, 1, 0, GetCUDAStream(gpu)>>>(tmp.MutableData(gpu));
vec.CopyFromCUDA();
std::vector<size_t> v = {1, 2, 3}; for (int i = 0; i < 10; ++i) {
for (size_t i = 0; i < v.size(); ++i) { ASSERT_EQ(tmp[i], i * 10);
EXPECT_EQ(v[i], vec[i]);
} }
} }
TEST(Vector, MultipleCopy) { TEST(mixed_vector, MultiGPU) {
InitDevices(); if (paddle::platform::GetCUDADeviceCount() < 2) {
Vector<size_t> vec({1, 2, 3}); LOG(WARNING) << "Skip mixed_vector.MultiGPU since there are not multiple "
CUDAPlace place(0); "GPUs in your machine.";
vec.mutable_data(place); return;
auto vec2 = Vector<size_t>(vec);
{
const size_t* ptr = vec2.data(CPUPlace());
for (size_t i = 0; i < vec2.size(); ++i) {
EXPECT_EQ(*(ptr + i), vec[i]);
}
} }
test<size_t><<<3, 3>>>(vec2.mutable_data(place), vec2.size());
vec2.CopyFromCUDA(); vec<int> tmp;
{ for (int i = 0; i < 10; ++i) {
const size_t* ptr = vec2.data(CPUPlace()); tmp.push_back(i);
for (size_t i = 0; i < vec2.size(); ++i) {
EXPECT_EQ(*(ptr + i), vec[i] * 2);
} }
ASSERT_EQ(tmp.size(), 10);
paddle::platform::CUDAPlace gpu0(0);
paddle::platform::SetDeviceId(0);
multiply_10<<<1, 1, 0, GetCUDAStream(gpu0)>>>(tmp.MutableData(gpu0));
paddle::platform::CUDAPlace gpu1(1);
auto* gpu1_ptr = tmp.MutableData(gpu1);
paddle::platform::SetDeviceId(1);
multiply_10<<<1, 1, 0, GetCUDAStream(gpu1)>>>(gpu1_ptr);
for (int i = 0; i < 10; ++i) {
ASSERT_EQ(tmp[i], i * 100);
} }
} }

@ -120,6 +120,7 @@ class Tensor {
return holder_->type(); return holder_->type();
} }
// memory size returns the holding memory size in byte.
size_t memory_size() const; size_t memory_size() const;
inline void check_memory_size() const; inline void check_memory_size() const;

@ -52,7 +52,7 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
}; };
static inline size_t SizeOfType(std::type_index type) { static inline size_t SizeOfType(std::type_index type) {
SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool> functor; SizeOfTypeFunctor<int, float, double, int16_t, int64_t, bool, size_t> functor;
size_t size = functor(type); size_t size = functor(type);
PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name()); PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
return size; return size;
@ -61,15 +61,15 @@ static inline size_t SizeOfType(std::type_index type) {
inline void Tensor::check_memory_size() const { inline void Tensor::check_memory_size() const {
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_LE(
holder_->size(), memory_size() + offset_, numel() * SizeOfType(type()), memory_size(),
"Tensor's dims_ is out of bound. Call Tensor::mutable_data " "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
"first to re-allocate memory.\n" "first to re-allocate memory.\n"
"or maybe the required data-type mismatches the data already stored."); "or maybe the required data-type mismatches the data already stored.");
} }
inline size_t Tensor::memory_size() const { inline size_t Tensor::memory_size() const {
return holder_ == nullptr ? 0UL : numel() * SizeOfType(type()); return holder_ == nullptr ? 0UL : holder_->size() - offset_;
} }
template <typename T> template <typename T>

@ -101,9 +101,9 @@ struct SparseAdagradFunctor<platform::CUDADeviceContext, T> {
SparseAdagradFunctorKernel< SparseAdagradFunctorKernel<
T, 256><<<grid2, threads, 0, T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, merge_rows.cuda_data(), lr, .stream()>>>(
param_data, moment_data, grad_width, grad_merge_data, merge_rows.CUDAMutableData(context.GetPlace()), lr,
epsilon); param_data, moment_data, grad_width, epsilon);
} }
}; };

@ -201,7 +201,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
const T* grad_data = grad_tensor.template data<T>(); const T* grad_data = grad_tensor.template data<T>();
int64_t* rows = nullptr; int64_t* rows = nullptr;
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->cuda_data(); rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
} else { } else {
rows = grad_merge.mutable_rows()->data(); rows = grad_merge.mutable_rows()->data();
} }

@ -69,8 +69,9 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>( MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
num_tokens, tokens, num_seq, input_lod[level].cuda_data(), blank, num_tokens, tokens, num_seq,
merge_repeated, dev_out_lod0_ptr, output_data); input_lod[level].CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
dev_out_lod0_ptr, output_data);
// set output lod // set output lod
std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end()); std::vector<size_t> host_out_lod0(dev_out_lod0.begin(), dev_out_lod0.end());

@ -125,7 +125,9 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
new_rows.resize(ids_dim[0]); new_rows.resize(ids_dim[0]);
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace()); auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
memory::Copy(platform::CPUPlace(), new_rows.cuda_data(), gpu_place, // TODO(yuyang18): Strange code here.
memory::Copy(platform::CPUPlace(),
new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
ids_data, ids_dim[0] * sizeof(int64_t), stream); ids_data, ids_dim[0] * sizeof(int64_t), stream);
d_table->set_rows(new_rows); d_table->set_rows(new_rows);

@ -128,7 +128,7 @@ struct SelectedRowsAddTo<platform::CPUDeviceContext, T> {
auto* in2_value = input2->mutable_value(); auto* in2_value = input2->mutable_value();
// concat rows // concat rows
in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); in2_rows.Extend(in1_rows.begin(), in1_rows.end());
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_cpu_place(in1_place)); PADDLE_ENFORCE(platform::is_cpu_place(in1_place));

@ -126,7 +126,8 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
SelectedRowsAddTensorKernel< SelectedRowsAddTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.cuda_data(), out_data, in1_row_numel); in1_data, in1_rows.CUDAData(context.GetPlace()), out_data,
in1_row_numel);
auto out_eigen = framework::EigenVector<T>::Flatten(*output); auto out_eigen = framework::EigenVector<T>::Flatten(*output);
auto in2_eigen = framework::EigenVector<T>::Flatten(input2); auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
@ -153,7 +154,9 @@ struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
auto* in2_value = input2->mutable_value(); auto* in2_value = input2->mutable_value();
// concat rows // concat rows
in2_rows.insert(in2_rows.end(), in1_rows.begin(), in1_rows.end()); if (in1_rows.size()) {
in2_rows.Extend(in1_rows.begin(), in1_rows.end());
}
auto in1_place = input1.place(); auto in1_place = input1.place();
PADDLE_ENFORCE(platform::is_gpu_place(in1_place)); PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
@ -216,7 +219,8 @@ struct SelectedRowsAddToTensor<platform::CUDADeviceContext, T> {
dim3 grid(1, in1_rows.size()); dim3 grid(1, in1_rows.size());
SelectedRowsAddToTensorKernel< SelectedRowsAddToTensorKernel<
T, block_size><<<grid, threads, 0, context.stream()>>>( T, block_size><<<grid, threads, 0, context.stream()>>>(
in1_data, in1_rows.cuda_data(), in2_data, in1_row_numel); in1_data, in1_rows.CUDAData(context.GetPlace()), in2_data,
in1_row_numel);
} }
}; };
@ -283,8 +287,9 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
MergeAddKernel< MergeAddKernel<
T, 256><<<grid1, threads, 0, T, 256><<<grid1, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context) reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(input_data, input_rows.cuda_data(), out_data, .stream()>>>(
out.mutable_rows()->cuda_data(), input_data, input_rows.CUDAData(context.GetPlace()), out_data,
out.mutable_rows()->CUDAMutableData(context.GetPlace()),
out.rows().size(), input_width); out.rows().size(), input_width);
return out; return out;
} }

@ -45,7 +45,6 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& src, const framework::Tensor& src,
framework::Vector<size_t> index_lod, framework::Tensor& dst, framework::Vector<size_t> index_lod, framework::Tensor& dst,
bool is_src_index) { bool is_src_index) {
size_t* index = index_lod.cuda_data();
auto src_dims = src.dims(); auto src_dims = src.dims();
auto dst_dims = dst.dims(); auto dst_dims = dst.dims();
PADDLE_ENFORCE_EQ(src_dims.size(), 2, PADDLE_ENFORCE_EQ(src_dims.size(), 2,
@ -63,7 +62,8 @@ class CopyMatrixRowsFunctor<platform::CUDADeviceContext, T> {
dim3 grid(8, 1); dim3 grid(8, 1);
auto stream = context.stream(); auto stream = context.stream();
CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>( CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
src_data, dst_data, index, height, width, is_src_index); src_data, dst_data, index_lod.CUDAData(context.GetPlace()), height,
width, is_src_index);
} }
}; };

@ -121,12 +121,12 @@ class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].cuda_data(), sequence_width, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences); max_sequence_length, num_sequences);
} else { } else {
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), padding_data, const_cast<T*>(seq_data),
abs_offset_lod[level].cuda_data(), sequence_width, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences); max_sequence_length, num_sequences);
} }
} }
@ -196,12 +196,12 @@ class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
if (norm_by_times) { if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].cuda_data(), sequence_width, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences); max_sequence_length, num_sequences);
} else { } else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>( SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, const_cast<T*>(padding_data), seq_data,
abs_offset_lod[level].cuda_data(), sequence_width, abs_offset_lod[level].CUDAData(context.GetPlace()), sequence_width,
max_sequence_length, num_sequences); max_sequence_length, num_sequences);
} }
} }

@ -73,7 +73,8 @@ class MaxSeqPoolFunctor<platform::CUDADeviceContext, T> {
dim3 grid(num_seq, 1); dim3 grid(num_seq, 1);
auto stream = context.stream(); auto stream = context.stream();
KeMaxSequencePool<T><<<grid, threads, 0, stream>>>( KeMaxSequencePool<T><<<grid, threads, 0, stream>>>(
in_data, starts.cuda_data(), out_data, max_index, num_seq, dim); in_data, starts.CUDAData(context.GetPlace()), out_data, max_index,
num_seq, dim);
} }
}; };

@ -46,7 +46,8 @@ class ScaleLoDTensorFunctor<platform::CUDADeviceContext, T> {
SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<< SequenceScaleKernel<T, PADDLE_CUDA_NUM_THREADS><<<
num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>( num_seq, PADDLE_CUDA_NUM_THREADS, 0, context.stream()>>>(
seq_data, abs_offset_lod[level].cuda_data(), scales, seq_width); seq_data, abs_offset_lod[level].CUDAMutableData(context.GetPlace()),
scales, seq_width);
} }
}; };

@ -79,9 +79,6 @@ inline void CopyOrShare(const framework::Variable &src,
dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod()); dst->GetMutable<LoDTensor>()->set_lod(src.Get<LoDTensor>().lod());
} else { } else {
Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>()); Copy(src.Get<LoDTensor>(), dst_place, dst->GetMutable<LoDTensor>());
framework::LoD lod(src.Get<LoDTensor>().lod());
lod.CopyToPeer(dst_place);
dst->GetMutable<LoDTensor>()->set_lod(lod);
} }
} else if (src.IsType<SelectedRows>()) { } else if (src.IsType<SelectedRows>()) {
auto &src_sr = src.Get<SelectedRows>(); auto &src_sr = src.Get<SelectedRows>();
@ -92,9 +89,6 @@ inline void CopyOrShare(const framework::Variable &src,
dst_sr->set_rows(src_sr.rows()); dst_sr->set_rows(src_sr.rows());
} else { } else {
Copy(src_sr.value(), dst_place, dst_sr->mutable_value()); Copy(src_sr.value(), dst_place, dst_sr->mutable_value());
framework::Vector<int64_t> lod(src_sr.rows());
lod.CopyToPeer(dst_place);
dst_sr->set_rows(lod);
} }
} else { } else {
PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name());
@ -152,9 +146,6 @@ class ParallelDoOp : public framework::OperatorBase {
auto *sub_scope = sub_scopes[i]; auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>(); auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
framework::Copy(src, place, dst); framework::Copy(src, place, dst);
framework::LoD lod(src.lod());
lod.CopyToPeer(place);
dst->set_lod(lod);
} }
} }
WaitOnPlaces(places); WaitOnPlaces(places);

@ -307,7 +307,7 @@ class RowConvKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1]; int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1; int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0]; int future_context = Filter->dims()[0];
size_t *idx = batch_indices.cuda_data(); size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
if (future_context <= 32) { if (future_context <= 32) {
@ -345,7 +345,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
int input_dim = X->dims()[1]; int input_dim = X->dims()[1];
int num_sequence = batch_indices.size() - 1; int num_sequence = batch_indices.size() - 1;
int future_context = Filter->dims()[0]; int future_context = Filter->dims()[0];
size_t *idx = batch_indices.cuda_data(); size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
auto &device_ctx = context.cuda_device_context(); auto &device_ctx = context.cuda_device_context();
math::SetConstant<platform::CUDADeviceContext, T> zero; math::SetConstant<platform::CUDADeviceContext, T> zero;

@ -87,8 +87,7 @@ class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
// Copy LoD to GPU // Copy LoD to GPU
auto lod0 = lod[0]; auto lod0 = lod[0];
auto lod_len = lod0.size(); auto lod_len = lod0.size();
thrust::device_vector<size_t> dev_in_lod = lod0; const size_t* dev_in_lod_ptr = lod0.CUDAData(ctx.GetPlace());
size_t* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
// Calc output LoD // Calc output LoD
thrust::device_vector<size_t> dev_out_lod(lod_len); thrust::device_vector<size_t> dev_out_lod(lod_len);

@ -102,8 +102,8 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
dim3 grid(1, in_rows.size()); dim3 grid(1, in_rows.size());
SparseSGDFunctorKernel< SparseSGDFunctorKernel<
T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>( T, 256><<<grid, threads, 0, ctx.cuda_device_context().stream()>>>(
in_data, in_rows.cuda_data(), learning_rate->data<T>(), out_data, in_data, in_rows.CUDAData(ctx.GetPlace()), learning_rate->data<T>(),
in_row_numel); out_data, in_row_numel);
} else { } else {
PADDLE_THROW("Unsupported Variable Type of Grad"); PADDLE_THROW("Unsupported Variable Type of Grad");

@ -137,8 +137,8 @@ class TargetAssignKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]); PADDLE_ENFORCE_EQ(gt_lod.data()[i], gt_label_lod.data()[i]);
} }
size_t* gt_lod_data = gt_lod.data(ctx.GetPlace()); size_t* gt_lod_data = gt_lod.MutableData(ctx.GetPlace());
size_t* neg_lod_data = neg_lod.data(ctx.GetPlace()); size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data, TargetAssignFunctor<T> functor(box_data, label_data, match_idx_data,
gt_lod_data, background_label, num, gt_lod_data, background_label, num,

@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
std::vector<char*> new_argv; std::vector<char*> new_argv;
std::string gflags_env; std::string gflags_env;
for (int i = 0; i < argc; ++i) { for (int i = 0; i < argc; ++i) {
@ -35,7 +36,6 @@ int main(int argc, char** argv) {
int new_argc = static_cast<int>(new_argv.size()); int new_argc = static_cast<int>(new_argv.size());
char** new_argv_address = new_argv.data(); char** new_argv_address = new_argv.data();
google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
testing::InitGoogleTest(&argc, argv);
paddle::memory::Used(paddle::platform::CPUPlace()); paddle::memory::Used(paddle::platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA

Loading…
Cancel
Save