bugfix lod cpu performance (#12297)

distributed_test
Yan Chunwei 7 years ago committed by GitHub
parent b41f8b9d42
commit 02cf54d331
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -26,6 +26,7 @@
namespace paddle {
namespace framework {
#if defined(PADDLE_WITH_CUDA)
// Vector<T> implements the std::vector interface, and can get Data or
// MutableData from any place. The data will be synced implicitly inside.
template <typename T>
@ -379,5 +380,53 @@ class Vector {
size_t size_;
};
} // namespace framework
#else // PADDLE_WITH_CUDA
template <typename T>
class CPUVector : public std::vector<T, std::allocator<T>> {
public:
CPUVector() : std::vector<T>() {}
CPUVector(size_t count, const T &value = T())
: std::vector<T>(count, value) {}
CPUVector(std::initializer_list<T> init) : std::vector<T>(init) {}
CPUVector(const std::vector<T> &other) : std::vector<T>(other) {}
explicit CPUVector(const CPUVector<T> &other) : std::vector<T>(other) {}
CPUVector(CPUVector<T> &&other) : std::vector<T>(std::move(other)) {}
CPUVector(std::vector<T> &&other) : std::vector<T>(std::move(other)) {}
CPUVector &operator=(const CPUVector &other) {
this->assign(other.begin(), other.end());
return *this;
}
CPUVector &operator=(const std::vector<T> &other) {
this->assign(other.begin(), other.end());
return *this;
}
friend std::ostream &operator<<(std::ostream &os, const CPUVector<T> &other) {
std::stringstream ss;
for (auto v : other) {
os << v << " ";
}
return os;
}
void resize(size_t size) { this->resize(size); }
T &operator[](size_t id) { return this->at(id); }
const T &operator[](size_t id) const { return this->at(id); }
template <typename D>
void Extend(const D &begin, const D &end) {
this->reserve(this->size() + size_t(end - begin));
this->insert(this->end(), begin, end);
}
};
template <typename T>
using Vector = CPUVector<T>;
#endif // PADDLE_WITH_CUDA
}; // namespace framework
} // namespace paddle

@ -293,11 +293,18 @@ class AdamOpKernel : public framework::OpKernel<T> {
auto& grad_tensor = grad_merge.value();
const T* grad_data = grad_tensor.template data<T>();
int64_t* rows = nullptr;
// When compiled without CUDA, the CUDAMutableData() interface should not be
// provided.
#if defined(PADDLE_WITH_CUDA)
if (platform::is_gpu_place(ctx.GetPlace())) {
rows = grad_merge.mutable_rows()->CUDAMutableData(ctx.GetPlace());
} else {
#endif
rows = grad_merge.mutable_rows()->data();
#if defined(PADDLE_WITH_CUDA)
}
#endif
auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
SparseAdamFunctor<T> functor(

@ -106,7 +106,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
int64_t k = x->dims()[2];
auto x_lod = x->lod().back();
#if defined(PADDLE_WITH_CUDA)
size_t* x_lod_data = x_lod.MutableData(ctx.GetPlace());
#else
size_t* x_lod_data = x_lod.data();
#endif
TargetAssignFunctor<T, WT> functor(x_data, match_idx_data, x_lod_data,
mismatch_value, n, m, p, k, out_data,
@ -121,7 +125,11 @@ class TargetAssignKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(neg_indices->lod().size(), 1UL);
const int* neg_idx_data = neg_indices->data<int>();
auto neg_lod = neg_indices->lod().back();
#if defined(PADDLE_WITH_CUDA)
size_t* neg_lod_data = neg_lod.MutableData(ctx.GetPlace());
#else
size_t* neg_lod_data = neg_lod.data();
#endif
NegTargetAssignFunctor<DeviceContext, T, WT> neg_trg_functor;
neg_trg_functor(device_ctx, neg_idx_data, neg_lod_data, n, m, k,
mismatch_value, out_data, out_wt_data);

@ -78,7 +78,7 @@ class LoDTensor2BatchFunctor {
auto lods = lod_tensor.lod();
PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
auto lod = lods[0];
const auto& lod = lods[0];
std::vector<SeqInfo> seq_info;
for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {

Loading…
Cancel
Save