improve GPU performance of transpose, test=develop (#25862)

revert-24895-update_cub
Zhang Ting 5 years ago committed by GitHub
parent 6773fcc1ba
commit 6486fe8a94
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -21,6 +21,8 @@ namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
using framework::To32BitIndex;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context, void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
framework::Tensor* tensor, framework::Tensor* tensor,
@ -40,7 +42,15 @@ void Transpose<DeviceContext, T, Rank>::operator()(
auto eigen_in = framework::EigenTensor<T, Rank>::From(in); auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
auto eigen_out = framework::EigenTensor<T, Rank>::From(*out); auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
auto* dev = context.eigen_device(); auto* dev = context.eigen_device();
eigen_out.device(*dev) = eigen_in.shuffle(permute); // use 32bit index to speed up computation
bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
bool is_gpu_place = platform::is_gpu_place(context.GetPlace());
if (use_32bit_index && is_gpu_place) {
To32BitIndex(eigen_out).device(*dev) =
To32BitIndex(eigen_in).shuffle(permute);
} else {
eigen_out.device(*dev) = eigen_in.shuffle(permute);
}
} }
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>

Loading…
Cancel
Save