|
|
@ -142,8 +142,13 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
|
|
|
|
->stream();
|
|
|
|
->stream();
|
|
|
|
auto buf_value = memory::AllocShared(place, values.size() * sizeof(float*));
|
|
|
|
auto buf_value = memory::AllocShared(place, values.size() * sizeof(float*));
|
|
|
|
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
|
|
|
|
float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
|
|
|
hipMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
|
|
|
|
|
|
|
|
hipMemcpyHostToDevice);
|
|
|
|
|
|
|
|
#else
|
|
|
|
cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
|
|
|
|
cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
|
|
|
|
cudaMemcpyHostToDevice);
|
|
|
|
cudaMemcpyHostToDevice);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
#define EMBEDX_CASE(i, ...) \
|
|
|
|
#define EMBEDX_CASE(i, ...) \
|
|
|
|
case i: { \
|
|
|
|
case i: { \
|
|
|
|
constexpr size_t EmbedxDim = i; \
|
|
|
|
constexpr size_t EmbedxDim = i; \
|
|
|
@ -155,6 +160,19 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
} break
|
|
|
|
} break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
|
|
|
#define EXPAND_EMBED_PUSH_CASE(i, ...) \
|
|
|
|
|
|
|
|
case i: { \
|
|
|
|
|
|
|
|
constexpr size_t ExpandDim = i; \
|
|
|
|
|
|
|
|
hipLaunchKernelGGL( \
|
|
|
|
|
|
|
|
PushCopy<EmbedxDim, ExpandDim>, dim3((total_length + 512 - 1) / 512), \
|
|
|
|
|
|
|
|
dim3(512), 0, stream, gpu_values, \
|
|
|
|
|
|
|
|
reinterpret_cast<boxps::FeatureValueGpu<EmbedxDim, ExpandDim>*>( \
|
|
|
|
|
|
|
|
total_values_gpu), \
|
|
|
|
|
|
|
|
gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \
|
|
|
|
|
|
|
|
gpu_keys); \
|
|
|
|
|
|
|
|
} break
|
|
|
|
|
|
|
|
#else
|
|
|
|
#define EXPAND_EMBED_PULL_CASE(i, ...) \
|
|
|
|
#define EXPAND_EMBED_PULL_CASE(i, ...) \
|
|
|
|
case i: { \
|
|
|
|
case i: { \
|
|
|
|
constexpr size_t ExpandDim = i; \
|
|
|
|
constexpr size_t ExpandDim = i; \
|
|
|
@ -166,6 +184,7 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
|
|
|
|
gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \
|
|
|
|
gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \
|
|
|
|
gpu_keys); \
|
|
|
|
gpu_keys); \
|
|
|
|
} break
|
|
|
|
} break
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
switch (hidden_size - 3) {
|
|
|
|
switch (hidden_size - 3) {
|
|
|
|
EMBEDX_CASE(8, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_CASE(8);
|
|
|
|
EMBEDX_CASE(8, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_CASE(8);
|
|
|
@ -187,9 +206,16 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
|
|
|
|
platform::DeviceContextPool::Instance().Get(
|
|
|
|
platform::DeviceContextPool::Instance().Get(
|
|
|
|
BOOST_GET_CONST(platform::CUDAPlace, place)))
|
|
|
|
BOOST_GET_CONST(platform::CUDAPlace, place)))
|
|
|
|
->stream();
|
|
|
|
->stream();
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
|
|
|
hipLaunchKernelGGL(CopyKeysKernel, dim3((total_len + 512 - 1) / 512),
|
|
|
|
|
|
|
|
dim3(512), 0, stream, origin_keys, total_keys, gpu_len,
|
|
|
|
|
|
|
|
slot_num, total_len);
|
|
|
|
|
|
|
|
hipStreamSynchronize(stream);
|
|
|
|
|
|
|
|
#else
|
|
|
|
CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
|
|
|
|
CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
|
|
|
|
origin_keys, total_keys, gpu_len, slot_num, total_len);
|
|
|
|
origin_keys, total_keys, gpu_len, slot_num, total_len);
|
|
|
|
cudaStreamSynchronize(stream);
|
|
|
|
cudaStreamSynchronize(stream);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
|
|
|
|
void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
|
|
|
@ -217,12 +243,21 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
|
|
|
|
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
|
|
|
|
int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
|
|
|
|
int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
|
|
|
|
int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
|
|
|
hipMemcpy(gpu_values, grad_values.data(), grad_values.size() * sizeof(float*),
|
|
|
|
|
|
|
|
hipMemcpyHostToDevice);
|
|
|
|
|
|
|
|
hipMemcpy(gpu_len, slot_lengths_lod.data(),
|
|
|
|
|
|
|
|
slot_lengths.size() * sizeof(int64_t), hipMemcpyHostToDevice);
|
|
|
|
|
|
|
|
hipMemcpy(d_slot_vector, slot_vector_.data(),
|
|
|
|
|
|
|
|
slot_lengths_lod.size() * sizeof(int), hipMemcpyHostToDevice);
|
|
|
|
|
|
|
|
#else
|
|
|
|
cudaMemcpy(gpu_values, grad_values.data(),
|
|
|
|
cudaMemcpy(gpu_values, grad_values.data(),
|
|
|
|
grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
|
|
|
|
grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
|
|
|
|
cudaMemcpy(gpu_len, slot_lengths_lod.data(),
|
|
|
|
cudaMemcpy(gpu_len, slot_lengths_lod.data(),
|
|
|
|
slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
|
|
|
|
slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
|
|
|
|
cudaMemcpy(d_slot_vector, slot_vector_.data(),
|
|
|
|
cudaMemcpy(d_slot_vector, slot_vector_.data(),
|
|
|
|
slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
|
|
|
|
slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
#define EMBEDX_CASE(i, ...) \
|
|
|
|
#define EMBEDX_CASE(i, ...) \
|
|
|
|
case i: { \
|
|
|
|
case i: { \
|
|
|
@ -235,6 +270,18 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
|
|
|
|
} \
|
|
|
|
} \
|
|
|
|
} break
|
|
|
|
} break
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_WITH_HIP
|
|
|
|
|
|
|
|
#define EXPAND_EMBED_PUSH_CASE(i, ...) \
|
|
|
|
|
|
|
|
case i: { \
|
|
|
|
|
|
|
|
constexpr size_t ExpandDim = i; \
|
|
|
|
|
|
|
|
hipLaunchKernelGGL(PushCopy<EmbedxDim, ExpandDim>, \
|
|
|
|
|
|
|
|
dim3(total_length + 512 - 1) / 512), dim3(512), 0, stream, \
|
|
|
|
|
|
|
|
reinterpret_cast<boxps::FeaturePushValueGpu<EmbedxDim, ExpandDim>*>( \
|
|
|
|
|
|
|
|
total_grad_values_gpu), \
|
|
|
|
|
|
|
|
gpu_values, gpu_len, hidden_size, expand_embed_dim, \
|
|
|
|
|
|
|
|
slot_lengths.size(), total_length, batch_size, d_slot_vector); \
|
|
|
|
|
|
|
|
} break
|
|
|
|
|
|
|
|
#else
|
|
|
|
#define EXPAND_EMBED_PUSH_CASE(i, ...) \
|
|
|
|
#define EXPAND_EMBED_PUSH_CASE(i, ...) \
|
|
|
|
case i: { \
|
|
|
|
case i: { \
|
|
|
|
constexpr size_t ExpandDim = i; \
|
|
|
|
constexpr size_t ExpandDim = i; \
|
|
|
@ -245,6 +292,7 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
|
|
|
|
gpu_values, gpu_len, hidden_size, expand_embed_dim, \
|
|
|
|
gpu_values, gpu_len, hidden_size, expand_embed_dim, \
|
|
|
|
slot_lengths.size(), total_length, batch_size, d_slot_vector); \
|
|
|
|
slot_lengths.size(), total_length, batch_size, d_slot_vector); \
|
|
|
|
} break
|
|
|
|
} break
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
switch (hidden_size - 3) {
|
|
|
|
switch (hidden_size - 3) {
|
|
|
|
EMBEDX_CASE(8, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_CASE(8);
|
|
|
|
EMBEDX_CASE(8, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_CASE(8);
|
|
|
|