Paddlebox about box_wrapper (#22497)

Refine PaddleBox Framework, Main functions: * Add MetricMsg util class, which can calculate metrics like AUC, bucket_error, COPC. * Replace FeedPass with new interface: BeginFeedPass & EndFeedPass * Refactor Pull/Push Sparse Function in box_wrapper. * Use CUDA Kernel to copy keys and copy feasign between tensor and boxps struct. * Cache copied keys in pull sparse in order to reuse it in push period.
5 years ago · 1a7962be97
parent 9e29d3ebed
commit 1a7962be97
4 changed files with 689 additions and 177 deletions
--- a/paddle/fluid/framework/fleet/box_wrapper.cc
+++ b/paddle/fluid/framework/fleet/box_wrapper.cc
--- a/paddle/fluid/framework/fleet/box_wrapper.cu
+++ b/paddle/fluid/framework/fleet/box_wrapper.cu
@ -0,0 +1,183 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifdef PADDLE_WITH_BOX_PS
+#include <algorithm>
+#include <ctime>
+#include <memory>
+#include <numeric>
+#include "paddle/fluid/framework/fleet/box_wrapper.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/platform/gpu_info.h"
+
+namespace paddle {
+namespace framework {
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+__global__ void PullCopy(float** dest, const boxps::FeatureValueGpu* src,
+                         const int64_t* len, int hidden, int slot_num,
+                         int total_len, uint64_t** keys) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[x - 1] : 0);
+    if (*(keys[x] + y) == 0) {
+      *(dest[x] + y * hidden) = 0;
+      *(dest[x] + y * hidden + 1) = 0;
+      *(dest[x] + y * hidden + 2) = 0;
+    } else {
+      *(dest[x] + y * hidden) = (src + i)->show;
+      *(dest[x] + y * hidden + 1) = (src + i)->clk;
+      *(dest[x] + y * hidden + 2) = (src + i)->embed_w;
+    }
+    if ((src + i)->embedding_size == 0 || *(keys[x] + y) == 0) {
+      for (int j = 0; j < 8; j++) {
+        *(dest[x] + y * hidden + 3 + j) = 0;
+      }
+    } else {
+      for (int j = 0; j < 8; j++) {
+        *(dest[x] + y * hidden + 3 + j) = (src + i)->embedx[1 + j];
+      }
+    }
+  }
+}
+
+__global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys,
+                               const int64_t* len, int slot_num,
+                               int total_len) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[x - 1] : 0);
+    dest_total_keys[i] = src_keys[x][y];
+  }
+}
+
+__global__ void PushCopy(boxps::FeaturePushValueGpu* dest, float** src,
+                         int64_t* len, int hidden, int slot_num, int total_len,
+                         int bs, int* slot_vector) {
+  CUDA_KERNEL_LOOP(i, total_len) {
+    int low = 0;
+    int high = slot_num - 1;
+    while (low < high) {
+      int mid = (low + high) / 2;
+      if (i < len[mid])
+        high = mid;
+      else
+        low = mid + 1;
+    }
+    int x = low;
+    int y = i - (x ? len[low - 1] : 0);
+    (dest + i)->slot = slot_vector[x];
+    (dest + i)->show = *(src[x] + y * hidden);
+    (dest + i)->clk = *(src[x] + y * hidden + 1);
+    (dest + i)->embed_g = *(src[x] + y * hidden + 2) * -1. * bs;
+    for (int j = 0; j < 8; j++) {
+      (dest + i)->embedx_g[j] = *(src[x] + y * hidden + 3 + j) * -1. * bs;
+    }
+  }
+}
+
+void BoxWrapper::CopyForPull(const paddle::platform::Place& place,
+                             uint64_t** gpu_keys,
+                             const std::vector<float*>& values,
+                             const boxps::FeatureValueGpu* total_values_gpu,
+                             const int64_t* gpu_len, const int slot_num,
+                             const int hidden_size,
+                             const int64_t total_length) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(
+                        boost::get<platform::CUDAPlace>(place)))
+                    ->stream();
+  auto buf_value = memory::AllocShared(place, values.size() * sizeof(float*));
+  float** gpu_values = reinterpret_cast<float**>(buf_value->ptr());
+  cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*),
+             cudaMemcpyHostToDevice);
+
+  PullCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num,
+      total_length, gpu_keys);
+  cudaStreamSynchronize(stream);
+}
+
+void BoxWrapper::CopyKeys(const paddle::platform::Place& place,
+                          uint64_t** origin_keys, uint64_t* total_keys,
+                          const int64_t* gpu_len, int slot_num, int total_len) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(
+                        boost::get<platform::CUDAPlace>(place)))
+                    ->stream();
+  CopyKeysKernel<<<(total_len + 512 - 1) / 512, 512, 0, stream>>>(
+      origin_keys, total_keys, gpu_len, slot_num, total_len);
+  cudaStreamSynchronize(stream);
+}
+
+void BoxWrapper::CopyForPush(const paddle::platform::Place& place,
+                             const std::vector<const float*>& grad_values,
+                             boxps::FeaturePushValueGpu* total_grad_values_gpu,
+                             const std::vector<int64_t>& slot_lengths,
+                             const int hidden_size, const int64_t total_length,
+                             const int batch_size) {
+  auto stream = dynamic_cast<platform::CUDADeviceContext*>(
+                    platform::DeviceContextPool::Instance().Get(
+                        boost::get<platform::CUDAPlace>(place)))
+                    ->stream();
+  auto slot_lengths_lod = slot_lengths;
+  for (int i = 1; i < slot_lengths_lod.size(); i++) {
+    slot_lengths_lod[i] += slot_lengths_lod[i - 1];
+  }
+  auto buf_grad_value =
+      memory::AllocShared(place, grad_values.size() * sizeof(float*));
+  auto buf_length =
+      memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t));
+  auto buf_slot_vector =
+      memory::AllocShared(place, slot_lengths_lod.size() * sizeof(int));
+
+  float** gpu_values = reinterpret_cast<float**>(buf_grad_value->ptr());
+  int64_t* gpu_len = reinterpret_cast<int64_t*>(buf_length->ptr());
+  int* d_slot_vector = reinterpret_cast<int*>(buf_slot_vector->ptr());
+
+  cudaMemcpy(gpu_values, grad_values.data(),
+             grad_values.size() * sizeof(float*), cudaMemcpyHostToDevice);
+  cudaMemcpy(gpu_len, slot_lengths_lod.data(),
+             slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(d_slot_vector, slot_vector_.data(),
+             slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice);
+
+  PushCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>(
+      total_grad_values_gpu, gpu_values, gpu_len, hidden_size,
+      slot_lengths.size(), total_length, batch_size, d_slot_vector);
+  cudaStreamSynchronize(stream);
+}
+}  // end namespace framework
+}  // end namespace paddle
+#endif
--- a/paddle/fluid/framework/fleet/box_wrapper.h
+++ b/paddle/fluid/framework/fleet/box_wrapper.h
--- a/paddle/fluid/operators/pull_box_sparse_op.h
+++ b/paddle/fluid/operators/pull_box_sparse_op.h
@ -26,7 +26,6 @@ template <typename T>
 static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
  auto outputs = ctx.MultiOutput<framework::Tensor>("Out");
-  auto hidden_size = ctx.Attr<int>("size");
  const auto slot_size = inputs.size();
  std::vector<const uint64_t *> all_keys(slot_size);
  // BoxPS only supports float now
@ -41,33 +40,49 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
    auto *output = outputs[i]->mutable_data<T>(ctx.GetPlace());
    all_values[i] = output;
  }
+#ifdef PADDLE_WITH_BOX_PS
+  auto hidden_size = ctx.Attr<int>("size");
  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
  box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths,
                      hidden_size);
+#endif
 }

 template <typename T>
 static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
-  auto inputs = ctx.MultiInput<framework::Tensor>("Ids");
+  auto inputs = ctx.MultiInput<framework::LoDTensor>("Ids");
  auto d_output =
      ctx.MultiInput<framework::Tensor>(framework::GradVarName("Out"));
-  auto hidden_size = ctx.Attr<int>("size");
  const auto slot_size = inputs.size();
  std::vector<const uint64_t *> all_keys(slot_size);
  std::vector<const float *> all_grad_values(slot_size);
  std::vector<int64_t> slot_lengths(slot_size);
+  int batch_size = -1;
  for (size_t i = 0; i < slot_size; i++) {
    const auto *slot = inputs[i];
    const uint64_t *single_slot_keys =
        reinterpret_cast<const uint64_t *>(slot->data<int64_t>());
    all_keys[i] = single_slot_keys;
    slot_lengths[i] = slot->numel();
+    int cur_batch_size =
+        slot->lod().size() ? slot->lod()[0].size() - 1 : slot->dims()[0];
+    if (batch_size == -1) {
+      batch_size = cur_batch_size;
+    } else {
+      PADDLE_ENFORCE_EQ(batch_size, cur_batch_size,
+                        platform::errors::PreconditionNotMet(
+                            "The batch size of all input slots should be same, "
+                            "please cheack"));
+    }
    const float *grad_value = d_output[i]->data<float>();
    all_grad_values[i] = grad_value;
  }
+#ifdef PADDLE_WITH_BOX_PS
+  auto hidden_size = ctx.Attr<int>("size");
  auto box_ptr = paddle::framework::BoxWrapper::GetInstance();
  box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values,
-                          slot_lengths, hidden_size);
+                          slot_lengths, hidden_size, batch_size);
+#endif
 }

 using LoDTensor = framework::LoDTensor;