You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							265 lines
						
					
					
						
							10 KiB
						
					
					
				
			
		
		
	
	
							265 lines
						
					
					
						
							10 KiB
						
					
					
				| /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 | |
| 
 | |
| Licensed under the Apache License, Version 2.0 (the "License");
 | |
| you may not use this file except in compliance with the License.
 | |
| You may obtain a copy of the License at
 | |
| 
 | |
|     http://www.apache.org/licenses/LICENSE-2.0
 | |
| 
 | |
| Unless required by applicable law or agreed to in writing, software
 | |
| distributed under the License is distributed on an "AS IS" BASIS,
 | |
| WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| See the License for the specific language governing permissions and
 | |
| limitations under the License. */
 | |
| 
 | |
| #pragma once
 | |
| 
 | |
| #include <string>
 | |
| #include <vector>
 | |
| #include "paddle/fluid/framework/eigen.h"
 | |
| #include "paddle/fluid/framework/op_registry.h"
 | |
| #include "paddle/fluid/framework/tensor_util.h"
 | |
| #include "paddle/fluid/operators/math/math_function.h"
 | |
| #include "paddle/fluid/operators/math/sample_prob.h"
 | |
| #include "paddle/fluid/operators/math/softmax.h"
 | |
| #include "paddle/fluid/operators/sample_logits_op.h"
 | |
| 
 | |
| namespace paddle {
 | |
| namespace operators {
 | |
| 
 | |
| // UNDERSTAND: something like take_along_axis in numpy.
 | |
| template <typename T>
 | |
| __global__ void GPUTakeAlongD1(size_t size, const int batch_size,
 | |
|                                const int array_slice_size,
 | |
|                                const int idx_slice_size, const T* p_array,
 | |
|                                const int64_t* p_index, T* p_value) {
 | |
|   const auto value_slice_size = idx_slice_size;
 | |
|   int idx = blockDim.x * blockIdx.x + threadIdx.x;
 | |
|   int step_size = blockDim.x * gridDim.x;
 | |
| 
 | |
|   for (; idx < size; idx += step_size) {
 | |
|     int i = idx / idx_slice_size;
 | |
|     auto array_index = p_index[idx];
 | |
|     p_value[idx] = p_array[i * array_slice_size + array_index];
 | |
|   }
 | |
| }
 | |
| 
 | |
| // UNDERSTAND: something like put_along_axis in numpy but if there is duplicate
 | |
| // indices, scatter is done in += way.
 | |
| template <typename T>
 | |
| __global__ void GPUPutAlongD1(size_t size, const int batch_size,
 | |
|                               const int array_slice_size,
 | |
|                               const int idx_slice_size, T* p_array,
 | |
|                               const int64_t* p_index, const T* p_value) {
 | |
|   const auto value_slice_size = idx_slice_size;
 | |
|   int idx = blockDim.x * blockIdx.x + threadIdx.x;
 | |
|   int step_size = blockDim.x * gridDim.x;
 | |
| 
 | |
|   // size == batch_size
 | |
|   for (; idx < size; idx += step_size) {
 | |
|     int i = idx;
 | |
|     for (int j = 0; j < idx_slice_size; ++j) {
 | |
|       auto array_index = p_index[i * idx_slice_size + j];
 | |
|       p_array[i * array_slice_size + array_index] +=
 | |
|           p_value[i * idx_slice_size + j];
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| // UNDERSTAND: set label as 0,1,...,num_true-1
 | |
| template <typename T>
 | |
| __global__ void GPUSetLabel(size_t size, const int num_true, int64_t* p_array) {
 | |
|   int idx = blockDim.x * blockIdx.x + threadIdx.x;
 | |
|   int step_size = blockDim.x * gridDim.x;
 | |
| 
 | |
|   for (; idx < size; idx += step_size) {
 | |
|     p_array[idx] = idx % num_true;
 | |
|   }
 | |
| }
 | |
| 
 | |
| // UNDERSTAND: compute accidentdal hits from samples and minus corresponding
 | |
| // logits by a float max, here 1e20
 | |
| template <typename T>
 | |
| __global__ void gpu_compute_remove_accidental_hits(const int size,
 | |
|                                                    const int num_true,
 | |
|                                                    const int idx_slice_size,
 | |
|                                                    const int64_t* p_index,
 | |
|                                                    T* p_value) {
 | |
|   const auto value_slice_size = idx_slice_size;
 | |
|   int idx = blockDim.x * blockIdx.x + threadIdx.x;
 | |
|   int step_size = blockDim.x * gridDim.x;
 | |
| 
 | |
|   for (; idx < size; idx += step_size) {
 | |
|     int i = idx / idx_slice_size;
 | |
|     if (idx % idx_slice_size < num_true) continue;
 | |
|     for (int j = 0; j < num_true; ++j) {
 | |
|       const auto true_idx = i * idx_slice_size + j;
 | |
|       if (p_index[true_idx] == p_index[idx]) {
 | |
|         p_value[idx] -= 1e20;
 | |
|         break;
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| template <typename T>
 | |
| class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 | |
|  public:
 | |
|   using Tensor = framework::Tensor;
 | |
|   void Compute(const framework::ExecutionContext& context) const override {
 | |
|     // get necessary inputs
 | |
|     const Tensor* logits = context.Input<Tensor>("Logits");
 | |
|     const Tensor* labels = context.Input<Tensor>("Labels");
 | |
|     VLOG(3) << "Enter SampleLogitsCUDAKernel";
 | |
| 
 | |
|     // get necessary outputs
 | |
|     Tensor* samples = context.Output<Tensor>("Samples");
 | |
|     Tensor* probabilities = context.Output<Tensor>("Probabilities");
 | |
|     Tensor* sampled_logits = context.Output<Tensor>("SampledLogits");
 | |
|     Tensor* sampled_labels = context.Output<Tensor>("SampledLabels");
 | |
| 
 | |
|     // shapes
 | |
|     const auto batch_size = logits->dims()[0];
 | |
|     const auto num_classes = logits->dims()[1];
 | |
|     const auto labels_dim = labels->dims();
 | |
|     const auto num_true = labels_dim[1];
 | |
|     const auto samples_dim = samples->dims();
 | |
| 
 | |
|     // attrs
 | |
|     const auto num_samples = context.Attr<int>("num_samples");
 | |
|     const bool use_customized_samples =
 | |
|         context.Attr<bool>("use_customized_samples");
 | |
|     const bool uniq = context.Attr<bool>("uniq");
 | |
|     const bool remove_accidental_hits =
 | |
|         context.Attr<bool>("remove_accidental_hits");
 | |
| 
 | |
|     // device contexts
 | |
|     auto& dev_ctx = context.cuda_device_context();
 | |
| 
 | |
|     // UNDERSTAND: allocate memories for temporaries
 | |
|     sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
 | |
|     math::SetConstant<platform::CUDADeviceContext, T> set_zero;
 | |
|     set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
 | |
| 
 | |
|     auto sampled_labels_data =
 | |
|         sampled_labels->mutable_data<int64_t>(labels_dim, context.GetPlace());
 | |
|     int threads = 512;
 | |
|     size_t size = batch_size * num_true;
 | |
|     int grid = (size + threads - 1) / threads;
 | |
|     GPUSetLabel<
 | |
|         T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
 | |
|         size, num_true, sampled_labels_data);
 | |
| 
 | |
|     if (use_customized_samples) {
 | |
|       const Tensor* customized_samples =
 | |
|           context.Input<Tensor>("CustomizedSamples");
 | |
|       const Tensor* customized_probabilities =
 | |
|           context.Input<Tensor>("CustomizedProbabilities");
 | |
|       PADDLE_ENFORCE_EQ(customized_samples, samples,
 | |
|                         platform::errors::InvalidArgument(
 | |
|                             "CustomizedSamples must be the same Tensor with "
 | |
|                             "Samples when use_customized_samples = True"));
 | |
|       PADDLE_ENFORCE_EQ(
 | |
|           customized_probabilities, probabilities,
 | |
|           platform::errors::InvalidArgument(
 | |
|               "CustomizedProbabilities must be the same Tensor with "
 | |
|               "Probabilities when use_customized_samples = True"));
 | |
|     } else {
 | |
|       samples->mutable_data<int64_t>(context.GetPlace());
 | |
|       probabilities->mutable_data<T>(samples_dim, context.GetPlace());
 | |
|       // UNDERSTAND: sampling
 | |
|       const auto seed = context.Attr<int>("seed");
 | |
|       auto sampler_with_prob = math::GPUSampleWithProb<T>();
 | |
|       sampler_with_prob(context.cuda_device_context(), seed, num_classes, uniq,
 | |
|                         num_samples, labels, samples, probabilities);
 | |
|     }
 | |
| 
 | |
|     // UNDERSTAND: gather sampled logits and remove accidental hits if needed
 | |
|     const auto num_take = samples->dims()[1];
 | |
|     const auto array_dims = logits->dims();
 | |
|     const auto idx_dims = samples->dims();
 | |
| 
 | |
|     const T* p_array = logits->data<T>();
 | |
|     const int64_t* p_index = samples->data<int64_t>();
 | |
|     T* p_value = sampled_logits->data<T>();
 | |
| 
 | |
|     // src slice size
 | |
|     const auto array_slice_size = array_dims[1];
 | |
|     // index slice size
 | |
|     const auto idx_slice_size = idx_dims[1];
 | |
| 
 | |
|     size = batch_size * num_take;
 | |
|     grid = (size + threads - 1) / threads;
 | |
|     GPUTakeAlongD1<
 | |
|         T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
 | |
|         size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
 | |
|         p_value);
 | |
| 
 | |
|     if (remove_accidental_hits) {
 | |
|       const size_t size = batch_size * (num_true + num_samples);
 | |
|       int grid = (size + threads - 1) / threads;
 | |
|       gpu_compute_remove_accidental_hits<
 | |
|           T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
 | |
|           size, num_true, idx_slice_size, p_index, p_value);
 | |
|     }
 | |
| 
 | |
|     // subtracted sampled logits with logQ(y|x)
 | |
|     auto probs = EigenMatrix<T>::From(*probabilities);
 | |
|     auto smp_logits = EigenMatrix<T>::From(*sampled_logits);
 | |
|     smp_logits.device(*dev_ctx.eigen_device()) =
 | |
|         (smp_logits - probs.log().unaryExpr(TolerableValue<T>()))
 | |
|             .unaryExpr(TolerableValue<T>());
 | |
|   }
 | |
| };
 | |
| 
 | |
| template <typename T>
 | |
| class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
 | |
|  public:
 | |
|   using Tensor = framework::Tensor;
 | |
|   void Compute(const framework::ExecutionContext& context) const override {
 | |
|     auto logits_grad = context.Output<Tensor>(framework::GradVarName("Logits"));
 | |
|     const Tensor* samples = context.Input<Tensor>("Samples");
 | |
|     const Tensor* sampled_logits_grad =
 | |
|         context.Input<Tensor>(framework::GradVarName("SampledLogits"));
 | |
|     logits_grad->mutable_data<T>(context.GetPlace());
 | |
| 
 | |
|     auto& dev_ctx = context.cuda_device_context();
 | |
|     math::SetConstant<platform::CUDADeviceContext, T> set_zero;
 | |
|     set_zero(dev_ctx, logits_grad, static_cast<T>(0));
 | |
| 
 | |
|     // UNDERSTAND: scatter it back to logit_grad
 | |
|     const auto batch_size = samples->dims()[0];
 | |
|     const auto num_put = samples->dims()[1];
 | |
|     const auto array_dims = logits_grad->dims();
 | |
|     const auto idx_dims = samples->dims();
 | |
| 
 | |
|     T* p_array = logits_grad->data<T>();
 | |
|     const int64_t* p_index = samples->data<int64_t>();
 | |
|     const T* p_value = sampled_logits_grad->data<T>();
 | |
| 
 | |
|     // src slice size
 | |
|     const auto array_slice_size = array_dims[1];
 | |
|     // index slice size
 | |
|     const auto idx_slice_size = idx_dims[1];
 | |
| 
 | |
|     int threads = 128;
 | |
|     const size_t size = batch_size;
 | |
|     int grid = (size + threads - 1) / threads;
 | |
| 
 | |
|     GPUPutAlongD1<
 | |
|         T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
 | |
|         size, batch_size, array_slice_size, idx_slice_size, p_array, p_index,
 | |
|         p_value);
 | |
|   }
 | |
| };
 | |
| 
 | |
| }  // namespace operators
 | |
| }  // namespace paddle
 | |
| namespace ops = paddle::operators;
 | |
| 
 | |
| REGISTER_OP_CUDA_KERNEL(sample_logits, ops::SampleLogitsCUDAKernel<float>,
 | |
|                         ops::SampleLogitsCUDAKernel<double>);
 | |
| REGISTER_OP_CUDA_KERNEL(sample_logits_grad,
 | |
|                         ops::SampleLogitsGradCUDAKernel<float>,
 | |
|                         ops::SampleLogitsGradCUDAKernel<double>);
 |