|
|
|
@ -24,7 +24,8 @@ using platform::PADDLE_CUDA_NUM_THREADS;
|
|
|
|
|
template <int BlockSize>
|
|
|
|
|
__global__ void AccuracyCudaKernel(const int N, const int D,
|
|
|
|
|
const int64_t* Xdata,
|
|
|
|
|
const int64_t* labeldata, float* accuracy) {
|
|
|
|
|
const int64_t* labeldata, int* correct_data,
|
|
|
|
|
float* accuracy) {
|
|
|
|
|
int count = 0;
|
|
|
|
|
__shared__ int total[BlockSize];
|
|
|
|
|
|
|
|
|
@ -43,6 +44,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
|
|
|
|
|
// reduce the count with init value 0, and output accuracy.
|
|
|
|
|
int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
|
|
|
|
|
if (threadIdx.x == 0) {
|
|
|
|
|
*correct_data = result;
|
|
|
|
|
*accuracy = static_cast<float>(result) / static_cast<float>(N);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
@ -56,31 +58,48 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
|
|
|
|
|
auto* inference = ctx.Input<Tensor>("Out");
|
|
|
|
|
auto* indices = ctx.Input<Tensor>("Indices");
|
|
|
|
|
auto* label = ctx.Input<Tensor>("Label");
|
|
|
|
|
|
|
|
|
|
auto* accuracy = ctx.Output<Tensor>("Accuracy");
|
|
|
|
|
auto* correct = ctx.Output<Tensor>("Correct");
|
|
|
|
|
auto* total = ctx.Output<Tensor>("Total");
|
|
|
|
|
// FIXME(typhoonzero): only support indices currently
|
|
|
|
|
// if add support for output values, how to detect the data type?
|
|
|
|
|
const int64_t* indices_data = indices->data<int64_t>();
|
|
|
|
|
const int64_t* label_data = label->data<int64_t>();
|
|
|
|
|
|
|
|
|
|
int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
|
|
|
|
|
int* total_data = total->mutable_data<int>(ctx.GetPlace());
|
|
|
|
|
float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
|
|
|
|
|
|
|
|
|
|
size_t num_samples = inference->dims()[0];
|
|
|
|
|
int num_samples = static_cast<int>(inference->dims()[0]);
|
|
|
|
|
size_t infer_width = inference->dims()[1];
|
|
|
|
|
PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
|
|
|
|
|
// cudaMemset((void**)&correct_data, 0, sizeof(float));
|
|
|
|
|
|
|
|
|
|
if (num_samples == 0) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
cudaMemcpy(total_data, &num_samples, sizeof(int), cudaMemcpyHostToDevice);
|
|
|
|
|
|
|
|
|
|
AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
|
|
|
|
|
1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
|
|
|
|
|
num_samples, infer_width, indices_data, label_data, accuracy_data);
|
|
|
|
|
num_samples, infer_width, indices_data, label_data, correct_data,
|
|
|
|
|
accuracy_data);
|
|
|
|
|
|
|
|
|
|
int d_num_samples, d_num_correct;
|
|
|
|
|
float d_accuracy;
|
|
|
|
|
cudaMemcpy(&d_num_correct, correct_data, sizeof(int),
|
|
|
|
|
cudaMemcpyDeviceToHost);
|
|
|
|
|
cudaMemcpy(&d_num_samples, total_data, sizeof(int), cudaMemcpyDeviceToHost);
|
|
|
|
|
cudaMemcpy(&d_accuracy, accuracy_data, sizeof(float),
|
|
|
|
|
cudaMemcpyDeviceToHost);
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
} // namespace operators
|
|
|
|
|
} // namespace paddle
|
|
|
|
|
|
|
|
|
|
// FIXME(typhoonzero): types of T is for infernece data.
|
|
|
|
|
// label data is always int
|
|
|
|
|
// FIXME(typhoonzero): types of T is for inference data.
|
|
|
|
|
// label data is always int64
|
|
|
|
|
REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
|
|
|
|
|
paddle::operators::AccuracyOpCUDAKernel<double>);
|
|
|
|
|