float16 type support enhance (#12181)
* cherry picked * "cherry picked platform" * "add comment" * "fix ci"bugfix/anakin-compile
parent
19ef4bab4c
commit
39ac9e39c2
@ -0,0 +1,118 @@
|
||||
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include <bitset>
|
||||
#include <iostream>
|
||||
#include <random>
|
||||
|
||||
#define PADDLE_CUDA_FP16
|
||||
#include "paddle/fluid/platform/cuda_device_function.h"
|
||||
#include "paddle/fluid/platform/cuda_primitives.h"
|
||||
#include "paddle/fluid/platform/float16.h"
|
||||
|
||||
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
|
||||
using paddle::platform::float16;
|
||||
|
||||
#define CUDA_ATOMIC_KERNEL(op, T) \
|
||||
__global__ void op##Kernel(const T* data_a, T* data_b, size_t num) { \
|
||||
for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num; \
|
||||
i += blockDim.x * gridDim.x) { \
|
||||
paddle::platform::CudaAtomic##op(&data_b[i], data_a[i]); \
|
||||
} \
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
struct AddFunctor {
|
||||
T operator()(const T& a, const T& b) { return a + b; }
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
struct SubFunctor {
|
||||
T operator()(const T& a, const T& b) { return a - b; }
|
||||
};
|
||||
|
||||
// NOTE(dzhwinter): the float16 add has small underflow/overflow
|
||||
// so we use EXPECT_NEAR to check the result.
|
||||
#define ARITHMETIC_KERNEL_LAUNCH(op, T) \
|
||||
void Test##T##op(size_t num) { \
|
||||
T *in1, *in2, *out; \
|
||||
T *d_in1, *d_in2; \
|
||||
size_t size = sizeof(T) * num; \
|
||||
cudaMalloc(reinterpret_cast<void**>(&d_in1), size); \
|
||||
cudaMalloc(reinterpret_cast<void**>(&d_in2), size); \
|
||||
in1 = reinterpret_cast<T*>(malloc(size)); \
|
||||
in2 = reinterpret_cast<T*>(malloc(size)); \
|
||||
out = reinterpret_cast<T*>(malloc(size)); \
|
||||
std::minstd_rand engine; \
|
||||
std::uniform_real_distribution<double> dist(0.0, 1.0); \
|
||||
for (size_t i = 0; i < num; ++i) { \
|
||||
in1[i] = static_cast<T>(dist(engine)); \
|
||||
in2[i] = static_cast<T>(dist(engine)); \
|
||||
} \
|
||||
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
|
||||
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
|
||||
op##Kernel<<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num); \
|
||||
cudaDeviceSynchronize(); \
|
||||
cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost); \
|
||||
cudaDeviceSynchronize(); \
|
||||
for (size_t i = 0; i < num; ++i) { \
|
||||
EXPECT_NEAR(static_cast<float>(out[i]), \
|
||||
static_cast<float>(op##Functor<T>()(in1[i], in2[i])), \
|
||||
0.001); \
|
||||
} \
|
||||
free(in1); \
|
||||
free(in2); \
|
||||
free(out); \
|
||||
cudaFree(d_in1); \
|
||||
cudaFree(d_in2); \
|
||||
}
|
||||
CUDA_ATOMIC_KERNEL(Add, float);
|
||||
CUDA_ATOMIC_KERNEL(Add, double);
|
||||
CUDA_ATOMIC_KERNEL(Add, float16);
|
||||
|
||||
ARITHMETIC_KERNEL_LAUNCH(Add, float);
|
||||
ARITHMETIC_KERNEL_LAUNCH(Add, double);
|
||||
ARITHMETIC_KERNEL_LAUNCH(Add, float16);
|
||||
|
||||
namespace paddle {
|
||||
namespace platform {
|
||||
USE_CUDA_ATOMIC(Sub, int);
|
||||
};
|
||||
};
|
||||
CUDA_ATOMIC_KERNEL(Sub, int);
|
||||
ARITHMETIC_KERNEL_LAUNCH(Sub, int);
|
||||
|
||||
// cuda primitives
|
||||
TEST(CudaAtomic, Add) {
|
||||
TestfloatAdd(static_cast<size_t>(10));
|
||||
TestfloatAdd(static_cast<size_t>(1024 * 1024));
|
||||
TestdoubleAdd(static_cast<size_t>(10));
|
||||
TestdoubleAdd(static_cast<size_t>(1024 * 1024));
|
||||
}
|
||||
|
||||
TEST(CudaAtomic, Sub) {
|
||||
TestintSub(static_cast<size_t>(10));
|
||||
TestintSub(static_cast<size_t>(1024 * 1024));
|
||||
}
|
||||
|
||||
TEST(CudaAtomic, float16) {
|
||||
using paddle::platform::float16;
|
||||
Testfloat16Add(static_cast<size_t>(1));
|
||||
Testfloat16Add(static_cast<size_t>(2));
|
||||
Testfloat16Add(static_cast<size_t>(3));
|
||||
|
||||
Testfloat16Add(static_cast<size_t>(10));
|
||||
Testfloat16Add(static_cast<size_t>(1024 * 1024));
|
||||
}
|
Loading…
Reference in new issue