|
|
|
|
@ -9,42 +9,211 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
|
See the License for the specific language governing permissions and
|
|
|
|
|
limitations under the License. */
|
|
|
|
|
|
|
|
|
|
#include <gtest/gtest.h>
|
|
|
|
|
#include "paddle/math/float16.h"
|
|
|
|
|
|
|
|
|
|
namespace paddle {
|
|
|
|
|
#include <gtest/gtest.h>
|
|
|
|
|
|
|
|
|
|
#include "paddle/utils/Logging.h"
|
|
|
|
|
|
|
|
|
|
#define ARITHMETIC_KERNEL(op_type, sign) \
|
|
|
|
|
__global__ void op_type( \
|
|
|
|
|
const float16* in1, const float16* in2, float16* out) { \
|
|
|
|
|
out[0] = in1[0] sign in2[0]; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define COMPOUND_KERNEL(op_type, sign) \
|
|
|
|
|
__global__ void op_type(float16* in1, const float16* in2) { \
|
|
|
|
|
in1[0] sign in2[0]; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define COMPARISON_KERNEL(op_type, sign) \
|
|
|
|
|
__global__ void op_type(const float16* in1, const float16* in2, bool* out) { \
|
|
|
|
|
out[0] = in1[0] sign in2[0]; \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define ARITHMETIC_KERNEL_LAUNCH(op_type) \
|
|
|
|
|
void Test##op_type(float v_in1, float v_in2, float v_out) { \
|
|
|
|
|
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
|
|
|
|
|
float16 *in1, *in2, *out; \
|
|
|
|
|
float16 *d_in1, *d_in2, *d_out; \
|
|
|
|
|
int size = sizeof(float16); \
|
|
|
|
|
cudaMalloc((void**)&d_in1, size); \
|
|
|
|
|
cudaMalloc((void**)&d_in2, size); \
|
|
|
|
|
cudaMalloc((void**)&d_out, size); \
|
|
|
|
|
in1 = (float16*)malloc(size); \
|
|
|
|
|
in2 = (float16*)malloc(size); \
|
|
|
|
|
out = (float16*)malloc(size); \
|
|
|
|
|
in1[0] = float16(v_in1); \
|
|
|
|
|
in2[0] = float16(v_in2); \
|
|
|
|
|
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
|
|
|
|
|
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
|
|
|
|
|
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
|
|
|
|
|
cudaMemcpy(out, d_out, size, cudaMemcpyDeviceToHost); \
|
|
|
|
|
EXPECT_EQ(float(out[0]), v_out); \
|
|
|
|
|
free(in1); \
|
|
|
|
|
free(in2); \
|
|
|
|
|
free(out); \
|
|
|
|
|
cudaFree(d_in1); \
|
|
|
|
|
cudaFree(d_in2); \
|
|
|
|
|
cudaFree(d_out); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define COMPOUND_KERNEL_LAUNCH(op_type) \
|
|
|
|
|
void Test##op_type(float v_in1, float v_in2, float v_out) { \
|
|
|
|
|
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
|
|
|
|
|
float16 *in1, *in2; \
|
|
|
|
|
float16 *d_in1, *d_in2; \
|
|
|
|
|
int size = sizeof(float16); \
|
|
|
|
|
cudaMalloc((void**)&d_in1, size); \
|
|
|
|
|
cudaMalloc((void**)&d_in2, size); \
|
|
|
|
|
in1 = (float16*)malloc(size); \
|
|
|
|
|
in2 = (float16*)malloc(size); \
|
|
|
|
|
in1[0] = float16(v_in1); \
|
|
|
|
|
in2[0] = float16(v_in2); \
|
|
|
|
|
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
|
|
|
|
|
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
|
|
|
|
|
op_type<<<1, 1>>>(d_in1, d_in2); \
|
|
|
|
|
cudaMemcpy(in1, d_in1, size, cudaMemcpyDeviceToHost); \
|
|
|
|
|
EXPECT_EQ(float(in1[0]), v_out); \
|
|
|
|
|
free(in1); \
|
|
|
|
|
free(in2); \
|
|
|
|
|
cudaFree(d_in1); \
|
|
|
|
|
cudaFree(d_in2); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#define COMPARISON_KERNEL_LAUNCH(op_type) \
|
|
|
|
|
void Test##op_type(float v_in1, float v_in2, bool v_out) { \
|
|
|
|
|
LOG(INFO) << "Test " << #op_type << " on GPU!"; \
|
|
|
|
|
float16 *in1, *in2; \
|
|
|
|
|
float16 *d_in1, *d_in2; \
|
|
|
|
|
bool *out, *d_out; \
|
|
|
|
|
int size = sizeof(float16); \
|
|
|
|
|
cudaMalloc((void**)&d_in1, size); \
|
|
|
|
|
cudaMalloc((void**)&d_in2, size); \
|
|
|
|
|
cudaMalloc((void**)&d_out, 1); \
|
|
|
|
|
in1 = (float16*)malloc(size); \
|
|
|
|
|
in2 = (float16*)malloc(size); \
|
|
|
|
|
out = (bool*)malloc(1); \
|
|
|
|
|
in1[0] = float16(v_in1); \
|
|
|
|
|
in2[0] = float16(v_in2); \
|
|
|
|
|
cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice); \
|
|
|
|
|
cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice); \
|
|
|
|
|
op_type<<<1, 1>>>(d_in1, d_in2, d_out); \
|
|
|
|
|
cudaMemcpy(out, d_out, 1, cudaMemcpyDeviceToHost); \
|
|
|
|
|
EXPECT_EQ(out[0], v_out); \
|
|
|
|
|
free(in1); \
|
|
|
|
|
free(in2); \
|
|
|
|
|
free(out); \
|
|
|
|
|
cudaFree(d_in1); \
|
|
|
|
|
cudaFree(d_in2); \
|
|
|
|
|
cudaFree(d_out); \
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#ifdef PADDLE_CUDA_FP16
|
|
|
|
|
TEST(float16, conversion_gpu) {
|
|
|
|
|
LOG(INFO) << "GPU tests started";
|
|
|
|
|
namespace paddle {
|
|
|
|
|
|
|
|
|
|
// Conversion to and from cuda half
|
|
|
|
|
float16 v1 = half(float16(1.0f));
|
|
|
|
|
EXPECT_EQ(v1.x, 0x3c00);
|
|
|
|
|
ARITHMETIC_KERNEL(Add, +)
|
|
|
|
|
ARITHMETIC_KERNEL(Sub, -)
|
|
|
|
|
ARITHMETIC_KERNEL(Mul, *)
|
|
|
|
|
ARITHMETIC_KERNEL(Div, /)
|
|
|
|
|
|
|
|
|
|
// Conversion to and from Eigen::half
|
|
|
|
|
float16 v2 = Eigen::half(float16(0.5f));
|
|
|
|
|
EXPECT_EQ(v2.x, 0x3800);
|
|
|
|
|
ARITHMETIC_KERNEL_LAUNCH(Add)
|
|
|
|
|
ARITHMETIC_KERNEL_LAUNCH(Sub)
|
|
|
|
|
ARITHMETIC_KERNEL_LAUNCH(Mul)
|
|
|
|
|
ARITHMETIC_KERNEL_LAUNCH(Div)
|
|
|
|
|
|
|
|
|
|
// Conversion from float
|
|
|
|
|
EXPECT_EQ(float16(1.0f).x, 0x3c00);
|
|
|
|
|
EXPECT_EQ(float16(0.5f).x, 0x3800);
|
|
|
|
|
EXPECT_EQ(float16(0.33333f).x, 0x3555);
|
|
|
|
|
EXPECT_EQ(float16(0.0f).x, 0x0000);
|
|
|
|
|
EXPECT_EQ(float16(-0.0f).x, 0x8000);
|
|
|
|
|
EXPECT_EQ(float16(65504.0f).x, 0x7bff);
|
|
|
|
|
EXPECT_EQ(float16(65536.0f).x, 0x7c00);
|
|
|
|
|
// Negative sign kernel
|
|
|
|
|
__global__ void Neg(float16* in) { in[0] = -in[0]; }
|
|
|
|
|
|
|
|
|
|
// Conversion from double
|
|
|
|
|
void TestNeg(float v_in, float v_out) {
|
|
|
|
|
LOG(INFO) << "Test Neg on GPU!";
|
|
|
|
|
float16 *in, *d_in;
|
|
|
|
|
int size = sizeof(float16);
|
|
|
|
|
cudaMalloc((void**)&d_in, size);
|
|
|
|
|
in = (float16*)malloc(size);
|
|
|
|
|
in[0] = float16(v_in);
|
|
|
|
|
cudaMemcpy(d_in, in, size, cudaMemcpyHostToDevice);
|
|
|
|
|
Neg<<<1, 1>>>(d_in);
|
|
|
|
|
cudaMemcpy(in, d_in, size, cudaMemcpyDeviceToHost);
|
|
|
|
|
EXPECT_EQ(float(in[0]), v_out);
|
|
|
|
|
free(in);
|
|
|
|
|
cudaFree(d_in);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Conversion from int
|
|
|
|
|
COMPOUND_KERNEL(AddAssign, +=)
|
|
|
|
|
COMPOUND_KERNEL(SubAssign, -=)
|
|
|
|
|
COMPOUND_KERNEL(MulAssign, *=)
|
|
|
|
|
COMPOUND_KERNEL(DivAssign, /=)
|
|
|
|
|
|
|
|
|
|
// Conversion from bool
|
|
|
|
|
COMPOUND_KERNEL_LAUNCH(AddAssign)
|
|
|
|
|
COMPOUND_KERNEL_LAUNCH(SubAssign)
|
|
|
|
|
COMPOUND_KERNEL_LAUNCH(MulAssign)
|
|
|
|
|
COMPOUND_KERNEL_LAUNCH(DivAssign)
|
|
|
|
|
|
|
|
|
|
COMPARISON_KERNEL(Equal, ==)
|
|
|
|
|
COMPARISON_KERNEL(NotEqual, !=)
|
|
|
|
|
COMPARISON_KERNEL(Less, <)
|
|
|
|
|
COMPARISON_KERNEL(LessEqual, <=)
|
|
|
|
|
COMPARISON_KERNEL(Greater, >)
|
|
|
|
|
COMPARISON_KERNEL(GreaterEqual, >=)
|
|
|
|
|
|
|
|
|
|
COMPARISON_KERNEL_LAUNCH(Equal)
|
|
|
|
|
COMPARISON_KERNEL_LAUNCH(NotEqual)
|
|
|
|
|
COMPARISON_KERNEL_LAUNCH(Less)
|
|
|
|
|
COMPARISON_KERNEL_LAUNCH(LessEqual)
|
|
|
|
|
COMPARISON_KERNEL_LAUNCH(Greater)
|
|
|
|
|
COMPARISON_KERNEL_LAUNCH(GreaterEqual)
|
|
|
|
|
|
|
|
|
|
TEST(float16, arithmetic_on_gpu) {
|
|
|
|
|
TestAdd(1, 2, 3);
|
|
|
|
|
TestSub(2, 1, 1);
|
|
|
|
|
TestMul(2, 3, 6);
|
|
|
|
|
TestDiv(6, 2, 3);
|
|
|
|
|
TestNeg(1, -1);
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
TEST(float16, arithmetic_gpu) { EXPECT_EQ(float(float16(2) + float16(2)), 4); }
|
|
|
|
|
TEST(float16, compound_on_gpu) {
|
|
|
|
|
TestAddAssign(1, 2, 3);
|
|
|
|
|
TestSubAssign(2, 1, 1);
|
|
|
|
|
TestMulAssign(2, 3, 6);
|
|
|
|
|
TestDivAssign(6, 2, 3);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(float16, comparison_gpu) { EXPECT_TRUE(float16(1.0f) > float16(0.5f)); }
|
|
|
|
|
TEST(float16, comparision_on_gpu) {
|
|
|
|
|
TestEqual(1, 1, true);
|
|
|
|
|
TestEqual(1, 2, false);
|
|
|
|
|
TestNotEqual(2, 3, true);
|
|
|
|
|
TestNotEqual(2, 2, false);
|
|
|
|
|
TestLess(3, 4, true);
|
|
|
|
|
TestLess(3, 3, false);
|
|
|
|
|
TestLessEqual(3, 3, true);
|
|
|
|
|
TestLessEqual(3, 2, false);
|
|
|
|
|
TestGreater(4, 3, true);
|
|
|
|
|
TestGreater(4, 4, false);
|
|
|
|
|
TestGreaterEqual(4, 4, true);
|
|
|
|
|
TestGreaterEqual(4, 5, false);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
TEST(float16, conversion_on_gpu) {
|
|
|
|
|
// Explicit conversion to and from cuda half
|
|
|
|
|
EXPECT_EQ(float16(half(float16(1.0f))).x, 0x3c00);
|
|
|
|
|
EXPECT_EQ(float16(half(float16(0.5f))).x, 0x3800);
|
|
|
|
|
EXPECT_EQ(float16(half(float16(0.33333f))).x, 0x3555);
|
|
|
|
|
EXPECT_EQ(float16(half(float16(0.0f))).x, 0x0000);
|
|
|
|
|
EXPECT_EQ(float16(half(float16(-0.0f))).x, 0x8000);
|
|
|
|
|
EXPECT_EQ(float16(half(float16(65504.0f))).x, 0x7bff);
|
|
|
|
|
EXPECT_EQ(float16(half(float16(65536.0f))).x, 0x7c00);
|
|
|
|
|
|
|
|
|
|
// Implicit conversion to and from cuda half
|
|
|
|
|
half tmp = float16(1.0f);
|
|
|
|
|
float16 val = tmp;
|
|
|
|
|
EXPECT_EQ(val.x, 0x3c00);
|
|
|
|
|
|
|
|
|
|
// Assignment operator
|
|
|
|
|
float16 v_assign;
|
|
|
|
|
v_assign = tmp;
|
|
|
|
|
EXPECT_EQ(v_assign.x, 0x3c00);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
} // namespace paddle
|
|
|
|
|
#endif
|
|
|
|
|
|