fix floating-point overflow problem of tanh (#355)

8 years ago · a07da94939
parent 56b23d1838
commit a07da94939
10 changed files with 119 additions and 14 deletions
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@ -209,6 +209,15 @@ typedef struct {
 #define HL_FLOAT_MIN        2.2250738585072014e-308
 #endif
 /**
 * The maximum input value for exp, used to avoid overflow problem.
 *
 * Currently only used for tanh function.
 */
 #define EXP_MAX_INPUT       40.0
 /**
 * @brief DIVUP(x, y) is similar to ceil(x / y).
 * @note  For CUDA, DIVUP will be used to specify
--- a/paddle/cuda/src/hl_avx_functions.cc
+++ b/paddle/cuda/src/hl_avx_functions.cc
@ -38,7 +38,9 @@ namespace hppl {
  }
  __m256 tanh(const __m256 a) {
    __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
    __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
    tmp = _mm256_min_ps(tmp, max);
    tmp = exp(tmp);
    return _mm256_sub_ps(
        _mm256_div_ps(_mm256_set1_ps(2.0f),
--- a/paddle/cuda/src/hl_cpu_functions.cc
+++ b/paddle/cuda/src/hl_cpu_functions.cc
@ -30,7 +30,9 @@ namespace hppl {
  }
  real tanh(const real a) {
-    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;
+    real tmp = -2.0 * a;
    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
    return (2.0 / (1.0 + exp(tmp))) - 1.0;
  }
  real linear(const real a) {
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@ -995,7 +995,7 @@ TEST(Layer, LstmLayer) {
  TestConfig config;
  config.layerConfig.set_type("lstmemory");
  config.layerConfig.set_size(4);
-  config.layerConfig.set_active_type("sigmoid");
+  config.layerConfig.set_active_type("tanh");
  config.layerConfig.set_active_state_type("sigmoid");
  config.layerConfig.set_active_gate_type("sigmoid");
  config.biasSize = 28;
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@ -369,7 +369,7 @@ TEST(Layer, LstmLayer) {
  LayerConfig layerConfig;
  layerConfig.set_type("lstmemory");
  layerConfig.set_active_type("relu");
-  layerConfig.set_active_state_type("sigmoid");
+  layerConfig.set_active_state_type("tanh");
  layerConfig.set_active_gate_type("sigmoid");
  layerConfig.add_inputs();
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@ -625,7 +625,10 @@ void BaseMatrixT<T>::squareDerivative(BaseMatrixT& b) {
  applyBinary(binary::SquareDerivative<T>(), b);
 }
-DEFINE_MATRIX_BINARY_OP(Tanh, b = 2.0 / (1.0 + exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(Tanh,
    T tmp = -2.0 * a;
    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<>
 void BaseMatrixT<real>::tanh(BaseMatrixT& b) {
  applyBinary(binary::Tanh<real>(), b);
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@ -200,7 +200,10 @@ void vLog1p(const int n, const T* a, T* r) {
    binary::vLog1p<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
-DEFINE_MATRIX_BINARY_OP(vTanh, b = 2.0 / (1.0 + std::exp(-2 * a)) - 1.0);
+DEFINE_MATRIX_BINARY_OP(vTanh,
    T tmp = -2.0 * a;
    tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
    b = 2.0 / (1.0 + std::exp(tmp)) - 1.0);
 template<class T>
 void vTanh(const int n, const T* a, T* r) {
  hl_cpu_apply_binary_op<T, binary::vTanh<T>, 0, 0>(
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@ -3471,9 +3471,7 @@ void CpuMatrix::tanh(Matrix& output) {
  size_t dim = getWidth();
  CHECK_EQ(output.getHeight(), numSamples);
  CHECK_EQ(output.getWidth(), dim);
  errno = 0;
  vTanh(numSamples * dim, getData(), output.getData());
  CHECK_EQ(errno, 0) << "vTanh error";
 }
 void CpuMatrix::tanhDerivative(Matrix& output) {
@ -3495,10 +3493,8 @@ void CpuMatrix::softrelu(Matrix& output) {
      out[j] = x;
    }
  }
  errno = 0;
  vExp(numSamples * dim, output.getData(), output.getData());
  vLog1p(numSamples * dim, output.getData(), output.getData());
  CHECK_EQ(errno, 0) << "vExp+vLog1p error";
 }
 void CpuMatrix::softreluDerivative(Matrix& output) {
@ -3513,9 +3509,7 @@ void CpuMatrix::softreluDerivative(Matrix& output) {
  MatrixPtr tmpMat = Matrix::create(numSamples, dim);
  real* tmp = tmpMat->getData();
  errno = 0;
  vExp(size, output.getData(), tmpMat->getData());
  CHECK_EQ(errno, 0) << "vExp error";
  for (size_t i = 0; i < size; ++i) {
    grad[i] *= (1.0 - 1.0 / tmp[i]);
@ -3538,10 +3532,7 @@ void CpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
    out[i] = p2 * in[i];
  }
  // out = tanh(out)
  errno = 0;
  vTanh(numSamples * dim, out, out);
  CHECK_EQ(errno, 0) << "vTanh error";
  // out = p1 * out
  for (size_t i = 0; i < numSamples * dim; ++i) {
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@ -13,3 +13,4 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
 add_simple_unittest(test_FPException)
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@ -0,0 +1,94 @@
 /* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 /**
 * This test is about floating point calculation exception.
 * Paddle catches FE_INVALID, FE DIVBYZERO and FE_OVERFLOW exceptions.
 *
 * Some exceptions occur in the middle of a set of formulas, 
 * that can be circumvented by some tricks.
 * For example, 
 * calculate tanh
 *   b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
 *
 * If the result of (-2 * a) is too large,
 * a FE_OVERFLOW exception occurs when calculating exp.
 * But the result of tanh is no overflow problem,
 * so we can add some tricks to prevent exp calculate an excessive value.
 *
 */
 #include <fenv.h>
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Excepts.h"
 using namespace paddle;     // NOLINT
 void SetTensorValue(Matrix& matrix, real value) {
  int height = matrix.getHeight();
  int width = matrix.getWidth();
  int stride = matrix.getStride();
  real* data = matrix.getData();
  for (int i = 0; i < height; i++) {
    int j = rand() % width;  // NOLINT
    if (typeid(matrix) == typeid(CpuMatrix)) {
      data[i * stride + j] = value;
    } else if (typeid(matrix) == typeid(GpuMatrix)) {
      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
    } else {
      LOG(FATAL) << "should not reach here";
    }
  }
 }
 template<typename Matrix>
 void testTanh(real illegal) {
  MatrixPtr A = std::make_shared<Matrix>(10, 10);
  MatrixPtr B = std::make_shared<Matrix>(10, 10);
  A->randomizeUniform();
  B->randomizeUniform();
  SetTensorValue(*A, illegal);
  A->tanh(*B);
 }
 template<typename Matrix>
 void testSigmoid(real illegal) {
  MatrixPtr A = std::make_shared<Matrix>(10, 10);
  MatrixPtr B = std::make_shared<Matrix>(10, 10);
  A->randomizeUniform();
  B->randomizeUniform();
  SetTensorValue(*A, illegal);
  A->sigmoid(*B);
 }
 TEST(fp, overflow) {
  for (auto illegal : {-90.0, 90.0}) {
    LOG(INFO) << " illegal=" << illegal;
    testTanh<CpuMatrix>(illegal);
    testSigmoid<CpuMatrix>(illegal);
  }
 }
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);
  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
  return RUN_ALL_TESTS();
 }