Merge pull request #564 from luotao1/clang

clang format .cc .h .cpp .c and .hpp file
8 years ago · 5c0eb23d1c
parent ad0c144e51 9ea0661a82
commit 5c0eb23d1c
377 changed files with 10817 additions and 7795 deletions
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
@ -112,7 +111,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
 }
 void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector *vec) throw(RangeError) {
+    size_t idx, IVector* vec) throw(RangeError) {
  auto& a = m->getArg(idx);
  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 #include "paddle/trainer/Trainer.h"
@ -44,8 +43,7 @@ TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
  return retv;
 }
-TrainerConfig* TrainerConfig::createFromProtoString(
+TrainerConfig* TrainerConfig::createFromProtoString(const std::string& str) {
    const std::string& str) {
  auto retv = new TrainerConfig();
  paddle::TrainerConfig trainerConfigProto;
  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
@ -27,7 +26,8 @@ GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
 GradientMachine::~GradientMachine() { delete m; }
 GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    const void* confPtr, GradientMatchineCreateMode mode,
+    const void* confPtr,
    GradientMatchineCreateMode mode,
    const std::vector<int>& types) {
  auto& conf = *(const paddle::ModelConfig*)(confPtr);
  std::vector<ParameterType> realTypes;
@ -44,7 +44,8 @@ GradientMachine* GradientMachine::createFromPaddleModelPtr(
 }
 GradientMachine* GradientMachine::createByConfigProtoStr(
-    const std::string& protoStr, GradientMatchineCreateMode mode,
+    const std::string& protoStr,
    GradientMatchineCreateMode mode,
    const std::vector<int>& types) {
  paddle::ModelConfig conf;
  conf.ParseFromString(protoStr);
@ -56,13 +57,15 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
 }
 GradientMachine* GradientMachine::createByModelConfig(
-    ModelConfig* conf, GradientMatchineCreateMode mode,
+    ModelConfig* conf,
    GradientMatchineCreateMode mode,
    const std::vector<int>& types) {
  auto confPtr = &conf->m->conf->getModelConfig();
  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
-void GradientMachine::forward(const Arguments& inArgs, Arguments* outArgs,
+void GradientMachine::forward(const Arguments& inArgs,
                              Arguments* outArgs,
                              PassType passType) {
  auto& in =
      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
@ -99,7 +102,8 @@ void GradientMachine::backward(const UpdateCallback& callback) {
 }
 void GradientMachine::forwardBackward(const Arguments& inArgs,
-                                      Arguments* outArgs, PassType passType,
+                                      Arguments* outArgs,
                                      PassType passType,
                                      const UpdateCallback& callback) {
  auto& in =
      m->cast<std::vector<paddle::Argument>>(inArgs.getInternalArgumentsPtr());
@ -129,7 +133,7 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
-  throw(UnsupportError) {
+    throw(UnsupportError) {
  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
  if (nn) {
    auto mat = nn->getLayerOutput(layerName);
@ -140,8 +144,11 @@ Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
 }
 SequenceGenerator* GradientMachine::asSequenceGenerator(
-    const std::vector<std::string>& dict, size_t begin_id, size_t end_id,
+    const std::vector<std::string>& dict,
-    size_t max_length, size_t beam_size) {
+    size_t begin_id,
    size_t end_id,
    size_t max_length,
    size_t beam_size) {
  SequenceGenerator* r =
      SequenceGenerator::createByGradientMachineSharedPtr(&m->machine);
  r->setDict(dict);
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "PaddleAPI.h"
@ -23,7 +22,8 @@ limitations under the License. */
 template <typename T1, typename T2>
 void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
  dest->resize(src.size());
-  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t){
+  std::transform(src.begin(),
-    return static_cast<T2>(t);
+                 src.end(),
-  });
+                 dest->begin(),
                 [](T1 t) { return static_cast<T2>(t); });
 }
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
@ -44,17 +43,21 @@ Matrix* Matrix::createZero(size_t height, size_t width, bool useGpu) {
  return m;
 }
-Matrix* Matrix::createDense(const std::vector<float>& data, size_t height,
+Matrix* Matrix::createDense(const std::vector<float>& data,
-                            size_t width, bool useGpu) {
+                            size_t height,
                            size_t width,
                            bool useGpu) {
  auto m = new Matrix();
  m->m->mat = paddle::Matrix::create(height, width, useGpu);
  m->m->mat->copyFrom(data.data(), data.size());
  return m;
 }
-Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createDenseFromNumpy(float* data,
-                                      bool copy, bool useGpu)
+                                     int dim1,
-                                     throw (UnsupportError) {
+                                     int dim2,
                                     bool copy,
                                     bool useGpu) throw(UnsupportError) {
  if (useGpu) {
    /// Gpu mode only supports copy=True
    if (!copy) {
@ -66,7 +69,9 @@ Matrix* Matrix::createDenseFromNumpy(float* data, int dim1, int dim2,
  }
 }
-Matrix* Matrix::createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+Matrix* Matrix::createCpuDenseFromNumpy(float* data,
                                        int dim1,
                                        int dim2,
                                        bool copy) {
  auto m = new Matrix();
  if (copy) {
@ -85,12 +90,20 @@ Matrix* Matrix::createGpuDenseFromNumpy(float* data, int dim1, int dim2) {
  return m;
 }
-Matrix* Matrix::createSparse(size_t height, size_t width, size_t nnz,
+Matrix* Matrix::createSparse(size_t height,
-                             bool isNonVal, bool isTrans, bool useGpu) {
+                             size_t width,
                             size_t nnz,
                             bool isNonVal,
                             bool isTrans,
                             bool useGpu) {
  auto m = new Matrix();
  m->m->mat = paddle::Matrix::createSparseMatrix(
-      height, width, nnz, isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      height,
-      isTrans, useGpu);
+      width,
      nnz,
      isNonVal ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
      isTrans,
      useGpu);
  return m;
 }
@ -221,7 +234,8 @@ FloatArray Matrix::getData() const {
 }
 void Matrix::sparseCopyFrom(
-    const std::vector<int>& rows, const std::vector<int>& cols,
+    const std::vector<int>& rows,
    const std::vector<int>& cols,
    const std::vector<float>& vals) throw(UnsupportError) {
  auto cpuSparseMat =
      std::dynamic_pointer_cast<paddle::CpuSparseMatrix>(m->mat);
@ -240,7 +254,8 @@ void Matrix::sparseCopyFrom(
 void* Matrix::getSharedPtr() const { return &m->mat; }
-void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
+void Matrix::toNumpyMatInplace(float** view_data,
                               int* dim1,
                               int* dim2) throw(UnsupportError) {
  auto cpuMat = std::dynamic_pointer_cast<paddle::CpuMatrix>(m->mat);
  if (cpuMat) {
@ -251,7 +266,8 @@ void Matrix::toNumpyMatInplace(float** view_data, int* dim1,
    throw UnsupportError();
  }
 }
-void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
+void Matrix::copyToNumpyMat(float** view_m_data,
                            int* dim1,
                            int* dim2) throw(UnsupportError) {
  static_assert(sizeof(paddle::real) == sizeof(float),
                "Currently PaddleAPI only support for single "
@ -269,8 +285,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
    } else if (auto gpuMat = dynamic_cast<paddle::GpuMatrix*>(m->mat.get())) {
      auto src = gpuMat->getData();
      auto dest = *view_m_data;
-      hl_memcpy_device2host(dest, src,
+      hl_memcpy_device2host(
-                            sizeof(paddle::real) * (*dim1) * (*dim2));
+          dest, src, sizeof(paddle::real) * (*dim1) * (*dim2));
    } else {
      LOG(WARNING) << "Unexpected Situation";
      throw UnsupportError();
@ -278,7 +294,8 @@ void Matrix::copyToNumpyMat(float** view_m_data, int* dim1,
  }
 }
-void Matrix::copyFromNumpyMat(float* data, int dim1,
+void Matrix::copyFromNumpyMat(float* data,
                              int dim1,
                              int dim2) throw(UnsupportError, RangeError) {
  if (isSparse()) {
    throw UnsupportError();
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <stddef.h>
@ -61,8 +60,8 @@ class RangeError {};
 /// Not support Error, such as access GPU memory directly, etc.
 class UnsupportError : public std::runtime_error {
 public:
-  UnsupportError() : std::runtime_error(" ") {};
+  UnsupportError() : std::runtime_error(" "){};
-  UnsupportError(const std::string& message) : std::runtime_error(message) {};
+  UnsupportError(const std::string& message) : std::runtime_error(message){};
 };
 /// This type will map to python's list of float.
@ -112,7 +111,8 @@ public:
  /**
   * Create A Matrix with height,width, which is filled by zero.
   */
-  static Matrix* createZero(size_t height, size_t width,
+  static Matrix* createZero(size_t height,
                            size_t width,
                            bool useGpu = isUsingGpu());
  /**
@ -124,8 +124,11 @@ public:
   *
   * @note the default sparse type is SPARSE_CSR.
   */
-  static Matrix* createSparse(size_t height, size_t width, size_t nnz,
+  static Matrix* createSparse(size_t height,
-                              bool isNonVal = true, bool trans = false,
+                              size_t width,
                              size_t nnz,
                              bool isNonVal = true,
                              bool trans = false,
                              bool useGpu = isUsingGpu());
  /**
@ -134,13 +137,17 @@ public:
   * @param data  list of float should be passed in python.
   * @note        the value will be copy into a new matrix.
   */
-  static Matrix* createDense(const std::vector<float>& data, size_t height,
+  static Matrix* createDense(const std::vector<float>& data,
-                             size_t width, bool useGpu = isUsingGpu());
+                             size_t height,
-
+                             size_t width,
-  static Matrix* createDenseFromNumpy(float* data, int dim1, int dim2,
+                             bool useGpu = isUsingGpu());
-                                      bool copy = true,
+
-                                      bool useGpu = isUsingGpu())
+  static Matrix* createDenseFromNumpy(
-                                      throw (UnsupportError);
+      float* data,
      int dim1,
      int dim2,
      bool copy = true,
      bool useGpu = isUsingGpu()) throw(UnsupportError);
  /**
   *  Create Cpu Dense Matrix from numpy matrix, dtype=float32
@ -151,7 +158,9 @@ public:
   *  @param copy  true if copy into a new matrix, false will create
   *               matrix inplace.
   */
-  static Matrix* createCpuDenseFromNumpy(float* data, int dim1, int dim2,
+  static Matrix* createCpuDenseFromNumpy(float* data,
                                         int dim1,
                                         int dim2,
                                         bool copy = false);
  /// Create Gpu Dense Matrix from numpy matrix, dtype=float32
@ -171,11 +180,13 @@ public:
   * numpy_mat = m.toNumpyMat()
   * @endcode
   */
-  void toNumpyMatInplace(float** view_data, int* dim1,
+  void toNumpyMatInplace(float** view_data,
                         int* dim1,
                         int* dim2) throw(UnsupportError);
  /// Copy To numpy mat.
-  void copyToNumpyMat(float** view_m_data, int* dim1,
+  void copyToNumpyMat(float** view_m_data,
                      int* dim1,
                      int* dim2) throw(UnsupportError);
  /// Copy From Numpy Mat
@ -248,15 +259,18 @@ public:
  static Vector* create(const std::vector<float>& data,
                        bool useGpu = isUsingGpu());
-  static Vector* createVectorFromNumpy(float* data, int dim, bool copy = true,
+  static Vector* createVectorFromNumpy(
-                                       bool useGpu = isUsingGpu())
+      float* data,
-                                       throw (UnsupportError);
+      int dim,
      bool copy = true,
      bool useGpu = isUsingGpu()) throw(UnsupportError);
  /**
   * Create Cpu Vector from numpy array, which dtype=float32
   *
   * If copy is false, it will create vector inplace.
   */
-  static Vector* createCpuVectorFromNumpy(float* data, int dim,
+  static Vector* createCpuVectorFromNumpy(float* data,
                                          int dim,
                                          bool copy = false);
  /// Create Gpu Vector from numpy array, which dtype=float32
@ -312,16 +326,19 @@ public:
  static IVector* create(const std::vector<int>& data,
                         bool useGpu = isUsingGpu());
-  static IVector* createVectorFromNumpy(int* data, int dim, bool copy = true,
+  static IVector* createVectorFromNumpy(
-                                        bool useGpu = isUsingGpu())
+      int* data,
-                                        throw (UnsupportError);
+      int dim,
      bool copy = true,
      bool useGpu = isUsingGpu()) throw(UnsupportError);
  /**
   * Create Cpu IVector from numpy array, which dtype=int32
   *
   * If copy is false, it will create vector inplace
   */
-  static IVector* createCpuVectorFromNumpy(int* data, int dim,
+  static IVector* createCpuVectorFromNumpy(int* data,
                                           int dim,
                                           bool copy = false);
  /**
   * Create Gpu IVector from numpy array, which dtype=int32
@ -605,7 +622,8 @@ class ParameterTraverseCallback {
 public:
  ~ParameterTraverseCallback();
-  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& config,
+  void apply(const std::vector<Vector*>& vecs,
             const ParameterConfig& config,
             size_t sparseId);
 private:
@ -638,7 +656,8 @@ public:
  void finishBatch();
-  void update(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+  void update(const std::vector<Vector*>& vecs,
              const ParameterConfig& conf,
              size_t sparseId = NO_SPARSE_ID);
  std::vector<int> getParameterTypes() const;
@ -678,7 +697,8 @@ public:
   * model config by TrainerConfig
   */
  static GradientMachine* createByModelConfig(
-      ModelConfig* conf, GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
+      ModelConfig* conf,
      GradientMatchineCreateMode mode = CREATE_MODE_NORMAL,
      const std::vector<int>& parameterTypes = defaultParamTypes);
  /**
@ -701,7 +721,8 @@ public:
  /**
   * Combine forward/backward
   */
-  void forwardBackward(const Arguments& inArgs, Arguments* outArgs,
+  void forwardBackward(const Arguments& inArgs,
                       Arguments* outArgs,
                       PassType passType,
                       const UpdateCallback& callback = UpdateCallback());
@ -722,14 +743,17 @@ public:
   */
  SequenceGenerator* asSequenceGenerator(
      const std::vector<std::string>& dict = std::vector<std::string>(),
-      size_t begin_id = 0UL, size_t end_id = 0UL, size_t max_length = 100UL,
+      size_t begin_id = 0UL,
      size_t end_id = 0UL,
      size_t max_length = 100UL,
      size_t beam_size = -1UL);
 private:
  GradientMachinePrivate* m;
  static GradientMachine* createFromPaddleModelPtr(
-      const void* confPtr, GradientMatchineCreateMode mode,
+      const void* confPtr,
      GradientMatchineCreateMode mode,
      const std::vector<int>& types);
  // Not to use c++ 11 init-list, so we use static var as function default arg.
@ -751,8 +775,8 @@ public:
  /// Create A Trainer By TrainerConfig. using paddle command line.
  static Trainer* createByCommandLine() throw(IOError);
-  static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
+  static Trainer* create(TrainerConfig* optConfig,
-      throw(IOError);
+                         GradientMachine* gm) throw(IOError);
  /// Start training
  void startTrain();
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
@ -32,17 +31,21 @@ struct ParameterTraverseCallbackPrivate {
      const paddle::ParameterOptimizer::TraverseCallback& callback)
      : callback(callback) {}
-  void apply(const std::vector<Vector*>& vecs, const ParameterConfig& conf,
+  void apply(const std::vector<Vector*>& vecs,
             const ParameterConfig& conf,
             size_t sparseId) {
    std::vector<paddle::VectorPtr> real_vecs;
    real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
+    std::transform(vecs.begin(),
-      if (v) {
+                   vecs.end(),
-        return *(paddle::VectorPtr*)(v->getSharedPtr());
+                   real_vecs.begin(),
-      } else {
+                   [](Vector* v) {
-        return paddle::VectorPtr();
+                     if (v) {
-      }
+                       return *(paddle::VectorPtr*)(v->getSharedPtr());
-    });
+                     } else {
                       return paddle::VectorPtr();
                     }
                   });
    paddle::ParameterConfig& real_conf =
        *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
@ -86,10 +89,12 @@ void ParameterOptimizer::startBatch(size_t numSamplesProcessed) {
 void ParameterOptimizer::finishBatch() { m->optimizer->finishBatch(); }
 void ParameterOptimizer::update(const std::vector<Vector*>& vecs,
-                                const ParameterConfig& conf, size_t sparseId) {
+                                const ParameterConfig& conf,
-  ParameterTraverseCallbackPrivate invoker([&](
+                                size_t sparseId) {
-      const paddle::VectorPtr _vecs[], const paddle::ParameterConfig& config,
+  ParameterTraverseCallbackPrivate invoker(
-      size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
+      [&](const paddle::VectorPtr _vecs[],
          const paddle::ParameterConfig& config,
          size_t sid = -1UL) { m->optimizer->update(_vecs, config, sid); });
  invoker.apply(vecs, conf, sparseId);
 }
@ -116,8 +121,9 @@ void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
 ParameterTraverseCallback* ParameterOptimizer::needSpecialTraversal(
    const ParameterConfig& config) const {
-  auto& param_config = *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(
+  auto& param_config =
-                            config).getRawPtr();
+      *(paddle::ParameterConfig*)const_cast<ParameterConfig&>(config)
           .getRawPtr();
  auto callback = m->optimizer->needSpecialTraversal(param_config);
  if (callback) {
    auto retCallback = new ParameterTraverseCallback();
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
@ -42,8 +41,10 @@ struct Path {
 // position
 static void findNBest(paddle::GradientMachine* gradMachine,
                      std::vector<paddle::Argument>& inArgs,
-                      std::vector<Path>& finalPaths, size_t bos_id,
+                      std::vector<Path>& finalPaths,
-                      size_t eos_id, size_t max_length) {
+                      size_t bos_id,
                      size_t eos_id,
                      size_t max_length) {
  std::vector<Path> paths;
  Path emptyPath;
  paths.push_back(emptyPath);
@ -166,7 +167,8 @@ public:
    if (id < getSize()) {
      Path& p = (*path_)[id];
      std::ostringstream sout;
-      std::transform(p.ids.begin(), p.ids.end(),
+      std::transform(p.ids.begin(),
                     p.ids.end(),
                     std::ostream_iterator<std::string>(sout, split ? " " : ""),
                     [&](int id) { return (*dict_)[id]; });
      return sout.str();
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@ -64,12 +64,11 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
 Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
    : m(new TrainerPrivate()) {
-  m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+  m->init(config->m->conf, /* testing= */ false, gm ? gm->m->machine : nullptr);
 }
-Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
+Trainer* Trainer::create(TrainerConfig* config,
-    throw(IOError)
+                         GradientMachine* gm) throw(IOError) {
 {
  auto retv = new Trainer(config, gm);
  if (retv->m->getConfig().IsInitialized()) {
    return retv;
@ -134,15 +133,17 @@ void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
 Matrix* Trainer::getLayerOutput(const std::string& layerName) {
  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-          this->m->getGradientMachine());
+      this->m->getGradientMachine());
  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
  auto m = nn->getLayerOutput(layerName);
  return Matrix::createByPaddleMatrixPtr(&m);
 }
-void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+void Trainer::forwardOneBatch(size_t batchSize) {
  m->forwardOneBatch(batchSize);
 }
-bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
+bool TrainerPrivate::forwardOneBatch(size_t batchSize) {
  CHECK(dataProvider_) << "data_provider is not specified";
  paddle::DataBatch dataBatch;
  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
@ -156,7 +157,6 @@ bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
 void TrainerPrivate::forwardOneDataBatch(
    const std::vector<paddle::Argument>& inArgs) {
  std::vector<paddle::Argument>& outArgs = forwardOutput_;
  if (config_->getOptConfig().use_sparse_remote_updater()) {
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@ -37,13 +37,15 @@ FloatArray::FloatArray(const float* b, const size_t l)
 IntArray::IntArray(const int* b, const size_t l, bool f)
    : buf(b), length(l), needFree(f) {}
-IntWithFloatArray::IntWithFloatArray(const float* v, const int* i, size_t l,
+IntWithFloatArray::IntWithFloatArray(const float* v,
                                     const int* i,
                                     size_t l,
                                     bool f)
    : valBuf(v), idxBuf(i), length(l), needFree(f) {}
-bool isUsingGpu() {return FLAGS_use_gpu;}
+bool isUsingGpu() { return FLAGS_use_gpu; }
-void setUseGpu(bool useGpu) {FLAGS_use_gpu = useGpu;}
+void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 bool isGpuVersion() {
 #ifdef PADDLE_ONLY_CPU
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "PaddleAPI.h"
 #include "paddle/math/Vector.h"
@ -39,8 +38,10 @@ IVector* IVector::create(const std::vector<int>& data, bool useGpu) {
  return v;
 }
-IVector* IVector::createVectorFromNumpy(int* data, int dim, bool copy,
+IVector* IVector::createVectorFromNumpy(int* data,
-                                        bool useGpu) throw (UnsupportError){
+                                        int dim,
                                        bool copy,
                                        bool useGpu) throw(UnsupportError) {
  if (useGpu) {
    /// if use gpu only copy=true is supported
    if (!copy) {
@ -137,8 +138,8 @@ void IVector::copyToNumpyArray(int** view_m_data, int* dim1) {
  if (auto cpuVec = dynamic_cast<paddle::CpuIVector*>(m->vec.get())) {
    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(int) * (*dim1));
  } else if (auto gpuVec = dynamic_cast<paddle::GpuIVector*>(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
+    hl_memcpy_device2host(
-                          sizeof(int) * (*dim1));
+        *view_m_data, gpuVec->getData(), sizeof(int) * (*dim1));
  } else {
    LOG(INFO) << "Unexpected situation";
  }
@ -201,8 +202,10 @@ Vector* Vector::createByPaddleVectorPtr(void* ptr) {
  }
 }
-Vector* Vector::createVectorFromNumpy(float* data, int dim, bool copy,
+Vector* Vector::createVectorFromNumpy(float* data,
-                                      bool useGpu) throw (UnsupportError){
+                                      int dim,
                                      bool copy,
                                      bool useGpu) throw(UnsupportError) {
  if (useGpu) {
    /// if use gpu only copy=True is supported
    if (!copy) {
@ -251,8 +254,8 @@ void Vector::copyToNumpyArray(float** view_m_data, int* dim1) {
  if (auto cpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
    std::memcpy(*view_m_data, cpuVec->getData(), sizeof(float) * (*dim1));
  } else if (auto gpuVec = dynamic_cast<paddle::CpuVector*>(m->vec.get())) {
-    hl_memcpy_device2host(*view_m_data, gpuVec->getData(),
+    hl_memcpy_device2host(
-                          sizeof(float) * (*dim1));
+        *view_m_data, gpuVec->getData(), sizeof(float) * (*dim1));
  } else {
    LOG(INFO) << "Unexpected situation";
  }
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_ACTIVATION_FUNCTIONS_H_
 #define HL_ACTIVATION_FUNCTIONS_H_
@ -21,11 +20,8 @@ limitations under the License. */
 /**
 * Active functions: sigmoid, relu, tanh and linear.
 */
-#define HPPL_ACTIVE_FUNCTION  {hppl::sigmoid,   \
+#define HPPL_ACTIVE_FUNCTION \
-                               hppl::relu,      \
+  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
                               hppl::tanh,      \
                               hppl::linear     \
                              }
 namespace hppl {
@ -42,18 +38,18 @@ public:
 #ifdef __NVCC__
 namespace gpu {
-static __device__ Active<real>::forward  forward[]  = HPPL_ACTIVE_FUNCTION;
+static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #else
 namespace cpu {
-static Active<real>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #ifdef __AVX__
 namespace avx {
-static Active<__m256>::forward  forward[] = HPPL_ACTIVE_FUNCTION;
+static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
 }
 #endif
--- a/paddle/cuda/include/hl_aggregate.h
+++ b/paddle/cuda/include/hl_aggregate.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_AGGREGATE_H_
 #define HL_AGGREGATE_H_
--- a/paddle/cuda/include/hl_avx_functions.h
+++ b/paddle/cuda/include/hl_avx_functions.h
@ -12,22 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_AVX_FUNCTIONS_H_
 #define HL_AVX_FUNCTIONS_H_
 #include <immintrin.h>
 namespace hppl {
-  __m256 relu(const __m256 a);
+__m256 relu(const __m256 a);
-  __m256 sigmoid(const __m256 a);
+__m256 sigmoid(const __m256 a);
-  __m256 tanh(const __m256 a);
+__m256 tanh(const __m256 a);
-  __m256 linear(const __m256 a);
+__m256 linear(const __m256 a);
-
+
-  __m256 relu(const __m256 a, const __m256 b);
+__m256 relu(const __m256 a, const __m256 b);
-  __m256 sigmoid(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
-  __m256 tanh(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
-  __m256 linear(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
 }  // namespace hppl
 #endif  // HL_AVX_FUNCTIONS_H_
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_BASE_H_
 #define HL_BASE_H_
@ -33,36 +31,36 @@ limitations under the License. */
 *          HPPL_STREAM_DEFAULT is HPPL default stream.
 */
 typedef enum {
-    HPPL_STREAM_DEFAULT = 0,    /* Thread Default Stream*/
+  HPPL_STREAM_DEFAULT = 0, /* Thread Default Stream*/
-    HPPL_STREAM_1 = 1,
+  HPPL_STREAM_1 = 1,
-    HPPL_STREAM_2 = 2,
+  HPPL_STREAM_2 = 2,
-    HPPL_STREAM_3 = 3,
+  HPPL_STREAM_3 = 3,
-    HPPL_STREAM_4 = 4,
+  HPPL_STREAM_4 = 4,
-    HPPL_THREAD_STREAM_1 = 5,
+  HPPL_THREAD_STREAM_1 = 5,
-    HPPL_THREAD_STREAM_2 = 6,
+  HPPL_THREAD_STREAM_2 = 6,
-    HPPL_THREAD_STREAM_3 = 7,
+  HPPL_THREAD_STREAM_3 = 7,
-    HPPL_THREAD_STREAM_4 = 8,
+  HPPL_THREAD_STREAM_4 = 8,
-    HPPL_STREAM_END
+  HPPL_STREAM_END
 } hl_stream_t;
 /**
 * @brief HPPL activation mode.
 */
 typedef enum {
-    HL_ACTIVATION_SIGMOID   = 0,
+  HL_ACTIVATION_SIGMOID = 0,
-    HL_ACTIVATION_RELU      = 1,
+  HL_ACTIVATION_RELU = 1,
-    HL_ACTIVATION_TANH      = 2,
+  HL_ACTIVATION_TANH = 2,
-    HL_ACTIVATION_LINEAR    = 3,
+  HL_ACTIVATION_LINEAR = 3,
-    HL_ACTIVATION_END
+  HL_ACTIVATION_END
 } hl_activation_mode_t;
 /**
 * @brief Transpose type.
 */
 typedef enum {
-    HPPL_OP_N = 0, /* transpose */
+  HPPL_OP_N = 0, /* transpose */
-    HPPL_OP_T = 1, /* non transpose */
+  HPPL_OP_T = 1, /* non transpose */
-    HPPL_OP_END
+  HPPL_OP_END
 } hl_trans_op_t;
 /**
@ -148,23 +146,21 @@ typedef struct {
 * @brief  Sparse matrix value type.
 */
 typedef enum {
-    HL_NO_VALUE = 0,                       /* matrix values only 0 or 1 */
+  HL_NO_VALUE = 0, /* matrix values only 0 or 1 */
-    HL_FLOAT_VALUE = 1,
+  HL_FLOAT_VALUE = 1,
-    HL_VALUE_END
+  HL_VALUE_END
 } hl_matrix_value_t;
 /**
 * @brief  HPPL matrix format.
 */
 typedef enum {
-    HL_SPARSE_CSR = 0,
+  HL_SPARSE_CSR = 0,
-    HL_SPARSE_CSC = 1,
+  HL_SPARSE_CSC = 1,
-    HL_SPARSE_END
+  HL_SPARSE_END
 } hl_matrix_format_t;
-
+typedef struct _hl_matrix_s *hl_matrix_s;
 typedef struct _hl_matrix_s * hl_matrix_s;
 /**
 * @brief   HPPL sparse matrix.
@ -177,12 +173,12 @@ typedef struct _hl_matrix_s * hl_matrix_s;
 * @param  nnz        nonzero values of sparse matrix.
 */
 typedef struct {
-    hl_matrix_s             matrix;
+  hl_matrix_s matrix;
-    hl_matrix_format_t      format;
+  hl_matrix_format_t format;
-    hl_matrix_value_t       type;
+  hl_matrix_value_t type;
-    int                     rows;
+  int rows;
-    int                     cols;
+  int cols;
-    size_t                  nnz;
+  size_t nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
 #ifndef PADDLE_TYPE_DOUBLE
@ -195,7 +191,7 @@ typedef struct {
 *
 * HL_FLOAT_MIN: 1.17549435e-38F
 */
-#define HL_FLOAT_MAX        3.40282347e+38F
+#define HL_FLOAT_MAX 3.40282347e+38F
 /**
 * if real == double
 *
@ -203,20 +199,18 @@ typedef struct {
 *
 * HL_FLOAT_MIN: 2.2250738585072014e-308
 */
-#define HL_FLOAT_MIN        1.17549435e-38F
+#define HL_FLOAT_MIN 1.17549435e-38F
 #else
-#define HL_FLOAT_MAX        1.7976931348623157e+308
+#define HL_FLOAT_MAX 1.7976931348623157e+308
-#define HL_FLOAT_MIN        2.2250738585072014e-308
+#define HL_FLOAT_MIN 2.2250738585072014e-308
 #endif
 /**
 * The maximum input value for exp, used to avoid overflow problem.
 *
 * Currently only used for tanh function.
 */
-#define EXP_MAX_INPUT       40.0
+#define EXP_MAX_INPUT 40.0
 /**
 * @brief DIVUP(x, y) is similar to ceil(x / y).
@ -224,7 +218,7 @@ typedef struct {
 *        the size of blockDim.
 */
 #ifndef DIVUP
-#define DIVUP(x, y) (((x) + (y) - 1) / (y))
+#define DIVUP(x, y) (((x) + (y)-1) / (y))
 #endif
 #ifdef __NVCC__
@ -233,7 +227,7 @@ typedef struct {
 #include "hl_cuda.h"
 #include "cuda_runtime.h"
-extern  __thread bool g_sync_flag;
+extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
 #define STREAM_DEFAULT default_stream
@ -241,16 +235,15 @@ extern __thread cudaStream_t default_stream;
 * @brief   Check cuda kernel execution.
 * @param   msg   error string
 */
-#define CHECK_SYNC(msg)                                   \
+#define CHECK_SYNC(msg)                                               \
-  if (true == g_sync_flag) {                              \
+  if (true == g_sync_flag) {                                          \
-    hl_stream_synchronize(HPPL_STREAM_DEFAULT);           \
+    hl_stream_synchronize(HPPL_STREAM_DEFAULT);                       \
-    cudaError_t err                                       \
+    cudaError_t err = (cudaError_t)hl_get_device_last_error();        \
-      = (cudaError_t)hl_get_device_last_error();          \
+    CHECK_EQ(cudaSuccess, err)                                        \
-    CHECK_EQ(cudaSuccess, err) << "[" << msg << "] "      \
+        << "[" << msg << "] "                                         \
-      << "CUDA error: "                                   \
+        << "CUDA error: " << hl_get_device_error_string((size_t)err); \
      << hl_get_device_error_string((size_t)err);         \
  }
-#endif  /* __NVCC__ */
+#endif /* __NVCC__ */
-#endif  /* HL_BASE_H_ */
+#endif /* HL_BASE_H_ */
--- a/paddle/cuda/include/hl_batch_transpose.h
+++ b/paddle/cuda/include/hl_batch_transpose.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_BATCH_TRANSPOSE_H_
 #define HL_BATCH_TRANSPOSE_H_
@ -31,10 +30,7 @@ limitations under the License. */
 *          order. Each batch has height * width data, which are
 *          arranged in height-first (or row-first) manner.
 */
-extern void batchTranspose(const real* input,
+extern void batchTranspose(
-                           real* output,
+    const real* input, real* output, int width, int height, int batchSize);
                           int width,
                           int height,
                           int batchSize);
 #endif  // HL_BATCH_TRANSPOSE_H_
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_CUDA_H_
 #define HL_CUDA_H_
@ -22,8 +21,7 @@ limitations under the License. */
 /**
 * @brief   HPPL event.
 */
-typedef struct _hl_event_st *  hl_event_t;
+typedef struct _hl_event_st *hl_event_t;
 /**
 * @brief return cuda runtime api version.
@ -42,7 +40,7 @@ extern void hl_start();
 *                      if device is NULL, will start all GPU.
 * @param[in]   number  number of devices.
 */
-extern void hl_specify_devices_start(int* device, int number);
+extern void hl_specify_devices_start(int *device, int number);
 /**
 * @brief   Queries if a device may directly access a peer device's memory.
@ -126,7 +124,7 @@ extern int hl_get_device();
 *
 * @return      dest_d   pointer to device memory.
 */
-extern void* hl_malloc_device(size_t size);
+extern void *hl_malloc_device(size_t size);
 /**
 * @brief   Free device memory.
@ -143,7 +141,7 @@ extern void hl_free_mem_device(void *dest_d);
 *
 * @return      dest_h   pointer to host memory.
 */
-extern void* hl_malloc_host(size_t size);
+extern void *hl_malloc_host(size_t size);
 /**
 * @brief   Free host page-lock memory.
@ -228,9 +226,9 @@ extern void hl_srand(unsigned int seed);
 * @param[in]   stream  stream id.
 */
 extern void hl_memcpy_async(void *dst,
-                           void *src,
+                            void *src,
-                           size_t size,
+                            size_t size,
-                           hl_stream_t stream);
+                            hl_stream_t stream);
 /**
 * @brief   Waits for stream tasks to complete.
@ -261,8 +259,7 @@ extern void hl_destroy_event(hl_event_t event);
 *
 * @return      time   Time between start and end in ms.
 */
-extern float hl_event_elapsed_time(hl_event_t start,
+extern float hl_event_elapsed_time(hl_event_t start, hl_event_t end);
                                   hl_event_t end);
 /**
 * @brief   Records an event.
@ -300,7 +297,7 @@ extern void hl_set_device_flags_block();
 /**
 * @brief   Returns the last error string from a cuda runtime call.
 */
-extern const char* hl_get_device_error_string();
+extern const char *hl_get_device_error_string();
 /**
 * @brief     Returns the last error string from a cuda runtime call.
@ -309,7 +306,7 @@ extern const char* hl_get_device_error_string();
 *
 * @see       hl_get_device_last_error()
 */
-extern const char* hl_get_device_error_string(size_t err);
+extern const char *hl_get_device_error_string(size_t err);
 /**
 * @brief   Returns the last error number.
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_CUDA_CUBLAS_H_
 #define HL_CUDA_CUBLAS_H_
@ -29,12 +28,8 @@ limitations under the License. */
 * @param[in]   ldc     the first dimension of C_d.
 *
 */
-extern void hl_matrix_transpose(real *A_d,
+extern void hl_matrix_transpose(
-                                real *C_d,
+    real *A_d, real *C_d, int dimM, int dimN, int lda, int ldc);
                                int dimM,
                                int dimN,
                                int lda,
                                int ldc);
 /*
 * @brief Matrix transpose, while lda = dimN, ldc = dimM.
@ -45,10 +40,7 @@ extern void hl_matrix_transpose(real *A_d,
 * @param[in]   dimN    matrix width.
 *
 */
-extern void hl_matrix_transpose(real *A_d,
+extern void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN);
                                real *C_d,
                                int dimM,
                                int dimN);
 /*
 * @brief Matrix inverse
@ -60,11 +52,7 @@ extern void hl_matrix_transpose(real *A_d,
 * @param[in]   ldc    the first dimension of C_d
 *
 */
-extern void hl_matrix_inverse(real *A_d,
+extern void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc);
                              real *C_d,
                              int dimN,
                              int lda,
                              int ldc);
 /**
 * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@ -84,12 +72,19 @@ extern void hl_matrix_inverse(real *A_d,
 * @param[in]   ldc     the first dimension of C_d.
 *
 */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_mul(real *A_d,
-                          real *B_d, hl_trans_op_t transb,
+                          hl_trans_op_t transa,
                          real *B_d,
                          hl_trans_op_t transb,
                          real *C_d,
-                          int dimM, int dimN, int dimK,
+                          int dimM,
-                          real alpha, real beta,
+                          int dimN,
-                          int lda, int ldb, int ldc);
+                          int dimK,
                          real alpha,
                          real beta,
                          int lda,
                          int ldb,
                          int ldc);
 /**
 * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
@ -106,11 +101,16 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
 * @param[in]   beta    scalar used for multiplication.
 *
 */
-extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
+extern void hl_matrix_mul(real *A_d,
-                          real *B_d, hl_trans_op_t transb,
+                          hl_trans_op_t transa,
                          real *B_d,
                          hl_trans_op_t transb,
                          real *C_d,
-                          int dimM, int dimN, int dimK,
+                          int dimM,
-                          real alpha, real beta);
+                          int dimN,
                          int dimK,
                          real alpha,
                          real beta);
 /**
 * @brief   This function performs the matrix-vector multiplication.
@ -132,11 +132,17 @@ extern void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
 *
 */
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
+extern void hl_matrix_mul_vector(real *A_d,
-                                 real *B_d, real *C_d,
+                                 hl_trans_op_t trans,
-                                 int dimM, int dimN,
+                                 real *B_d,
-                                 real alpha, real beta,
+                                 real *C_d,
-                                 int lda, int incb, int incc);
+                                 int dimM,
                                 int dimN,
                                 real alpha,
                                 real beta,
                                 int lda,
                                 int incb,
                                 int incc);
 /**
 * @brief   This function performs the matrix-vector multiplication.
@ -154,9 +160,13 @@ extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
 * @param[in]     beta   scalar used for multiplication.
 *
 */
-extern void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
+extern void hl_matrix_mul_vector(real *A_d,
-                                 real *B_d, real *C_d,
+                                 hl_trans_op_t trans,
-                                 int dimM, int dimN,
+                                 real *B_d,
-                                 real alpha, real beta);
+                                 real *C_d,
                                 int dimM,
                                 int dimN,
                                 real alpha,
                                 real beta);
 #endif /* HL_CUDA_CUBLAS_H_ */
--- a/paddle/cuda/include/hl_cuda_cudnn.h
+++ b/paddle/cuda/include/hl_cuda_cudnn.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_CUDA_CUDNN_H_
 #define HL_CUDA_CUDNN_H_
@ -22,7 +21,7 @@ limitations under the License. */
 *  hppl pooling mode
 */
 typedef enum {
-  HL_POOLING_MAX     = 0,
+  HL_POOLING_MAX = 0,
  // average includes padded values
  HL_POOLING_AVERAGE = 1,
  // average does not include padded values
@ -324,17 +323,16 @@ extern void hl_convolution_forward_add_bias(hl_tensor_descriptor bias,
 * @param[in]   sizeInBytes         gpu workspace size (bytes).
 * @param[in]   convBwdFilterAlgo   backward filter algorithm.
 */
-extern void hl_convolution_backward_filter(
+extern void hl_convolution_backward_filter(hl_tensor_descriptor input,
-        hl_tensor_descriptor input,
+                                           real* input_data,
-        real* input_data,
+                                           hl_tensor_descriptor output,
-        hl_tensor_descriptor output,
+                                           real* output_grad_data,
-        real* output_grad_data,
+                                           hl_filter_descriptor filter,
-        hl_filter_descriptor filter,
+                                           real* filter_grad_data,
-        real* filter_grad_data,
+                                           hl_convolution_descriptor conv,
-        hl_convolution_descriptor conv,
+                                           void* gpuWorkSpace,
-        void* gpuWorkSpace,
+                                           size_t sizeInBytes,
-        size_t sizeInBytes,
+                                           int convBwdFilterAlgo);
        int  convBwdFilterAlgo);
 /**
 * @brief   convolution backward data(calculate input image grad data).
@ -350,17 +348,16 @@ extern void hl_convolution_backward_filter(
 * @param[in]   sizeInBytes         gpu workspace size (bytes).
 * @param[in]   convBwdDataAlgo     backward data algorithm.
 */
-extern void hl_convolution_backward_data(
+extern void hl_convolution_backward_data(hl_tensor_descriptor input,
-        hl_tensor_descriptor input,
+                                         real* input_data_grad,
-        real* input_data_grad,
+                                         hl_tensor_descriptor output,
-        hl_tensor_descriptor output,
+                                         real* output_grad_data,
-        real* output_grad_data,
+                                         hl_filter_descriptor filter,
-        hl_filter_descriptor filter,
+                                         real* filter_data,
-        real* filter_data,
+                                         hl_convolution_descriptor conv,
-        hl_convolution_descriptor conv,
+                                         void* gpuWorkSpace,
-        void* gpuWorkSpace,
+                                         size_t sizeInBytes,
-        size_t sizeInBytes,
+                                         int convBwdDataAlgo);
        int convBwdDataAlgo);
 /**
 * @brief   convolution backward bias(calculate bias grad data).
@ -383,8 +380,8 @@ extern void hl_convolution_backward_bias(hl_tensor_descriptor bias,
 * @param[in]   height              matrix height.
 * @param[in]   width               matrix width.
 */
-extern void hl_softmax_forward(real *input,
+extern void hl_softmax_forward(real* input,
-                               real *output,
+                               real* output,
                               int height,
                               int width);
@ -396,8 +393,8 @@ extern void hl_softmax_forward(real *input,
 * @param[in]   height              matrix height.
 * @param[in]   width               matrix width.
 */
-extern void hl_softmax_backward(real *output_value,
+extern void hl_softmax_backward(real* output_value,
-                                real *output_grad,
+                                real* output_grad,
                                int height,
                                int width);
@ -426,18 +423,18 @@ extern void hl_softmax_backward(real *output_value,
 *
 */
 extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
-                                           real *input,
+                                           real* input,
                                           hl_tensor_descriptor outputDesc,
-                                           real *output,
+                                           real* output,
                                           hl_tensor_descriptor bnParamDesc,
-                                           real *scale,
+                                           real* scale,
-                                           real *bias,
+                                           real* bias,
                                           double factor,
-                                           real *runningMean,
+                                           real* runningMean,
-                                           real *runningInvVar,
+                                           real* runningInvVar,
                                           double epsilon,
-                                           real *savedMean,
+                                           real* savedMean,
-                                           real *savedVar);
+                                           real* savedVar);
 /**
 * @brief   cudnn batch norm forward.
@ -463,14 +460,14 @@ extern void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
 *
 */
 extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
-                                            real *input,
+                                            real* input,
                                            hl_tensor_descriptor outputDesc,
-                                            real *output,
+                                            real* output,
                                            hl_tensor_descriptor bnParamDesc,
-                                            real *scale,
+                                            real* scale,
-                                            real *bias,
+                                            real* bias,
-                                            real *estimatedMean,
+                                            real* estimatedMean,
-                                            real *estimatedVar,
+                                            real* estimatedVar,
                                            double epsilon);
 /**
@ -483,7 +480,8 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 * @param[in]   inGradDesc      input tensor descriptor desc.
 * @param[in]   inGrad          input data.
 * @param[in]   dBnParamDesc    tensor descriptor desc.
- *                              bnScale, bnBias, running mean/var, save_mean/var.
+ *                              bnScale, bnBias, running mean/var,
 * save_mean/var.
 * @param[in]   scale           batch normalization scale parameter (in original
 *                              paper scale is referred to as gamma).
 * @param[in]   scaleGrad       batch normalization scale parameter (in original
@ -497,17 +495,17 @@ extern void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
 *
 */
 extern void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
-                                   real *input,
+                                   real* input,
                                   hl_tensor_descriptor outGradDesc,
-                                   real *outGrad,
+                                   real* outGrad,
                                   hl_tensor_descriptor inGradDesc,
-                                   real *inGrad,
+                                   real* inGrad,
                                   hl_tensor_descriptor dBnParamDesc,
-                                   real *scale,
+                                   real* scale,
-                                   real *scaleGrad,
+                                   real* scaleGrad,
-                                   real *biasGrad,
+                                   real* biasGrad,
                                   double epsilon,
-                                   real *savedMean,
+                                   real* savedMean,
-                                   real *savedInvVar);
+                                   real* savedInvVar);
 #endif  // HL_CUDA_CUDNN_H_
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_DSO_LOADER_H_
 #define HL_DSO_LOADER_H_
--- a/paddle/cuda/include/hl_functions.h
+++ b/paddle/cuda/include/hl_functions.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_FUNCTIONS_H_
 #define HL_FUNCTIONS_H_
@ -21,30 +20,30 @@ limitations under the License. */
 /**
 * sigmoid threshold maximum
 */
-#define     SIGMOID_THRESHOLD_MIN   -40.0
+#define SIGMOID_THRESHOLD_MIN -40.0
 /**
 * sigmoid threshold minimum
 */
-#define     SIGMOID_THRESHOLD_MAX   13.0
+#define SIGMOID_THRESHOLD_MAX 13.0
 #ifndef __NVCC__
 namespace hppl {
-  /*
+/*
-   * forward activation
+ * forward activation
-   */
+ */
-  real relu(const real a);
+real relu(const real a);
-  real sigmoid(const real a);
+real sigmoid(const real a);
-  real tanh(const real a);
+real tanh(const real a);
-  real linear(const real a);
+real linear(const real a);
-
+
-  /*
+/*
-   * backward activation
+ * backward activation
-   */
+ */
-  real relu(const real a, const real b);
+real relu(const real a, const real b);
-  real sigmoid(const real a, const real b);
+real sigmoid(const real a, const real b);
-  real tanh(const real a, const real b);
+real tanh(const real a, const real b);
-  real linear(const real a, const real b);
+real linear(const real a, const real b);
 }  // namespace hppl
 #ifdef __AVX__
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_GPU_H_
 #define HL_GPU_H_
--- a/paddle/cuda/include/hl_lstm.h
+++ b/paddle/cuda/include/hl_lstm.h
@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_LSTM_H_
 #define HL_LSTM_H_
--- a/Show More
+++ b/Show More