Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_prelu

8 years ago · b7ebaf71bb
parent 2ab928d185 9f28925629
commit b7ebaf71bb
58 changed files with 1477 additions and 940 deletions
--- a/.gitignore
+++ b/.gitignore
@ -28,4 +28,3 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
-python/paddle/v2/framework/tests/tmp/*
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@ -98,7 +98,7 @@ IF(NOT ${CBLAS_FOUND})
        ENDIF()
        INSTALL(CODE "execute_process(
            COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
-                    destination ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
+                    ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
            )"
        )
        INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
--- a/doc/design/evaluator.md
+++ b/doc/design/evaluator.md
@ -0,0 +1,58 @@
+## Evaluator Design
+
+### The Problem
+
+During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
+
+### Evaluator Design
+Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
+
+1. Initialize the metric state and add it into the block.
+
+2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
+
+
+3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
+
+### Implementation
+This design is shown in python API. 
+Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. 
+
+    
+```python
+class Evaluator(object):
+    """
+    Evaluator Base class.
+    """
+    def __init__(self, name, **kwargs):
+       """
+       Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
+       Auc need four variables, `true_positives`,
+         `true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
+
+       The initialization of Evaluator should be responsible for:
+       create metric states and append to the main_program
+       """ 
+       pass
+
+    def _update_ops(self, input, label, **kwargs)
+       """
+       Add mini-batch evaluator caculate operators to the main_program.
+       Add increment operator to accumulate the metric states.
+       """
+    
+
+    def reset(self, executor, reset_program=None):
+      """
+      Reset metric states at the begin of each pass/user specified batch number.
+      Execute the reset_program to reset the states.
+      """
+      
+
+    def eval(self, executor, eval_program=None):
+      """
+      Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
+      Execute the eval_program and return the result.
+      """
+      return eval_result
+```
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@ -121,6 +121,7 @@ paddle_error paddle_matrix_get_shape(paddle_matrix mat,

 paddle_matrix paddle_matrix_create_sparse(
    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+#ifndef PADDLE_MOBILE_INFERENCE
  auto ptr = new paddle::capi::CMatrix();
  ptr->mat = paddle::Matrix::createSparseMatrix(
      height,
@ -131,6 +132,9 @@ paddle_matrix paddle_matrix_create_sparse(
      false,
      useGpu);
  return ptr;
+#else
+  return nullptr;
+#endif
 }

 paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
@ -140,6 +144,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                            uint64_t colSize,
                                            float* valueArray,
                                            uint64_t valueSize) {
+#ifndef PADDLE_MOBILE_INFERENCE
  if (mat == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (rowArray == nullptr || colArray == nullptr ||
@ -160,4 +165,7 @@ paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
  } else {
    return kPD_NOT_SUPPORTED;
  }
+#else
+  return kPD_NOT_SUPPORTED;
+#endif
 }
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@ -48,6 +48,7 @@ PD_API paddle_matrix paddle_matrix_create(uint64_t height,
 * @param isBinary is binary (either 1 or 0 in matrix) or not.
 * @param useGpu is using GPU or not.
 * @return paddle_matrix.
+ * @note Mobile inference does not support this interface.
 */
 PD_API paddle_matrix paddle_matrix_create_sparse(
    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
@ -129,6 +130,7 @@ PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
 * NULL if the matrix is binary.
 * @param [in] valueSize length of value array. Zero if the matrix is binary.
 * @return paddle_error
+ * @note Mobile inference does not support this interface.
 */
 PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
                                                   int* rowArray,
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@ -27,7 +27,9 @@ if(WITH_GPU)
    set_source_files_properties(${CUDA_CXX_SOURCES}
                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
+    if (NOT MOBILE_INFERENCE)
    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
+    endif()
 endif()

 set(CUDA_CU_SOURCES
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@ -38,9 +38,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 add_custom_command(TARGET framework_py_proto POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto
-    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto/
-    COMMENT "Copy generated python proto into directory paddle/v2/framework/proto."
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto
+    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/
+    COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto."
    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 cc_library(backward SRCS backward.cc DEPS net_op)
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@ -85,9 +85,49 @@ if(MOBILE_INFERENCE)
         gradientmachines/GradientMachineMode.cpp
         gradientmachines/MultiGradientMachine.cpp)

-    # Remove useless layers
+    # Remove layers that used in training
    list(REMOVE_ITEM GSERVER_SOURCES
-    	 layers/RecurrentLayerGroup.cpp)
+    	 layers/RecurrentLayerGroup.cpp
+         layers/CostLayer.cpp
+         layers/MultiBoxLossLayer.cpp
+         layers/WarpCTCLayer.cpp
+         layers/CTCLayer.cpp
+         layers/LinearChainCTC.cpp
+         layers/PrintLayer.cpp)
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/OuterProdLayer.cpp
+         layers/SumToOneNormLayer.cpp
+         layers/ConvShiftLayer.cpp
+         layers/InterpolationLayer.cpp
+         layers/AgentLayer.cpp
+         layers/DotMulOperator.cpp
+         layers/GruStepLayer.cpp
+         layers/LstmStepLayer.cpp
+         layers/ConvexCombinationLayer.cpp
+         layers/Conv3DLayer.cpp
+         layers/DeConv3DLayer.cpp
+         layers/CropLayer.cpp
+         layers/CrossEntropyOverBeam.cpp
+         layers/DataNormLayer.cpp
+         layers/FeatureMapExpandLayer.cpp
+         layers/HierarchicalSigmoidLayer.cpp
+         layers/MultinomialSampler.cpp
+         layers/NCELayer.cpp
+         layers/KmaxSeqScoreLayer.cpp
+         layers/MDLstmLayer.cpp
+         layers/MultiplexLayer.cpp
+         layers/PadLayer.cpp
+         layers/Pool3DLayer.cpp
+         layers/ResizeLayer.cpp
+         layers/RotateLayer.cpp
+         layers/RowConvLayer.cpp
+         layers/RowL2NormLayer.cpp
+         layers/SamplingIdLayer.cpp
+         layers/ScaleShiftLayer.cpp
+         layers/SelectiveFullyConnectedLayer.cpp
+         layers/SpatialPyramidPoolLayer.cpp
+         layers/BilinearInterpLayer.cpp
+         layers/ClipLayer.cpp)
 endif()

 if(WITH_GPU)
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@ -16,7 +16,6 @@ limitations under the License. */

 #include "NeuralNetwork.h"
 #include "hl_gpu.h"
-#include "paddle/gserver/layers/AgentLayer.h"
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@ -28,6 +27,7 @@ limitations under the License. */
 #ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
 #include "RecurrentGradientMachine.h"
+#include "paddle/gserver/layers/AgentLayer.h"
 #endif

 namespace paddle {
@ -192,9 +192,11 @@ void NeuralNetwork::init(const ModelConfig& config,
 void NeuralNetwork::connect(LayerPtr agentLayer,
                            LayerPtr realLayer,
                            int height) {
+#ifndef PADDLE_MOBILE_INFERENCE
  AgentLayer* agent = dynamic_cast<AgentLayer*>(agentLayer.get());
  CHECK_NOTNULL(agent);
  agent->setRealLayer(realLayer, height);
+#endif
 }

 void NeuralNetwork::connect(std::string agentLayerName,
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@ -98,6 +98,7 @@ ClassRegistrar<Layer, LayerConfig> Layer::registrar_;
 LayerPtr Layer::create(const LayerConfig& config) {
  std::string type = config.type();

+#ifndef PADDLE_MOBILE_INFERENCE
  // NOTE: As following types have illegal character '-',
  // they can not use REGISTER_LAYER to registrar.
  // Besides, to fit with old training models,
@ -106,7 +107,6 @@ LayerPtr Layer::create(const LayerConfig& config) {
    return LayerPtr(new MultiClassCrossEntropy(config));
  else if (type == "rank-cost")
    return LayerPtr(new RankingCost(config));
-#ifndef PADDLE_MOBILE_INFERENCE
  else if (type == "auc-validation")
    return LayerPtr(new AucValidation(config));
  else if (type == "pnpair-validation")
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@ -100,8 +100,9 @@ void ROIPoolLayer::forward(PassType passType) {
    size_t roiEndH = round(bottomROIs[4] * spatialScale_);
    CHECK_GE(roiBatchIdx, 0UL);
    CHECK_LT(roiBatchIdx, batchSize);
-    size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL);
-    size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL);
+    size_t roiHeight =
+        std::max(roiEndH - roiStartH + 1, static_cast<size_t>(1));
+    size_t roiWidth = std::max(roiEndW - roiStartW + 1, static_cast<size_t>(1));
    real binSizeH =
        static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
    real binSizeW =
@ -114,10 +115,14 @@ void ROIPoolLayer::forward(PassType passType) {
          size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
          size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
          size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
-          hstart = std::min(std::max(hstart + roiStartH, 0UL), height_);
-          wstart = std::min(std::max(wstart + roiStartW, 0UL), width_);
-          hend = std::min(std::max(hend + roiStartH, 0UL), height_);
-          wend = std::min(std::max(wend + roiStartW, 0UL), width_);
+          hstart = std::min(
+              std::max(hstart + roiStartH, static_cast<size_t>(0)), height_);
+          wstart = std::min(
+              std::max(wstart + roiStartW, static_cast<size_t>(0)), width_);
+          hend = std::min(std::max(hend + roiStartH, static_cast<size_t>(0)),
+                          height_);
+          wend = std::min(std::max(wend + roiStartW, static_cast<size_t>(0)),
+                          width_);

          bool isEmpty = (hend <= hstart) || (wend <= wstart);
          size_t poolIndex = ph * pooledWidth_ + pw;
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@ -1,9 +1,12 @@
 # gserver pacakge unittests

 add_simple_unittest(test_LinearChainCRF)
-add_simple_unittest(test_MultinomialSampler)
 add_simple_unittest(test_RecurrentLayer)

+if(NOT MOBILE_INFERENCE)
+  add_simple_unittest(test_MultinomialSampler)
+endif()
+
 function(gserver_test TARGET)
  add_unittest_without_exec(${TARGET}
      ${TARGET}.cpp
@ -49,7 +52,7 @@ if(WITH_PYTHON)
 endif()

 ############### test_WarpCTCLayer #######################
-if(NOT WITH_DOUBLE)
+if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
    add_unittest_without_exec(test_WarpCTCLayer
        test_WarpCTCLayer.cpp)

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@ -1902,5 +1902,52 @@ void BaseMatrixT<real>::sumOfProducts(BaseMatrixT& b,
 }

 template class BaseMatrixT<real>;
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 template class BaseMatrixT<int>;
+
+#else
+
+template <>
+void BaseMatrixT<int>::zero() {
+  applyUnary(unary::Zero<int>());
+}
+
+template <>
+void BaseMatrixT<int>::assign(int p) {
+  applyUnary(unary::Assign<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::isEqualTo(BaseMatrixT& b, int value) {
+  applyBinary(binary::IsEqual<int>(value), b);
+}
+
+template <>
+void BaseMatrixT<int>::neg() {
+  applyUnary(unary::Neg<int>());
+}
+
+template <>
+void BaseMatrixT<int>::abs2() {
+  applyUnary(unary::Abs<int>());
+}
+
+template <>
+void BaseMatrixT<int>::add(int p) {
+  applyUnary(unary::Add<int>(p));
+}
+
+template <>
+void BaseMatrixT<int>::add(int p1, int p2) {
+  applyUnary(unary::Add2<int>(p1, p2));
+}
+
+template <>
+void BaseMatrixT<int>::applyL1(int learningRate, int decayRate) {
+  applyUnary(unary::ApplyL1<int>(learningRate * decayRate));
+}
+
+#endif
 }  // namespace paddle
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@ -25,6 +25,19 @@ else()
    message(STATUS "Compile with MKLDNNMatrix")
 endif()

+if(MOBILE_INFERENCE)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
+    # Remove sparse
+    list(REMOVE_ITEM MATH_HEADERS
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.h
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.h)
+    list(REMOVE_ITEM MATH_SOURCES
+         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseMatrix.cpp
+         ${CMAKE_CURRENT_SOURCE_DIR}/SparseRowMatrix.cpp)
+endif()
 set(MATH_SOURCES
    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "Matrix.h"

@ -309,3 +312,57 @@ private:
  using Matrix::subMatrix;
 };
 }  // namespace paddle
+
+#else
+
+#include "Matrix.h"
+
+namespace paddle {
+
+class CpuSparseMatrix : public Matrix {
+public:
+  CpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  CpuSparseMatrix(real* data,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  real* getValue() const { return nullptr; }
+  size_t getColStartIdx(size_t i) const { return 0; }
+  size_t getRowStartIdx(size_t i) const { return 0; }
+  size_t getColNum(size_t i) const { return 0; }
+  int* getRowCols(size_t i) const { return nullptr; }
+
+  CpuSparseMatrixPtr getTmpSparseMatrix(size_t height, size_t width) {
+    return nullptr;
+  }
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@ -451,6 +451,7 @@ void GpuMatrix::addSharedBias(Matrix& b, real scale) {
 }

 void GpuMatrix::collectBias(Matrix& a, real scale) {
+#ifdef PADDLE_WITH_CUDA
  CHECK_EQ(getHeight(), (size_t)1);
  CHECK_EQ(width_, a.getWidth());
  GpuSparseMatrix* sMatPtr = dynamic_cast<GpuSparseMatrix*>(&a);
@ -461,6 +462,7 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
    hl_sparse_matrix_s A_d = sMatPtr->sMatrix_.get();
    hl_sparse_matrix_column_sum(data, A_d, sMatPtr->getHeight(), width_, scale);
  }
+#endif
 }

 void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
@ -552,6 +554,7 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                    const GpuMatrix& b,
                    real scaleAB,
                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
  CHECK(isContiguous());
  CHECK(b.isContiguous());
  CHECK(b.useGpu_ == true) << "Matrix type are not equal";
@ -578,12 +581,14 @@ void GpuMatrix::mul(const GpuSparseMatrix& a,
                          b.height_,
                          scaleAB,
                          scaleT);
+#endif
 }

 void GpuMatrix::mul(const GpuMatrix& a,
                    const GpuSparseMatrix& b,
                    real scaleAB,
                    real scaleT) {
+#ifdef PADDLE_WITH_CUDA
  CHECK(isContiguous());
  CHECK(a.isContiguous());
  CHECK(a.useGpu_ == true) << "Matrix type are not equal";
@ -622,6 +627,7 @@ void GpuMatrix::mul(const GpuMatrix& a,
                            scaleAB,
                            scaleT);
  }
+#endif
 }

 /* this = a*b */
@ -1557,6 +1563,7 @@ void GpuMatrix::bilinearBackward(const Matrix& out,
 }

 void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);

@ -1572,9 +1579,11 @@ void GpuMatrix::multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
  hl_matrix_multi_binary_cross_entropy(
      output_d, entropy_d, mat_d, height_, outputPtr->width_);
+#endif
 }

 void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+#ifdef PADDLE_WITH_CUDA
  GpuMatrix* outputPtr = dynamic_cast<GpuMatrix*>(&output);
  auto labelPtr = dynamic_cast<GpuSparseMatrix*>(&label);

@ -1590,6 +1599,7 @@ void GpuMatrix::multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
  hl_sparse_matrix_s mat_d = labelPtr->sMatrix_.get();
  hl_matrix_multi_binary_cross_entropy_bp(
      output_d, grad_d, mat_d, height_, width_);
+#endif
 }

 void GpuMatrix::vol2Col(real* dataSrc,
@ -3255,6 +3265,7 @@ template void CpuMatrix::mul<CpuMatrix, CacheRowCpuMatrix>(CpuSparseMatrix* a,
                                                           real scaleAB,
                                                           real scaleT);

+#ifndef PADDLE_MOBILE_INFERENCE
 void SharedCpuMatrix::mul(CpuSparseMatrix* a,
                          CpuMatrix* b,
                          real scaleAB,
@ -3383,6 +3394,7 @@ void SharedCpuMatrix::initBlock(int blockNum) {
  }
 }

+#endif
 /* Add a (column) vector b to matrix a, column by column */
 void CpuMatrix::addColumnVector(const Matrix& b) {
  BaseMatrix::addColVector(const_cast<Matrix&>(b));
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@ -2070,6 +2070,7 @@ public:

 class SharedCpuMatrix : public CpuMatrix {
 public:
+#ifndef PADDLE_MOBILE_INFERENCE
  /* blockNum is number of partitions of the matrix  */
  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
      : CpuMatrix(height, width, trans) {
@ -2115,6 +2116,7 @@ private:
  ThreadLocal<CpuMatrixPtr> localBuf_;
  ThreadLocal<std::vector<int>> localBufRows_;
  ThreadLocal<std::vector<int>> blockSeq_;
+#endif
 };

 typedef struct { unsigned int col; } sparse_non_value_t;
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+
+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <cstddef>
 #include "CpuSparseMatrix.h"
 #include "Matrix.h"
@ -237,3 +240,47 @@ private:
 };

 }  // namespace paddle
+
+#else
+
+#include "CpuSparseMatrix.h"
+
+namespace paddle {
+
+class GpuSparseMatrix : public Matrix {
+public:
+  GpuSparseMatrix(size_t height,
+                  size_t width,
+                  size_t nnz, /* used to allocate space */
+                  SparseValueType valueType = FLOAT_VALUE,
+                  SparseFormat format_ = SPARSE_CSR,
+                  bool trans = false)
+      : Matrix(NULL, height, width, trans, false) {}
+
+  GpuSparseMatrix(real* value,
+                  int* rows,
+                  int* cols,
+                  size_t height,
+                  size_t width,
+                  size_t nnz,
+                  SparseValueType valueType,
+                  SparseFormat format,
+                  bool trans)
+      : Matrix(NULL, height, width, trans, true) {}
+
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {}
+  void resize(size_t newHeight, size_t newWidth) {}
+  MatrixPtr getTranspose() { return nullptr; }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {}
+};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@ -14,6 +14,8 @@ limitations under the License. */

 #pragma once

+#ifndef PADDLE_MOBILE_INFERENCE
+
 #include <gflags/gflags.h>
 #include <string.h>
 #include <algorithm>
@ -313,3 +315,27 @@ private:
 };

 }  // namespace paddle
+
+#else
+namespace paddle {
+
+class SparseRowCpuMatrix : public CpuMatrix {
+public:
+  void reserveStore() {}
+  void clearIndices() {}
+};
+
+class SparsePrefetchRowCpuMatrix : public SparseRowCpuMatrix {
+public:
+  void setupIndices() {}
+  void addRows(MatrixPtr input) {}
+  void addRows(IVectorPtr ids) {}
+};
+
+class SparseAutoGrowRowCpuMatrix : public SparseRowCpuMatrix {};
+class CacheRowCpuMatrix : public SparseAutoGrowRowCpuMatrix {};
+class SparseRowIdsCpuMatrix : public CpuMatrix {};
+
+}  // namespace paddle
+
+#endif
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@ -3,8 +3,10 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
-add_simple_unittest(test_SparseMatrix)
 add_simple_unittest(test_RowBuffer)
+if(NOT MOBILE_INFERENCE)
+    add_simple_unittest(test_SparseMatrix)
+endif()

 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@ -30,6 +30,10 @@ class AccuracyOp : public framework::OperatorWithKernel {
                   "Input (Label) of accuracy op should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
                   "Output (Accuracy) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Correct"),
+                   "Output (Correct) of AccuracyOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Total"),
+                   "Output (Total) of AccuracyOp should not be null.");

    auto inference_dim = ctx->GetInputDim("Out");
    auto label_dim = ctx->GetInputDim("Label");
@ -43,6 +47,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
                      " the same as label.");

    ctx->SetOutputDim("Accuracy", {1});
+    ctx->SetOutputDim("Correct", {1});
+    ctx->SetOutputDim("Total", {1});
    ctx->ShareLoD("Out", /*->*/ "Accuracy");
  }

@ -66,6 +72,8 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Label", "Label of the training data");
    // TODO(typhoonzero): AddInput("Weight", ...
    AddOutput("Accuracy", "The accuracy of current batch");
+    AddOutput("Correct", "The correct samples count of current batch");
+    AddOutput("Total", "The samples count of current batch");

    AddComment(R"DOC(
 Accuracy Operator. 
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@ -24,7 +24,8 @@ using platform::PADDLE_CUDA_NUM_THREADS;
 template <int BlockSize>
 __global__ void AccuracyCudaKernel(const int N, const int D,
                                   const int64_t* Xdata,
-                                   const int64_t* labeldata, float* accuracy) {
+                                   const int64_t* labeldata, int* correct_data,
+                                   float* accuracy) {
  int count = 0;
  __shared__ int total[BlockSize];

@ -43,6 +44,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D,
  // reduce the count with init value 0, and output accuracy.
  int result = thrust::reduce(thrust::device, total, total + BlockSize, 0);
  if (threadIdx.x == 0) {
+    *correct_data = result;
    *accuracy = static_cast<float>(result) / static_cast<float>(N);
  }
 }
@ -56,31 +58,48 @@ class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
    auto* inference = ctx.Input<Tensor>("Out");
    auto* indices = ctx.Input<Tensor>("Indices");
    auto* label = ctx.Input<Tensor>("Label");
+
    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");
    // FIXME(typhoonzero): only support indices currently
    // if add support for output values, how to detect the data type?
    const int64_t* indices_data = indices->data<int64_t>();
    const int64_t* label_data = label->data<int64_t>();
+
+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());

-    size_t num_samples = inference->dims()[0];
+    int num_samples = static_cast<int>(inference->dims()[0]);
    size_t infer_width = inference->dims()[1];
    PADDLE_ENFORCE(cudaMemset(accuracy_data, 0, sizeof(float)));
+    // cudaMemset((void**)&correct_data, 0, sizeof(float));

    if (num_samples == 0) {
      return;
    }
+    cudaMemcpy(total_data, &num_samples, sizeof(int), cudaMemcpyHostToDevice);

    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
        1, PADDLE_CUDA_NUM_THREADS, 0, ctx.cuda_device_context().stream()>>>(
-        num_samples, infer_width, indices_data, label_data, accuracy_data);
+        num_samples, infer_width, indices_data, label_data, correct_data,
+        accuracy_data);
+
+    int d_num_samples, d_num_correct;
+    float d_accuracy;
+    cudaMemcpy(&d_num_correct, correct_data, sizeof(int),
+               cudaMemcpyDeviceToHost);
+    cudaMemcpy(&d_num_samples, total_data, sizeof(int), cudaMemcpyDeviceToHost);
+    cudaMemcpy(&d_accuracy, accuracy_data, sizeof(float),
+               cudaMemcpyDeviceToHost);
  }
 };

 }  // namespace operators
 }  // namespace paddle

-// FIXME(typhoonzero): types of T is for infernece data.
-// label data is always int
+// FIXME(typhoonzero): types of T is for inference data.
+// label data is always int64
 REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
                       paddle::operators::AccuracyOpCUDAKernel<double>);
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@ -29,7 +29,11 @@ class AccuracyKernel : public framework::OpKernel<T> {
    auto* indices = ctx.Input<Tensor>("Indices");
    auto* label = ctx.Input<Tensor>("Label");
    auto* accuracy = ctx.Output<Tensor>("Accuracy");
+    auto* correct = ctx.Output<Tensor>("Correct");
+    auto* total = ctx.Output<Tensor>("Total");

+    int* correct_data = correct->mutable_data<int>(ctx.GetPlace());
+    int* total_data = total->mutable_data<int>(ctx.GetPlace());
    float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());

    const int64_t* indices_data = indices->data<int64_t>();
@ -55,7 +59,8 @@ class AccuracyKernel : public framework::OpKernel<T> {
      }
    }

-    // FIXME(typhoonzero): we don't accumulate the accuracy for now.
+    *correct_data = num_correct;
+    *total_data = num_samples;
    *accuracy_data =
        static_cast<float>(num_correct) / static_cast<float>(num_samples);
  }
--- a/paddle/operators/compare_op.cc
+++ b/paddle/operators/compare_op.cc
@ -94,5 +94,13 @@ class CompareOp : public framework::OperatorWithKernel {

 REGISTER_LOGICAL_OP(less_than, "Out = X < Y");
 REGISTER_LOGICAL_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_OP(less_equal, "Out = X <= Y");
+REGISTER_LOGICAL_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_OP(greater_than, "Out = X > Y");
+REGISTER_LOGICAL_KERNEL(greater_than, CPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_OP(greater_equal, "Out = X >= Y");
+REGISTER_LOGICAL_KERNEL(greater_equal, CPU,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_OP(equal, "Out = X == Y");
 REGISTER_LOGICAL_KERNEL(equal, CPU, paddle::operators::EqualFunctor);
--- a/paddle/operators/compare_op.cu
+++ b/paddle/operators/compare_op.cu
@ -15,4 +15,9 @@
 #include "paddle/operators/compare_op.h"

 REGISTER_LOGICAL_KERNEL(less_than, GPU, paddle::operators::LessThanFunctor);
+REGISTER_LOGICAL_KERNEL(less_equal, GPU, paddle::operators::LessEqualFunctor);
+REGISTER_LOGICAL_KERNEL(greater_than, GPU,
+                        paddle::operators::GreaterThanFunctor);
+REGISTER_LOGICAL_KERNEL(greater_equal, GPU,
+                        paddle::operators::GreaterEqualFunctor);
 REGISTER_LOGICAL_KERNEL(equal, GPU, paddle::operators::EqualFunctor);
--- a/Show More
+++ b/Show More