Resolve conflicts

8 years ago · 437c98d502
parent 8c9716f7d4 ecbff689fb
commit 437c98d502
43 changed files with 1606 additions and 775 deletions
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
@ -13,15 +13,7 @@ import numpy as np
 import random
 from mnist_util import read_from_mnist
 from paddle.trainer_config_helpers import *
-
+import paddle.v2
 def optimizer_config():
    settings(
        learning_rate=1e-4,
        learning_method=AdamOptimizer(),
        batch_size=1000,
        model_average=ModelAverage(average_window=0.5),
        regularization=L2Regularization(rate=0.5))
 def network_config():
@ -75,19 +67,23 @@ def input_order_converter(generator):
 def main():
    api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
-    # get enable_types for each optimizer.
+    optimizer = paddle.v2.optimizer.Adam(
-    # enable_types = [value, gradient, momentum, etc]
+        learning_rate=1e-4,
-    # For each optimizer(SGD, Adam), GradientMachine should enable different
+        batch_size=1000,
-    # buffers.
+        model_average=ModelAverage(average_window=0.5),
-    opt_config_proto = parse_optimizer_config(optimizer_config)
+        regularization=L2Regularization(rate=0.5))
-    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
+
-    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
+    # Create Local Updater. Local means not run in cluster.
-    enable_types = _temp_optimizer_.getParameterTypes()
+    # For a cluster training, here we can change to createRemoteUpdater
    # in future.
    updater = optimizer.create_local_updater()
    assert isinstance(updater, api.ParameterUpdater)
    # Create Simple Gradient Machine.
    model_config = parse_network_config(network_config)
-    m = api.GradientMachine.createFromConfigProto(
+    m = api.GradientMachine.createFromConfigProto(model_config,
-        model_config, api.CREATE_MODE_NORMAL, enable_types)
+                                                  api.CREATE_MODE_NORMAL,
                                                  optimizer.enable_types())
    # This type check is not useful. Only enable type hint in IDE.
    # Such as PyCharm
@ -96,12 +92,6 @@ def main():
    # Initialize Parameter by numpy.
    init_parameter(network=m)
    # Create Local Updater. Local means not run in cluster.
    # For a cluster training, here we can change to createRemoteUpdater
    # in future.
    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
    assert isinstance(updater, api.ParameterUpdater)
    # Initialize ParameterUpdater.
    updater.init(m)
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@ -127,11 +127,6 @@
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 <tr>
 <td class="left">allow_inefficient_sparse_update</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@ -127,11 +127,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 <tr>
 <td class="left">allow_inefficient_sparse_update</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@ -306,10 +306,6 @@
  - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
  - 类型: bool (默认: 0).
 * `--allow_inefficient_sparse_update`
  - 指示是否允许低效率的稀疏更新.
  - 类型: bool (默认: 0).
 * `--check_sparse_distribution_batches`
  - 每运行多少个批次执行一次稀疏参数分布的检查.
  - 类型: int32 (默认: 100).
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@ -310,10 +310,6 @@
  - show log details for sparse parameter distribution in pserver.
  - type: bool (default: 0).
 * `--allow_inefficient_sparse_update`
  - Whether to allow inefficient sparse update.
  - type: bool (default: 0).
 * `--check_sparse_distribution_batches`
  - Running sparse parameter distribution check every so many batches.
  - type: int32 (default: 100).
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
--- a/doc/howto/usage/k8s/src/add_security_group.png
+++ b/doc/howto/usage/k8s/src/add_security_group.png
--- a/doc/howto/usage/k8s/src/create_efs.png
+++ b/doc/howto/usage/k8s/src/create_efs.png
--- a/doc/howto/usage/k8s/src/job.yaml
+++ b/doc/howto/usage/k8s/src/job.yaml
@ -1,43 +0,0 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: paddle-cluster-job
 spec:
  parallelism: 3
  completions: 3
  template:
    metadata:
      name: paddle-cluster-job
    spec:
      volumes:
      - name: jobpath
        hostPath: 
          path: /home/work/paddle_output              
      containers:
      - name: trainer
        image: registry.baidu.com/public/paddle:mypaddle
        command: ["bin/bash",  "-c", "/root/start.sh"]        
        env:
        - name: JOB_NAME
          value: paddle-cluster-job
        - name: JOB_PATH
          value: /home/jobpath     
        - name: JOB_NAMESPACE
          value: default         
        - name: TRAIN_CONFIG_DIR
          value: recommendation
        - name: CONF_PADDLE_NIC
          value: eth0  
        - name: CONF_PADDLE_PORT
          value: "7164"
        - name: CONF_PADDLE_PORTS_NUM
          value: "2"     
        - name: CONF_PADDLE_PORTS_NUM_SPARSE
          value: "2"  
        - name: CONF_PADDLE_GRADIENT_NUM
          value: "3"                                                               
        volumeMounts:
        - name: jobpath
          mountPath: /home/jobpath       
      restartPolicy: Never
--- a/doc/howto/usage/k8s/src/k8s_data/Dockerfile
+++ b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
@ -0,0 +1,7 @@
 FROM alpine
 RUN apk update && apk upgrade && apk add coreutils
 ADD quick_start /quick_start
 ADD get_data.sh /bin/
 RUN chmod +x /bin/get_data.sh
 ENTRYPOINT ["/bin/get_data.sh"]
--- a/doc/howto/usage/k8s/src/k8s_data/README.md
+++ b/doc/howto/usage/k8s/src/k8s_data/README.md
@ -0,0 +1,6 @@
 To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
 ```
 cp -r ../../../../../../demo/quick_start .
 docker build . -t prepare-data-image-name
 ```
--- a/doc/howto/usage/k8s/src/k8s_data/get_data.sh
+++ b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
@ -0,0 +1,26 @@
 #!/bin/sh
 out_dir=$OUT_DIR
 split_count=$SPLIT_COUNT
 set -e
 mkdir -p $out_dir
 cp -r /quick_start $out_dir/
 mkdir -p $out_dir/0/data
 cd $out_dir/0/data
 wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
 tar zxvf preprocessed_data.tar.gz
 rm preprocessed_data.tar.gz
 split -d --number=l/$split_count -a 5 train.txt train.
 mv train.00000 train.txt
 cd $out_dir
 end=$(expr $split_count - 1)
 for i in $(seq 1 $end); do
    mkdir -p $i/data
    cp -r 0/data/* $i/data
    mv $i/data/train.`printf %05d $i` $i/data/train.txt
 done;
--- a/doc/howto/usage/k8s/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
@ -0,0 +1,6 @@
 FROM paddledev/paddle:cpu-latest
 COPY start.sh /root/
 COPY start_paddle.py /root/
 RUN chmod +x /root/start.sh
 CMD ["bash"," -c","/root/start.sh"]
--- a/doc/howto/usage/k8s/src/k8s_train/README.md
+++ b/doc/howto/usage/k8s/src/k8s_train/README.md
@ -0,0 +1,5 @@
 To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
 ```
 docker build . -t train-image-name
 ```
--- a/doc/howto/usage/k8s/src/k8s_train/start.sh
+++ b/doc/howto/usage/k8s/src/k8s_train/start.sh
@ -1,19 +1,19 @@
 #!/bin/sh
 set -eu
 jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
 cd /root
-cp -rf $jobconfig .
+cp -rf $jobconfig/* .
 cd $TRAIN_CONFIG_DIR
 python /root/start_paddle.py \
  --dot_period=10 \
-  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
+  --ports_num=$CONF_PADDLE_PORTS_NUM \
  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
  --log_period=50 \
  --num_passes=10 \
-  --trainer_count=4 \
+  --trainer_count=$TRAINER_COUNT \
  --saving_period=1 \
  --local=0 \
-  --config=./trainer_config.py \
+  --config=trainer_config.lr.py \
  --use_gpu=0
--- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
+++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
@ -23,7 +23,6 @@ import argparse
 API = "/api/v1/namespaces/"
 JOBSELECTOR = "labelSelector=job-name="
 JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
 JOB_PATH_DATA = JOB_PATH + "/data"
 JOB_PATH_OUTPUT = JOB_PATH + "/output"
 JOBNAME = os.getenv("JOB_NAME")
 NAMESPACE = os.getenv("JOB_NAMESPACE")
@ -33,6 +32,8 @@ PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
 PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
 PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
 tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
 def refine_unknown_args(cmd_args):
    '''
@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
    for pod in podlist["items"]:
        if pod["status"]["phase"] == "Running":
            running += 1
    print "waiting for pods running, require:", require, "running:", running
    if require == running:
        return True
    return False
@ -79,8 +81,17 @@ def getPodList():
    pod = API + NAMESPACE + "/pods?"
    job = JOBNAME
-    return requests.get(apiserver + pod + JOBSELECTOR + job,
+    if os.path.isfile(tokenpath):
-                        verify=False).json()
+        tokenfile = open(tokenpath, mode='r')
        token = tokenfile.read()
        Bearer = "Bearer " + token
        headers = {"Authorization": Bearer}
        return requests.get(apiserver + pod + JOBSELECTOR + job,
                            headers=headers,
                            verify=False).json()
    else:
        return requests.get(apiserver + pod + JOBSELECTOR + job,
                            verify=False).json()
 def getIdMap(podlist):
@ -122,8 +133,8 @@ def startPaddle(idMap={}, train_args_dict=None):
    if not os.path.exists(JOB_PATH_OUTPUT):
        os.makedirs(JOB_PATH_OUTPUT)
    os.mkdir(logDir)
-    copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
+    copyCommand = 'cp -rf ' + JOB_PATH + \
-        "/" + str(trainerId) + " ./data"
+        "/" + str(trainerId) + "/data/*" + " ./data/"
    os.system(copyCommand)
    startPserver = 'nohup paddle pserver' + \
        " --port=" + str(PADDLE_PORT) + \
@ -136,9 +147,9 @@ def startPaddle(idMap={}, train_args_dict=None):
    print startPserver
    os.system(startPserver)
    # wait until pservers completely start
-    time.sleep(10)
+    time.sleep(20)
-    startTrainer = program + args + " > " + \
+    startTrainer = program + args + " 2>&1 | tee " + \
-        logDir + "/train.log 2>&1 < /dev/null"
+        logDir + "/train.log"
    print startTrainer
    os.system(startTrainer)
@ -152,7 +163,7 @@ if __name__ == '__main__':
    podlist = getPodList()
    # need to wait until all pods are running
    while not isPodAllRunning(podlist):
-        time.sleep(10)
+        time.sleep(20)
        podlist = getPodList()
    idMap = getIdMap(podlist)
    startPaddle(idMap, train_args_dict)
--- a/doc/howto/usage/k8s/src/worker_security_group.png
+++ b/doc/howto/usage/k8s/src/worker_security_group.png
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@ -32,14 +32,20 @@ const SparseMatrixArg& BufferArg::sparse() const {
 SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
    : BufferArg(sparse, argType),
      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
      nnz_(sparse.getElementCnt()),
      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
      type_(static_cast<SparseDataType>(sparse.getValueType())) {
  bufferType_ = TENSOR_SPARSE;
 }
 SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
    : BufferArg(sparse, argType),
      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
      nnz_(sparse.getElementCnt()),
      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
      type_(static_cast<SparseDataType>(sparse.getValueType())) {
  bufferType_ = TENSOR_SPARSE;
 }
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@ -30,13 +30,6 @@ enum BufferType {
  TENSOR_SPARSE = 4
 };
 enum SparseDataType {
  SPARSE_NO_VALUE = 0,  // do not need value pointer, all values are 1
  SPARSE_FLOAT_VALUE = 1
 };
 enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
@ -79,19 +72,21 @@ public:
  BufferArg(ValueType valueType,
            const TensorShape& shape,
            ArgType argType = UNSPECIFIED)
-      : buf_(nullptr),
+      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
-        valueType_(valueType),
+    bufferType_ = TENSOR_NORMAL;
-        shape_(shape),
+  }
        argType_(argType) {}
  BufferArg(void* buf,
            ValueType valueType,
            const TensorShape& shape,
            ArgType argType = UNSPECIFIED)
-      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
    bufferType_ = TENSOR_NORMAL;
  }
-  BufferArg(void* buf, ValueType valueType)
+  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
-      : buf_(buf), valueType_(valueType) {}
+    bufferType_ = TENSOR_NORMAL;
  }
  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
      : buf_(
@ -167,8 +162,9 @@ public:
  ValueType valueType() const { return valueType_; }
  BufferType bufferType() const { return bufferType_; }
  const TensorShape& shape() const { return shape_; }
-  bool isSparse() const { return (TENSOR_SPARSE == bufferType_); }
+  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
  virtual size_t numElements() const { return shape_.getElements(); }
  const SequenceArg& sequence() const;
  const SparseMatrixArg& sparse() const;
@ -179,6 +175,7 @@ protected:
  TensorShape shape_;
  BufferType bufferType_{TENSOR_UNKNOWN};
  ArgType argType_{UNSPECIFIED};
  // TODO(tianbing), add deviceType_
  // leading dimensions. The size is dims_.size()
  // Dims lds_;
 };
@ -191,6 +188,7 @@ class SequenceIdArg : public BufferArg {
 public:
  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
    bufferType_ = TENSOR_SEQUENCE_ID;
    CHECK_EQ(shape_.ndims(), (size_t)1);
    CHECK_GT(shape_[0], 1);
    numSeqs_ = shape_[0] - 1;
@ -228,7 +226,9 @@ public:
  SequenceArg(ValueType valueType,
              const TensorShape& shape,
              ArgType argType = UNSPECIFIED)
-      : BufferArg(valueType, shape, argType), startPositions_(TensorShape()) {}
+      : BufferArg(valueType, shape, argType), startPositions_(TensorShape()) {
    bufferType_ = TENSOR_SEQUENCE_DATA;
  }
  SequenceArg(void* buf,
              ValueType valueType,
@ -269,31 +269,75 @@ public:
                  const BufferArg& row,
                  const BufferArg& col,
                  size_t nnz,
-                  SparseDataFormat format,
+                  SparseFormat format,
-                  SparseDataType type,
+                  SparseValueType type,
                  ArgType argType = UNSPECIFIED)
      : BufferArg(buf, valueType, shape, argType),
        row_(row),
        col_(col),
        nnz_(nnz),
-        format_(format),
+        format_(static_cast<SparseDataFormat>(format)),
-        type_(type) {
+        type_(static_cast<SparseDataType>(type)) {
    bufferType_ = TENSOR_SPARSE;
    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
    CHECK_EQ(shape_.ndims(), (size_t)2);
    CHECK_EQ(row_.shape().ndims(), (size_t)1);
    CHECK_EQ(col_.shape().ndims(), (size_t)1);
-    if (format == SPARSE_CSR_FORMAT) {
+    if (format_ == T_SPARSE_CSR) {
      CHECK_EQ(nnz, col.shape()[0]);
-    } else if (format == SPARSE_CSC_FORMAT) {
+    } else if (format_ == T_SPARSE_CSC) {
      CHECK_EQ(nnz, row.shape()[0]);
    }
  }
  SparseMatrixArg(ValueType valueType,
                  const TensorShape& shape,
                  size_t nnz,
                  SparseFormat format,
                  SparseValueType type,
                  ArgType argType = UNSPECIFIED)
      : BufferArg(valueType, shape, argType),
        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
        nnz_(nnz),
        format_(static_cast<SparseDataFormat>(format)),
        type_(static_cast<SparseDataType>(type)) {
    bufferType_ = TENSOR_SPARSE;
    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
    CHECK_EQ(shape_.ndims(), (size_t)2);
    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
    row_ = (format_ == T_SPARSE_CSR
                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
    col_ = (format_ == T_SPARSE_CSR
                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
  }
  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
  template <DeviceType DType>
  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
    CHECK(buf_);
    CHECK(valueType_ == DataType<real>::value);
    // CHECK(deviceType_ == DType);
    CHECK_EQ(2, shape_.ndims());
    return typename Tensor<real, DType>::SparseMatrix(
        reinterpret_cast<real*>(buf_),
        reinterpret_cast<int*>(row_.data()),
        reinterpret_cast<int*>(col_.data()),
        shape_[0],
        shape_[1],
        nnz_,
        static_cast<SparseValueType>(type_),
        static_cast<SparseFormat>(format_),
        false);
  }
  ~SparseMatrixArg() {}
  void* getRowBuf() const { return row_.data(); }
@ -302,6 +346,8 @@ public:
  size_t nnz() const { return nnz_; }
  size_t numElements() const override { return nnz_; }
  SparseDataFormat dataFormat() const { return format_; }
  SparseDataType dataType() const { return type_; }
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -26,6 +26,7 @@ if(WITH_TESTING)
    add_simple_unittest(FunctionTest)
    add_simple_unittest(ContextProjectionOpTest)
    add_simple_unittest(PadOpTest)
    add_simple_unittest(MulOpTest)
 endif()
 endif()
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@ -162,38 +162,64 @@ template <DeviceType Device>
 class CrossMapNormalFunc : public FunctionBase {
 public:
  void init(const FuncConfig& config) override {
    // function arguments
    size_ = config.get<size_t>("size");
    scale_ = config.get<real>("scale");
    pow_ = config.get<real>("pow");
    // number of inputs and outputs
    numInputs_ = 1;
    numOutputs_ = 2;
  }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)1, inputs.size());
+    check(inputs, outputs);
-    CHECK_EQ((size_t)2, outputs.size());
+    // ArgType check still on here,
-
+    // not sure whether it is better to put inside the check.
    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
    CHECK(inputs[0].shape() == outputs[0].shape());
    CHECK(inputs[0].shape() == outputs[1].shape());
    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
-    size_t samples = inputs[0].shape()[0];
+    size_t batchSize = inputs[0].shape()[0];
-    size_t channels = inputs[0].shape()[1];
+    size_t maps = inputs[0].shape()[1];
-    size_t height = inputs[0].shape()[2];
+    size_t rows = inputs[0].shape()[2];
-    size_t width = inputs[0].shape()[3];
+    size_t columns = inputs[0].shape()[3];
    CrossMapNormal<Device>(outputs[0].data<real>(),
                           outputs[1].data<real>(),
                           inputs[0].data<real>(),
-                           samples,
+                           batchSize,
-                           channels,
+                           maps,
-                           height,
+                           rows,
-                           width,
+                           columns,
                           size_,
                           scale_,
                           pow_);
  }
  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
    CHECK(inputs[0].shape() == outputs[0].shape());
    CHECK(inputs[0].shape() == outputs[1].shape());
  }
  // Only need the shape of the input, can calculate the
  // floating-point operation.
  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ((size_t)numInputs_, inputs.size());
    size_t batchSize = inputs[0].shape()[0];
    size_t maps = inputs[0].shape()[1];
    size_t rows = inputs[0].shape()[2];
    size_t columns = inputs[0].shape()[3];
    // number of floating-point operations
    // an approximate value
    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
    return ops;
  }
 private:
  size_t size_;
  real scale_;
@ -236,21 +262,18 @@ template <DeviceType Device>
 class CrossMapNormalGradFunc : public FunctionBase {
 public:
  void init(const FuncConfig& config) override {
    // function arguments
    size_ = config.get<size_t>("size");
    scale_ = config.get<real>("scale");
    pow_ = config.get<real>("pow");
    // number of inputs and outputs
    numInputs_ = 4;
    numOutputs_ = 1;
  }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)4, inputs.size());
+    check(inputs, outputs);
    CHECK_EQ((size_t)1, outputs.size());
    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
    CHECK(inputs[0].shape() == inputs[1].shape());
    CHECK(inputs[0].shape() == inputs[2].shape());
    CHECK(inputs[0].shape() == inputs[3].shape());
    CHECK(inputs[0].shape() == outputs[0].shape());
    if (outputs[0].getArgType() != ADD_TO) {
      // Currently, some algorithm implementations are ASSIGN_TO mode,
      // if need to support the ADD_TO calculation, need to clear the output.
@ -259,25 +282,52 @@ public:
      tmp.zero();
    }
-    size_t samples = inputs[0].shape()[0];
+    size_t batchSize = inputs[0].shape()[0];
-    size_t channels = inputs[0].shape()[1];
+    size_t maps = inputs[0].shape()[1];
-    size_t height = inputs[0].shape()[2];
+    size_t rows = inputs[0].shape()[2];
-    size_t width = inputs[0].shape()[3];
+    size_t columns = inputs[0].shape()[3];
    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
                               inputs[0].data<real>(),
                               inputs[1].data<real>(),
                               inputs[2].data<real>(),
                               inputs[3].data<real>(),
-                               samples,
+                               batchSize,
-                               channels,
+                               maps,
-                               height,
+                               rows,
-                               width,
+                               columns,
                               size_,
                               scale_,
                               pow_);
  }
  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_EQ(numInputs_, inputs.size());
    CHECK_EQ(numOutputs_, outputs.size());
    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
    CHECK(inputs[0].shape() == inputs[1].shape());
    CHECK(inputs[0].shape() == inputs[2].shape());
    CHECK(inputs[0].shape() == inputs[3].shape());
    CHECK(inputs[0].shape() == outputs[0].shape());
  }
  // Only need the shape of one input, can calculate the
  // floating-point operation.
  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
    CHECK_LT((size_t)1, inputs.size());
    size_t batchSize = inputs[0].shape()[0];
    size_t maps = inputs[0].shape()[1];
    size_t rows = inputs[0].shape()[2];
    size_t columns = inputs[0].shape()[3];
    // number of floating-point operations
    // an approximate value
    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
    return ops;
  }
 private:
  size_t size_;
  real scale_;
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@ -153,7 +153,36 @@ public:
  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
  // This member function is used to check whether the BufferType and shape of
  // the inputs and outputs arguments of the Function are correct.
  // General calc function which will call this check to do arguments check.
  // And before the calc called, the caller can also check their own arguments.
  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
  // Calculate the number of floating-point operations of this Function.
  // The inputs and outputs arguments do not need to contain the actual data,
  // only the shape.
  // And some Functions have the same input and output shapes,
  // so you may not need to enter the complete number of arguments.
  // But entering the full arguments is always correct for this interface.
  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
    return 0;
  }
  int getNumInputs() const { return numInputs_; }
  int getNumOutputs() const { return numOutputs_; }
  static ClassRegistrar<FunctionBase> funcRegistrar_;
 protected:
  // numInputs_ and numOutputs_ represents the maximum
  // input and output supported by Function.
  // Some functions are optimized for input and output,
  // so when comparing the number of arguments, for these functions
  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
  size_t numInputs_;
  size_t numOutputs_;
 };
 #define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "Function.h"
-#include "paddle/math/Vector.h"
+#include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/math/tests/TensorCheck.h"
 #include "paddle/testing/TestUtil.h"
@ -69,7 +70,7 @@ public:
  }
  // output need only contains shape, do not contains data.
-  void addOutputs(const BufferArg& output) {
+  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
    size_t size =
        output.shape().getElements() * sizeOfValuType(output.valueType());
    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
@ -79,12 +80,40 @@ public:
        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
                                    output.valueType(),
                                    output.shape(),
-                                    ASSIGN_TO));
+                                    argType));
    gpuOutputs_.emplace_back(
        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
                                    output.valueType(),
                                    output.shape(),
-                                    ASSIGN_TO));
+                                    argType));
  }
  /// add and init output sparse matrix
  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
        output.shape()[0],
        output.shape()[1],
        output.nnz(),
        static_cast<SparseValueType>(output.dataType()),
        static_cast<SparseFormat>(output.dataFormat()));
    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
        output.shape()[0],
        output.shape()[1],
        output.nnz(),
        static_cast<SparseValueType>(output.dataType()),
        static_cast<SparseFormat>(output.dataFormat()));
    /// init sparse matrix
    hl_stream_t stream(HPPL_STREAM_1);
    cpuSparse_->randomizeUniform();
    gpuSparse_->copyFrom(*cpuSparse_, stream);
    hl_stream_synchronize(stream);
    cpuOutputs_.emplace_back(
        std::make_shared<SparseMatrixArg>(*cpuSparse_, argType));
    gpuOutputs_.emplace_back(
        std::make_shared<SparseMatrixArg>(*gpuSparse_, argType));
  }
  void addInputs(const SequenceArg& input) {
@ -107,10 +136,36 @@ public:
    // TODO: need be implemented.
  }
  void addInputs(const SparseMatrixArg& input) {
    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
        input.shape()[0],
        input.shape()[1],
        input.nnz(),
        static_cast<SparseValueType>(input.dataType()),
        static_cast<SparseFormat>(input.dataFormat()));
    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
        input.shape()[0],
        input.shape()[1],
        input.nnz(),
        static_cast<SparseValueType>(input.dataType()),
        static_cast<SparseFormat>(input.dataFormat()));
    /// init sparse matrix
    hl_stream_t stream(HPPL_STREAM_1);
    cpuSparse_->randomizeUniform();
    gpuSparse_->copyFrom(*cpuSparse_, stream);
    hl_stream_synchronize(stream);
    cpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*cpuSparse_));
    gpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*gpuSparse_));
  }
  void run() {
    // prepare cpu/gpu arguments
    initInputs();
    initOutputs();
    // function calculate
    auto callFunction = [](FunctionBase* function,
                           std::vector<BufferArgPtr>& inputs,
@ -129,7 +184,7 @@ public:
    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
-    // check outputs and inouts
+    // check outputs
    compareOutputs();
  }
@ -140,6 +195,10 @@ public:
 protected:
  void initInputs() {
    for (size_t i = 0; i < cpuInputs_.size(); i++) {
      if (cpuInputs_[i]->isSparseArg()) {
        continue;  /// sparse matrix already init
      }
      initArg(*cpuInputs_[i]);
      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
@ -152,14 +211,32 @@ protected:
    }
  }
  void initOutputs() {
    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
      if (cpuOutputs_[i]->isSparseArg()) {
        continue;  /// sparse matrix already init
      }
      initArg(*cpuOutputs_[i]);
      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
      CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(),
                          (real*)cpuOutputs_[i]->data());
      GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(),
                          (real*)gpuOutputs_[i]->data());
      gpuVector.copyFrom(cpuVector);
    }
  }
  void compareOutputs() {
    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
      // TODO, Need a BufferCheck used to compare the two buffers.
-      auto cpu = cpuOutputs_[i];
+      const auto cpu = cpuOutputs_[i];
-      auto gpu = gpuOutputs_[i];
+      const auto gpu = gpuOutputs_[i];
-      CpuVector cpuVector(cpu->shape().getElements(), (real*)cpu->data());
+      CHECK_EQ(cpu->numElements(), gpu->numElements());
-      GpuVector gpuVector(cpu->shape().getElements(), (real*)gpu->data());
+      CpuVector cpuVector(cpu->numElements(), (real*)cpu->data());
-
+      GpuVector gpuVector(gpu->numElements(), (real*)gpu->data());
      autotest::TensorCheckErr(cpuVector, gpuVector);
    }
  }
@ -195,6 +272,8 @@ protected:
  std::vector<BufferArgPtr> cpuOutputs_;
  std::vector<BufferArgPtr> gpuInputs_;
  std::vector<BufferArgPtr> gpuOutputs_;
  std::shared_ptr<CpuSparseMatrix> cpuSparse_;
  std::shared_ptr<GpuSparseMatrix> gpuSparse_;
 };
 }  // namespace paddle
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
--- a/paddle/function/MulOp.h
+++ b/paddle/function/MulOp.h
@ -0,0 +1,102 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "Function.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 namespace paddle {
 /// CPU, dense matrix (+)= dense matrix * dense matrix
 template <DeviceType DType>
 void MulOp(CpuMatrix& out,
           const CpuMatrix& a,
           const CpuMatrix& b,
           real scaleAB,
           real scaleT,
           bool aTrans,
           bool bTrans);
 /// CPU, dense matrix (+)= sparse matrix * dense matrix
 template <DeviceType DType>
 void MulOp(CpuMatrix& out,
           const CpuSparseMatrix& a,
           const CpuMatrix& b,
           real scaleAB,
           real scaleT,
           bool aTrans,
           bool bTrans);
 /// CPU, dense matrix (+)= dense matrix * sparse matrix
 template <DeviceType DType>
 void MulOp(CpuMatrix& out,
           const CpuMatrix& a,
           const CpuSparseMatrix& b,
           real scaleAB,
           real scaleT,
           bool aTrans,
           bool bTrans);
 /// CPU, sparse matrix (+)= dense matrix * dense matrix
 template <DeviceType DType>
 void MulOp(CpuSparseMatrix& out,
           const CpuMatrix& a,
           const CpuMatrix& b,
           real scaleAB,
           real scaleT,
           bool aTrans,
           bool bTrans);
 /// GPU, dense matrix (+)= dense matrix * dense matrix
 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
           const GpuMatrix& a,
           const GpuMatrix& b,
           real scaleAB,
           real scaleT,
           bool aTrans,
           bool bTrans);
 /// GPU, dense matrix (+)= sparse matrix * dense matrix
 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
           const GpuSparseMatrix& a,
           const GpuMatrix& b,
           real scaleAB,
           real scaleT,
           bool aTrans,
           bool bTrans);
 /// GPU, dense matrix (+)= dense matrix * sparse matrix
 template <DeviceType DType>
 void MulOp(GpuMatrix& out,
           const GpuMatrix& a,
           const GpuSparseMatrix& b,
           real scaleAB,
           real scaleT,
           bool aTrans,
           bool bTrans);
 /// GPU, sparse matrix (+)= dense matrix * dense matrix
 template <DeviceType DType>
 void MulOp(GpuSparseMatrix& out,
           const GpuMatrix& a,
           const GpuMatrix& b,
           real scaleAB,
           real scaleT,
           bool aTrans,
           bool bTrans);
 }  // namespace paddle
--- a/Show More
+++ b/Show More