From e3d4da2de3a442c85cecdde8fbc9407b54dba0f0 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 26 Dec 2016 14:15:29 +0800
Subject: [PATCH 01/41] Add sum cost to Arguments

---
 paddle/api/Arguments.cpp         | 4 ++++
 paddle/api/PaddleAPI.h           | 2 ++
 paddle/api/test/testArguments.py | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 0cafbd896e..41beed38a8 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -137,6 +137,10 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
   a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
 }
 
+float Arguments::sumCosts() const {
+  return paddle::Argument::sumCosts(m->outputs);
+}
+
 int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return a.getBatchSize();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 7521ff4c6c..155e3e3afe 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -454,6 +454,8 @@ public:
                                         IVector* vec) throw(RangeError);
   void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
 
+  float sumCosts() const;
+
 private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
   void* getInternalArgumentsPtr() const;
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index 8cabecd242..a04a805d7a 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -22,6 +22,8 @@ class TestArguments(unittest.TestCase):
         args = swig_paddle.Arguments.createArguments(1)
         args.setSlotValue(0, m)
 
+        self.assertAlmostEqual(27.0, args.sumCosts())
+
         mat = args.getSlotValue(0)
         assert isinstance(mat, swig_paddle.Matrix)
         np_mat = mat.toNumpyMatInplace()

From 8b833d5a8ada43ba8b049665d5c6161eeb0c5d65 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 26 Dec 2016 16:18:22 +0800
Subject: [PATCH 02/41] Add load/save method for Parameter

---
 paddle/api/PaddleAPI.h                 | 4 ++++
 paddle/api/Parameter.cpp               | 8 ++++++++
 paddle/api/test/.gitignore             | 6 ++++++
 paddle/api/test/testGradientMachine.py | 4 ++++
 4 files changed, 22 insertions(+)
 create mode 100644 paddle/api/test/.gitignore

diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 155e3e3afe..bc1b22e187 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -551,6 +551,10 @@ public:
   ParameterConfig* getConfig();
   void setValueUpdated();
 
+  bool save(const std::string& filename) const;
+
+  bool load(const std::string& filename) const;
+
 private:
   static Parameter* createFromRawPtr(void* ptr);
   static Parameter* createFromSharedPtr(void* ptr);
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 4eed00a84a..9cfa2e35f5 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -70,3 +70,11 @@ ParameterConfig* Parameter::getConfig() {
 size_t Parameter::getID() const { return m->getPtr()->getID(); }
 
 void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
+
+bool Parameter::save(const std::string& filename) const {
+  return m->getPtr()->save(filename);
+}
+
+bool Parameter::load(const std::string& filename) const {
+  return m->getPtr()->load(filename);
+}
diff --git a/paddle/api/test/.gitignore b/paddle/api/test/.gitignore
new file mode 100644
index 0000000000..ef37ef4167
--- /dev/null
+++ b/paddle/api/test/.gitignore
@@ -0,0 +1,6 @@
+___fc_layer_0__.w0
+___fc_layer_0__.wbias
+_hidden1.w0
+_hidden1.wbias
+_hidden2.w0
+_hidden2.wbias
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/api/test/testGradientMachine.py
index b81eafa967..4b705f66ec 100644
--- a/paddle/api/test/testGradientMachine.py
+++ b/paddle/api/test/testGradientMachine.py
@@ -45,6 +45,7 @@ class TestGradientMachine(unittest.TestCase):
             assert isinstance(val, swig_paddle.Vector)
             arr = numpy.full((len(val), ), 0.1, dtype="float32")
             val.copyFromNumpyArray(arr)
+            self.assertTrue(param.save(param.getName()))
             param_config = param.getConfig().toProto()
             assert isinstance(param_config,
                               paddle.proto.ParameterConfig_pb2.ParameterConfig)
@@ -92,6 +93,9 @@ class TestGradientMachine(unittest.TestCase):
 
         self.assertTrue(self.isCalled)
 
+        for param in machine.getParameters():
+            self.assertTrue(param.load(param.getName()))
+
     def test_train_one_pass(self):
         conf_file_path = './testTrainConfig.py'
         trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(

From 343d9973cac9a7054a74ff66bfacebc33e214a55 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 27 Dec 2016 11:07:41 +0800
Subject: [PATCH 03/41] Simplify gitignore for api/test

---
 paddle/api/test/.gitignore | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/paddle/api/test/.gitignore b/paddle/api/test/.gitignore
index ef37ef4167..b7948824a1 100644
--- a/paddle/api/test/.gitignore
+++ b/paddle/api/test/.gitignore
@@ -1,6 +1,2 @@
-___fc_layer_0__.w0
-___fc_layer_0__.wbias
-_hidden1.w0
-_hidden1.wbias
-_hidden2.w0
-_hidden2.wbias
+*.w0
+*.wbias

From 886908235102fdffd19715c109064a0e4bdb386f Mon Sep 17 00:00:00 2001
From: zhouyingfeng <zhouyingfeng@baidu.com>
Date: Wed, 28 Dec 2016 17:14:54 +0800
Subject: [PATCH 04/41] add usage collection for demo

Add a `paddle usage` command to support collecting demo usage.
---
 demo/introduction/train.sh                  |   1 +
 paddle/scripts/CMakeLists.txt               |  10 +-
 paddle/scripts/submit_local.sh.in           |   3 +
 paddle/scripts/tools/usage_stat/feedback.sh | 172 ++++++++++++++++++++
 4 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100755 paddle/scripts/tools/usage_stat/feedback.sh

diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
index b7bbb90ddd..a7e184300c 100755
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -19,3 +19,4 @@ paddle train \
     --save_dir=./output \
     --num_passes=30 \
     2>&1 |tee 'train.log'
+paddle usage -l "train.log" -e $? -n "linear_intro" >/dev/null 2>&1
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index 1bae396a18..e29faab5e8 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -2,8 +2,16 @@ configure_file(submit_local.sh.in
     submit_local.sh
     @ONLY)
 
-
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
         RENAME paddle)
+
+configure_file(tools/usage_stat/feedback.sh
+    feedback.sh
+    @ONLY)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/feedback.sh DESTINATION opt/paddle/bin
+        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+        RENAME paddle_feedback)
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 283fd34a6d..e95b6dbea1 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -122,6 +122,9 @@ case "$1" in
     "make_diagram")
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
+    "usage")
+        $MYDIR/../opt/paddle/bin/paddle_feedback ${@:2}
+        ;;
     "version")
         version
         ;;
diff --git a/paddle/scripts/tools/usage_stat/feedback.sh b/paddle/scripts/tools/usage_stat/feedback.sh
new file mode 100755
index 0000000000..76f7d44a55
--- /dev/null
+++ b/paddle/scripts/tools/usage_stat/feedback.sh
@@ -0,0 +1,172 @@
+#!/bin/bash
+
+ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
+KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
+# paddle config home dir, same as paddle
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+# api url, mirror url(s) will be append later
+PD_URLS="http://api.paddlepaddle.org/version"
+
+usage()
+{
+    echo "Usage: `basename $0` [options]"
+    echo "Options:"
+    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
+    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
+    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
+    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
+    echo "  -v, -i                            Verbose output and interact with user when necessary"
+    echo " --help                             display this help message"
+}
+
+eval set -- "${ARGPARSE}"
+while true; do
+    case "$1" in
+        -l|--log-file)
+            log_file=$2
+            shift 2
+            ;;
+        -e|--exit-code)
+            exit_code=$2
+            shift 2
+            ;;
+        -u|--git-user)
+            github_user=$2
+            shift 2
+            ;;
+        -n|--task-name)
+            task=$2
+            shift 2
+            ;;
+        -v|-i)
+            v=1
+            shift
+            ;;
+        --dry-run)
+            dry_run=1
+            shift
+            ;;
+        --)
+            shift
+            break
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Invalid option $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# parse the log_file to get the time costs
+if [ -s "${log_file}" ]; then
+    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
+    {if(index($2,":")==3){
+        t=substr($2,1,8);
+        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
+        if(sec<last_sec-600){day+=1;sec+=86400;}
+        last_sec=sec;
+        if(min_sec==0 || min_sec>sec){min_sec=sec;}
+        if(max_sec==0 || max_sec<sec){max_sec=sec;}
+    }}
+    END{print max_sec-min_sec}' ${log_file}`
+else
+    duration=-1
+fi
+if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
+
+# try find the user/email if not given
+if [ -z "${github_user}" ]; then
+    # search for cached username
+    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
+        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
+        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
+    else
+        # search the github-user from git config
+        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
+        git_username=`git config --get user.name 2>/dev/null`
+        git_url=`git config --get remote.origin.url 2>/dev/null`
+        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
+            # under a git url, like https://github.com/user_xxx/proj_yyy.git
+            if [ "${v}" = "1" ]; then echo " from github url..."; fi
+            github_user=`echo ${git_url} | cut -d "/" -f 4`
+            if [ "${github_user}" = "PaddlePaddle" ]; then
+                github_user=
+            fi
+        fi
+        if [ -n "${git_username}" -a -z "${github_user}" ]; then
+            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
+            github_user=${git_username}
+        fi
+    fi
+fi
+# allow user to set the user name, if it's not found
+if [ -z "${github_user}" -a "${v}" = "1" ]; then
+    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
+    github_user=${REPLY}
+    if [ -z "${github_user}" ]; then
+        # empty input, consider as one anonymous user
+        github_user="${KEEP_ANONYMOUS}"
+    fi
+fi
+if [ -n "${github_user}" -a -z "${dry_run}" ]; then
+    # valid user and not in dry-run mode, then save to cache
+    mkdir -p ${PADDLE_CONF_HOME}
+    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
+fi
+if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
+if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
+    # anonymous user should keep the var empty.
+    github_user=
+fi
+
+# read local paddle version
+paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
+if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
+
+# read local system time
+system_time=`date "+%Y%m%d%H%M%S"`
+if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
+
+# make empty job_name as default value.
+if [ -z "${task}" ]; then
+    task="(unknown_task)"
+fi
+if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
+
+# concat the curl command
+params="content={\"data_type\":\"usage\",\
+\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
+\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
+\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
+}&type=1"
+curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
+ -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
+
+if [ "${dry_run}" = "1" ]; then
+    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
+    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
+    exit 0
+else
+    for u in ${PD_URLS}; do
+        curl_cmd="${curl_cmd_prefix} ${u}"
+        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
+        ${curl_cmd} >/dev/null 2>&1
+        if [ $? -eq 0 ]; then
+            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
+            exit 0
+        else
+            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
+        fi
+    done
+    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
+    exit 1
+fi
+
+
+
+

From 7193cbf1199ce7bb193d06e71b234c645a631da1 Mon Sep 17 00:00:00 2001
From: zhouyingfeng <zhouyingfeng@baidu.com>
Date: Wed, 28 Dec 2016 22:12:30 +0800
Subject: [PATCH 05/41] rename "feedback.sh" into "usage.sh", and fix the code
 style error in this file.

---
 paddle/scripts/CMakeLists.txt                             | 8 ++++----
 paddle/scripts/submit_local.sh.in                         | 2 +-
 paddle/scripts/tools/usage_stat/{feedback.sh => usage.sh} | 4 ----
 3 files changed, 5 insertions(+), 9 deletions(-)
 rename paddle/scripts/tools/usage_stat/{feedback.sh => usage.sh} (99%)

diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index e29faab5e8..66a46e1883 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -7,11 +7,11 @@ install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
         RENAME paddle)
 
-configure_file(tools/usage_stat/feedback.sh
-    feedback.sh
+configure_file(tools/usage_stat/usage.sh
+    usage.sh
     @ONLY)
 
-install(FILES ${CMAKE_CURRENT_BINARY_DIR}/feedback.sh DESTINATION opt/paddle/bin
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
-        RENAME paddle_feedback)
+        RENAME paddle_usage)
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index e95b6dbea1..f29d32f0d9 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -123,7 +123,7 @@ case "$1" in
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
     "usage")
-        $MYDIR/../opt/paddle/bin/paddle_feedback ${@:2}
+        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
         ;;
     "version")
         version
diff --git a/paddle/scripts/tools/usage_stat/feedback.sh b/paddle/scripts/tools/usage_stat/usage.sh
similarity index 99%
rename from paddle/scripts/tools/usage_stat/feedback.sh
rename to paddle/scripts/tools/usage_stat/usage.sh
index 76f7d44a55..7dbd1f5884 100755
--- a/paddle/scripts/tools/usage_stat/feedback.sh
+++ b/paddle/scripts/tools/usage_stat/usage.sh
@@ -166,7 +166,3 @@ else
     if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
     exit 1
 fi
-
-
-
-

From f3c61cbc4cc6239958ce394980a82e511e475de5 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 3 Jan 2017 10:43:14 +0800
Subject: [PATCH 06/41] add pserver util and parameter server config

---
 demo/quick_start/cluster/cluster_train.sh |   1 +
 demo/quick_start/cluster/pserver.sh       |   2 +-
 paddle/pserver/CMakeLists.txt             |   6 +-
 paddle/pserver/PServerUtil.cpp            | 101 ++++++++++++++++++++++
 paddle/pserver/PServerUtil.h              |  39 +++++++++
 paddle/pserver/ParameterServer2Main.cpp   |  59 ++-----------
 paddle/trainer/TrainerMain.cpp            |  53 +-----------
 proto/CMakeLists.txt                      |   3 +-
 proto/ParameterServerConfig.proto         |  43 +++++++++
 9 files changed, 199 insertions(+), 108 deletions(-)
 create mode 100644 paddle/pserver/PServerUtil.cpp
 create mode 100644 paddle/pserver/PServerUtil.h
 create mode 100644 proto/ParameterServerConfig.proto

diff --git a/demo/quick_start/cluster/cluster_train.sh b/demo/quick_start/cluster/cluster_train.sh
index aac9b89b14..a7b1f01064 100755
--- a/demo/quick_start/cluster/cluster_train.sh
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -25,6 +25,7 @@ log_file="$bin_dir/train.log"
 pushd "$home_dir"
 cfg=trainer_config.lr.py
 paddle train \
+  --start_pserver=false \
   --config=$cfg \
   --save_dir=${model_dir} \
   --trainer_count=4 \
diff --git a/demo/quick_start/cluster/pserver.sh b/demo/quick_start/cluster/pserver.sh
index b187c1d9b9..4e1ffe5139 100755
--- a/demo/quick_start/cluster/pserver.sh
+++ b/demo/quick_start/cluster/pserver.sh
@@ -19,7 +19,7 @@ source "$bin_dir/env.sh"
 paddle pserver \
   --nics=`get_nics` \
   --port=7164 \
-  --ports_num=1 \
+  --ports_num=2 \
   --ports_num_for_sparse=1 \
   --num_gradient_servers=1 \
   --comment="paddle_pserver" \
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index 1c1e1964b8..9bc48294f0 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -24,13 +24,15 @@ set(PSERVER_SOURCES
     BaseClient.cpp
     ParameterClient2.cpp
     ParameterServer2.cpp
-    SparseParameterDistribution.cpp)
+    SparseParameterDistribution.cpp
+    PServerUtil.cpp)
 
 set(PSERVER_HEADERS
     BaseClient.h
     ParameterClient2.h
     ParameterServer2.h
-    SparseParameterDistribution.h)
+    SparseParameterDistribution.h
+    PServerUtil.h)
 
 add_library(paddle_pserver STATIC
     ${PSERVER_SOURCES})
diff --git a/paddle/pserver/PServerUtil.cpp b/paddle/pserver/PServerUtil.cpp
new file mode 100644
index 0000000000..e645697936
--- /dev/null
+++ b/paddle/pserver/PServerUtil.cpp
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PServerUtil.h"
+
+namespace paddle {
+
+ParameterServerConfig* PServerUtil::initConfig() {
+  ParameterServerConfig* config = new ParameterServerConfig();
+  config->set_nics(FLAGS_nics);
+  config->set_port(FLAGS_port);
+  config->set_ports_num(FLAGS_ports_num);
+  config->set_rdma_tcp(FLAGS_rdma_tcp);
+  return config;
+}
+
+PServerUtil* PServerUtil::create() {
+  auto& pServerConfig = *paddle::PServerUtil::initConfig();
+  return PServerUtil::create(pServerConfig);
+}
+
+PServerUtil* PServerUtil::create(const ParameterServerConfig& config) {
+  return new PServerUtil(config);
+}
+
+PServerUtil::PServerUtil(const ParameterServerConfig& config) {
+  // round robin to load balance RDMA server ENGINE
+  std::vector<std::string> devices;
+  int rdmaCpu = 0;
+  int onlineCpus = rdma::numCpus();
+  ;
+  int numPorts = config.ports_num() + config.ports_num_for_sparse();
+
+  if (FLAGS_nics.empty()) {
+    pservers_.resize(numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      if (FLAGS_rdma_tcp == "rdma") {
+        pservers_[i].reset(
+            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
+        rdmaCpu = rdmaCpu % onlineCpus;
+      } else {
+        pservers_[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
+      }
+      CHECK(pservers_[i]->init()) << "Fail to initialize parameter server"
+                                  << FLAGS_port + i;
+    }
+  } else {
+    str::split(FLAGS_nics, ',', &devices);
+    pservers_.resize(devices.size() * numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      for (size_t j = 0; j < devices.size(); ++j) {
+        if (FLAGS_rdma_tcp == "rdma") {
+          pservers_[i * devices.size() + j].reset(new ParameterServer2(
+              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
+          rdmaCpu = rdmaCpu % onlineCpus;
+        } else {
+          pservers_[i * devices.size() + j].reset(
+              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
+        }
+        CHECK(pservers_[i * devices.size() + j]->init())
+            << "Fail to initialize parameter server" << devices[j]
+            << FLAGS_port + i;
+      }
+    }
+  }
+}
+
+PServerUtil::~PServerUtil() { this->join(); }
+
+void PServerUtil::start() {
+  LOG(INFO) << "pserver sizes : " << pservers_.size();
+  int i = 0;
+  for (const auto& pserver : pservers_) {
+    LOG(INFO) << "pserver started : " << i;
+    pserver->start();
+    i++;
+  }
+}
+
+void PServerUtil::join() {
+  LOG(INFO) << "pserver sizes : " << pservers_.size();
+  int i = 0;
+  for (const auto& pserver : pservers_) {
+    LOG(INFO) << "pserver join : " << i;
+    pserver->join();
+    i++;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/PServerUtil.h b/paddle/pserver/PServerUtil.h
new file mode 100644
index 0000000000..dd8d32a4e9
--- /dev/null
+++ b/paddle/pserver/PServerUtil.h
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterServer2.h"
+#include "ParameterServerConfig.pb.h"
+#include "RDMANetwork.h"
+#include "paddle/utils/StringUtil.h"
+
+namespace paddle {
+
+class PServerUtil {
+public:
+  DISABLE_COPY(PServerUtil);
+  static PServerUtil* create();
+  static PServerUtil* create(const ParameterServerConfig& config);
+  explicit PServerUtil(const ParameterServerConfig& config);
+  ~PServerUtil();
+  static ParameterServerConfig* initConfig();
+  void start();
+  void join();
+
+private:
+  std::vector<std::shared_ptr<ParameterServer2>> pservers_;
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index ffc521f2c1..afba7293eb 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -13,66 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fstream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "ParameterServer2.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/Flags.h"
+#include "PServerUtil.h"
+#include "paddle/trainer/ParamUtil.h"
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
   initMain(argc, argv);
 
-  std::vector<std::string> devices;
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-
-  // round robin to loadbalance RDMA server ENGINE
-  int rdmaCpu = 0;
-  int onlineCpus = rdma::numCpus();
-  int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-  if (FLAGS_nics.empty()) {
-    pservers.resize(numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      if (FLAGS_rdma_tcp == "rdma") {
-        pservers[i].reset(
-            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-        rdmaCpu = rdmaCpu % onlineCpus;
-      } else {
-        pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      }
-      CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                 << FLAGS_port + i;
-      LOG(INFO) << "pserver started : " << FLAGS_port + i;
-      pservers[i]->start();
-    }
-  } else {
-    str::split(FLAGS_nics, ',', &devices);
-    pservers.resize(devices.size() * numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      for (size_t j = 0; j < devices.size(); ++j) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-        }
-        CHECK(pservers[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server" << devices[j]
-            << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << devices[j] << ":"
-                  << FLAGS_port + i;
-        pservers[i * devices.size() + j]->start();
-      }
-    }
-  }
-
-  for (auto& pserver : pservers) {
-    pserver->join();
-  }
+  std::unique_ptr<PServerUtil> pServerPtr(paddle::PServerUtil::create());
+  pServerPtr->start();
+  pServerPtr->join();
 
   return 0;
 }
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 947f9cadcc..0d3e4514d6 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -13,14 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fenv.h>
-#include "paddle/pserver/ParameterServer2.h"
+#include "paddle/pserver/PServerUtil.h"
 #include "paddle/utils/Excepts.h"
 #include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/StringUtil.h"
 
 #include "ParamUtil.h"
 #include "Trainer.h"
-#include "paddle/pserver/RDMANetwork.h"
 
 DEFINE_bool(start_pserver, false, "Whether to start pserver");
 DECLARE_int32(gpu_id);
@@ -39,54 +37,9 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
 
-  std::vector<std::unique_ptr<ParameterServer2>> pservers;
-  std::vector<std::string> devices;
-
   if (FLAGS_start_pserver) {
-    // round robin to loadbalance RDMA server ENGINE
-    int rdmaCpu = 0;
-    int onlineCpus = rdma::numCpus();
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    if (FLAGS_nics.empty()) {
-      pservers.resize(numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i));
-        }
-
-        CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                   << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << FLAGS_port + i;
-        pservers[i]->start();
-      }
-    } else {
-      str::split(FLAGS_nics, ',', &devices);
-      pservers.resize(devices.size() * numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        for (size_t j = 0; j < devices.size(); ++j) {
-          if (FLAGS_rdma_tcp == "rdma") {
-            pservers[i * devices.size() + j].reset(new ParameterServer2(
-                getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-            rdmaCpu = rdmaCpu % onlineCpus;
-          } else {
-            pservers[i * devices.size() + j].reset(
-                new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-          }
-
-          CHECK(pservers[i * devices.size() + j]->init())
-              << "Fail to initialize parameter server" << devices[j]
-              << FLAGS_port + i;
-          LOG(INFO) << "pserver started : " << devices[j] << ":"
-                    << FLAGS_port + i;
-          pservers[i * devices.size() + j]->start();
-        }
-      }
-    }
+    PServerUtil* pServerUtil = paddle::PServerUtil::create();
+    pServerUtil->start();
   }
   Trainer trainer;
   auto config = TrainerConfigHelper::createFromFlags();
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 2c40070eca..e53d06e773 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -4,7 +4,8 @@ set(proto_filenames
     ModelConfig.proto
     ParameterConfig.proto
     ParameterService.proto
-    TrainerConfig.proto)
+    TrainerConfig.proto
+    ParameterServerConfig.proto)
 
 set(PROTO_GEN)
 set(PROTO_GEN_PY)
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
new file mode 100644
index 0000000000..b4fbf901c2
--- /dev/null
+++ b/proto/ParameterServerConfig.proto
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+
+package paddle;
+
+message ParameterClientConfig {
+  required int32 trainer_id = 1;
+}
+
+message ParameterServerConfig {
+  // The ports number for parameter send,
+  // increment based on default port number
+  required int32 ports_num = 1 [default = 1];
+  // The ports number for parameter send,
+  // increment based on default (port + ports_num
+  required int32 ports_num_for_sparse = 2 [default = 0];
+  // network device name for pservers
+  required string nics = 3 [default = "xgbe0,xgbe1"];
+  required string rdma_tcp = 4 [default = "tcp"];
+  // Listening port for pserver
+  required int32 port = 5 [default = 20134];
+  // number of gradient servers
+  required int32 num_gradient_servers = 6 [default = 1];
+  // number of threads for sync op exec
+  required int32 pserver_num_threads = 7 [default = 1];
+  // control config_.async_lagged_grad_discard_ratio() min value
+  required double async_lagged_ratio_min = 8 [default = 1.0];
+  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
+  // use it as defalut value
+  required double async_lagged_ratio_default = 9 [default = 1.5];
+}
\ No newline at end of file

From 95f20b9472b9e754207cc489246d3f51f6fb2793 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 3 Jan 2017 14:39:44 +0800
Subject: [PATCH 07/41] add comment and refine code

---
 demo/quick_start/cluster/pserver.sh     |  2 +-
 paddle/pserver/PServerUtil.cpp          | 37 ++++++++++++-------------
 paddle/pserver/PServerUtil.h            | 37 +++++++++++++++++++++++--
 paddle/pserver/ParameterServer2Main.cpp |  3 +-
 paddle/trainer/TrainerMain.cpp          |  2 +-
 5 files changed, 56 insertions(+), 25 deletions(-)

diff --git a/demo/quick_start/cluster/pserver.sh b/demo/quick_start/cluster/pserver.sh
index 4e1ffe5139..b187c1d9b9 100755
--- a/demo/quick_start/cluster/pserver.sh
+++ b/demo/quick_start/cluster/pserver.sh
@@ -19,7 +19,7 @@ source "$bin_dir/env.sh"
 paddle pserver \
   --nics=`get_nics` \
   --port=7164 \
-  --ports_num=2 \
+  --ports_num=1 \
   --ports_num_for_sparse=1 \
   --num_gradient_servers=1 \
   --comment="paddle_pserver" \
diff --git a/paddle/pserver/PServerUtil.cpp b/paddle/pserver/PServerUtil.cpp
index e645697936..68a9174330 100644
--- a/paddle/pserver/PServerUtil.cpp
+++ b/paddle/pserver/PServerUtil.cpp
@@ -16,30 +16,11 @@ limitations under the License. */
 
 namespace paddle {
 
-ParameterServerConfig* PServerUtil::initConfig() {
-  ParameterServerConfig* config = new ParameterServerConfig();
-  config->set_nics(FLAGS_nics);
-  config->set_port(FLAGS_port);
-  config->set_ports_num(FLAGS_ports_num);
-  config->set_rdma_tcp(FLAGS_rdma_tcp);
-  return config;
-}
-
-PServerUtil* PServerUtil::create() {
-  auto& pServerConfig = *paddle::PServerUtil::initConfig();
-  return PServerUtil::create(pServerConfig);
-}
-
-PServerUtil* PServerUtil::create(const ParameterServerConfig& config) {
-  return new PServerUtil(config);
-}
-
 PServerUtil::PServerUtil(const ParameterServerConfig& config) {
   // round robin to load balance RDMA server ENGINE
   std::vector<std::string> devices;
   int rdmaCpu = 0;
   int onlineCpus = rdma::numCpus();
-  ;
   int numPorts = config.ports_num() + config.ports_num_for_sparse();
 
   if (FLAGS_nics.empty()) {
@@ -78,6 +59,24 @@ PServerUtil::PServerUtil(const ParameterServerConfig& config) {
 
 PServerUtil::~PServerUtil() { this->join(); }
 
+ParameterServerConfig* PServerUtil::initConfig() {
+  ParameterServerConfig* config = new ParameterServerConfig();
+  config->set_nics(FLAGS_nics);
+  config->set_port(FLAGS_port);
+  config->set_ports_num(FLAGS_ports_num);
+  config->set_rdma_tcp(FLAGS_rdma_tcp);
+  return config;
+}
+
+PServerUtil* PServerUtil::createWithGflags() {
+  auto& pServerConfig = *paddle::PServerUtil::initConfig();
+  return create(pServerConfig);
+}
+
+PServerUtil* PServerUtil::create(const ParameterServerConfig& config) {
+  return new PServerUtil(config);
+}
+
 void PServerUtil::start() {
   LOG(INFO) << "pserver sizes : " << pservers_.size();
   int i = 0;
diff --git a/paddle/pserver/PServerUtil.h b/paddle/pserver/PServerUtil.h
index dd8d32a4e9..117dde37e3 100644
--- a/paddle/pserver/PServerUtil.h
+++ b/paddle/pserver/PServerUtil.h
@@ -24,16 +24,47 @@ namespace paddle {
 class PServerUtil {
 public:
   DISABLE_COPY(PServerUtil);
-  static PServerUtil* create();
-  static PServerUtil* create(const ParameterServerConfig& config);
+
+  /**
+   * @brief Ctor, Create a PServerUtil from ParameterServerConfig.
+   */
   explicit PServerUtil(const ParameterServerConfig& config);
+
+  /**
+   * @brief Dtor.
+   */
   ~PServerUtil();
-  static ParameterServerConfig* initConfig();
+
+  /**
+   * @brief create PServerUtil from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static PServerUtil* createWithGflags();
+
+  /**
+   * @brief create PServerUtil with ParameterServerConfig, remove gflags
+   * from ParameterServer. Init all pservers thread according to the config.
+   */
+  static PServerUtil* create(const ParameterServerConfig& config);
+
+  /**
+   * @brief start all pserver thread in this PServerUtil.
+   */
   void start();
+
+  /**
+   * @brief join and wait for all pserver thread in this PServerUtil.
+   */
   void join();
 
 private:
   std::vector<std::shared_ptr<ParameterServer2>> pservers_;
+
+  /**
+   * @brief create ParameterServerConfig from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static ParameterServerConfig* initConfig();
 };
 
 }  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index afba7293eb..8c1baea0ce 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -21,7 +21,8 @@ using namespace paddle;  // NOLINT
 int main(int argc, char** argv) {
   initMain(argc, argv);
 
-  std::unique_ptr<PServerUtil> pServerPtr(paddle::PServerUtil::create());
+  std::unique_ptr<PServerUtil> pServerPtr(
+      paddle::PServerUtil::createWithGflags());
   pServerPtr->start();
   pServerPtr->join();
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 0d3e4514d6..a690268c2c 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -38,7 +38,7 @@ int main(int argc, char** argv) {
   initPython(argc, argv);
 
   if (FLAGS_start_pserver) {
-    PServerUtil* pServerUtil = paddle::PServerUtil::create();
+    PServerUtil* pServerUtil = paddle::PServerUtil::createWithGflags();
     pServerUtil->start();
   }
   Trainer trainer;

From cfbb4c481e0b2a59f335ae3f34d2aa8dba39e26d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 3 Jan 2017 15:34:50 +0800
Subject: [PATCH 08/41] use unique_ptr

---
 paddle/trainer/TrainerMain.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index a690268c2c..52983e46eb 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -37,9 +37,10 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
 
+  std::unique_ptr<PServerUtil> pServerPtr(nullptr);
   if (FLAGS_start_pserver) {
-    PServerUtil* pServerUtil = paddle::PServerUtil::createWithGflags();
-    pServerUtil->start();
+    pServerPtr.reset(paddle::PServerUtil::createWithGflags());
+    pServerPtr->start();
   }
   Trainer trainer;
   auto config = TrainerConfigHelper::createFromFlags();

From 904eefaf8a82ea10c0a804c58a11110fa296a74a Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 3 Jan 2017 15:51:07 +0800
Subject: [PATCH 09/41] add TensorShape use to represent tensor of any
 dimension.

---
 paddle/function/TensorType.h       | 125 +++++++++++++++++++++++++++++
 paddle/function/TensorTypeTest.cpp |  53 ++++++++++++
 2 files changed, 178 insertions(+)
 create mode 100644 paddle/function/TensorType.h
 create mode 100644 paddle/function/TensorTypeTest.cpp

diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
new file mode 100644
index 0000000000..0b860f2046
--- /dev/null
+++ b/paddle/function/TensorType.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+namespace paddle {
+
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    std::copy(dims.begin(), dims.end(), dims_.begin());
+    numElements();
+  };
+
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    std::copy(t.dims_.begin(), t.dims_.end(), dims_.begin());
+  };
+
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+
+  size_t getElements() const { return nelements_; }
+
+private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < 4 ? 4 : ndims;
+    dims_.assign(count, 1);
+  }
+
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
new file mode 100644
index 0000000000..99c25f42a1
--- /dev/null
+++ b/paddle/function/TensorTypeTest.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorType.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorShape, Constructor) {
+  TensorShape t1;
+  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.getElements(), 0);
+
+  TensorShape t2(3);
+  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.getElements(), 1);
+
+  TensorShape t3({8, 10});
+  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.getElements(), 80);
+
+  TensorShape t4(t3);
+  EXPECT_EQ(t4.ndims(), t3.ndims());
+  EXPECT_EQ(t4.getElements(), t3.getElements());
+
+  TensorShape t5({1, 2, 3, 4, 5});
+  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.getElements(), 120);
+}
+
+TEST(TensorShape, GetAndSet) {
+  TensorShape t({1, 2, 3});
+  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.getElements(), 6);
+
+  EXPECT_EQ(t[1], 2);
+  t.setDim(1, 100);
+  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t[1], 100);
+}
+
+}  // namespace paddle

From 77839826a4030efd5f692c4e6ab2cf8cc011a363 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 3 Jan 2017 17:09:59 +0800
Subject: [PATCH 10/41] change FLAGS to proto config in PServerUtil

---
 paddle/pserver/PServerUtil.cpp | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/paddle/pserver/PServerUtil.cpp b/paddle/pserver/PServerUtil.cpp
index 68a9174330..bf4cf0771c 100644
--- a/paddle/pserver/PServerUtil.cpp
+++ b/paddle/pserver/PServerUtil.cpp
@@ -23,35 +23,36 @@ PServerUtil::PServerUtil(const ParameterServerConfig& config) {
   int onlineCpus = rdma::numCpus();
   int numPorts = config.ports_num() + config.ports_num_for_sparse();
 
-  if (FLAGS_nics.empty()) {
+  if (config.nics().empty()) {
     pservers_.resize(numPorts);
     for (int i = 0; i < numPorts; ++i) {
-      if (FLAGS_rdma_tcp == "rdma") {
+      if (config.rdma_tcp() == "rdma") {
         pservers_[i].reset(
-            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
+            new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
         rdmaCpu = rdmaCpu % onlineCpus;
       } else {
-        pservers_[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
+        pservers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i));
       }
       CHECK(pservers_[i]->init()) << "Fail to initialize parameter server"
-                                  << FLAGS_port + i;
+                                  << config.port() + i;
     }
   } else {
-    str::split(FLAGS_nics, ',', &devices);
+    str::split(config.nics(), ',', &devices);
     pservers_.resize(devices.size() * numPorts);
     for (int i = 0; i < numPorts; ++i) {
       for (size_t j = 0; j < devices.size(); ++j) {
-        if (FLAGS_rdma_tcp == "rdma") {
+        if (config.rdma_tcp() == "rdma") {
           pservers_[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
+              getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
           rdmaCpu = rdmaCpu % onlineCpus;
         } else {
           pservers_[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
+              new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
         }
         CHECK(pservers_[i * devices.size() + j]->init())
             << "Fail to initialize parameter server" << devices[j]
-            << FLAGS_port + i;
+            << config.port() + i;
       }
     }
   }

From 93e74f89c6b91db1d57014c2b7b7ac4f59486121 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 4 Jan 2017 10:17:07 +0800
Subject: [PATCH 11/41] rename PServerUtil to PServerController

---
 paddle/pserver/CMakeLists.txt                 |  4 ++--
 ...{PServerUtil.cpp => PServerController.cpp} | 21 ++++++++++---------
 .../{PServerUtil.h => PServerController.h}    | 14 ++++++-------
 paddle/pserver/ParameterServer2Main.cpp       |  7 +++----
 paddle/trainer/TrainerMain.cpp                |  6 +++---
 5 files changed, 26 insertions(+), 26 deletions(-)
 rename paddle/pserver/{PServerUtil.cpp => PServerController.cpp} (83%)
 rename paddle/pserver/{PServerUtil.h => PServerController.h} (83%)

diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index 9bc48294f0..ac52b8dbec 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -25,14 +25,14 @@ set(PSERVER_SOURCES
     ParameterClient2.cpp
     ParameterServer2.cpp
     SparseParameterDistribution.cpp
-    PServerUtil.cpp)
+    PServerController.cpp)
 
 set(PSERVER_HEADERS
     BaseClient.h
     ParameterClient2.h
     ParameterServer2.h
     SparseParameterDistribution.h
-    PServerUtil.h)
+    PServerController.h)
 
 add_library(paddle_pserver STATIC
     ${PSERVER_SOURCES})
diff --git a/paddle/pserver/PServerUtil.cpp b/paddle/pserver/PServerController.cpp
similarity index 83%
rename from paddle/pserver/PServerUtil.cpp
rename to paddle/pserver/PServerController.cpp
index bf4cf0771c..2d00019cef 100644
--- a/paddle/pserver/PServerUtil.cpp
+++ b/paddle/pserver/PServerController.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "PServerUtil.h"
+#include "PServerController.h"
 
 namespace paddle {
 
-PServerUtil::PServerUtil(const ParameterServerConfig& config) {
+PServerController::PServerController(const ParameterServerConfig& config) {
   // round robin to load balance RDMA server ENGINE
   std::vector<std::string> devices;
   int rdmaCpu = 0;
@@ -58,9 +58,9 @@ PServerUtil::PServerUtil(const ParameterServerConfig& config) {
   }
 }
 
-PServerUtil::~PServerUtil() { this->join(); }
+PServerController::~PServerController() { this->join(); }
 
-ParameterServerConfig* PServerUtil::initConfig() {
+ParameterServerConfig* PServerController::initConfigByGflags() {
   ParameterServerConfig* config = new ParameterServerConfig();
   config->set_nics(FLAGS_nics);
   config->set_port(FLAGS_port);
@@ -69,16 +69,17 @@ ParameterServerConfig* PServerUtil::initConfig() {
   return config;
 }
 
-PServerUtil* PServerUtil::createWithGflags() {
-  auto& pServerConfig = *paddle::PServerUtil::initConfig();
+PServerController* PServerController::createByGflags() {
+  auto& pServerConfig = *paddle::PServerController::initConfigByGflags();
   return create(pServerConfig);
 }
 
-PServerUtil* PServerUtil::create(const ParameterServerConfig& config) {
-  return new PServerUtil(config);
+PServerController* PServerController::create(
+    const ParameterServerConfig& config) {
+  return new PServerController(config);
 }
 
-void PServerUtil::start() {
+void PServerController::start() {
   LOG(INFO) << "pserver sizes : " << pservers_.size();
   int i = 0;
   for (const auto& pserver : pservers_) {
@@ -88,7 +89,7 @@ void PServerUtil::start() {
   }
 }
 
-void PServerUtil::join() {
+void PServerController::join() {
   LOG(INFO) << "pserver sizes : " << pservers_.size();
   int i = 0;
   for (const auto& pserver : pservers_) {
diff --git a/paddle/pserver/PServerUtil.h b/paddle/pserver/PServerController.h
similarity index 83%
rename from paddle/pserver/PServerUtil.h
rename to paddle/pserver/PServerController.h
index 117dde37e3..6fb7e0a31a 100644
--- a/paddle/pserver/PServerUtil.h
+++ b/paddle/pserver/PServerController.h
@@ -21,31 +21,31 @@ limitations under the License. */
 
 namespace paddle {
 
-class PServerUtil {
+class PServerController {
 public:
-  DISABLE_COPY(PServerUtil);
+  DISABLE_COPY(PServerController);
 
   /**
    * @brief Ctor, Create a PServerUtil from ParameterServerConfig.
    */
-  explicit PServerUtil(const ParameterServerConfig& config);
+  explicit PServerController(const ParameterServerConfig& config);
 
   /**
    * @brief Dtor.
    */
-  ~PServerUtil();
+  ~PServerController();
 
   /**
    * @brief create PServerUtil from gflags, this is used for
    * compatibility with the old usage of configuration by gflags.
    */
-  static PServerUtil* createWithGflags();
+  static PServerController* createByGflags();
 
   /**
    * @brief create PServerUtil with ParameterServerConfig, remove gflags
    * from ParameterServer. Init all pservers thread according to the config.
    */
-  static PServerUtil* create(const ParameterServerConfig& config);
+  static PServerController* create(const ParameterServerConfig& config);
 
   /**
    * @brief start all pserver thread in this PServerUtil.
@@ -64,7 +64,7 @@ private:
    * @brief create ParameterServerConfig from gflags, this is used for
    * compatibility with the old usage of configuration by gflags.
    */
-  static ParameterServerConfig* initConfig();
+  static ParameterServerConfig* initConfigByGflags();
 };
 
 }  // namespace paddle
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index 8c1baea0ce..6e683cdd2c 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -13,16 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fstream>
-#include "PServerUtil.h"
-#include "paddle/trainer/ParamUtil.h"
+#include "PServerController.h"
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
   initMain(argc, argv);
 
-  std::unique_ptr<PServerUtil> pServerPtr(
-      paddle::PServerUtil::createWithGflags());
+  std::unique_ptr<PServerController> pServerPtr(
+      paddle::PServerController::createByGflags());
   pServerPtr->start();
   pServerPtr->join();
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 52983e46eb..3ce3d67842 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fenv.h>
-#include "paddle/pserver/PServerUtil.h"
+#include "paddle/pserver/PServerController.h"
 #include "paddle/utils/Excepts.h"
 #include "paddle/utils/PythonUtil.h"
 
@@ -37,9 +37,9 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
 
-  std::unique_ptr<PServerUtil> pServerPtr(nullptr);
+  std::unique_ptr<PServerController> pServerPtr(nullptr);
   if (FLAGS_start_pserver) {
-    pServerPtr.reset(paddle::PServerUtil::createWithGflags());
+    pServerPtr.reset(paddle::PServerController::createByGflags());
     pServerPtr->start();
   }
   Trainer trainer;

From 3f6c2b3621f4ec7fdf051f7c8e21faff31e2881d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 4 Jan 2017 11:25:28 +0800
Subject: [PATCH 12/41] rm initConfigByGflags of PServerController, use stack
 value instead

---
 paddle/pserver/PServerController.cpp | 20 +++++++++-----------
 paddle/pserver/PServerController.h   | 10 ++--------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/paddle/pserver/PServerController.cpp b/paddle/pserver/PServerController.cpp
index 2d00019cef..8d2e026bca 100644
--- a/paddle/pserver/PServerController.cpp
+++ b/paddle/pserver/PServerController.cpp
@@ -60,18 +60,16 @@ PServerController::PServerController(const ParameterServerConfig& config) {
 
 PServerController::~PServerController() { this->join(); }
 
-ParameterServerConfig* PServerController::initConfigByGflags() {
-  ParameterServerConfig* config = new ParameterServerConfig();
-  config->set_nics(FLAGS_nics);
-  config->set_port(FLAGS_port);
-  config->set_ports_num(FLAGS_ports_num);
-  config->set_rdma_tcp(FLAGS_rdma_tcp);
-  return config;
-}
-
 PServerController* PServerController::createByGflags() {
-  auto& pServerConfig = *paddle::PServerController::initConfigByGflags();
-  return create(pServerConfig);
+  ParameterServerConfig config;
+
+  config.set_nics(FLAGS_nics);
+  config.set_rdma_tcp(FLAGS_rdma_tcp);
+  config.set_port(FLAGS_port);
+  config.set_ports_num(FLAGS_ports_num);
+  config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
+
+  return create(config);
 }
 
 PServerController* PServerController::create(
diff --git a/paddle/pserver/PServerController.h b/paddle/pserver/PServerController.h
index 6fb7e0a31a..cecf729009 100644
--- a/paddle/pserver/PServerController.h
+++ b/paddle/pserver/PServerController.h
@@ -21,7 +21,7 @@ limitations under the License. */
 
 namespace paddle {
 
-class PServerController {
+class PServerController final {
 public:
   DISABLE_COPY(PServerController);
 
@@ -58,13 +58,7 @@ public:
   void join();
 
 private:
-  std::vector<std::shared_ptr<ParameterServer2>> pservers_;
-
-  /**
-   * @brief create ParameterServerConfig from gflags, this is used for
-   * compatibility with the old usage of configuration by gflags.
-   */
-  static ParameterServerConfig* initConfigByGflags();
+  std::vector<std::unique_ptr<ParameterServer2>> pservers_;
 };
 
 }  // namespace paddle

From 0c4be7e6a687b5ec9a722fc1c9dbded70b1aa8ea Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 4 Jan 2017 16:51:49 +0800
Subject: [PATCH 13/41] add TensorType.h

---
 paddle/function/TensorShape.h       |  97 +++++++++++++++++++++++++
 paddle/function/TensorShapeTest.cpp |  53 ++++++++++++++
 paddle/function/TensorType.h        | 107 +++++++++++++---------------
 paddle/function/TensorTypeTest.cpp  |  52 ++++++--------
 4 files changed, 222 insertions(+), 87 deletions(-)
 create mode 100644 paddle/function/TensorShape.h
 create mode 100644 paddle/function/TensorShapeTest.cpp

diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
new file mode 100644
index 0000000000..e70484a1af
--- /dev/null
+++ b/paddle/function/TensorShape.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+namespace paddle {
+
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    std::copy(dims.begin(), dims.end(), dims_.begin());
+    numElements();
+  };
+
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    std::copy(t.dims_.begin(), t.dims_.end(), dims_.begin());
+  };
+
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, 0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+
+  size_t getElements() const { return nelements_; }
+
+  bool operator==(const TensorShape& t) const {
+    if (ndims() != t.ndims()) return false;
+    for (size_t i = 0; i < ndims(); i++) {
+      if (dims_[i] != t.dims_[i]) return false;
+    }
+
+    return true;
+  }
+
+  bool operator!=(const TensorShape& t) const { return !(*this == t); }
+
+private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < 4 ? 4 : ndims;
+    dims_.assign(count, 1);
+  }
+
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp
new file mode 100644
index 0000000000..45a2e106e7
--- /dev/null
+++ b/paddle/function/TensorShapeTest.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorShape.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorShape, Constructor) {
+  TensorShape t1;
+  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.getElements(), 0);
+
+  TensorShape t2(3);
+  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.getElements(), 1);
+
+  TensorShape t3({8, 10});
+  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.getElements(), 80);
+
+  TensorShape t4(t3);
+  EXPECT_EQ(t4.ndims(), t3.ndims());
+  EXPECT_EQ(t4.getElements(), t3.getElements());
+
+  TensorShape t5({1, 2, 3, 4, 5});
+  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.getElements(), 120);
+}
+
+TEST(TensorShape, GetAndSet) {
+  TensorShape t({1, 2, 3});
+  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.getElements(), 6);
+
+  EXPECT_EQ(t[1], 2);
+  t.setDim(1, 100);
+  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t[1], 100);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
index 0b860f2046..800f71a5b9 100644
--- a/paddle/function/TensorType.h
+++ b/paddle/function/TensorType.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include <glog/logging.h>
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
@@ -57,69 +57,60 @@ struct DataType<double> {
   static const ValueType value = VALUE_TYPE_DOUBLE;
 };
 
-/**
- * TensorShape used to represent shape of normal tensor.
- */
-class TensorShape {
-public:
-  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
-
-  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
-
-  TensorShape(std::initializer_list<size_t> dims) {
-    ndims_ = dims.size();
-    initDims(ndims_);
-    std::copy(dims.begin(), dims.end(), dims_.begin());
-    numElements();
-  };
-
-  TensorShape(const TensorShape& t)
-      : ndims_(t.ndims_), nelements_(t.nelements_) {
-    initDims(ndims_);
-    std::copy(t.dims_.begin(), t.dims_.end(), dims_.begin());
-  };
-
-  // get the size of specified dimension
-  size_t operator[](size_t dim) const {
-    CHECK_GE(dim, 0);
-    CHECK_LT(dim, ndims_);
-    return dims_[dim];
-  }
+namespace detail {
 
-  // set the size of specified dimension
-  void setDim(size_t dim, size_t size) {
-    CHECK_GE(dim, 0);
-    CHECK_LT(dim, ndims_);
-    dims_[dim] = size;
-    numElements();
-  }
+template <typename VType, DeviceType Device>
+struct MatrixT;
 
-  // number of dimensions of the tensor
-  size_t ndims() const { return ndims_; }
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
 
-  size_t getElements() const { return nelements_; }
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
 
-private:
-  // compute number of elements
-  void numElements() {
-    nelements_ = 1;
-    for (size_t n = 0; n < ndims_; n++) {
-      nelements_ *= dims_[n];
-    }
-  }
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
 
-  // init dims_
-  void initDims(size_t ndims) {
-    size_t count = ndims < 4 ? 4 : ndims;
-    dims_.assign(count, 1);
-  }
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct VectorT;
+
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
+}  // namespace detail
 
-  // number of dimensions
-  // ndims_ may be not equeal dims_.size()
-  size_t ndims_;
-  // number of elements
-  size_t nelements_;
-  std::vector<size_t> dims_;
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::VectorT<VType, DType>::type Vector;
 };
 
 }  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
index 99c25f42a1..4a86245c2a 100644
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
@@ -17,37 +17,31 @@ limitations under the License. */
 
 namespace paddle {
 
-TEST(TensorShape, Constructor) {
-  TensorShape t1;
-  EXPECT_EQ(t1.ndims(), 0);
-  EXPECT_EQ(t1.getElements(), 0);
-
-  TensorShape t2(3);
-  EXPECT_EQ(t2.ndims(), 3);
-  EXPECT_EQ(t2.getElements(), 1);
-
-  TensorShape t3({8, 10});
-  EXPECT_EQ(t3.ndims(), 2);
-  EXPECT_EQ(t3.getElements(), 80);
-
-  TensorShape t4(t3);
-  EXPECT_EQ(t4.ndims(), t3.ndims());
-  EXPECT_EQ(t4.getElements(), t3.getElements());
-
-  TensorShape t5({1, 2, 3, 4, 5});
-  EXPECT_EQ(t5.ndims(), 5);
-  EXPECT_EQ(t5.getElements(), 120);
+TEST(TensorType, Matrix) {
+  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
+  EXPECT_EQ(matrix.getHeight(), 100);
+  EXPECT_EQ(matrix.getWidth(), 200);
+  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.useGpu(), false);
+
+  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
+  EXPECT_EQ(testGpu.useGpu(), true);
 }
 
-TEST(TensorShape, GetAndSet) {
-  TensorShape t({1, 2, 3});
-  EXPECT_EQ(t.ndims(), 3);
-  EXPECT_EQ(t.getElements(), 6);
-
-  EXPECT_EQ(t[1], 2);
-  t.setDim(1, 100);
-  EXPECT_EQ(t.getElements(), 300);
-  EXPECT_EQ(t[1], 100);
+TEST(TensorType, Vector) {
+  Tensor<real, DEVICE_TYPE_CPU>::Vector cpuVector(100);
+  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
+  EXPECT_EQ(cpuVector.useGpu(), false);
+  EXPECT_EQ(gpuVector.useGpu(), true);
+  EXPECT_EQ(cpuVector.getSize(), 100);
+  EXPECT_EQ(gpuVector.getSize(), 100);
+
+  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
+  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
+  EXPECT_EQ(cpuIVector.useGpu(), false);
+  EXPECT_EQ(gpuIVector.useGpu(), true);
+  EXPECT_EQ(cpuIVector.getSize(), 100);
+  EXPECT_EQ(gpuIVector.getSize(), 100);
 }
 
 }  // namespace paddle

From c5c8051657611025eeaf8bc095da09a81fb76a1d Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 4 Jan 2017 21:17:56 +0800
Subject: [PATCH 14/41] add BufferArg

---
 paddle/function/BufferArg.cpp     |  43 +++++
 paddle/function/BufferArg.h       | 260 ++++++++++++++++++++++++++++++
 paddle/function/BufferArgTest.cpp | 128 +++++++++++++++
 paddle/function/TensorType.h      |   5 +
 4 files changed, 436 insertions(+)
 create mode 100644 paddle/function/BufferArg.cpp
 create mode 100644 paddle/function/BufferArg.h
 create mode 100644 paddle/function/BufferArgTest.cpp

diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
new file mode 100644
index 0000000000..08031917b2
--- /dev/null
+++ b/paddle/function/BufferArg.cpp
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include "BufferArg.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+
+void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape));
+}
+
+void BufferArgs::addArg(const CpuSparseMatrix& arg) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+}
+
+void BufferArgs::addArg(const GpuSparseMatrix& arg) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+}
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
new file mode 100644
index 0000000000..9fcda7a878
--- /dev/null
+++ b/paddle/function/BufferArg.h
@@ -0,0 +1,260 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+enum BufferType {
+  TENSOR_NORMAL = 0,
+  TENSOR_SEQUENCE_ID = 1,
+  TENSOR_SEQUENCE_DATA = 2,
+  TENSOR_SPARSE = 3
+};
+
+enum SparseDataType {
+  SPARSE_NO_VALUE = 0,  // do not need value pointer, all values are 1
+  SPARSE_FLOAT_VALUE = 1
+};
+
+enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
+
+/**
+ * BufferArg used as the argument type for Function.
+ */
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+class BufferArgs {
+public:
+  BufferArgs() {}
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgss
+  template <typename Tensor>
+  void addArg(const Tensor& arg) {
+    args_.push_back(std::make_shared<BufferArg>(arg));
+  }
+
+  void addArg(const Matrix& arg, const TensorShape& shape);
+
+  void addArg(const CpuSparseMatrix& arg);
+  void addArg(const GpuSparseMatrix& arg);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+private:
+  std::vector<BufferArgPtr> args_;
+};
+
+// an array of arbitrary dimensions
+class BufferArg {
+public:
+  BufferArg(void* buf, ValueType valueType, const TensorShape& shape)
+      : buf_(buf), valueType_(valueType), shape_(shape) {}
+
+  BufferArg(void* buf, ValueType valueType)
+      : buf_(buf), valueType_(valueType) {}
+
+  BufferArg(const Matrix& matrix)
+      : buf_((void*)matrix.getData()),
+        valueType_(DataType<real>::value),
+        shape_(2) {
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+
+  BufferArg(const Matrix& matrix, const TensorShape& shape)
+      : buf_((void*)matrix.getData()),
+        valueType_(DataType<real>::value),
+        shape_(shape) {
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+
+  BufferArg(const Vector& vector)
+      : buf_((void*)vector.getData()),
+        valueType_(DataType<real>::value),
+        shape_(1) {
+    shape_.setDim(0, vector.getSize());
+  }
+
+  BufferArg(const IVector& vector)
+      : buf_((void*)vector.getData()), valueType_(VALUE_TYPE_INT32), shape_(1) {
+    shape_.setDim(0, vector.getSize());
+  }
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+
+  virtual ~BufferArg() {}
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+
+protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_;
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b than value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+public:
+  SequenceIdArg(void* buf, const TensorShape& shape)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape) {
+    CHECK_EQ(shape_.ndims(), 1);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  ~SequenceIdArg() {}
+
+  size_t numSeqs() const { return numSeqs_; }
+
+private:
+  size_t numSeqs_;
+};
+
+// sequence data
+class SequenceArg : public BufferArg {
+public:
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions)
+      : BufferArg(buf, valueType, shape), startPositions_(startPositions) {}
+
+  SequenceArg(const Matrix& matrix, const IVector& vector)
+      : BufferArg(matrix), startPositions_(vector) {}
+
+  ~SequenceArg() {}
+
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+
+private:
+  SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseDataFormat format,
+                  SparseDataType type)
+      : BufferArg(buf, valueType, shape),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(format),
+        type_(type) {
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2);
+    CHECK_EQ(row_.shape().ndims(), 1);
+    CHECK_EQ(col_.shape().ndims(), 1);
+    if (format == SPARSE_CSR_FORMAT) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format == SPARSE_CSC_FORMAT) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+
+  SparseMatrixArg(const CpuSparseMatrix& sparse)
+      : BufferArg(sparse),
+        row_((void*)sparse.getRows(), VALUE_TYPE_INT32),
+        col_((void*)sparse.getCols(), VALUE_TYPE_INT32) {}
+
+  SparseMatrixArg(const GpuSparseMatrix& sparse)
+      : BufferArg(sparse),
+        row_((void*)sparse.getRows(), VALUE_TYPE_INT32),
+        col_((void*)sparse.getCols(), VALUE_TYPE_INT32) {}
+
+  ~SparseMatrixArg() {}
+
+  void* getRowBuf() const { return row_.data(); }
+
+  void* getColBuf() const { return col_.data(); }
+
+  size_t nnz() const { return nnz_; }
+
+  SparseDataFormat dataFormat() const { return format_; }
+
+  SparseDataType dataType() const { return type_; }
+
+private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
new file mode 100644
index 0000000000..5d669b8137
--- /dev/null
+++ b/paddle/function/BufferArgTest.cpp
@@ -0,0 +1,128 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "paddle/math/MemoryHandle.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9);
+}
+
+TEST(BufferTest, asArgument) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  VectorPtr vector = Vector::create(100, false);
+  CpuSparseMatrix sparse(200, 300, 50);
+
+  // prepare arguments
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  argments.addArg(*vector);
+  argments.addArg(sparse);
+
+  // function
+  auto function = [=](const BufferArgs& inputs) {
+    EXPECT_EQ(inputs.size(), 3);
+
+    // check inputs[0]
+    EXPECT_EQ(inputs[0].shape().ndims(), 2);
+    EXPECT_EQ(inputs[0].shape()[0], 100);
+    EXPECT_EQ(inputs[0].shape()[1], 200);
+    EXPECT_EQ(inputs[0].data(), matrix->getData());
+
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
+              matrix->getHeight());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
+              matrix->getWidth());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+
+    // check inputs[1]
+    EXPECT_EQ(inputs[1].shape().ndims(), 1);
+    EXPECT_EQ(inputs[1].shape()[0], 100);
+    EXPECT_EQ(inputs[1].data(), vector->getData());
+    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+
+    // check inputs[2]
+    EXPECT_EQ(inputs[2].shape().ndims(), 2);
+    EXPECT_EQ(inputs[2].shape()[0], 200);
+    EXPECT_EQ(inputs[2].shape()[1], 300);
+    EXPECT_EQ(inputs[2].data(), sparse.getData());
+    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
+    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
+  };
+
+  // call function
+  function(argments);
+}
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getWidth(), 200);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getWidth(), 20);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(BufferTest, Function) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
index 800f71a5b9..98942cff9e 100644
--- a/paddle/function/TensorType.h
+++ b/paddle/function/TensorType.h
@@ -57,6 +57,11 @@ struct DataType<double> {
   static const ValueType value = VALUE_TYPE_DOUBLE;
 };
 
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+
 namespace detail {
 
 template <typename VType, DeviceType Device>

From 811f24de2df9aba4b22dcc05c461eb8294e3f10f Mon Sep 17 00:00:00 2001
From: chenguoyan01 <chenguoyan01@baidu.com>
Date: Thu, 5 Jan 2017 14:26:33 +0800
Subject: [PATCH 15/41] add some explanation for 'your_repo'

---
 doc/howto/usage/k8s/k8s_distributed_cn.md | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index b63b8437a0..2063b98ca8 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -159,6 +159,8 @@ docker build -t your_repo/paddle:mypaddle .
 docker push  your_repo/paddle:mypaddle
 ```
 
+注意上述命令中`your_repo`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`your_repo/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
+
 ### 上传训练文件
 
 本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
@@ -244,6 +246,8 @@ spec:
 
 `CONF_PADDLE_GRADIENT_NUM`表示训练节点数量，即`--num_gradient_servers`参数
 
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
+
 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
 
 ```bash

From 68156c88c50aff2c614ecc69b56bd5f814dc30be Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 5 Jan 2017 19:45:12 +0800
Subject: [PATCH 16/41] Modify the argument type of Function

---
 paddle/function/CrossMapNormalOp.cpp          | 68 +++++++++----------
 paddle/function/Function.h                    | 53 ++-------------
 paddle/gserver/layers/NormProjectionLayer.cpp | 30 +++++---
 paddle/gserver/layers/NormProjectionLayer.h   |  2 +-
 4 files changed, 56 insertions(+), 97 deletions(-)

diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index f13eb78d27..ec27db9c21 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -125,27 +125,25 @@ public:
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs,
+            const BufferArgs& outputs,
+            const BufferArgs& inouts) override {
     CHECK_EQ(1, inputs.size());
     CHECK_EQ(2, outputs.size());
     CHECK_EQ(0, inouts.size());
 
-    CHECK_EQ(inputs[0].dims_.size(), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]);
-    }
+    CHECK_EQ(inputs[0].shape().ndims(), 4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
 
-    size_t samples = inputs[0].dims_[0];
-    size_t channels = inputs[0].dims_[1];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
+    size_t samples = inputs[0].shape()[0];
+    size_t channels = inputs[0].shape()[1];
+    size_t height = inputs[0].shape()[2];
+    size_t width = inputs[0].shape()[3];
 
-    CrossMapNormal<Device>(outputs[0].getData(),
-                           outputs[1].getData(),
-                           inputs[0].getData(),
+    CrossMapNormal<Device>(outputs[0].data<real>(),
+                           outputs[1].data<real>(),
+                           inputs[0].data<real>(),
                            samples,
                            channels,
                            height,
@@ -177,31 +175,29 @@ public:
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs,
+            const BufferArgs& outputs,
+            const BufferArgs& inouts) override {
     CHECK_EQ(4, inputs.size());
     CHECK_EQ(1, outputs.size());
     CHECK_EQ(0, inouts.size());
 
-    CHECK_EQ(inputs[0].dims_.size(), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[3].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-    }
-
-    size_t samples = inputs[0].dims_[0];
-    size_t channels = inputs[0].dims_[1];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].getData(),
-                               inputs[0].getData(),
-                               inputs[1].getData(),
-                               inputs[2].getData(),
-                               inputs[3].getData(),
+    CHECK_EQ(inputs[0].shape().ndims(), 4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+
+    size_t samples = inputs[0].shape()[0];
+    size_t channels = inputs[0].shape()[1];
+    size_t height = inputs[0].shape()[2];
+    size_t width = inputs[0].shape()[3];
+
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               inputs[2].data<real>(),
+                               inputs[3].data<real>(),
                                samples,
                                channels,
                                height,
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 9e8cbb8e48..024575b4f7 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -16,57 +16,12 @@ limitations under the License. */
 
 #include <map>
 #include <vector>
+#include "BufferArg.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ClassRegistrar.h"
 
 namespace paddle {
 
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2,
-};
-
-template <DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-template <DeviceType Device>
-struct SequenceT;
-
-template <>
-struct SequenceT<DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-
-template <>
-struct SequenceT<DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-
-typedef std::vector<size_t> Dims;
-
-class Tensor {
-public:
-  Tensor(real* data, const Dims& dim) : buf_(data), dims_(dim) {}
-
-  real* getData() const { return buf_; }
-
-  real* buf_;
-  Dims dims_;
-};
-
-typedef std::vector<Tensor> Arguments;
-
 class FuncConfig {
 public:
   union value {
@@ -92,9 +47,9 @@ public:
 
   virtual void init(const FuncConfig& config) {}
 
-  virtual void calc(const Arguments& inputs,
-                    const Arguments& outputs,
-                    const Arguments& inouts) {}
+  virtual void calc(const BufferArgs& inputs,
+                    const BufferArgs& outputs,
+                    const BufferArgs& inouts) {}
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
 };
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 262d757c67..573de152fd 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -71,11 +71,16 @@ void CMRProjectionNormLayer::forward(PassType passType) {
 
   Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
 
-  dims_ = {batchSize, channels_, imgSizeH_, imgSizeW_};
-  forward_[0]->calc(
-      {Tensor(input->getData(), dims_)},
-      {Tensor(outV->getData(), dims_), Tensor(denoms_->getData(), dims_)},
-      {});
+  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs inouts;
+  inputs.addArg(*input, shape_);
+  outputs.addArg(*outV, shape_);
+  outputs.addArg(*denoms_, shape_);
+
+  forward_[0]->calc(inputs, outputs, inouts);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
@@ -90,11 +95,14 @@ void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   MatrixPtr localOutV = getOutputValue();
   MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
 
-  backward_[0]->calc({Tensor(preOutV->getData(), dims_),
-                      Tensor(localOutV->getData(), dims_),
-                      Tensor(localGrad->getData(), dims_),
-                      Tensor(denoms_->getData(), dims_)},
-                     {Tensor(preOutGrad->getData(), dims_)},
-                     {});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs inouts;
+  inputs.addArg(*preOutV, shape_);
+  inputs.addArg(*localOutV, shape_);
+  inputs.addArg(*localGrad, shape_);
+  inputs.addArg(*denoms_, shape_);
+  outputs.addArg(*preOutGrad, shape_);
+  backward_[0]->calc(inputs, outputs, inouts);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 6b2c5dde0d..2c0d8a3a71 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -41,6 +41,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr);
 
 protected:
-  Dims dims_;
+  TensorShape shape_;
 };
 }  // namespace paddle

From 41c52d3b0ce619ba25ff9d681ef39613daa1c868 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 5 Jan 2017 20:33:09 +0800
Subject: [PATCH 17/41] Modify the argument type of ContextProjectionFunc

---
 paddle/function/CMakeLists.txt              |   12 +-
 paddle/function/ContextProjectionOp.cpp     |  161 +-
 paddle/function/ContextProjectionOp.h       |   54 +-
 paddle/function/ContextProjectionOpGpu.cu   |   44 +-
 paddle/function/TensorTypeTest.cpp          |   17 +
 paddle/gserver/layers/ContextProjection.cpp |   42 +-
 paddle/math/Matrix.h                        |    4 +
 paddle/math/Matrix.h~RFbb8b484f.TMP         | 1870 +++++++++++++++++++
 8 files changed, 2048 insertions(+), 156 deletions(-)
 create mode 100644 paddle/math/Matrix.h~RFbb8b484f.TMP

diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 0b3126155d..37c011549e 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -3,6 +3,7 @@ file(GLOB cpp_files . *Op.cpp)
 
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
+list(APPEND cpp_files BufferArg.cpp)
 
 if(WITH_GPU)
     file(GLOB cu_files . *OpGpu.cu)
@@ -16,10 +17,13 @@ if(WITH_TESTING)
     # TODO:
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(CrossMapNormalOpTest)
-    add_unittest(ContextProjectionOpTest
-        ContextProjectionOpTest.cpp
-        ../gserver/tests/TestUtil.cpp)
+    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(TensorShapeTest)
+    add_simple_unittest(TensorTypeTest)
+    add_simple_unittest(BufferArgTest)
+    # add_unittest(ContextProjectionOpTest
+    #    ContextProjectionOpTest.cpp
+    #    ../gserver/tests/TestUtil.cpp)
 endif()
 endif()
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index bd367a859e..1a483c4795 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -19,17 +19,15 @@ limitations under the License. */
 namespace paddle {
 
 template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
-                                               const CpuMatrix* input_mat,
-                                               const CpuMatrix* weight_mat,
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                               const CpuMatrix& input_mat,
+                                               const CpuMatrix& weight_mat,
                                                const CpuIVector& seq_vec,
                                                size_t context_length,
                                                int context_start,
                                                size_t begin_pad) {
   const int* starts = seq_vec.getData();
   const size_t num_sequences = seq_vec.getSize() - 1;
-  auto w_mat = const_cast<CpuMatrix*>(weight_mat);
-  auto in_mat = const_cast<CpuMatrix*>(input_mat);
   for (size_t i = 0; i < num_sequences; ++i) {
     for (size_t j = 0; j < context_length; ++j) {
       int begin = starts[i] + context_start + j;
@@ -39,10 +37,11 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
       if (begin < starts[i]) {
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
-        if (w_mat) {
-          MatrixPtr sub = w_mat->subMatrix(j, pad_size);
-          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
         }
         dst_begin = starts[i] + pad_size;
         begin = starts[i];
@@ -50,19 +49,22 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
       if (end > starts[i + 1]) {
         int64_t pad_size =
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (w_mat) {
-          MatrixPtr sub = w_mat->subMatrix(
-              begin_pad + context_start + j - pad_size, pad_size);
-          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat)
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
         }
         dst_end = starts[i + 1] - pad_size;
         end = starts[i + 1];
       }
       if (end <= begin) continue;
-      MatrixPtr src = in_mat->subMatrix(begin, end - begin);
-      MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
-      dst->addAtOffset(*src, j * in_mat->getWidth());
+      MatrixPtr src =
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
     }
   }
 }
@@ -82,40 +84,34 @@ public:
     begin_pad_ = config.get<size_t>("begin_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs,
+            const BufferArgs& outputs,
+            const BufferArgs& inouts) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
     CHECK_EQ(0, inouts.size());
 
-    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
-    CHECK_EQ(outputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inputs[2].dims_.size(), 1);
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2);
+    CHECK_EQ(inputs[0].shape().ndims(), 2);
+    CHECK_EQ(inputs[1].shape().ndims(), 2);
+    CHECK_EQ(inputs[2].shape().ndims(), 1);
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
     /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
-
-    auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    const auto w_mat =
-        !inputs[1].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
-
-    ContextProjectionForward<Device>(out_mat.get(),
-                                     in_mat.get(),
-                                     w_mat.get(),
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+
+    auto out_mat = outputs[0].matrix<Device>();
+    auto in_mat = inputs[0].matrix<Device>();
+    auto w_mat = !inputs[1].data()
+                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                     : inputs[1].matrix<Device>();
+    auto seq_vec = inputs[2].vector<int, Device>();
+    ContextProjectionForward<Device>(out_mat,
+                                     in_mat,
+                                     w_mat,
                                      seq_vec,
                                      context_length_,
                                      context_start_,
@@ -129,18 +125,17 @@ private:
 };
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
-                                                CpuMatrix* in_grad_mat,
-                                                CpuMatrix* w_grad_mat,
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
+                                                CpuMatrix& in_grad_mat,
+                                                CpuMatrix& w_grad_mat,
                                                 const CpuIVector& seq_vec,
                                                 size_t context_length,
                                                 int context_start,
                                                 size_t begin_pad,
                                                 bool is_padding,
                                                 size_t total_pad) {
-  CHECK(out_grad_mat);
-  size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
-                                 : w_grad_mat ? w_grad_mat->getWidth() : 0;
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
   const int* starts = seq_vec.getData();
   size_t num_sequences = seq_vec.getSize() - 1;
   for (size_t i = 0; i < num_sequences; ++i) {
@@ -153,8 +148,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
+          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
         dst_begin = starts[i] + pad_size;
@@ -165,8 +160,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
           MatrixPtr mat =
-              out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat->subMatrix(
+              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(
               begin_pad + context_start + j - pad_size, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
@@ -175,8 +170,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
       }
       if (end <= begin) continue;
       if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
       src->addAtOffset(*dst, j * input_dim);
     }
   }
@@ -199,44 +194,37 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs,
+            const BufferArgs& outputs,
+            const BufferArgs& inouts) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
     CHECK_EQ(0, inouts.size());
 
-    CHECK(outputs[0].getData() && inputs[2].getData());
-    CHECK_EQ(outputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inputs[2].dims_.size(), 1);
+    CHECK(outputs[0].data() && inputs[2].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2);
+    CHECK_EQ(inputs[0].shape().ndims(), 2);
+    CHECK_EQ(inputs[1].shape().ndims(), 2);
+    CHECK_EQ(inputs[2].shape().ndims(), 1);
 
     /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
 
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto out_grad_mat = outputs[0].matrix<Device>();
     auto in_grad_mat =
-        !inputs[0].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    auto w_grad_mat =
-        !inputs[1].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
-
-    ContextProjectionBackward<Device>(out_grad_mat.get(),
-                                      in_grad_mat ? in_grad_mat.get() : nullptr,
-                                      w_grad_mat ? w_grad_mat.get() : nullptr,
+        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[0].matrix<Device>();
+    auto w_grad_mat = !inputs[1].data()
+                          ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[1].matrix<Device>();
+    auto seq_vec = inputs[2].vector<int, Device>();
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
                                       seq_vec,
                                       context_length_,
                                       context_start_,
@@ -253,6 +241,7 @@ private:
   size_t total_pad_;
 };
 
+#if 0
 /**
  * \param inputs[0] input grad.
  * \param inputs[1] input sequence.
@@ -272,6 +261,7 @@ public:
     CHECK_EQ(2, inputs.size());
     CHECK_EQ(1, outputs.size());
     CHECK_EQ(0, inouts.size());
+
     CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
     CHECK_EQ(outputs[0].dims_.size(), 2);
     CHECK_EQ(inputs[0].dims_.size(), 2);
@@ -349,6 +339,7 @@ private:
   size_t begin_pad_;
   size_t total_pad_;
 };
+#endif
 
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     CPU,
@@ -363,6 +354,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     GPU,
                     ContextProjectionBackwardFunc);
+#if 0
 REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
                     GPU,
                     ContextProjectionBackwardDataFunc);
@@ -370,4 +362,5 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
                     GPU,
                     ContextProjectionBackwardWeightFunc);
 #endif
+#endif
 }  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index 93eb050fde..a558df5e07 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -31,14 +31,15 @@ namespace paddle {
  * \param[in]   is_padding        whether padding 0 or not.
  *
  */
-template <DeviceType Device>
-void ContextProjectionForward(typename MatrixT<Device>::type* output,
-                              const typename MatrixT<Device>::type* input,
-                              const typename MatrixT<Device>::type* weight,
-                              const typename SequenceT<Device>::type& sequence,
-                              size_t context_length,
-                              int context_start,
-                              size_t begin_pad);
+template <DeviceType DType>
+void ContextProjectionForward(
+    typename Tensor<real, DType>::Matrix& output,
+    const typename Tensor<real, DType>::Matrix& input,
+    const typename Tensor<real, DType>::Matrix& weight,
+    const typename Tensor<int, DType>::Vector& sequence,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad);
 
 /**
  * \brief   Context Projection Backward.
@@ -53,30 +54,31 @@ void ContextProjectionForward(typename MatrixT<Device>::type* output,
  * \param[in]   is_padding        whether padding 0 or not.
  *
  */
-template <DeviceType Device>
-void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
-                               typename MatrixT<Device>::type* in_grad,
-                               typename MatrixT<Device>::type* w_grad,
-                               const typename SequenceT<Device>::type& seq_vec,
-                               size_t context_length,
-                               int context_start,
-                               size_t begin_pad,
-                               bool is_padding,
-                               size_t total_pad);
+template <DeviceType DType>
+void ContextProjectionBackward(
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad,
+    bool is_padding,
+    size_t total_pad);
 
-template <DeviceType Device>
+template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename MatrixT<Device>::type* out_grad,
-    typename MatrixT<Device>::type* in_grad,
-    const typename SequenceT<Device>::type& sequence,
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    const typename Tensor<int, DType>::Vector& sequence,
     size_t context_length,
     int context_start);
 
-template <DeviceType Device>
+template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename MatrixT<Device>::type* out_grad,
-    typename MatrixT<Device>::type* w_grad,
-    const typename SequenceT<Device>::type& seq_vec,
+    typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
     size_t context_length,
     int context_start,
     size_t total_pad,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 1ec7058f96..6a4a01a651 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -120,20 +120,19 @@ void hl_context_projection_forward(const real* input,
 }
 
 template <>
-void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
-                                               const GpuMatrix* input,
-                                               const GpuMatrix* weight,
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
+                                               const GpuMatrix& input,
+                                               const GpuMatrix& weight,
                                                const GpuIVector& sequence,
                                                size_t context_length,
                                                int context_start,
                                                size_t begin_pad) {
-  CHECK(input && output);
-  hl_context_projection_forward(input->getData(),
+  hl_context_projection_forward(input.getData(),
                                 sequence.getData(),
-                                weight ? weight->getData() : nullptr,
-                                output->getData(),
+                                weight ? weight.getData() : nullptr,
+                                output.getData(),
                                 sequence.getSize() - 1,
-                                input->getWidth(),
+                                input.getWidth(),
                                 context_length,
                                 context_start,
                                 begin_pad);
@@ -217,17 +216,16 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
-                                                    GpuMatrix* in_grad,
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+                                                    GpuMatrix& in_grad,
                                                     const GpuIVector& sequence,
                                                     size_t context_length,
                                                     int context_start) {
-  CHECK(in_grad && out_grad);
-  hl_context_projection_backward_data(out_grad->getData(),
+  hl_context_projection_backward_data(out_grad.getData(),
                                       sequence.getData(),
-                                      in_grad->getData(),
+                                      in_grad.getData(),
                                       sequence.getSize() - 1,
-                                      in_grad->getWidth(),
+                                      in_grad.getWidth(),
                                       context_length,
                                       context_start);
 }
@@ -348,19 +346,18 @@ void hl_context_projection_backward_weight(real* out_grad,
 
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix* out_grad,
-        GpuMatrix* w_grad,
+        GpuMatrix& out_grad,
+        GpuMatrix& w_grad,
         const GpuIVector& seq_vec,
         size_t context_length,
         int context_start,
         size_t total_pad,
         size_t begin_pad) {
-  CHECK(out_grad && w_grad);
-  hl_context_projection_backward_weight(out_grad->getData(),
+  hl_context_projection_backward_weight(out_grad.getData(),
                                         seq_vec.getData(),
-                                        w_grad->getData(),
+                                        w_grad.getData(),
                                         seq_vec.getSize() - 1,
-                                        w_grad->getWidth(),
+                                        w_grad.getWidth(),
                                         total_pad,
                                         context_length,
                                         context_start,
@@ -368,16 +365,15 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
-                                                GpuMatrix* in_grad,
-                                                GpuMatrix* w_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+                                                GpuMatrix& in_grad,
+                                                GpuMatrix& w_grad,
                                                 const GpuIVector& sequence,
                                                 size_t context_length,
                                                 int context_start,
                                                 size_t begin_pad,
                                                 bool is_padding,
                                                 size_t total_pad) {
-    CHECK(out_grad);
     if (in_grad) {
         ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
                 out_grad,
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
index 4a86245c2a..e50e46f3e9 100644
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
@@ -44,4 +44,21 @@ TEST(TensorType, Vector) {
   EXPECT_EQ(gpuIVector.getSize(), 100);
 }
 
+TEST(TensorType, EmptyMatrix) {
+  CpuMatrix empty(nullptr, 0, 0);
+  CpuMatrix nonEmpty(10, 10);
+  EXPECT_EQ(empty.isEmpty(), true);
+  EXPECT_EQ(nonEmpty.isEmpty(), false);
+  CHECK(nonEmpty);
+  auto function = [](const CpuMatrix& matrix) {
+    if (matrix) {
+      EXPECT_NE(matrix.getData(), nullptr);
+    } else {
+      EXPECT_EQ(matrix.getData(), nullptr);
+    }
+  };
+  function(empty);
+  function(nonEmpty);
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index e947b2b9ec..26783a42ca 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -110,7 +110,7 @@ void ContextProjection::forward() {
   size_t input_dim = in_->value->getWidth();
   size_t dim = out_->value->getWidth();
   CHECK_EQ(dim, input_dim * config_.context_length());
-  size_t batch_size = in_->value->getHeight();
+  // size_t batch_size = in_->value->getHeight();
   CHECK_EQ(forward_.size(), 1) << "Only one forward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
@@ -119,14 +119,17 @@ void ContextProjection::forward() {
   auto w_ptr =
       state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
   auto start_pos = in_->sequenceStartPositions;
-  forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
-                     Tensor(w_ptr ? w_ptr->getData() : nullptr,
-                            Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
-                     Tensor(reinterpret_cast<real*>(
-                                const_cast<int*>(start_pos->getData(useGpu_))),
-                            Dims{start_pos->getSize()})},
-                    {Tensor(out_->value->getData(), Dims{batch_size, dim})},
-                    {});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs inouts;
+  inputs.addArg(*in_->value);
+  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                          w_ptr ? w_ptr->getHeight() : 0,
+                          input_dim));
+  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(*out_->value);
+  forward_[0]->calc(inputs, outputs, inouts);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -160,15 +163,18 @@ void ContextProjection::backward(const UpdateCallback& callback) {
   bool is_padding = config_.trainable_padding();
   auto start_pos = in_->sequenceStartPositions;
   auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-  backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
-                             Dims{batch_size, input_dim}),
-                      Tensor(w_ptr ? w_ptr->getData() : nullptr,
-                             Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
-                      Tensor(reinterpret_cast<real*>(
-                                 const_cast<int*>(start_pos->getData(useGpu_))),
-                             Dims{start_pos->getSize()})},
-                     {Tensor(out_->grad->getData(), Dims{batch_size, dim})},
-                     {});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  BufferArgs inouts;
+  inputs.addArg(CpuMatrix(
+      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
+  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                          w_ptr ? w_ptr->getHeight() : 0,
+                          input_dim));
+  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(*out_->grad);
+  backward_[0]->calc(inputs, outputs, inouts);
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 4865a081a5..60c6560396 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -1091,6 +1091,10 @@ public:
       TensorCpuApply<real>(*this, expr);
     }
   }
+
+  bool isEmpty() const { return data_ == nullptr; }
+
+  explicit operator bool() const { return !isEmpty(); }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
diff --git a/paddle/math/Matrix.h~RFbb8b484f.TMP b/paddle/math/Matrix.h~RFbb8b484f.TMP
new file mode 100644
index 0000000000..d89b0f67b3
--- /dev/null
+++ b/paddle/math/Matrix.h~RFbb8b484f.TMP
@@ -0,0 +1,1870 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stdint.h>
+#include <memory>
+#include <thread>
+
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#include <hl_gpu.h>
+
+#include "BaseMatrix.h"
+#include "MemoryHandle.h"
+#include "Vector.h"
+#include "paddle/utils/ThreadLocal.h"
+#include "paddle/utils/common.h"
+
+namespace paddle {
+
+enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
+
+/**
+ * @brief  matrix sparse_format .
+ *
+ * nnz represents nonzero number in sparse matrix.
+ *
+ * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
+ * represents row start index in Matrix. length of col and value are nnz.
+ *
+ * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
+ * represents col start index in Matrix. length of col and value are nnz.
+ *
+ * @code
+ * for example: [0, 1, 0, 2, 0;
+ *               1, 0, 0, 0, 0;
+ *               0, 0, 0, 2, 5];
+ * SPARSE_CSR row   [0, 2, 3, 5];
+ *            col   [1, 3, 0, 3, 4];
+ *            value [1, 2, 1, 2, 5]
+ * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
+ *            row   [1, 0, 0, 2, 2];
+ *            value [1, 1, 2, 2, 5]
+ * @endcode
+ */
+enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
+
+class Matrix;
+class GpuMatrix;
+class CpuMatrix;
+class CpuSparseMatrix;
+class GpuSparseMatrix;
+typedef std::shared_ptr<Matrix> MatrixPtr;
+typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
+typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
+typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
+typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
+
+/**
+ * Copy or assignemnt constructor will share the data as opposed to making a
+ * copy of the original data. To make a copy of the orinal data, use copyFrom()
+ * instead.
+ */
+class Matrix : public BaseMatrix {
+protected:
+  Matrix(MemoryHandlePtr memHandle,
+         size_t height,
+         size_t width,
+         bool trans,
+         bool use_gpu);
+
+  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
+
+  Matrix(real* data,
+         size_t height,
+         size_t width,
+         size_t stride,
+         bool trans,
+         bool use_gpu);
+
+  static ThreadLocal<MatrixPtr> tmpMat_;
+
+public:
+  size_t elementCnt_;  // maximal number of elements which can be held in data_
+  MemoryHandlePtr memoryHandle_;
+
+public:
+  virtual ~Matrix() {}
+
+  static MatrixPtr create(MemoryHandlePtr memHandle,
+                          size_t height,
+                          size_t width,
+                          bool trans = false);
+  static MatrixPtr create(size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          bool trans = false,
+                          bool useGpu = false);
+  static MatrixPtr create(real* data,
+                          size_t height,
+                          size_t width,
+                          size_t stride,
+                          bool trans = false,
+                          bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      bool trans = false,
+                                      bool useGpu = false);
+  static MatrixPtr createSparseMatrix(size_t height,
+                                      size_t width,
+                                      size_t nnz,
+                                      SparseValueType valueType = FLOAT_VALUE,
+                                      SparseFormat foramt = SPARSE_CSR,
+                                      bool trans = false,
+                                      bool useGpu = false);
+
+  static MatrixPtr createSparseMatrix(real* data,
+                                      int* row,
+                                      int* col,
+                                      size_t height,
+                                      size_t width,
+                                      size_t nnz, /* used to allocate space */
+                                      SparseValueType valueType, /*value type*/
+                                      SparseFormat format,
+                                      bool trans,
+                                      bool useGpu);
+
+  static void resizeOrCreateSparseMatrix(
+      MatrixPtr& matrix,
+      size_t height,
+      size_t width,
+      size_t nnz,
+      SparseValueType valueType = FLOAT_VALUE,
+      SparseFormat foramt = SPARSE_CSR,
+      bool trans = false,
+      bool useGpu = false);
+
+  static void resizeOrCreate(MatrixPtr& a,
+                             size_t height,
+                             size_t width,
+                             bool trans = false,
+                             bool useGpu = false);
+
+  /**
+   * @brief  set the data buffer used to hold the matrix data.
+   *
+   * caller should make sure that the size of data is at least
+   * sizeof(real)*height*width.
+   */
+  void setData(real* data) {
+    BaseMatrix::setData(data);
+    memoryHandle_.reset();
+  }
+
+  /// the data should be contiguous
+  void setData(real* data, size_t newHeight, size_t newWidth) {
+    setData(data);
+    height_ = newHeight;
+    width_ = newWidth;
+    elementCnt_ = newHeight * newWidth;
+    stride_ = width_;
+  }
+
+  size_t getWidth() const { return width_; }
+  size_t getHeight() const { return height_; }
+  size_t getStride() const { return stride_; }
+  size_t getElementCnt() const { return elementCnt_; }
+  virtual real* getData() { return data_; }
+  virtual const real* getData() const { return data_; }
+  bool isTransposed() const { return trans_; }
+  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+
+  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
+  // befor call the following functions.
+  // Declare these functions in the base class just easy to call them.
+  // And these declarations should be moved to base class of sparse matrix
+  // if refactor sparse matrix
+  virtual int* getRows() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual int* getCols() const {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;  //! suppress warning for no return value.
+  }
+
+  virtual SparseFormat getFormat() const {
+    LOG(FATAL) << "Not implemented";
+    return SPARSE_CSR;  //! suppress warning for no return value.
+  }
+
+  virtual SparseValueType getValueType() const {
+    LOG(FATAL) << "Not implemented";
+    return NO_VALUE;  //! suppress warning for no return value.
+  }
+
+  /**
+   * @brief matrix elment-wise add
+   *
+   * Named add3 just because add/add2 has been used in BaseMatrix.cu
+   * and they are not virtual function.
+   */
+  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
+
+  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
+
+  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
+
+  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
+
+  void setDiag(real value);
+
+  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void trimFrom(const CpuSparseMatrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  // asynchronous copy
+  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  MatrixPtr subMatrix(size_t startRow,
+                      size_t endRow,
+                      size_t startCol,
+                      size_t endCol);
+
+  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
+    return subMatrix(startRow, endRow, 0, getWidth());
+  }
+
+  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
+    return subMatrix(0, getHeight(), startCol, endCol);
+  }
+
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
+    CHECK_LE(startRow + numRows, getHeight());
+    return Matrix::create(getData() + startRow * getWidth(),
+                          numRows,
+                          getWidth(),
+                          trans_,
+                          useGpu_);
+  }
+  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
+    CHECK_LE(startRow + numRows, getHeight());
+    CHECK_EQ(useGpu_, dest->useGpu_);
+    dest->setData(this->rowBuf(startRow), numRows, getWidth());
+    return dest;
+  }
+
+  /**
+   * If this is GpuMatrix, src is assumed to be CPU memory
+   *
+   * If this is CpuMatrix, src is assumed to be CPU memory
+   */
+  virtual void copyFrom(const real* src, size_t size) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void copyFrom(const real* src, const int64_t* seq) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief convert a int vector to a real matrix.
+   *
+   * (1) source and dest are both in CPU.
+   *
+   * (2) sizes are exactly match.
+   */
+  virtual void copyFrom(const IVector& src) {
+    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
+  }
+
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
+   *        NonValueSparseMatrix, etc.) as this.
+   *
+   * If height and width is zero, the new matrix will have the same size
+   * as this, otherwise the new matrix will have the specified size.
+   *
+   */
+  virtual MatrixPtr clone(size_t height = 0,
+                          size_t width = 0,
+                          bool useGpu = false) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real* getRowBuf(size_t row) {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  virtual real getElement(size_t x, size_t y) const {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual real getSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void accumulateColSum(Matrix& src) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual real getAbsSum() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  /**
+   * @note Original data may not be preserved after resize().
+   */
+  virtual void resize(size_t newHeight, size_t newWidth) = 0;
+
+  /**
+   * @note This should only be used for sparse matrix.
+   */
+  virtual void resize(size_t newHeight,
+                      size_t newWidth,
+                      size_t newNnz, /* total item used to allocate space */
+                      SparseValueType valueType,
+                      SparseFormat format) = 0;
+
+  /**
+   * @brief This should only be used for sparse matrix.
+   *
+   * Currently must be called for each row in order.
+   * The matrix is not valid until setRow is called for the last row.
+   */
+  virtual void setRow(size_t row,
+                      size_t colNum,
+                      const unsigned int* cols,
+                      const real* values) = 0;
+
+  virtual MatrixPtr getTranspose() = 0;
+
+  /**
+   * @brief  hard transpose.
+   *
+   * allocate matTrans' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual MatrixPtr getInverse() {
+    LOG(FATAL) << "Not implemented";
+    return nullptr;
+  }
+
+  /**
+   * @brief  inverse.
+   *
+   * if allocate matInv's memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+public:
+  /// Only set all variables to 0 or NULL but not free them.
+  virtual void clear() {
+    height_ = 0;
+    width_ = 0;
+    data_ = NULL;
+  }
+
+  void reshape(size_t height, size_t width);
+
+  /// add b to each sample of this.
+  virtual void addBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
+  /// add each sample from a to this.
+  virtual void collectBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
+  virtual void sequenceAvgForward(Matrix& a,
+                                  const IVector& startsPos,
+                                  int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  virtual void mul(const Matrix& a,
+                   const Matrix& b,
+                   real scaleAB,
+                   real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// Add a vector (column) b to matrix a, column by column.
+  virtual void addColumnVector(const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += vec(index(i, j), 0)
+   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
+   * @endcode
+   */
+  virtual void addByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   vec(index(i, j), 0) += this(i, j)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void addByBitCodeBackward(size_t numClasses,
+                                    const IVector& codes,
+                                    Matrix& vec) {
+    (void)numClasses;
+    (void)codes;
+    (void)vec;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCode(size_t numClasses,
+                            const IVector& codes,
+                            const Matrix& mat,
+                            const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
+                                          const IVector& codes,
+                                          Matrix& mat,
+                                          const Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength:
+   *   input.row(i) += this(i, j) * mat.row(index(i, j))
+   * where index is same as the index for addByBitCode
+   * @endcode
+   */
+  virtual void mulByBitCodeBackwardError(size_t numClasses,
+                                         const IVector& codes,
+                                         const Matrix& mat,
+                                         Matrix& input) {
+    (void)numClasses;
+    (void)codes;
+    (void)mat;
+    (void)input;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
+   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
+   * @endcode
+   */
+  virtual void sumByBitCode(size_t numClasses,
+                            IVector& codes,
+                            Matrix& sum,
+                            real scaleSum) {
+    (void)numClasses;
+    (void)codes;
+    (void)sum;
+    (void)scaleSum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * For j < codeLength
+   *  this(i, j) -= bit(i, j)
+   * where bit(i, j) is same as that for sumByBitCode
+   * @endcode
+   */
+  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
+    (void)numClasses_;
+    (void)codes;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * add the sum of each row of this to mat
+   */
+  virtual void rowSum(Matrix& sum) {
+    (void)sum;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each row of this to mat
+   */
+  virtual void rowMax(Matrix& max) {
+    (void)max;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * set the max of each column of this to mat
+   */
+  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutForward(Matrix& a,
+                             IVector& id,
+                             size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void maxoutBackward(Matrix& a,
+                              IVector& id,
+                              size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+
+  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief Get the top k elements of each row of this matrix.
+   *
+   * The column ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void rowMax(IVector& maxIds, Matrix& max) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// normalize each row so that the sum of each row is 1.
+  virtual void rowNormalizeL1(Matrix& out) {
+    (void)out;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   *  this = a*b
+   * @endcode
+   */
+  virtual void mul(const Matrix& a, const Matrix& b) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this = a*this)
+   * @endcode
+   */
+  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
+
+  /// merge the element for each col.
+  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// copy -log(output[label]) to this->data[i].
+  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                              IVector& label,
+                                              real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the error of outputV according to label.
+  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                                IVector& label,
+                                                real alpha) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * \f[
+   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
+   * \f]
+   *
+   * b contains M elements,
+   * c contains N elements (N is odd),
+   * b's index arithmetic is computed modulo M,
+   * c's index arithmetic is computed modulo N.
+   */
+  virtual void circularConv(Matrix& b, Matrix& c) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void circularConvDerivative(Matrix& output,
+                                      Matrix& prevOut1,
+                                      Matrix& prevOut2,
+                                      Matrix& prevGrad1,
+                                      Matrix& prevGrad2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
+  virtual void softmax(Matrix& output) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
+    (void)output;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void softmaxBackward(Matrix& outputV) {
+    (void)outputV;
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /*
+    sum_i = sum_j this_ij * output_ij
+    this_ij = output_ij* (this_ij - sum_i)
+  */
+  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// calculate the sum of squares diff cost.
+  virtual void sumOfSquares(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// gradient of sumOfSquares.
+  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void tanhDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
+
+  virtual void softreluDerivative(Matrix& output) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void scaledTanh(Matrix& output, real p1, real p2) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * cosine similarity, for each row i,
+   *   this[i] = cos(output1[i], output2[i])
+   *
+   * output2 can only have one row, then for each row i,
+   *   this[i] = cos(output1[i], output2[0])
+   */
+  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void cosSimDerivative(Matrix& output,
+                                Matrix& prevOut1,
+                                Matrix& prevOut2,
+                                Matrix& prevGrad1,
+                                Matrix& prevGrad2,
+                                real scale = 1.0f) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print out the values of elements to os
+  virtual void print(std::ostream& os) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * print a part of the matrix
+   * from the (top,left) value to the (height, width) value (not included)
+   */
+  virtual void print(std::ostream& os, size_t height, size_t width) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /// print one row to os
+  virtual void printOneRow(std::ostream& os, size_t idx) const {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
+
+  virtual real getMin() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+  virtual real getMax() {
+    LOG(FATAL) << "Not implemented";
+    return 0;
+  }
+
+  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
+
+  /**
+   * @brief  calulate the error of classification
+   *
+   * output[i] = 1 if row i is an error.
+   *
+   * output[i] = 0 if row i is correct.
+   */
+  virtual void classificationError(Matrix& output, IVector& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * This function is used to calculate the convolution:
+   *
+   * It will expand a feature matrix according to the
+   * convolution filters
+   */
+  virtual void convExpand(Matrix& feature,
+                          int feaImgHeight,
+                          int feaImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * This function is the reverse implementation of convExpand:
+   *
+   * Its function is to restore a expanded-matrix into a feature matrix
+   */
+  virtual void convShrink(Matrix& expandColMat,
+                          int thisImgHeight,
+                          int thisImgWidth,
+                          int channels,
+                          int blockH,
+                          int blockW,
+                          int strideH,
+                          int strideW,
+                          int paddingH,
+                          int paddingW,
+                          int outputH,
+                          int outputW,
+                          real alpha = 1.0f,
+                          real beta = 0.0f) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Pooling forward operation, pick out the largest element
+   * in the sizeX of value
+   */
+  virtual void maxPoolForward(Matrix& inputMat,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling backward operation.
+  virtual void maxPoolBackward(Matrix& image,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               Matrix& outGrad,
+                               Matrix& outV,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /// Pooling forward operation, caculate the average of sizeX elements.
+  virtual void avgPoolForward(Matrix& input,
+                              size_t imgSizeH,
+                              size_t imgSizeW,
+                              size_t channels,
+                              size_t sizeX,
+                              size_t sizeY,
+                              size_t strideH,
+                              size_t strideW,
+                              size_t outputH,
+                              size_t outputW,
+                              size_t paddingH,
+                              size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void avgPoolBackward(Matrix& input,
+                               size_t imgSizeH,
+                               size_t imgSizeW,
+                               size_t sizeX,
+                               size_t sizeY,
+                               size_t strideH,
+                               size_t strideW,
+                               size_t outputH,
+                               size_t outputW,
+                               real scaleTargets,
+                               real scaleOutput,
+                               size_t paddingH,
+                               size_t paddingW) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * Input: one or more sequences. Each sequence contains some instances.
+   *
+   * Output: output size is the number of input sequences (NOT input
+   * instances).
+   *
+   * output[i] is set to max_input[i].
+   */
+  virtual void maxSequenceForward(Matrix& input,
+                                  const IVector& sequence,
+                                  IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  virtual void maxSequenceBackward(Matrix& outputGrad,
+                                   const IVector& sequence,
+                                   IVector& index) {
+    LOG(FATAL) << "Not implemeted";
+  }
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * if ids[i] == -1, it will be ignored
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids) {
+    (void)table;
+    (void)ids;
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids) {
+    LOG(FATAL) << "Not implemented";
+  }
+  /**
+   * @brief  cross entropy for multi binary labels
+   *
+   * @code
+   * this[i] = -sum(label[i][j]*log(output[i][j])
+   *           + (1-label[i][j])*log(1-output[i][j]))
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  The gradient of cross entropy for multi binary labels on output
+   *
+   * @code
+   * this[i][j] = -label[i][j]/output[i][j]
+   *              + (1-label[i][j])/(1-output[i][j])
+   * @endcode
+   */
+  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  Calculate the classification error for multi binary labels
+   *
+   * @code
+   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
+   *            || (output[i][j] < threshold && label[i][j] == 1))
+   *            / output->getWidth()
+   * @endcode
+   */
+  virtual void classificationErrorMulti(Matrix& output,
+                                        Matrix& label,
+                                        real threshold) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void paramReluForward(Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void bilinearForward(const Matrix& in,
+                               const size_t inImgH,
+                               const size_t inImgW,
+                               const size_t outImgH,
+                               const size_t outImgW,
+                               const size_t numChannels,
+                               const real ratioH,
+                               const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void bilinearBackward(const Matrix& out,
+                                const size_t outImgH,
+                                const size_t outImgW,
+                                const size_t inImgH,
+                                const size_t inImgW,
+                                const size_t numChannels,
+                                const real ratioH,
+                                const real ratioW) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
+
+  bool isEmpty() const {
+    return data_ == nullptr;
+  }
+
+  explicit operator bool() const {
+    return !isEmpty();
+  }
+};
+
+inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
+  mat.print(os);
+  return os;
+}
+
+class GpuMatrix : public Matrix {
+public:
+  GpuMatrix();
+
+  GpuMatrix(size_t height, size_t width, bool trans = false);
+  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, true) {}
+  GpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, true) {}
+  GpuMatrix(GpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, true) {}
+  ~GpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  /**
+   * Copy the data from cpu_memory buffer
+   */
+  void copyFrom(const real* hostSrc, size_t size);
+
+  void copyFrom(const real* hostSrc, const int64_t* seq);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const IVector& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  real getElement(size_t x, size_t y) const;
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr matTrans, bool memAlloc);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr matInv, bool memAlloc);
+
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /**
+   * @code
+   * add each sample from a to this.
+   * @endcode
+   */
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*b) + scaleT*this
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*b
+   * @endcode
+   */
+  void mul(const Matrix& a, const Matrix& b);
+
+  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
+
+  void mul(const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  void mul(const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT);
+
+  /**
+   * @code
+   * this = scaleAB*(this*b) +  scaleT*this
+   * @endcode
+   */
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = this* b
+   * @endcode
+   */
+  void rightMul(Matrix& b);
+
+  /**
+   * @code
+   * this = scaleAB*(a*this) +  scaleT*this
+   * @endcode
+   */
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+
+  /**
+   * @code
+   * this = a*this
+   * @endcode
+   */
+  void leftMul(Matrix& a);
+
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& max);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxBackward(Matrix& outputV);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void cosSim(Matrix& output1, Matrix& output2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
+
+  virtual void print(std::ostream& os) const;
+  virtual void print(std::ostream& os, size_t height, size_t width) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label);
+
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandColMat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blochW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingWreal,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
+};
+
+class CpuMatrix : public Matrix {
+public:
+  CpuMatrix(size_t height, size_t width, bool trans = false);
+  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
+      : Matrix(data, height, width, trans, false) {}
+  CpuMatrix(real* data,
+            size_t height,
+            size_t width,
+            size_t stride,
+            bool trans = false)
+      : Matrix(data, height, width, stride, trans, false) {}
+
+  CpuMatrix(CpuMemHandlePtr dataHandle,
+            size_t height,
+            size_t width,
+            bool trans = false)
+      : Matrix(dataHandle, height, width, trans, false) {}
+
+  ~CpuMatrix();
+
+  void zeroMem();
+  void resetOne();
+  void setDiag(real value);
+
+  void resize(size_t newHeight, size_t newWidth);
+  void resize(size_t newHeight,
+              size_t newWidth,
+              size_t newNnz, /* used to allocate space */
+              SparseValueType valueType,
+              SparseFormat format) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+  void setRow(size_t row,
+              size_t colNum,
+              const unsigned int* cols,
+              const real* values) {
+    LOG(FATAL) << "Only Support Sparse Matrix";
+  }
+
+  real getElement(size_t x, size_t y) const;
+  real getSum();
+  void accumulateColSum(Matrix& src);
+  real getAbsSum();
+
+  MatrixPtr getTranspose();
+  void transpose(MatrixPtr matTrans, bool memAlloc);
+
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr matInv, bool memAlloc);
+
+  void copyFrom(const Matrix& src);
+
+  void copyFrom(const Matrix& src, hl_stream_t stream);
+
+  void copyFrom(const real* cpuSrc, size_t size);
+
+  void copyFrom(const real* cpuSrc, const int64_t* seq);
+
+  void copyFrom(const IVector& src);
+
+  void copyFrom(CpuSparseMatrix& src);
+
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
+
+  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
+
+  void convExpand(Matrix& feature,
+                  int feaImgHeight,
+                  int feaImgWidth,
+                  int channels,
+                  int blcokH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW);
+
+  void convShrink(Matrix& expandFeat,
+                  int thisImgHeight,
+                  int thisImgWidth,
+                  int channels,
+                  int blockH,
+                  int blockW,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  int outputH,
+                  int outputW,
+                  real alpha = 1.0f,
+                  real beta = 0.0f);
+
+  void maxPoolForward(Matrix& inputMat,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void maxPoolBackward(Matrix& image,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       Matrix& outGrad,
+                       Matrix& outV,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void avgPoolForward(Matrix& input,
+                      size_t imgSizeH,
+                      size_t imgSizeW,
+                      size_t channels,
+                      size_t sizeX,
+                      size_t sizeY,
+                      size_t strideH,
+                      size_t strideW,
+                      size_t outputH,
+                      size_t outputW,
+                      size_t paddingH,
+                      size_t paddingW);
+
+  void avgPoolBackward(Matrix& input,
+                       size_t imgSizeH,
+                       size_t imgSizeW,
+                       size_t sizeX,
+                       size_t sizeY,
+                       size_t strideH,
+                       size_t strideW,
+                       size_t outputH,
+                       size_t outputW,
+                       real scaleTargets,
+                       real scaleOutput,
+                       size_t paddingH,
+                       size_t paddingW);
+
+  void maxSequenceForward(Matrix& input,
+                          const IVector& sequence,
+                          IVector& index);
+
+  void maxSequenceBackward(Matrix& outputGrad,
+                           const IVector& sequence,
+                           IVector& index);
+
+  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
+  virtual real* getRowBuf(size_t row) { return getRow(row); }
+
+public:
+  /// add b to each sample of this.
+  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
+
+  /// add each sample of a to this.
+  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
+
+  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+
+  /**
+   * @code
+   * this.row[i] += table.row[ids[i]]
+   * @endcode
+   */
+  virtual void selectRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table.row[ids[i]] += this.row[i]
+   * @endcode
+   */
+  virtual void addToRows(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * this[i] = table[i, id[i]]
+   * @endcode
+   */
+  virtual void selectElements(Matrix& table, IVector& ids);
+
+  /**
+   * @code
+   * table[i, id[i]] += this[i]
+   * @endcode
+   */
+  virtual void addElements(Matrix& table, IVector& ids);
+
+  /**
+   * use abstract getRow() to get row from table.
+   *
+   * Define table as template instead of virtual class for performance sake.
+   * internal used by above two virtual funcs.
+   */
+  template <typename TableMatType>
+  void selectRowsImp(TableMatType& table, IVector& ids);
+  template <typename TableMatType>
+  void addToRowsImp(TableMatType& table, IVector& ids);
+
+  void addColumnVector(const Matrix& b);
+
+  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
+  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
+
+  static void mul(CpuMatrix* a,
+                  CpuMatrix* b,
+                  CpuSparseMatrix* c,
+                  real scaleAB,
+                  real scaleT);
+
+  /**
+   * c = a * b
+   *
+   * use abstract getRow() to get row from B,C.
+   * Define B,C as template instead of virtual class for performance sake.
+   */
+  template <typename MatBType, typename MatCType>
+  static void mul(
+      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
+
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+
+  void mul(const Matrix& a, const Matrix& b);
+
+  void rightMul(Matrix& b, real scaleAB, real scaleT);
+  void rightMul(Matrix& b);
+
+  void leftMul(Matrix& a, real scaleAB, real scaleT);
+  void leftMul(Matrix& a);
+  void colMerge(Matrix& src);
+  void rowSum(Matrix& sum);
+  void rowMaxId(IVector& maxIds);
+  void rowMax(Matrix& max);
+  void rowMax(IVector& maxIds, Matrix& maxVal);
+  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void rowNormalizeL1(Matrix& out);
+
+  void oneHotCrossEntropy(Matrix& output, IVector& label);
+  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
+  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
+                                      IVector& label,
+                                      real alpha);
+  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
+                                        IVector& label,
+                                        real alpha);
+
+  void circularConv(Matrix& b, Matrix& c);
+  void circularConvDerivative(Matrix& output,
+                              Matrix& prevOut1,
+                              Matrix& prevOut2,
+                              Matrix& prevGrad1,
+                              Matrix& prevGrad2);
+
+  void softmax(Matrix& output);
+  void sequenceSoftmax(Matrix& output, const IVector& index);
+  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
+
+  /// calculate the sum of squares diff cost.
+  void sumOfSquares(Matrix& output, Matrix& label);
+
+  /// gradient of sumOfSquares.
+  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+
+  void tanh(Matrix& output);
+  void tanhDerivative(Matrix& output);
+
+  void softrelu(Matrix& output);
+  void softreluDerivative(Matrix& output);
+  void scaledTanh(Matrix& output, real p1, real p2);
+
+  void cosSim(Matrix& output1, Matrix& output2, real scale);
+  void cosSimDerivative(Matrix& output,
+                        Matrix& prevOut1,
+                        Matrix& prevOut2,
+                        Matrix& prevGrad1,
+                        Matrix& prevGrad2,
+                        real scale);
+
+  void print(std::ostream& os) const;
+  void print(std::ostream& os, size_t height, size_t width) const;
+  void printOneRow(std::ostream& os, size_t idx) const;
+
+  void paramReluForward(Matrix& data, Matrix& W);
+  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
+  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
+
+  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
+
+  real getMin();
+  real getMax();
+
+  void randomizeUniform();
+
+  void classificationError(Matrix& output, IVector& label);
+
+  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
+
+  void addByBitCodeBackward(size_t numClasses,
+                            const IVector& codes,
+                            Matrix& vec);
+
+  void mulByBitCode(size_t numClasses,
+                    const IVector& codes,
+                    const Matrix& mat,
+                    const Matrix& input);
+
+  void mulByBitCodeBackwardWeight(size_t numClasses,
+                                  const IVector& codes,
+                                  Matrix& mat,
+                                  const Matrix& input);
+
+  void mulByBitCodeBackwardError(size_t numClasses,
+                                 const IVector& codes,
+                                 const Matrix& mat,
+                                 Matrix& input);
+
+  void sumByBitCode(size_t numClasses,
+                    IVector& codes,
+                    Matrix& sum,
+                    real scaleSum);
+
+  void subByBitCode(size_t numClasses_, IVector& codes);
+
+  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
+  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
+
+  void bilinearForward(const Matrix& in,
+                       const size_t inImgH,
+                       const size_t inImgW,
+                       const size_t outImgH,
+                       const size_t outImgW,
+                       const size_t numChannels,
+                       const real ratioH,
+                       const real ratioW);
+
+  void bilinearBackward(const Matrix& out,
+                        const size_t outImgH,
+                        const size_t outImgW,
+                        const size_t inImgH,
+                        const size_t inImgW,
+                        const size_t numChannels,
+                        const real ratioH,
+                        const real ratioW);
+
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
+};
+
+class SharedCpuMatrix : public CpuMatrix {
+public:
+  /* blockNum is number of partitions of the matrix  */
+  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(height, width, trans) {
+    initShared(blockNum);
+  }
+  SharedCpuMatrix(
+      int blockNum, real* data, size_t height, size_t width, bool trans = false)
+      : CpuMatrix(data, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(int blockNum,
+                  CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initShared(blockNum);
+  }
+
+  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
+                  size_t height,
+                  size_t width,
+                  bool trans = false)
+      : CpuMatrix(dataHandle, height, width, trans) {
+    initBlock(1);
+  }
+
+  ~SharedCpuMatrix() {}
+
+public:
+  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
+  virtual void add(Matrix& b, real p1, real p2);
+  virtual void add(real p1, real p2);
+
+private:
+  using Matrix::mul;
+  void initShared(int blockNum);
+  void initBlock(int blockNum);
+
+  int blockNum_;
+  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
+  ThreadLocal<CpuMatrixPtr> localBuf_;
+  ThreadLocal<std::vector<int>> localBufRows_;
+  ThreadLocal<std::vector<int>> blockSeq_;
+};
+
+typedef struct { unsigned int col; } sparse_non_value_t;
+
+typedef struct {
+  unsigned int col;
+  float value;
+} sparse_float_value_t;
+
+}  // namespace paddle
+#include "ExecViaCpu.h"

From f3fdfd941f170fbcfa5162246803b4cf8be6131c Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 5 Jan 2017 21:32:55 +0800
Subject: [PATCH 18/41] add some comments for Function.h

---
 paddle/function/BufferArg.h | 26 -----------------------
 paddle/function/Function.h  | 42 +++++++++++++++++++++++++++++++++++++
 2 files changed, 42 insertions(+), 26 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 9fcda7a878..52494afed3 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -46,32 +46,6 @@ class SequenceArg;
 class SparseMatrixArg;
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
-class BufferArgs {
-public:
-  BufferArgs() {}
-  size_t size() const { return args_.size(); }
-
-  // add argument into BufferArgss
-  template <typename Tensor>
-  void addArg(const Tensor& arg) {
-    args_.push_back(std::make_shared<BufferArg>(arg));
-  }
-
-  void addArg(const Matrix& arg, const TensorShape& shape);
-
-  void addArg(const CpuSparseMatrix& arg);
-  void addArg(const GpuSparseMatrix& arg);
-
-  // get argument
-  const BufferArg& operator[](size_t num) const {
-    CHECK_LT(num, args_.size());
-    return *args_[num];
-  }
-
-private:
-  std::vector<BufferArgPtr> args_;
-};
-
 // an array of arbitrary dimensions
 class BufferArg {
 public:
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 024575b4f7..27ebe808aa 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -22,6 +22,11 @@ limitations under the License. */
 
 namespace paddle {
 
+/**
+ * Function Configuration.
+ * The argument type of Function::init.
+ * Follow-up will consider moving this data structure to Proto inside.
+ */
 class FuncConfig {
 public:
   union value {
@@ -41,6 +46,43 @@ protected:
   std::map<std::string, value> valueMap_;
 };
 
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs, outputs and inouts.
+ */
+class BufferArgs {
+public:
+  BufferArgs() {}
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgss
+  template <typename Tensor>
+  void addArg(const Tensor& arg) {
+    args_.push_back(std::make_shared<BufferArg>(arg));
+  }
+
+  void addArg(const Matrix& arg, const TensorShape& shape);
+
+  void addArg(const CpuSparseMatrix& arg);
+  void addArg(const GpuSparseMatrix& arg);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+private:
+  std::vector<BufferArgPtr> args_;
+};
+
+/**
+ * Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ * Need to pay attention to the inouts argument. For the input argument
+ * that will be modified, it needs to be passed through inouts.
+ */
 class FunctionBase {
 public:
   virtual ~FunctionBase() {}

From ccf0b1bb2e32e0b52b56af18c1c8e339eface97e Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 5 Jan 2017 21:45:00 +0800
Subject: [PATCH 19/41] add FunctionTest.cpp

---
 paddle/function/BufferArg.cpp     | 12 -------
 paddle/function/BufferArgTest.cpp | 40 +--------------------
 paddle/function/CMakeLists.txt    |  1 +
 paddle/function/Function.cpp      | 12 +++++++
 paddle/function/FunctionTest.cpp  | 59 +++++++++++++++++++++++++++++++
 5 files changed, 73 insertions(+), 51 deletions(-)
 create mode 100644 paddle/function/FunctionTest.cpp

diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
index 08031917b2..65c6f30304 100644
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -28,16 +28,4 @@ const SparseMatrixArg& BufferArg::sparse() const {
   return dynamic_cast<const SparseMatrixArg&>(*this);
 }
 
-void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape));
-}
-
-void BufferArgs::addArg(const CpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
-}
-
-void BufferArgs::addArg(const GpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
-}
-
 }  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index 5d669b8137..a9ee3ab079 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "BufferArg.h"
 #include <gtest/gtest.h>
+#include "Function.h"
 #include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
@@ -86,43 +87,4 @@ TEST(BufferTest, asArgument) {
   function(argments);
 }
 
-template <DeviceType DType>
-void FunctionApi(typename Tensor<real, DType>::Matrix& output,
-                 const typename Tensor<real, DType>::Matrix& input);
-
-template <>
-void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 100);
-  EXPECT_EQ(output.getWidth(), 200);
-}
-
-template <>
-void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
-  EXPECT_EQ(output.getHeight(), 10);
-  EXPECT_EQ(output.getWidth(), 20);
-}
-
-template <DeviceType DType>
-void Function(const BufferArgs& arguments) {
-  auto input = arguments[0].matrix<DType>();
-  auto output = arguments[1].matrix<DType>();
-  FunctionApi<DType>(output, input);
-}
-
-TEST(BufferTest, Function) {
-  CpuMatrix cpuInput = CpuMatrix(100, 200);
-  CpuMatrix cpuOutput = CpuMatrix(100, 200);
-  BufferArgs cpuArgments;
-  cpuArgments.addArg(cpuInput);
-  cpuArgments.addArg(cpuOutput);
-  Function<DEVICE_TYPE_CPU>(cpuArgments);
-
-  GpuMatrix gpuInput = GpuMatrix(10, 20);
-  GpuMatrix gpuOutput = GpuMatrix(10, 20);
-  BufferArgs gpuArgments;
-  gpuArgments.addArg(gpuInput);
-  gpuArgments.addArg(gpuOutput);
-  Function<DEVICE_TYPE_GPU>(gpuArgments);
-}
-
 }  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 37c011549e..31c395c848 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -21,6 +21,7 @@ if(WITH_TESTING)
     add_simple_unittest(TensorShapeTest)
     add_simple_unittest(TensorTypeTest)
     add_simple_unittest(BufferArgTest)
+    add_simple_unittest(FunctionTest)
     # add_unittest(ContextProjectionOpTest
     #    ContextProjectionOpTest.cpp
     #    ../gserver/tests/TestUtil.cpp)
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 6f82a8d053..2f56cfc1b5 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -72,6 +72,18 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
   return *this;
 }
 
+void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape));
+}
+
+void BufferArgs::addArg(const CpuSparseMatrix& arg) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+}
+
+void BufferArgs::addArg(const GpuSparseMatrix& arg) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 
 }  // namespace paddle
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
new file mode 100644
index 0000000000..7c3d6684cd
--- /dev/null
+++ b/paddle/function/FunctionTest.cpp
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getWidth(), 200);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getWidth(), 20);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+}  // namespace paddle

From 5aaaef446818e52106f652d8b070c803cbbcce20 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Fri, 6 Jan 2017 00:57:21 +0800
Subject: [PATCH 20/41] rename PServerController to ParameterServerController

---
 paddle/pserver/CMakeLists.txt                 |  4 +--
 paddle/pserver/ParameterServer2Main.cpp       |  6 ++---
 ...ller.cpp => ParameterServerController.cpp} | 17 ++++++------
 ...ntroller.h => ParameterServerController.h} | 26 ++++++++++---------
 paddle/trainer/TrainerMain.cpp                |  6 ++---
 5 files changed, 31 insertions(+), 28 deletions(-)
 rename paddle/pserver/{PServerController.cpp => ParameterServerController.cpp} (85%)
 rename paddle/pserver/{PServerController.h => ParameterServerController.h} (56%)

diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index ac52b8dbec..b7f85ea1a6 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -25,14 +25,14 @@ set(PSERVER_SOURCES
     ParameterClient2.cpp
     ParameterServer2.cpp
     SparseParameterDistribution.cpp
-    PServerController.cpp)
+    ParameterServerController.cpp)
 
 set(PSERVER_HEADERS
     BaseClient.h
     ParameterClient2.h
     ParameterServer2.h
     SparseParameterDistribution.h
-    PServerController.h)
+    ParameterServerController.h)
 
 add_library(paddle_pserver STATIC
     ${PSERVER_SOURCES})
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index 6e683cdd2c..1145052522 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fstream>
-#include "PServerController.h"
+#include "ParameterServerController.h"
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
   initMain(argc, argv);
 
-  std::unique_ptr<PServerController> pServerPtr(
-      paddle::PServerController::createByGflags());
+  std::unique_ptr<ParameterServerController> pServerPtr(
+      paddle::ParameterServerController::createByGflags());
   pServerPtr->start();
   pServerPtr->join();
 
diff --git a/paddle/pserver/PServerController.cpp b/paddle/pserver/ParameterServerController.cpp
similarity index 85%
rename from paddle/pserver/PServerController.cpp
rename to paddle/pserver/ParameterServerController.cpp
index 8d2e026bca..ec24bc7e57 100644
--- a/paddle/pserver/PServerController.cpp
+++ b/paddle/pserver/ParameterServerController.cpp
@@ -12,11 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "PServerController.h"
+#include "ParameterServerController.h"
 
 namespace paddle {
 
-PServerController::PServerController(const ParameterServerConfig& config) {
+ParameterServerController::ParameterServerController(
+    const ParameterServerConfig& config) {
   // round robin to load balance RDMA server ENGINE
   std::vector<std::string> devices;
   int rdmaCpu = 0;
@@ -58,9 +59,9 @@ PServerController::PServerController(const ParameterServerConfig& config) {
   }
 }
 
-PServerController::~PServerController() { this->join(); }
+ParameterServerController::~ParameterServerController() { this->join(); }
 
-PServerController* PServerController::createByGflags() {
+ParameterServerController* ParameterServerController::createByGflags() {
   ParameterServerConfig config;
 
   config.set_nics(FLAGS_nics);
@@ -72,12 +73,12 @@ PServerController* PServerController::createByGflags() {
   return create(config);
 }
 
-PServerController* PServerController::create(
+ParameterServerController* ParameterServerController::create(
     const ParameterServerConfig& config) {
-  return new PServerController(config);
+  return new ParameterServerController(config);
 }
 
-void PServerController::start() {
+void ParameterServerController::start() {
   LOG(INFO) << "pserver sizes : " << pservers_.size();
   int i = 0;
   for (const auto& pserver : pservers_) {
@@ -87,7 +88,7 @@ void PServerController::start() {
   }
 }
 
-void PServerController::join() {
+void ParameterServerController::join() {
   LOG(INFO) << "pserver sizes : " << pservers_.size();
   int i = 0;
   for (const auto& pserver : pservers_) {
diff --git a/paddle/pserver/PServerController.h b/paddle/pserver/ParameterServerController.h
similarity index 56%
rename from paddle/pserver/PServerController.h
rename to paddle/pserver/ParameterServerController.h
index cecf729009..ee249de9d8 100644
--- a/paddle/pserver/PServerController.h
+++ b/paddle/pserver/ParameterServerController.h
@@ -21,39 +21,41 @@ limitations under the License. */
 
 namespace paddle {
 
-class PServerController final {
+class ParameterServerController final {
 public:
-  DISABLE_COPY(PServerController);
+  DISABLE_COPY(ParameterServerController);
 
   /**
-   * @brief Ctor, Create a PServerUtil from ParameterServerConfig.
+   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
    */
-  explicit PServerController(const ParameterServerConfig& config);
+  explicit ParameterServerController(const ParameterServerConfig& config);
 
   /**
    * @brief Dtor.
    */
-  ~PServerController();
+  ~ParameterServerController();
 
   /**
-   * @brief create PServerUtil from gflags, this is used for
+   * @brief create ParameterServerController from gflags, this is used for
    * compatibility with the old usage of configuration by gflags.
    */
-  static PServerController* createByGflags();
+  static ParameterServerController* createByGflags();
 
   /**
-   * @brief create PServerUtil with ParameterServerConfig, remove gflags
-   * from ParameterServer. Init all pservers thread according to the config.
+   * @brief create ParameterServerController with ParameterServerConfig, remove
+   * gflags from ParameterServer. Init all pservers thread according to the
+   * config.
    */
-  static PServerController* create(const ParameterServerConfig& config);
+  static ParameterServerController* create(const ParameterServerConfig& config);
 
   /**
-   * @brief start all pserver thread in this PServerUtil.
+   * @brief start all pserver thread in this ParameterServerController.
    */
   void start();
 
   /**
-   * @brief join and wait for all pserver thread in this PServerUtil.
+   * @brief join and wait for all pserver thread in this
+   * ParameterServerController.
    */
   void join();
 
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 3ce3d67842..ccf67f96fa 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fenv.h>
-#include "paddle/pserver/PServerController.h"
+#include "paddle/pserver/ParameterServerController.h"
 #include "paddle/utils/Excepts.h"
 #include "paddle/utils/PythonUtil.h"
 
@@ -37,9 +37,9 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
 
-  std::unique_ptr<PServerController> pServerPtr(nullptr);
+  std::unique_ptr<ParameterServerController> pServerPtr(nullptr);
   if (FLAGS_start_pserver) {
-    pServerPtr.reset(paddle::PServerController::createByGflags());
+    pServerPtr.reset(paddle::ParameterServerController::createByGflags());
     pServerPtr->start();
   }
   Trainer trainer;

From 6c4831a096ce21ec1b1d90656783ef4a3179b800 Mon Sep 17 00:00:00 2001
From: zhouyingfeng <zhouyingfeng@baidu.com>
Date: Fri, 6 Jan 2017 19:42:37 +0800
Subject: [PATCH 21/41] add "paddle usage" scripts in all demos

resolve #965
---
 demo/image_classification/train.sh    | 1 +
 demo/introduction/train.sh            | 2 +-
 demo/mnist/train.sh                   | 1 +
 demo/quick_start/predict.sh           | 2 ++
 demo/quick_start/train.sh             | 1 +
 demo/recommendation/run.sh            | 1 +
 demo/semantic_role_labeling/test.sh   | 1 +
 demo/semantic_role_labeling/train.sh  | 1 +
 demo/sentiment/test.sh                | 1 +
 demo/sentiment/train.sh               | 1 +
 demo/seqToseq/paraphrase/train.sh     | 1 +
 demo/seqToseq/translation/gen.sh      | 1 +
 demo/seqToseq/translation/train.sh    | 1 +
 demo/sequence_tagging/train.sh        | 4 +++-
 demo/sequence_tagging/train_linear.sh | 2 ++
 15 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
index 6fc11caf1c..e45bd47ad5 100755
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
 
 python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
index a7e184300c..2ce6446d7c 100755
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -19,4 +19,4 @@ paddle train \
     --save_dir=./output \
     --num_passes=30 \
     2>&1 |tee 'train.log'
-paddle usage -l "train.log" -e $? -n "linear_intro" >/dev/null 2>&1
+paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh
index da90cd749a..ca2b1ad9eb 100755
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=100 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
 
 python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh
index f02e5038e9..e47c2dd01f 100755
--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -26,5 +26,7 @@ paddle train \
     --init_model_path=$model \
     --config_args=is_predict=1 \
     --predict_output_dir=. \
+2>&1 | tee 'predict.log'
+paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
 
 mv rank-00000 result.txt
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index e3595fce75..01697fed48 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -31,3 +31,4 @@ paddle train \
   --show_parameter_stats_period=100 \
   --test_all_data_in_one_period=1 \
   2>&1 | tee 'train.log'
+paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh
index e341d1cc7a..22aef55608 100755
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
@@ -22,3 +22,4 @@ paddle train \
     --log_period=100 \
     --dot_period=1 \
     --num_passes=50  2>&1 | tee 'log.txt'
+paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 11d9d6a19c..eabfb3fe3d 100755
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -38,3 +38,4 @@ paddle train \
   --config_args=is_test=1 \
   --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
\ No newline at end of file
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index 9354e72f46..eee14010d7 100755
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -27,3 +27,4 @@ paddle train \
   --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
   2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
index 8af827c338..85c4f3ccfc 100755
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -37,3 +37,4 @@ paddle train --config=$net_conf \
              --trainer_count=4 \
              --config_args=is_test=1 \
              2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh
index 5ce8bf4b99..14620f733b 100755
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
@@ -27,3 +27,4 @@ paddle train --config=$config \
              --show_parameter_stats_period=100 \
              --test_all_data_in_one_period=1 \
              2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh
index 33a42f6eff..9bb6dbdb1d 100755
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -27,3 +27,4 @@ paddle train \
     --log_period=10 \
     --dot_period=5 \
     2>&1 | tee 'paraphrase/train.log'
+paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh
index a700ae2134..64b78f5e96 100755
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
@@ -24,3 +24,4 @@ paddle train \
     --test_pass=12 \
     --trainer_count=1 \
     2>&1 | tee 'translation/gen.log'
+paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh
index bdece693e5..b0ec9854b1 100755
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
@@ -25,3 +25,4 @@ paddle train \
 --log_period=10 \
 --dot_period=5 \
 2>&1 | tee 'translation/train.log'
+paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh
index 9a706b98d8..37e196c842 100755
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
@@ -7,4 +7,6 @@ paddle train \
        --dot_period=10 \
        --log_period=1000 \
        --test_period=0 \
-       --num_passes=10
+       --num_passes=10 \
+2>&1 | tee 'train.log'
+paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh
index 597b5afea9..ad6e2d8ee7 100755
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
@@ -7,3 +7,5 @@ paddle train \
        --log_period=10000 \
        --test_period=0 \
        --num_passes=10
+2>&1 | tee 'train_linear.log'
+paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1

From d35ef9de10b3b97f63fa0156a8c7d36e7e89c8b8 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 9 Jan 2017 11:47:18 +0800
Subject: [PATCH 22/41] follow commit

---
 paddle/function/BufferArg.h   | 20 +++++++++++---------
 paddle/function/TensorShape.h |  4 ++--
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 52494afed3..d787d2814d 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -56,7 +56,7 @@ public:
       : buf_(buf), valueType_(valueType) {}
 
   BufferArg(const Matrix& matrix)
-      : buf_((void*)matrix.getData()),
+      : buf_(reinterpret_cast<void*>(matrix.getData())),
         valueType_(DataType<real>::value),
         shape_(2) {
     shape_.setDim(0, matrix.getHeight());
@@ -64,21 +64,23 @@ public:
   }
 
   BufferArg(const Matrix& matrix, const TensorShape& shape)
-      : buf_((void*)matrix.getData()),
+      : buf_(reinterpret_cast<void*>(matrix.getData())),
         valueType_(DataType<real>::value),
         shape_(shape) {
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
   BufferArg(const Vector& vector)
-      : buf_((void*)vector.getData()),
+      : buf_(reinterpret_cast<void*>(vector.getData())),
         valueType_(DataType<real>::value),
         shape_(1) {
     shape_.setDim(0, vector.getSize());
   }
 
   BufferArg(const IVector& vector)
-      : buf_((void*)vector.getData()), valueType_(VALUE_TYPE_INT32), shape_(1) {
+      : buf_(reinterpret_cast<void*>(vector.getData())),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1) {
     shape_.setDim(0, vector.getSize());
   }
 
@@ -129,7 +131,7 @@ protected:
 // sequence start positions in a mini-batch of sequences
 // shape_.ndims() == 1
 // valueType_ = int32
-// if a < b than value_.buf_[a] < value_.buf_[b]
+// if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
   SequenceIdArg(void* buf, const TensorShape& shape)
@@ -203,13 +205,13 @@ public:
 
   SparseMatrixArg(const CpuSparseMatrix& sparse)
       : BufferArg(sparse),
-        row_((void*)sparse.getRows(), VALUE_TYPE_INT32),
-        col_((void*)sparse.getCols(), VALUE_TYPE_INT32) {}
+        row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+        col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
   SparseMatrixArg(const GpuSparseMatrix& sparse)
       : BufferArg(sparse),
-        row_((void*)sparse.getRows(), VALUE_TYPE_INT32),
-        col_((void*)sparse.getCols(), VALUE_TYPE_INT32) {}
+        row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+        col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
   ~SparseMatrixArg() {}
 
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index e70484a1af..0333fe1831 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -30,14 +30,14 @@ public:
   TensorShape(std::initializer_list<size_t> dims) {
     ndims_ = dims.size();
     initDims(ndims_);
-    std::copy(dims.begin(), dims.end(), dims_.begin());
+    dims_.assign(dims);
     numElements();
   };
 
   TensorShape(const TensorShape& t)
       : ndims_(t.ndims_), nelements_(t.nelements_) {
     initDims(ndims_);
-    std::copy(t.dims_.begin(), t.dims_.end(), dims_.begin());
+    dims_.assign(t.dims_.begin(), t.dims_.end());
   };
 
   // get the size of specified dimension

From f30c25fe2d5a3f3d86f9f6bcfadfbc446a3c9071 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 9 Jan 2017 15:19:07 +0800
Subject: [PATCH 23/41] Add THIRD_PARTY_PATH

---
 CMakeLists.txt                | 10 ++++++++++
 cmake/external/glog.cmake     |  4 ++--
 cmake/external/gtest.cmake    |  4 ++--
 cmake/external/openblas.cmake |  4 ++--
 cmake/external/protobuf.cmake |  4 ++--
 cmake/external/python.cmake   |  4 ++--
 cmake/external/swig.cmake     |  4 ++--
 cmake/external/warpctc.cmake  |  4 ++--
 cmake/external/zlib.cmake     |  4 ++--
 cmake/flags.cmake             |  6 ------
 10 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index abe7b5228c..8f53abacb4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,16 @@ option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+
+# CMAKE_BUILD_TYPE
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
+      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+      FORCE)
+endif()
+
+set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+  "A path setting third party libraries download & build directories.")
 ########################################################################################
 
 include(external/zlib)      # download, build, install zlib
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index bec69f3ddf..71e20c8527 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -14,8 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-SET(GLOG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/glog)
-SET(GLOG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/glog)
+SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog)
+SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
 SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
 
 IF(WIN32)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 2fcb7893fa..11d829a9e2 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -16,8 +16,8 @@ IF(WITH_TESTING)
     ENABLE_TESTING()
     INCLUDE(ExternalProject)
 
-    SET(GTEST_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gtest)
-    SET(GTEST_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gtest)
+    SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
+    SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
     SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
 
     INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 66a72cd243..0e8c29c831 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -18,8 +18,8 @@ IF(NOT ${CBLAS_FOUND})
     MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
     INCLUDE(ExternalProject)
 
-    SET(CBLAS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
-    SET(CBLAS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/openblas)
+    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
+    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
     SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
     IF(WIN32)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2f2769b4c6..c0cf2719f9 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,8 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-SET(PROTOBUF_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/protobuf)
-SET(PROTOBUF_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/protobuf)
+SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
+SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
 SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
 
 INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index cbb6940221..55787f75f8 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -28,8 +28,8 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
     FIND_PACKAGE(NumPy REQUIRED)
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
     ##################################### PYTHON ########################################
-    SET(PYTHON_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/python)
-    SET(PYTHON_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/python)
+    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
+    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
     SET(_python_DIR ${PYTHON_INSTALL_DIR})
 
     IF(UNIX)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index 40088c65ef..63e8bd2546 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -18,8 +18,8 @@ IF(NOT SWIG_FOUND)
     # build swig as an external project
     INCLUDE(ExternalProject)
 
-    SET(SWIG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/swig)
-    SET(SWIG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/swig)
+    SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
+    SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
     SET(SWIG_TARGET_VERSION "3.0.2")
     SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
     SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7386d935b8..f5e4b3e1eb 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,8 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-SET(WARPCTC_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/warpctc)
-SET(WARPCTC_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/warpctc)
+SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
+SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
 
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 916f6816aa..47fa8817fb 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -14,8 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-SET(ZLIB_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zlib)
-SET(ZLIB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/zlib)
+SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib)
+SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
 SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
 SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
 
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0983d83b73..0d1ef5cd84 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -3,12 +3,6 @@ include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
-        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-        FORCE)
-endif()
-
 function(CheckCompilerCXX11Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)

From 57e252119eee99523a92ecd323532bec355f9144 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 9 Jan 2017 15:21:47 +0800
Subject: [PATCH 24/41] BufferArg add ArgType and Function remove inouts

---
 paddle/function/BufferArg.h         |   45 +-
 paddle/function/Function.h          |   28 +-
 paddle/function/FunctionTest.cpp    |    2 +-
 paddle/math/Matrix.h~RFbb8b484f.TMP | 1870 ---------------------------
 4 files changed, 59 insertions(+), 1886 deletions(-)
 delete mode 100644 paddle/math/Matrix.h~RFbb8b484f.TMP

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index d787d2814d..3d28249f69 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -38,16 +38,40 @@ enum SparseDataType {
 
 enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 
-/**
- * BufferArg used as the argument type for Function.
- */
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
-// an array of arbitrary dimensions
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
 class BufferArg {
+public:
+  // ArgType is only used by output BufferArg.
+  // For input argument, argType_ is ignored.
+  // For output argument, need to set the argType_ of the BufferArg.
+  enum ArgType {
+    UNSPECIFIED = 0,
+    ASSIGN_TO = 1,
+    ADD_TO = 2,
+  };
+
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
 public:
   BufferArg(void* buf, ValueType valueType, const TensorShape& shape)
       : buf_(buf), valueType_(valueType), shape_(shape) {}
@@ -56,7 +80,8 @@ public:
       : buf_(buf), valueType_(valueType) {}
 
   BufferArg(const Matrix& matrix)
-      : buf_(reinterpret_cast<void*>(matrix.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(2) {
     shape_.setDim(0, matrix.getHeight());
@@ -64,21 +89,24 @@ public:
   }
 
   BufferArg(const Matrix& matrix, const TensorShape& shape)
-      : buf_(reinterpret_cast<void*>(matrix.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
         shape_(shape) {
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
   BufferArg(const Vector& vector)
-      : buf_(reinterpret_cast<void*>(vector.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(DataType<real>::value),
         shape_(1) {
     shape_.setDim(0, vector.getSize());
   }
 
   BufferArg(const IVector& vector)
-      : buf_(reinterpret_cast<void*>(vector.getData())),
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(VALUE_TYPE_INT32),
         shape_(1) {
     shape_.setDim(0, vector.getSize());
@@ -124,6 +152,7 @@ protected:
   ValueType valueType_;
   TensorShape shape_;
   BufferType bufferType_;
+  ArgType argType_ = UNSPECIFIED;
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
 };
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 27ebe808aa..88d6824aa3 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -56,12 +56,18 @@ public:
   BufferArgs() {}
   size_t size() const { return args_.size(); }
 
-  // add argument into BufferArgss
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
   template <typename Tensor>
   void addArg(const Tensor& arg) {
     args_.push_back(std::make_shared<BufferArg>(arg));
   }
 
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
   void addArg(const Matrix& arg, const TensorShape& shape);
 
   void addArg(const CpuSparseMatrix& arg);
@@ -78,10 +84,20 @@ private:
 };
 
 /**
- * Base class for Function.
+ * \brief Base class for Function.
  * The basic Function implementation requires override init and calc interfaces.
- * Need to pay attention to the inouts argument. For the input argument
- * that will be modified, it needs to be passed through inouts.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
  */
 class FunctionBase {
 public:
@@ -89,9 +105,7 @@ public:
 
   virtual void init(const FuncConfig& config) {}
 
-  virtual void calc(const BufferArgs& inputs,
-                    const BufferArgs& outputs,
-                    const BufferArgs& inouts) {}
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
 };
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 7c3d6684cd..7ce908320a 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -35,7 +35,7 @@ void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
 
 template <DeviceType DType>
 void Function(const BufferArgs& arguments) {
-  auto input = arguments[0].matrix<DType>();
+  const auto input = arguments[0].matrix<DType>();
   auto output = arguments[1].matrix<DType>();
   FunctionApi<DType>(output, input);
 }
diff --git a/paddle/math/Matrix.h~RFbb8b484f.TMP b/paddle/math/Matrix.h~RFbb8b484f.TMP
deleted file mode 100644
index d89b0f67b3..0000000000
--- a/paddle/math/Matrix.h~RFbb8b484f.TMP
+++ /dev/null
@@ -1,1870 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-#include <memory>
-#include <thread>
-
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
-
-#include <hl_gpu.h>
-
-#include "BaseMatrix.h"
-#include "MemoryHandle.h"
-#include "Vector.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/common.h"
-
-namespace paddle {
-
-enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
-
-/**
- * @brief  matrix sparse_format .
- *
- * nnz represents nonzero number in sparse matrix.
- *
- * SPARSE_CSR: row major matrix. length of row is height_ + 1, each element
- * represents row start index in Matrix. length of col and value are nnz.
- *
- * SPARSE_CSC: col major matrix. length of col is width_ + 1, each element
- * represents col start index in Matrix. length of col and value are nnz.
- *
- * @code
- * for example: [0, 1, 0, 2, 0;
- *               1, 0, 0, 0, 0;
- *               0, 0, 0, 2, 5];
- * SPARSE_CSR row   [0, 2, 3, 5];
- *            col   [1, 3, 0, 3, 4];
- *            value [1, 2, 1, 2, 5]
- * SPARSE_CSC col   [0, 1, 2, 2, 4, 5];
- *            row   [1, 0, 0, 2, 2];
- *            value [1, 1, 2, 2, 5]
- * @endcode
- */
-enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
-
-class Matrix;
-class GpuMatrix;
-class CpuMatrix;
-class CpuSparseMatrix;
-class GpuSparseMatrix;
-typedef std::shared_ptr<Matrix> MatrixPtr;
-typedef std::shared_ptr<GpuMatrix> GpuMatrixPtr;
-typedef std::shared_ptr<CpuMatrix> CpuMatrixPtr;
-typedef std::shared_ptr<GpuSparseMatrix> GpuSparseMatrixPtr;
-typedef std::shared_ptr<CpuSparseMatrix> CpuSparseMatrixPtr;
-
-/**
- * Copy or assignemnt constructor will share the data as opposed to making a
- * copy of the original data. To make a copy of the orinal data, use copyFrom()
- * instead.
- */
-class Matrix : public BaseMatrix {
-protected:
-  Matrix(MemoryHandlePtr memHandle,
-         size_t height,
-         size_t width,
-         bool trans,
-         bool use_gpu);
-
-  Matrix(real* data, size_t height, size_t width, bool trans, bool use_gpu);
-
-  Matrix(real* data,
-         size_t height,
-         size_t width,
-         size_t stride,
-         bool trans,
-         bool use_gpu);
-
-  static ThreadLocal<MatrixPtr> tmpMat_;
-
-public:
-  size_t elementCnt_;  // maximal number of elements which can be held in data_
-  MemoryHandlePtr memoryHandle_;
-
-public:
-  virtual ~Matrix() {}
-
-  static MatrixPtr create(MemoryHandlePtr memHandle,
-                          size_t height,
-                          size_t width,
-                          bool trans = false);
-  static MatrixPtr create(size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          bool trans = false,
-                          bool useGpu = false);
-  static MatrixPtr create(real* data,
-                          size_t height,
-                          size_t width,
-                          size_t stride,
-                          bool trans = false,
-                          bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      bool trans = false,
-                                      bool useGpu = false);
-  static MatrixPtr createSparseMatrix(size_t height,
-                                      size_t width,
-                                      size_t nnz,
-                                      SparseValueType valueType = FLOAT_VALUE,
-                                      SparseFormat foramt = SPARSE_CSR,
-                                      bool trans = false,
-                                      bool useGpu = false);
-
-  static MatrixPtr createSparseMatrix(real* data,
-                                      int* row,
-                                      int* col,
-                                      size_t height,
-                                      size_t width,
-                                      size_t nnz, /* used to allocate space */
-                                      SparseValueType valueType, /*value type*/
-                                      SparseFormat format,
-                                      bool trans,
-                                      bool useGpu);
-
-  static void resizeOrCreateSparseMatrix(
-      MatrixPtr& matrix,
-      size_t height,
-      size_t width,
-      size_t nnz,
-      SparseValueType valueType = FLOAT_VALUE,
-      SparseFormat foramt = SPARSE_CSR,
-      bool trans = false,
-      bool useGpu = false);
-
-  static void resizeOrCreate(MatrixPtr& a,
-                             size_t height,
-                             size_t width,
-                             bool trans = false,
-                             bool useGpu = false);
-
-  /**
-   * @brief  set the data buffer used to hold the matrix data.
-   *
-   * caller should make sure that the size of data is at least
-   * sizeof(real)*height*width.
-   */
-  void setData(real* data) {
-    BaseMatrix::setData(data);
-    memoryHandle_.reset();
-  }
-
-  /// the data should be contiguous
-  void setData(real* data, size_t newHeight, size_t newWidth) {
-    setData(data);
-    height_ = newHeight;
-    width_ = newWidth;
-    elementCnt_ = newHeight * newWidth;
-    stride_ = width_;
-  }
-
-  size_t getWidth() const { return width_; }
-  size_t getHeight() const { return height_; }
-  size_t getStride() const { return stride_; }
-  size_t getElementCnt() const { return elementCnt_; }
-  virtual real* getData() { return data_; }
-  virtual const real* getData() const { return data_; }
-  bool isTransposed() const { return trans_; }
-  bool isContiguous() const { return stride_ == width_ || height_ == 1; }
-
-  // If sparse matrix, need to dynamic_cast to CpuSparseMatrix/GpuSparseMatrix
-  // befor call the following functions.
-  // Declare these functions in the base class just easy to call them.
-  // And these declarations should be moved to base class of sparse matrix
-  // if refactor sparse matrix
-  virtual int* getRows() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual int* getCols() const {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;  //! suppress warning for no return value.
-  }
-
-  virtual SparseFormat getFormat() const {
-    LOG(FATAL) << "Not implemented";
-    return SPARSE_CSR;  //! suppress warning for no return value.
-  }
-
-  virtual SparseValueType getValueType() const {
-    LOG(FATAL) << "Not implemented";
-    return NO_VALUE;  //! suppress warning for no return value.
-  }
-
-  /**
-   * @brief matrix elment-wise add
-   *
-   * Named add3 just because add/add2 has been used in BaseMatrix.cu
-   * and they are not virtual function.
-   */
-  virtual void add3(MatrixPtr b) { LOG(FATAL) << "Not implemented"; }
-
-  MemoryHandlePtr getMemoryHandle() const { return memoryHandle_; }
-
-  virtual void zeroMem() { LOG(FATAL) << "Not implemented"; }
-
-  virtual void resetOne() { LOG(FATAL) << "Not implemented"; }
-
-  void setDiag(real value);
-
-  virtual void copyFrom(const Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void trimFrom(const CpuSparseMatrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  // asynchronous copy
-  virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  MatrixPtr subMatrix(size_t startRow,
-                      size_t endRow,
-                      size_t startCol,
-                      size_t endCol);
-
-  MatrixPtr subRowMatrix(size_t startRow, size_t endRow) {
-    return subMatrix(startRow, endRow, 0, getWidth());
-  }
-
-  MatrixPtr subColMatrix(size_t startCol, size_t endCol) {
-    return subMatrix(0, getHeight(), startCol, endCol);
-  }
-
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows) {
-    CHECK_LE(startRow + numRows, getHeight());
-    return Matrix::create(getData() + startRow * getWidth(),
-                          numRows,
-                          getWidth(),
-                          trans_,
-                          useGpu_);
-  }
-  virtual MatrixPtr subMatrix(size_t startRow, size_t numRows, MatrixPtr dest) {
-    CHECK_LE(startRow + numRows, getHeight());
-    CHECK_EQ(useGpu_, dest->useGpu_);
-    dest->setData(this->rowBuf(startRow), numRows, getWidth());
-    return dest;
-  }
-
-  /**
-   * If this is GpuMatrix, src is assumed to be CPU memory
-   *
-   * If this is CpuMatrix, src is assumed to be CPU memory
-   */
-  virtual void copyFrom(const real* src, size_t size) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void copyFrom(const real* src, const int64_t* seq) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief convert a int vector to a real matrix.
-   *
-   * (1) source and dest are both in CPU.
-   *
-   * (2) sizes are exactly match.
-   */
-  virtual void copyFrom(const IVector& src) {
-    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
-  }
-
-  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief Create a matrix with the same type (GpuMatrix, CpuMatrix,
-   *        NonValueSparseMatrix, etc.) as this.
-   *
-   * If height and width is zero, the new matrix will have the same size
-   * as this, otherwise the new matrix will have the specified size.
-   *
-   */
-  virtual MatrixPtr clone(size_t height = 0,
-                          size_t width = 0,
-                          bool useGpu = false) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real* getRowBuf(size_t row) {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  virtual real getElement(size_t x, size_t y) const {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual real getSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void accumulateColSum(Matrix& src) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual real getAbsSum() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  /**
-   * @note Original data may not be preserved after resize().
-   */
-  virtual void resize(size_t newHeight, size_t newWidth) = 0;
-
-  /**
-   * @note This should only be used for sparse matrix.
-   */
-  virtual void resize(size_t newHeight,
-                      size_t newWidth,
-                      size_t newNnz, /* total item used to allocate space */
-                      SparseValueType valueType,
-                      SparseFormat format) = 0;
-
-  /**
-   * @brief This should only be used for sparse matrix.
-   *
-   * Currently must be called for each row in order.
-   * The matrix is not valid until setRow is called for the last row.
-   */
-  virtual void setRow(size_t row,
-                      size_t colNum,
-                      const unsigned int* cols,
-                      const real* values) = 0;
-
-  virtual MatrixPtr getTranspose() = 0;
-
-  /**
-   * @brief  hard transpose.
-   *
-   * allocate matTrans' memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual MatrixPtr getInverse() {
-    LOG(FATAL) << "Not implemented";
-    return nullptr;
-  }
-
-  /**
-   * @brief  inverse.
-   *
-   * if allocate matInv's memory outside, then set memAlloc as false;
-   * else set as true.
-   */
-  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-public:
-  /// Only set all variables to 0 or NULL but not free them.
-  virtual void clear() {
-    height_ = 0;
-    width_ = 0;
-    data_ = NULL;
-  }
-
-  void reshape(size_t height, size_t width);
-
-  /// add b to each sample of this.
-  virtual void addBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void addSharedBias(Matrix& b, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void addBias(Matrix& b, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      addBias(b, scale);
-    } else {
-      addSharedBias(b, scale);
-    }
-  }
-
-  /// add each sample from a to this.
-  virtual void collectBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void collectSharedBias(Matrix& a, real scale) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  void collectBias(Matrix& a, real scale, bool sharedBias) {
-    if (!sharedBias) {
-      collectBias(a, scale);
-    } else {
-      collectSharedBias(a, scale);
-    }
-  }
-
-  virtual void sequenceAvgForward(Matrix& a,
-                                  const IVector& startsPos,
-                                  int mode) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  virtual void mul(const Matrix& a,
-                   const Matrix& b,
-                   real scaleAB,
-                   real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// Add a vector (column) b to matrix a, column by column.
-  virtual void addColumnVector(const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += vec(index(i, j), 0)
-   * where index(i, j) = ((codes(i) + numClasses) >> (j + 1)) - 1
-   * @endcode
-   */
-  virtual void addByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   vec(index(i, j), 0) += this(i, j)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void addByBitCodeBackward(size_t numClasses,
-                                    const IVector& codes,
-                                    Matrix& vec) {
-    (void)numClasses;
-    (void)codes;
-    (void)vec;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   this(i, j) += <mat.row(index(i, j)), input.row(i)>
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCode(size_t numClasses,
-                            const IVector& codes,
-                            const Matrix& mat,
-                            const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   mat.row(index(i, j)) += this(i, j) * input.row(i)
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardWeight(size_t numClasses,
-                                          const IVector& codes,
-                                          Matrix& mat,
-                                          const Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength:
-   *   input.row(i) += this(i, j) * mat.row(index(i, j))
-   * where index is same as the index for addByBitCode
-   * @endcode
-   */
-  virtual void mulByBitCodeBackwardError(size_t numClasses,
-                                         const IVector& codes,
-                                         const Matrix& mat,
-                                         Matrix& input) {
-    (void)numClasses;
-    (void)codes;
-    (void)mat;
-    (void)input;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *   sum(i, 0) = scaleSum * \sum_j  bit(i, j) * this(i, j)
-   * where bit(i, j) = ((codes(i) + numClasses) & 2^j) ? 1 : 0
-   * @endcode
-   */
-  virtual void sumByBitCode(size_t numClasses,
-                            IVector& codes,
-                            Matrix& sum,
-                            real scaleSum) {
-    (void)numClasses;
-    (void)codes;
-    (void)sum;
-    (void)scaleSum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * For j < codeLength
-   *  this(i, j) -= bit(i, j)
-   * where bit(i, j) is same as that for sumByBitCode
-   * @endcode
-   */
-  virtual void subByBitCode(size_t numClasses_, IVector& codes) {
-    (void)numClasses_;
-    (void)codes;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * add the sum of each row of this to mat
-   */
-  virtual void rowSum(Matrix& sum) {
-    (void)sum;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each row of this to mat
-   */
-  virtual void rowMax(Matrix& max) {
-    (void)max;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * set the max of each column of this to mat
-   */
-  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each column of this matrix.
-   *
-   * The row ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutForward(Matrix& a,
-                             IVector& id,
-                             size_t channels,
-                             size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void maxoutBackward(Matrix& a,
-                              IVector& id,
-                              size_t channels,
-                              size_t groups) {
-    LOG(FATAL) << "not implemented";
-  }
-
-  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief Get the top k elements of each row of this matrix.
-   *
-   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. where k is the size of maxIds.
-   * And note that the top k elements are not sorted.
-   */
-  virtual void rowMax(IVector& maxIds, Matrix& max) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// normalize each row so that the sum of each row is 1.
-  virtual void rowNormalizeL1(Matrix& out) {
-    (void)out;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   *  this = a*b
-   * @endcode
-   */
-  virtual void mul(const Matrix& a, const Matrix& b) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  virtual void rightMul(Matrix& b) { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a, real scaleAB, real scaleT) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this = a*this)
-   * @endcode
-   */
-  virtual void leftMul(Matrix& a) { LOG(FATAL) << "Not implemented"; }
-
-  /// merge the element for each col.
-  virtual void colMerge(Matrix& src) { LOG(FATAL) << "Not implemented"; }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropy(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyBp(Matrix& outputV, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// copy -log(output[label]) to this->data[i].
-  virtual void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                              IVector& label,
-                                              real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the error of outputV according to label.
-  virtual void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                                IVector& label,
-                                                real alpha) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * \f[
-   *  a[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} b_{i+j} * c_{j}
-   * \f]
-   *
-   * b contains M elements,
-   * c contains N elements (N is odd),
-   * b's index arithmetic is computed modulo M,
-   * c's index arithmetic is computed modulo N.
-   */
-  virtual void circularConv(Matrix& b, Matrix& c) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void circularConvDerivative(Matrix& output,
-                                      Matrix& prevOut1,
-                                      Matrix& prevOut2,
-                                      Matrix& prevGrad1,
-                                      Matrix& prevGrad2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /* output_ij = exp(this_{ij}) / (sum_j exp(this_ij)) */
-  virtual void softmax(Matrix& output) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void sequenceSoftmax(Matrix& output, const IVector& index) {
-    (void)output;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void softmaxBackward(Matrix& outputV) {
-    (void)outputV;
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /*
-    sum_i = sum_j this_ij * output_ij
-    this_ij = output_ij* (this_ij - sum_i)
-  */
-  virtual void softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// calculate the sum of squares diff cost.
-  virtual void sumOfSquares(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// gradient of sumOfSquares.
-  virtual void sumOfSquaresBp(Matrix& outputV, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void tanhDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void softrelu(Matrix& output) { LOG(FATAL) << "Not implemented"; }
-
-  virtual void softreluDerivative(Matrix& output) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void scaledTanh(Matrix& output, real p1, real p2) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * cosine similarity, for each row i,
-   *   this[i] = cos(output1[i], output2[i])
-   *
-   * output2 can only have one row, then for each row i,
-   *   this[i] = cos(output1[i], output2[0])
-   */
-  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void cosSimDerivative(Matrix& output,
-                                Matrix& prevOut1,
-                                Matrix& prevOut2,
-                                Matrix& prevGrad1,
-                                Matrix& prevGrad2,
-                                real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print out the values of elements to os
-  virtual void print(std::ostream& os) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * print a part of the matrix
-   * from the (top,left) value to the (height, width) value (not included)
-   */
-  virtual void print(std::ostream& os, size_t height, size_t width) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /// print one row to os
-  virtual void printOneRow(std::ostream& os, size_t idx) const {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void check(std::ostream& os, Matrix& refMat, bool printDiff = true) {}
-
-  virtual real getMin() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-  virtual real getMax() {
-    LOG(FATAL) << "Not implemented";
-    return 0;
-  }
-
-  virtual void randomizeUniform() { LOG(FATAL) << "Not implemented"; }
-
-  /**
-   * @brief  calulate the error of classification
-   *
-   * output[i] = 1 if row i is an error.
-   *
-   * output[i] = 0 if row i is correct.
-   */
-  virtual void classificationError(Matrix& output, IVector& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Pooling forward operation, pick out the largest element
-   * in the sizeX of value
-   */
-  virtual void maxPoolForward(Matrix& inputMat,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling backward operation.
-  virtual void maxPoolBackward(Matrix& image,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               Matrix& outGrad,
-                               Matrix& outV,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /// Pooling forward operation, caculate the average of sizeX elements.
-  virtual void avgPoolForward(Matrix& input,
-                              size_t imgSizeH,
-                              size_t imgSizeW,
-                              size_t channels,
-                              size_t sizeX,
-                              size_t sizeY,
-                              size_t strideH,
-                              size_t strideW,
-                              size_t outputH,
-                              size_t outputW,
-                              size_t paddingH,
-                              size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void avgPoolBackward(Matrix& input,
-                               size_t imgSizeH,
-                               size_t imgSizeW,
-                               size_t sizeX,
-                               size_t sizeY,
-                               size_t strideH,
-                               size_t strideW,
-                               size_t outputH,
-                               size_t outputW,
-                               real scaleTargets,
-                               real scaleOutput,
-                               size_t paddingH,
-                               size_t paddingW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * Input: one or more sequences. Each sequence contains some instances.
-   *
-   * Output: output size is the number of input sequences (NOT input
-   * instances).
-   *
-   * output[i] is set to max_input[i].
-   */
-  virtual void maxSequenceForward(Matrix& input,
-                                  const IVector& sequence,
-                                  IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void maxSequenceBackward(Matrix& outputGrad,
-                                   const IVector& sequence,
-                                   IVector& index) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * if ids[i] == -1, it will be ignored
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids) {
-    (void)table;
-    (void)ids;
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids) {
-    LOG(FATAL) << "Not implemented";
-  }
-  /**
-   * @brief  cross entropy for multi binary labels
-   *
-   * @code
-   * this[i] = -sum(label[i][j]*log(output[i][j])
-   *           + (1-label[i][j])*log(1-output[i][j]))
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  The gradient of cross entropy for multi binary labels on output
-   *
-   * @code
-   * this[i][j] = -label[i][j]/output[i][j]
-   *              + (1-label[i][j])/(1-output[i][j])
-   * @endcode
-   */
-  virtual void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  /**
-   * @brief  Calculate the classification error for multi binary labels
-   *
-   * @code
-   * this[i] = sum((output[i][j] >= threshold && label[i][j] == 0)
-   *            || (output[i][j] < threshold && label[i][j] == 1))
-   *            / output->getWidth()
-   * @endcode
-   */
-  virtual void classificationErrorMulti(Matrix& output,
-                                        Matrix& label,
-                                        real threshold) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void paramReluForward(Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardW(Matrix& oGrad, Matrix& data) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void bilinearForward(const Matrix& in,
-                               const size_t inImgH,
-                               const size_t inImgW,
-                               const size_t outImgH,
-                               const size_t outImgW,
-                               const size_t numChannels,
-                               const real ratioH,
-                               const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-  virtual void bilinearBackward(const Matrix& out,
-                                const size_t outImgH,
-                                const size_t outImgW,
-                                const size_t inImgH,
-                                const size_t inImgW,
-                                const size_t numChannels,
-                                const real ratioH,
-                                const real ratioW) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    if (useGpu_) {
-      TensorGpuApply<real>(*this, expr);
-    } else {
-      TensorCpuApply<real>(*this, expr);
-    }
-  }
-
-  bool isEmpty() const {
-    return data_ == nullptr;
-  }
-
-  explicit operator bool() const {
-    return !isEmpty();
-  }
-};
-
-inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
-  mat.print(os);
-  return os;
-}
-
-class GpuMatrix : public Matrix {
-public:
-  GpuMatrix();
-
-  GpuMatrix(size_t height, size_t width, bool trans = false);
-  GpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, true) {}
-  GpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, true) {}
-  GpuMatrix(GpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, true) {}
-  ~GpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  /**
-   * Copy the data from cpu_memory buffer
-   */
-  void copyFrom(const real* hostSrc, size_t size);
-
-  void copyFrom(const real* hostSrc, const int64_t* seq);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const IVector& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  real getElement(size_t x, size_t y) const;
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
-
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /**
-   * @code
-   * add each sample from a to this.
-   * @endcode
-   */
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*b) + scaleT*this
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*b
-   * @endcode
-   */
-  void mul(const Matrix& a, const Matrix& b);
-
-  void mul(const GpuMatrix& a, const GpuMatrix& b, real scaleAB, real scaleT);
-
-  void mul(const GpuSparseMatrix& a,
-           const GpuMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  void mul(const GpuMatrix& a,
-           const GpuSparseMatrix& b,
-           real scaleAB,
-           real scaleT);
-
-  /**
-   * @code
-   * this = scaleAB*(this*b) +  scaleT*this
-   * @endcode
-   */
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = this* b
-   * @endcode
-   */
-  void rightMul(Matrix& b);
-
-  /**
-   * @code
-   * this = scaleAB*(a*this) +  scaleT*this
-   * @endcode
-   */
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-
-  /**
-   * @code
-   * this = a*this
-   * @endcode
-   */
-  void leftMul(Matrix& a);
-
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& max);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& max);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxBackward(Matrix& outputV);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
-  virtual void print(std::ostream& os) const;
-  virtual void print(std::ostream& os, size_t height, size_t width) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label);
-
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorGpuApply<real>(*this, expr);
-  }
-};
-
-class CpuMatrix : public Matrix {
-public:
-  CpuMatrix(size_t height, size_t width, bool trans = false);
-  CpuMatrix(real* data, size_t height, size_t width, bool trans = false)
-      : Matrix(data, height, width, trans, false) {}
-  CpuMatrix(real* data,
-            size_t height,
-            size_t width,
-            size_t stride,
-            bool trans = false)
-      : Matrix(data, height, width, stride, trans, false) {}
-
-  CpuMatrix(CpuMemHandlePtr dataHandle,
-            size_t height,
-            size_t width,
-            bool trans = false)
-      : Matrix(dataHandle, height, width, trans, false) {}
-
-  ~CpuMatrix();
-
-  void zeroMem();
-  void resetOne();
-  void setDiag(real value);
-
-  void resize(size_t newHeight, size_t newWidth);
-  void resize(size_t newHeight,
-              size_t newWidth,
-              size_t newNnz, /* used to allocate space */
-              SparseValueType valueType,
-              SparseFormat format) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-  void setRow(size_t row,
-              size_t colNum,
-              const unsigned int* cols,
-              const real* values) {
-    LOG(FATAL) << "Only Support Sparse Matrix";
-  }
-
-  real getElement(size_t x, size_t y) const;
-  real getSum();
-  void accumulateColSum(Matrix& src);
-  real getAbsSum();
-
-  MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
-
-  MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
-
-  void copyFrom(const Matrix& src);
-
-  void copyFrom(const Matrix& src, hl_stream_t stream);
-
-  void copyFrom(const real* cpuSrc, size_t size);
-
-  void copyFrom(const real* cpuSrc, const int64_t* seq);
-
-  void copyFrom(const IVector& src);
-
-  void copyFrom(CpuSparseMatrix& src);
-
-  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
-
-  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
-
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
-  void maxPoolForward(Matrix& inputMat,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void maxPoolBackward(Matrix& image,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       Matrix& outGrad,
-                       Matrix& outV,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void avgPoolForward(Matrix& input,
-                      size_t imgSizeH,
-                      size_t imgSizeW,
-                      size_t channels,
-                      size_t sizeX,
-                      size_t sizeY,
-                      size_t strideH,
-                      size_t strideW,
-                      size_t outputH,
-                      size_t outputW,
-                      size_t paddingH,
-                      size_t paddingW);
-
-  void avgPoolBackward(Matrix& input,
-                       size_t imgSizeH,
-                       size_t imgSizeW,
-                       size_t sizeX,
-                       size_t sizeY,
-                       size_t strideH,
-                       size_t strideW,
-                       size_t outputH,
-                       size_t outputW,
-                       real scaleTargets,
-                       real scaleOutput,
-                       size_t paddingH,
-                       size_t paddingW);
-
-  void maxSequenceForward(Matrix& input,
-                          const IVector& sequence,
-                          IVector& index);
-
-  void maxSequenceBackward(Matrix& outputGrad,
-                           const IVector& sequence,
-                           IVector& index);
-
-  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
-  virtual real* getRowBuf(size_t row) { return getRow(row); }
-
-public:
-  /// add b to each sample of this.
-  void addBias(Matrix& b, real scale);
-  void addSharedBias(Matrix& b, real scale);
-
-  /// add each sample of a to this.
-  void collectBias(Matrix& a, real scale);
-  void collectSharedBias(Matrix& a, real scale);
-
-  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
-
-  /**
-   * @code
-   * this.row[i] += table.row[ids[i]]
-   * @endcode
-   */
-  virtual void selectRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table.row[ids[i]] += this.row[i]
-   * @endcode
-   */
-  virtual void addToRows(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * this[i] = table[i, id[i]]
-   * @endcode
-   */
-  virtual void selectElements(Matrix& table, IVector& ids);
-
-  /**
-   * @code
-   * table[i, id[i]] += this[i]
-   * @endcode
-   */
-  virtual void addElements(Matrix& table, IVector& ids);
-
-  /**
-   * use abstract getRow() to get row from table.
-   *
-   * Define table as template instead of virtual class for performance sake.
-   * internal used by above two virtual funcs.
-   */
-  template <typename TableMatType>
-  void selectRowsImp(TableMatType& table, IVector& ids);
-  template <typename TableMatType>
-  void addToRowsImp(TableMatType& table, IVector& ids);
-
-  void addColumnVector(const Matrix& b);
-
-  void mul(const Matrix& a, const Matrix& b, real scaleAB, real scaleT);
-  void mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(CpuMatrix* a, CpuSparseMatrix* b, real scaleAB, real scaleT);
-
-  static void mul(CpuMatrix* a,
-                  CpuMatrix* b,
-                  CpuSparseMatrix* c,
-                  real scaleAB,
-                  real scaleT);
-
-  /**
-   * c = a * b
-   *
-   * use abstract getRow() to get row from B,C.
-   * Define B,C as template instead of virtual class for performance sake.
-   */
-  template <typename MatBType, typename MatCType>
-  static void mul(
-      CpuSparseMatrix* a, MatBType* b, MatCType* c, real scaleAB, real scaleT);
-
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-
-  void mul(const Matrix& a, const Matrix& b);
-
-  void rightMul(Matrix& b, real scaleAB, real scaleT);
-  void rightMul(Matrix& b);
-
-  void leftMul(Matrix& a, real scaleAB, real scaleT);
-  void leftMul(Matrix& a);
-  void colMerge(Matrix& src);
-  void rowSum(Matrix& sum);
-  void rowMaxId(IVector& maxIds);
-  void rowMax(Matrix& max);
-  void rowMax(IVector& maxIds, Matrix& maxVal);
-  void colMax(Matrix& max);
-  void colMax(IVector& maxIds, Matrix& maxVal);
-  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
-  void rowNormalizeL1(Matrix& out);
-
-  void oneHotCrossEntropy(Matrix& output, IVector& label);
-  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
-  void oneHotCrossEntropyWithSelfNorm(Matrix& output,
-                                      IVector& label,
-                                      real alpha);
-  void oneHotCrossEntropyWithSelfNormBp(Matrix& outputV,
-                                        IVector& label,
-                                        real alpha);
-
-  void circularConv(Matrix& b, Matrix& c);
-  void circularConvDerivative(Matrix& output,
-                              Matrix& prevOut1,
-                              Matrix& prevOut2,
-                              Matrix& prevGrad1,
-                              Matrix& prevGrad2);
-
-  void softmax(Matrix& output);
-  void sequenceSoftmax(Matrix& output, const IVector& index);
-  void softmaxDerivative(Matrix& output, Matrix& sftmaxSum);
-
-  /// calculate the sum of squares diff cost.
-  void sumOfSquares(Matrix& output, Matrix& label);
-
-  /// gradient of sumOfSquares.
-  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-
-  void tanh(Matrix& output);
-  void tanhDerivative(Matrix& output);
-
-  void softrelu(Matrix& output);
-  void softreluDerivative(Matrix& output);
-  void scaledTanh(Matrix& output, real p1, real p2);
-
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
-  void print(std::ostream& os) const;
-  void print(std::ostream& os, size_t height, size_t width) const;
-  void printOneRow(std::ostream& os, size_t idx) const;
-
-  void paramReluForward(Matrix& data, Matrix& W);
-  void paramReluBackwardW(Matrix& oGrad, Matrix& data);
-  void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W);
-
-  void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
-
-  real getMin();
-  real getMax();
-
-  void randomizeUniform();
-
-  void classificationError(Matrix& output, IVector& label);
-
-  void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
-
-  void addByBitCodeBackward(size_t numClasses,
-                            const IVector& codes,
-                            Matrix& vec);
-
-  void mulByBitCode(size_t numClasses,
-                    const IVector& codes,
-                    const Matrix& mat,
-                    const Matrix& input);
-
-  void mulByBitCodeBackwardWeight(size_t numClasses,
-                                  const IVector& codes,
-                                  Matrix& mat,
-                                  const Matrix& input);
-
-  void mulByBitCodeBackwardError(size_t numClasses,
-                                 const IVector& codes,
-                                 const Matrix& mat,
-                                 Matrix& input);
-
-  void sumByBitCode(size_t numClasses,
-                    IVector& codes,
-                    Matrix& sum,
-                    real scaleSum);
-
-  void subByBitCode(size_t numClasses_, IVector& codes);
-
-  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
-  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
-  void classificationErrorMulti(Matrix& output, Matrix& label, real threshold);
-
-  void bilinearForward(const Matrix& in,
-                       const size_t inImgH,
-                       const size_t inImgW,
-                       const size_t outImgH,
-                       const size_t outImgW,
-                       const size_t numChannels,
-                       const real ratioH,
-                       const real ratioW);
-
-  void bilinearBackward(const Matrix& out,
-                        const size_t outImgH,
-                        const size_t outImgW,
-                        const size_t inImgH,
-                        const size_t inImgW,
-                        const size_t numChannels,
-                        const real ratioH,
-                        const real ratioW);
-
-  template <typename ExpressionType>
-  void operator=(const ExpressionType& expr) {
-    TensorCpuApply<real>(*this, expr);
-  }
-};
-
-class SharedCpuMatrix : public CpuMatrix {
-public:
-  /* blockNum is number of partitions of the matrix  */
-  SharedCpuMatrix(int blockNum, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(height, width, trans) {
-    initShared(blockNum);
-  }
-  SharedCpuMatrix(
-      int blockNum, real* data, size_t height, size_t width, bool trans = false)
-      : CpuMatrix(data, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(int blockNum,
-                  CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initShared(blockNum);
-  }
-
-  SharedCpuMatrix(CpuMemHandlePtr dataHandle,
-                  size_t height,
-                  size_t width,
-                  bool trans = false)
-      : CpuMatrix(dataHandle, height, width, trans) {
-    initBlock(1);
-  }
-
-  ~SharedCpuMatrix() {}
-
-public:
-  virtual void mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB, real scaleT);
-  virtual void add(Matrix& b, real p1, real p2);
-  virtual void add(real p1, real p2);
-
-private:
-  using Matrix::mul;
-  void initShared(int blockNum);
-  void initBlock(int blockNum);
-
-  int blockNum_;
-  std::vector<std::unique_ptr<std::mutex>> blockLocks_;
-  ThreadLocal<CpuMatrixPtr> localBuf_;
-  ThreadLocal<std::vector<int>> localBufRows_;
-  ThreadLocal<std::vector<int>> blockSeq_;
-};
-
-typedef struct { unsigned int col; } sparse_non_value_t;
-
-typedef struct {
-  unsigned int col;
-  float value;
-} sparse_float_value_t;
-
-}  // namespace paddle
-#include "ExecViaCpu.h"

From c33d6417fb075c10f357a6f8fa2b4eca4d75cc9d Mon Sep 17 00:00:00 2001
From: zhouyingfeng <zhouyingfeng@baidu.com>
Date: Mon, 9 Jan 2017 15:24:52 +0800
Subject: [PATCH 25/41] fix code style error in semantic_role_labelling

---
 demo/semantic_role_labeling/test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index eabfb3fe3d..095bbff2ea 100755
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -38,4 +38,4 @@ paddle train \
   --config_args=is_test=1 \
   --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
-paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
\ No newline at end of file
+paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1

From e14a2288c4d3170d08130420f4506ac40a6e374d Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 9 Jan 2017 15:35:34 +0800
Subject: [PATCH 26/41] Add cache to cmake third_party in TravisCI.

---
 .travis.yml                             | 7 ++++++-
 paddle/scripts/travis/build_and_test.sh | 4 ++--
 paddle/scripts/travis/common.sh         | 2 ++
 paddle/scripts/travis/docs.sh           | 2 +-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index eecf5e81f0..40ac9a9f03 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,9 @@
 language: cpp
-cache: ccache
+cache:
+  directories:
+    - $HOME/third_party
+    - $HOME/.ccache
+
 sudo: required
 dist: trusty
 os:
@@ -35,6 +39,7 @@ addons:
       - clang-format-3.8
       - automake
       - libtool
+      - ccache
 before_install:
   - |
     if [ ${JOB} == "BUILD_AND_TEST" ]; then
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index db98504ba4..fd3aeb02b2 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -6,14 +6,14 @@ if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
   export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
   export PYTHONHOME=/opt/python/2.7.12
   export PATH=/opt/python/2.7.12/bin:${PATH}
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON
+  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
   NRPOC=`nproc`
   make -j $NPROC
   make coveralls
   sudo make install
 elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
   export PYTHONPATH=/usr/local/lib/python2.7/site-packages
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON
+  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
   NPROC=`sysctl -n hw.ncpu`
   make -j $NPROC
 fi
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
index 9b6e420ca7..f05c7530a3 100755
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
@@ -2,3 +2,5 @@
 set -e
 mkdir -p ../../../build
 cd ../../../build
+mkdir -p $HOME/third_party
+EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party"
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index 8690fe1d40..bdafb145bc 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -4,7 +4,7 @@
 source ./common.sh
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
 # check websites for broken links

From af02d7cda84c83b35ab28f740df54b2bb5e38126 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Mon, 9 Jan 2017 16:02:04 +0800
Subject: [PATCH 27/41] Add pip for TravisCI cache

---
 .travis.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index 40ac9a9f03..0705baa1ac 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -3,7 +3,7 @@ cache:
   directories:
     - $HOME/third_party
     - $HOME/.ccache
-
+    - $HOME/.cache/pip
 sudo: required
 dist: trusty
 os:

From df9be2d483cc3073e7b8680c1f687654710d2865 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Mon, 9 Jan 2017 22:57:29 +0800
Subject: [PATCH 28/41] fix CrossMapNormalFunc and ContextProjectionFunc(remove
 inouts argument)

---
 paddle/function/BufferArg.h                   | 78 +++++++++++--------
 paddle/function/ContextProjectionOp.cpp       | 13 ++--
 paddle/function/CrossMapNormalOp.cpp          | 19 +++--
 paddle/function/Function.cpp                  | 14 ++--
 paddle/function/Function.h                    | 16 ++--
 paddle/gserver/layers/ContextProjection.cpp   | 10 +--
 paddle/gserver/layers/NormProjectionLayer.cpp | 33 ++++----
 7 files changed, 98 insertions(+), 85 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 3d28249f69..6576d18dae 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -57,58 +57,67 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  * output Buffer or added to the output Buffer is determined by the
  * argType_ property of the output BufferArg.
  */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
 class BufferArg {
 public:
-  // ArgType is only used by output BufferArg.
-  // For input argument, argType_ is ignored.
-  // For output argument, need to set the argType_ of the BufferArg.
-  enum ArgType {
-    UNSPECIFIED = 0,
-    ASSIGN_TO = 1,
-    ADD_TO = 2,
-  };
-
   void setArgType(ArgType argType) { argType_ = argType; }
 
   ArgType getArgType() const { return argType_; }
 
 public:
-  BufferArg(void* buf, ValueType valueType, const TensorShape& shape)
-      : buf_(buf), valueType_(valueType), shape_(shape) {}
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
 
   BufferArg(void* buf, ValueType valueType)
       : buf_(buf), valueType_(valueType) {}
 
-  BufferArg(const Matrix& matrix)
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
-        shape_(2) {
+        shape_(2),
+        argType_(argType) {
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
   }
 
-  BufferArg(const Matrix& matrix, const TensorShape& shape)
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
         valueType_(DataType<real>::value),
-        shape_(shape) {
+        shape_(shape),
+        argType_(argType) {
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
-  BufferArg(const Vector& vector)
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(DataType<real>::value),
-        shape_(1) {
+        shape_(1),
+        argType_(argType) {
     shape_.setDim(0, vector.getSize());
   }
 
-  BufferArg(const IVector& vector)
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
       : buf_(
             const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
         valueType_(VALUE_TYPE_INT32),
-        shape_(1) {
+        shape_(1),
+        argType_(argType) {
     shape_.setDim(0, vector.getSize());
   }
 
@@ -163,8 +172,10 @@ protected:
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
-  SequenceIdArg(void* buf, const TensorShape& shape)
-      : BufferArg(buf, VALUE_TYPE_INT32, shape) {
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
     CHECK_EQ(shape_.ndims(), 1);
     numSeqs_ = shape_[0] - 1;
   }
@@ -187,11 +198,15 @@ public:
   SequenceArg(void* buf,
               ValueType valueType,
               const TensorShape& shape,
-              const SequenceIdArg& startPositions)
-      : BufferArg(buf, valueType, shape), startPositions_(startPositions) {}
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {}
 
-  SequenceArg(const Matrix& matrix, const IVector& vector)
-      : BufferArg(matrix), startPositions_(vector) {}
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {}
 
   ~SequenceArg() {}
 
@@ -214,8 +229,9 @@ public:
                   const BufferArg& col,
                   size_t nnz,
                   SparseDataFormat format,
-                  SparseDataType type)
-      : BufferArg(buf, valueType, shape),
+                  SparseDataType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
         row_(row),
         col_(col),
         nnz_(nnz),
@@ -232,13 +248,13 @@ public:
     }
   }
 
-  SparseMatrixArg(const CpuSparseMatrix& sparse)
-      : BufferArg(sparse),
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
+      : BufferArg(sparse, argType),
         row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
         col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
-  SparseMatrixArg(const GpuSparseMatrix& sparse)
-      : BufferArg(sparse),
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
+      : BufferArg(sparse, argType),
         row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
         col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 1a483c4795..b50098c521 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -84,12 +84,9 @@ public:
     begin_pad_ = config.get<size_t>("begin_pad");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), 2);
@@ -103,6 +100,7 @@ public:
     /// input and output has the same batch_size
     CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
 
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
     auto out_mat = outputs[0].matrix<Device>();
     auto in_mat = inputs[0].matrix<Device>();
     auto w_mat = !inputs[1].data()
@@ -194,12 +192,9 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(3, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK(outputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), 2);
@@ -214,6 +209,8 @@ public:
     /// dim of output = dim of input * context_length
     CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
 
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
     auto out_grad_mat = outputs[0].matrix<Device>();
     auto in_grad_mat =
         !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index ec27db9c21..23ee357a53 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -112,6 +112,8 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
 }
 
 /**
+ * \brief {o_0, o_1} = calc(i_0)
+ *
  * \param inputs[0] input value.
  * \param outputs[0] output value.
  * \param outputs[1] denoms.
@@ -125,17 +127,16 @@ public:
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1, inputs.size());
     CHECK_EQ(2, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK_EQ(inputs[0].shape().ndims(), 4);
     CHECK(inputs[0].shape() == outputs[0].shape());
     CHECK(inputs[0].shape() == outputs[1].shape());
 
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
     size_t height = inputs[0].shape()[2];
@@ -160,6 +161,8 @@ private:
 };
 
 /**
+ * \brief {o_0} = calc(i_0, i_1, i_2, i_3)
+ *
  * \param inputs[0] input value.
  * \param inputs[1] output value.
  * \param inputs[2] output grad.
@@ -175,12 +178,9 @@ public:
     pow_ = config.get<real>("pow");
   }
 
-  void calc(const BufferArgs& inputs,
-            const BufferArgs& outputs,
-            const BufferArgs& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(4, inputs.size());
     CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
 
     CHECK_EQ(inputs[0].shape().ndims(), 4);
     CHECK(inputs[0].shape() == inputs[1].shape());
@@ -188,6 +188,9 @@ public:
     CHECK(inputs[0].shape() == inputs[3].shape());
     CHECK(inputs[0].shape() == outputs[0].shape());
 
+    // TODO(hedaoyuan): need support ASSIGN_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
     size_t height = inputs[0].shape()[2];
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 2f56cfc1b5..46af4e9462 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -72,16 +72,18 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
   return *this;
 }
 
-void BufferArgs::addArg(const Matrix& arg, const TensorShape& shape) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape));
+void BufferArgs::addArg(const Matrix& arg,
+                        const TensorShape& shape,
+                        ArgType argType) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
 }
 
-void BufferArgs::addArg(const CpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
 }
 
-void BufferArgs::addArg(const GpuSparseMatrix& arg) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg));
+void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 88d6824aa3..249f8f9cfa 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -49,7 +49,7 @@ protected:
 /**
  * Argument type for Function::calc().
  * A BufferArgs contains a set of BufferArg,
- * because Function can have multiple inputs, outputs and inouts.
+ * because Function can have multiple inputs and outputs.
  */
 class BufferArgs {
 public:
@@ -58,9 +58,11 @@ public:
 
   // add argument into BufferArgs
   // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
   template <typename Tensor>
-  void addArg(const Tensor& arg) {
-    args_.push_back(std::make_shared<BufferArg>(arg));
+  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
+    args_.push_back(std::make_shared<BufferArg>(arg, argType));
   }
 
   // Add arg into BufferArgs and reshape the arg.
@@ -68,10 +70,12 @@ public:
   // For example, arg represents an image buffer,
   // but Matrix can only represent a two-dimensional Tensor.
   // So need an extra argument to describe the shape of the image buffer.
-  void addArg(const Matrix& arg, const TensorShape& shape);
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
 
-  void addArg(const CpuSparseMatrix& arg);
-  void addArg(const GpuSparseMatrix& arg);
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
 
   // get argument
   const BufferArg& operator[](size_t num) const {
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 26783a42ca..04d06cf33f 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -122,14 +122,13 @@ void ContextProjection::forward() {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
   inputs.addArg(*in_->value);
   inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
                           w_ptr ? w_ptr->getHeight() : 0,
                           input_dim));
   inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->value);
-  forward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*out_->value, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -166,15 +165,14 @@ void ContextProjection::backward(const UpdateCallback& callback) {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
   inputs.addArg(CpuMatrix(
       in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
   inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
                           w_ptr ? w_ptr->getHeight() : 0,
                           input_dim));
   inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->grad);
-  backward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*out_->grad, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 573de152fd..4331009de7 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -59,7 +59,6 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
 
 void CMRProjectionNormLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   /* note: one sample correspond to one row */
   MatrixPtr input = inputLayers_[0]->getOutputValue();
@@ -67,42 +66,36 @@ void CMRProjectionNormLayer::forward(PassType passType) {
   int size = getSize();
   resetOutput(batchSize, size);
 
-  MatrixPtr outV = getOutputValue();
-
   Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
 
   shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
 
+  // prepare forward arguments
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
-  inputs.addArg(*input, shape_);
-  outputs.addArg(*outV, shape_);
-  outputs.addArg(*denoms_, shape_);
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
 
-  forward_[0]->calc(inputs, outputs, inouts);
+  forward_[0]->calc(inputs, outputs);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
-  if (NULL == inputLayers_[0]->getOutputGrad()) {
+  if (NULL == getInputGrad(0)) {
     return;
   }
-  /* Do derivation */
-  MatrixPtr preOutGrad = inputLayers_[0]->getOutputGrad();
-  MatrixPtr localGrad = getOutputGrad();
-  MatrixPtr localOutV = getOutputValue();
-  MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
 
+  // prepare backward arguments
   BufferArgs inputs;
   BufferArgs outputs;
-  BufferArgs inouts;
-  inputs.addArg(*preOutV, shape_);
-  inputs.addArg(*localOutV, shape_);
-  inputs.addArg(*localGrad, shape_);
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
+  inputs.addArg(*getOutputGrad(), shape_);
   inputs.addArg(*denoms_, shape_);
-  outputs.addArg(*preOutGrad, shape_);
-  backward_[0]->calc(inputs, outputs, inouts);
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
+
+  backward_[0]->calc(inputs, outputs);
 }
 }  // namespace paddle

From 8b64446cd3d7d784ea16828ad5c37d8cd3237c7b Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 10 Jan 2017 12:42:22 +0800
Subject: [PATCH 29/41] Update build and install docs

---
 .../build_and_install/build_from_source_en.md | 99 +++++--------------
 1 file changed, 25 insertions(+), 74 deletions(-)

diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index aaa07d49d3..1980c3624d 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -11,32 +11,21 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
-git submodule update --init --recursive
 ```
-
-If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
-
-If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
-```
-git submodule update --remote
-```
-
 ## <span id="requirements">Requirements</span>
 
 To compile the source code, your computer must be equipped with the following dependencies.
 
 - **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
-- **CMake**: version >= 2.8
+- **CMake**: version >= 3.0
 - **BLAS**: MKL, OpenBlas or ATLAS
-- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
-- **Python**: only python 2.7 is supported currently
 
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
 
 ### Options
 
-PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+PaddlePaddle supports some build options. 
 
 <html>
 <table> 
@@ -47,12 +36,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 </tr>
 </thead>
 <tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
+<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
+<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
+<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
+<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
+<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
+<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
+<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
+<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
+<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
+<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
+<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
+<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
+<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
 </table>
 </html>
@@ -66,14 +64,9 @@ As a simple example, consider the following:
 
 1. **Python Dependencies(optional)**
   
-    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+    To compile PaddlePaddle with python predict API, set `-DWITH_SWIG_PY=ON` as follows:
 
     ```bash
-    # install swig on ubuntu
-    sudo apt-get install swig
-    # install swig on Mac OS X
-    brew install swig
-
     # active swig in cmake
     cmake .. -DWITH_SWIG_PY=ON
     ```
@@ -104,17 +97,9 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
-    # optional
-    sudo apt-get install libgoogle-glog-dev
-    sudo apt-get install libgflags-dev
-    sudo apt-get install libgtest-dev
-    sudo pip install wheel
-    pushd /usr/src/gtest
-    cmake .
-    make
-    sudo cp *.a /usr/lib
-    popd
+    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
+    sudo pip install wheel numpy
+    sudo pip install 'protobuf>=3.0.0'
     ```
   
 - **GPU Dependencies (optional)**
@@ -149,51 +134,17 @@ As usual, the best option is to create build folder under paddle project directo
 
 ```bash
 mkdir build && cd build
-cmake ..
-```
-
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
-
-As a simple example, consider the following:
+``` 
 
-- **Only CPU with swig**
-
-  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
-  ```
-- **GPU with swig**
-
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
-  ```
-
-- **GPU with doc and swig**
-
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-  ``` 
-
-Finally, you can build PaddlePaddle:
+Finally, you can build and install PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
+cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
-```
-
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-
-```bash
-# you can run
+# install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-# or just run 
-sudo paddle version
 ```

From ae4400beda6ce14e78d137ff60da4196f7e6c70c Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 10 Jan 2017 13:17:23 +0800
Subject: [PATCH 30/41] Bug fix for mac os

---
 paddle/function/BufferArg.h             | 12 ++++++------
 paddle/function/ContextProjectionOp.cpp |  8 ++++----
 paddle/function/CrossMapNormalOp.cpp    |  4 ++--
 paddle/function/TensorShape.h           |  4 ++--
 4 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 6576d18dae..9649913fa8 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -126,7 +126,7 @@ public:
     CHECK(buf_);
     CHECK(valueType_ == DataType<real>::value);
     // CHECK(deviceType_ == DType);
-    CHECK_EQ(2, shape_.ndims());
+    CHECK_EQ((size_t)2, shape_.ndims());
     return typename Tensor<real, DType>::Matrix(
         reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
   }
@@ -136,7 +136,7 @@ public:
     CHECK(buf_);
     CHECK(valueType_ == DataType<VType>::value);
     // CHECK(deviceType_ == DType);
-    CHECK_EQ(1, shape_.ndims());
+    CHECK_EQ((size_t)1, shape_.ndims());
     return typename Tensor<VType, DType>::Vector(
         shape_[0], reinterpret_cast<VType*>(buf_));
   }
@@ -176,7 +176,7 @@ public:
                 const TensorShape& shape,
                 ArgType argType = UNSPECIFIED)
       : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
-    CHECK_EQ(shape_.ndims(), 1);
+    CHECK_EQ(shape_.ndims(), (size_t)1);
     numSeqs_ = shape_[0] - 1;
   }
 
@@ -238,9 +238,9 @@ public:
         format_(format),
         type_(type) {
     CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
-    CHECK_EQ(shape_.ndims(), 2);
-    CHECK_EQ(row_.shape().ndims(), 1);
-    CHECK_EQ(col_.shape().ndims(), 1);
+    CHECK_EQ(shape_.ndims(), (size_t)2);
+    CHECK_EQ(row_.shape().ndims(), (size_t)1);
+    CHECK_EQ(col_.shape().ndims(), (size_t)1);
     if (format == SPARSE_CSR_FORMAT) {
       CHECK_EQ(nnz, col.shape()[0]);
     } else if (format == SPARSE_CSC_FORMAT) {
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index ca7a11f936..cb448562eb 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -85,8 +85,8 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(3, inputs.size());
-    CHECK_EQ(1, outputs.size());
+    CHECK_EQ((size_t)3, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
 
     CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
@@ -193,8 +193,8 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ(3, inputs.size());
-    CHECK_EQ(1, outputs.size());
+    CHECK_EQ((size_t)3, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
 
     CHECK(outputs[0].data() && inputs[2].data());
     CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index cf98946840..92980c503f 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -131,7 +131,7 @@ public:
     CHECK_EQ((size_t)1, inputs.size());
     CHECK_EQ((size_t)2, outputs.size());
 
-    CHECK_EQ(inputs[0].shape().ndims(), 4);
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
     CHECK(inputs[0].shape() == outputs[0].shape());
     CHECK(inputs[0].shape() == outputs[1].shape());
 
@@ -182,7 +182,7 @@ public:
     CHECK_EQ((size_t)4, inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
 
-    CHECK_EQ(inputs[0].shape().ndims(), 4);
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
     CHECK(inputs[0].shape() == inputs[1].shape());
     CHECK(inputs[0].shape() == inputs[2].shape());
     CHECK(inputs[0].shape() == inputs[3].shape());
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index 0333fe1831..e491e3f1d6 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -42,14 +42,14 @@ public:
 
   // get the size of specified dimension
   size_t operator[](size_t dim) const {
-    CHECK_GE(dim, 0);
+    CHECK_GE(dim, (size_t)0);
     CHECK_LT(dim, ndims_);
     return dims_[dim];
   }
 
   // set the size of specified dimension
   void setDim(size_t dim, size_t size) {
-    CHECK_GE(dim, 0);
+    CHECK_GE(dim, (size_t)0);
     CHECK_LT(dim, ndims_);
     dims_[dim] = size;
     numElements();

From 1e6c917ecc76905fa0f33090d757f2b6376e437f Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 10 Jan 2017 15:30:24 +0800
Subject: [PATCH 31/41] fix unit test of paramRelu

---
 paddle/math/Matrix.cpp                   | 24 ++++++++++++++++++------
 paddle/math/tests/test_Matrix.cpp        |  5 +++--
 paddle/math/tests/test_matrixCompare.cpp |  5 +++--
 3 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 90813a8996..3ae237bc7d 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1311,7 +1311,9 @@ void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   real* output = getData();
   hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
 }
@@ -1324,7 +1326,9 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   real* wgrad = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   hl_param_relu_backward_w(
       wgrad, ograd, input, numElements, numSamples, partial_sum);
 }
@@ -1336,7 +1340,9 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   hl_param_relu_backward_diff(
       ograd, input, w, diff, numElements, numSamples, partial_sum);
 }
@@ -3764,7 +3770,9 @@ void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
@@ -3778,7 +3786,9 @@ void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   real* wgrad = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
@@ -3793,7 +3803,9 @@ void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index 6899769144..a4084bdf7c 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -224,10 +224,11 @@ void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
 }
 
 TEST(Matrix, paramRelu) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
       for (auto w_height : {1, 2}) {
         for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
           testParamReluForward(height, width, w_height, w_width);
           testParamReluBackwardW(height, width, w_height, w_width);
         }
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 3a780d26c0..f0c49791d7 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -773,10 +773,11 @@ void testParamReluBackwardDiff(int height,
 }
 
 TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
       for (auto w_height : {1, 2}) {
         for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
           testParamReluBackwardDiff(height, width, w_height, w_width);
         }
       }

From bce067b002eb1660ea25d60bb1251376b3f157f6 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Tue, 10 Jan 2017 15:41:22 +0800
Subject: [PATCH 32/41] Make gflags third_party path configurable

---
 cmake/external/gflags.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index d38b7d1ba2..2a49d76eb3 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -14,8 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-SET(GFLAGS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gflags)
-SET(GFLAGS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gflags)
+SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
+SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
     set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)

From f953a6e669108e55a388c1e0448ea84b3fe1c42e Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 10 Jan 2017 16:04:15 +0800
Subject: [PATCH 33/41] Add CBLAS in build doc

---
 .../build_and_install/build_from_source_en.md |  7 ++-
 .../cmake/build_from_source_cn.rst            | 43 -------------------
 .../cmake/cblas_settings.csv                  |  5 ---
 .../cmake/compile_options.csv                 | 12 ------
 4 files changed, 3 insertions(+), 64 deletions(-)
 delete mode 100644 doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
 delete mode 100644 doc/getstarted/build_and_install/cmake/cblas_settings.csv
 delete mode 100644 doc/getstarted/build_and_install/cmake/compile_options.csv

diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 1980c3624d..6c2b6e794e 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -62,13 +62,12 @@ PaddlePaddle supports some build options.
 
 As a simple example, consider the following:  
 
-1. **Python Dependencies(optional)**
+1. **BLAS Dependencies(optional)**
   
-    To compile PaddlePaddle with python predict API, set `-DWITH_SWIG_PY=ON` as follows:
+    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `CBLAS_INC_DIR` and `CBLAS_LIBRARIES`.
 
     ```bash
-    # active swig in cmake
-    cmake .. -DWITH_SWIG_PY=ON
+    cmake .. -DCBLAS_INC_DIR=<cblas_inc> -DCBLAS_LIBRARIES=<cblas_libs>
     ```
 
 2. **Doc Dependencies(optional)**
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
deleted file mode 100644
index 3a52c8723b..0000000000
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
deleted file mode 100644
index a6356baf16..0000000000
--- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-编译选项,描述,注意
-MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
-ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
-OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
-REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
deleted file mode 100644
index 463b825470..0000000000
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ /dev/null
@@ -1,12 +0,0 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
-WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file

From 19a0a32f62deb98a5a05e129e8024131ecf447cd Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 10 Jan 2017 16:40:50 +0800
Subject: [PATCH 34/41] revert build cn docs

---
 .../cmake/build_from_source_cn.rst            | 43 +++++++++++++++++++
 .../cmake/cblas_settings.csv                  |  5 +++
 .../cmake/compile_options.csv                 | 12 ++++++
 3 files changed, 60 insertions(+)
 create mode 100644 doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
 create mode 100644 doc/getstarted/build_and_install/cmake/cblas_settings.csv
 create mode 100644 doc/getstarted/build_and_install/cmake/compile_options.csv

diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
new file mode 100644
index 0000000000..be0c1ffa45
--- /dev/null
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -0,0 +1,43 @@
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool型的编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
+
+..  csv-table:: BLAS路径相关的编译选项
+    :widths: 1, 2, 7
+    :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
new file mode 100644
index 0000000000..a6356baf16
--- /dev/null
+++ b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
@@ -0,0 +1,5 @@
+编译选项,描述,注意
+MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
+ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
+OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
+REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
new file mode 100644
index 0000000000..463b825470
--- /dev/null
+++ b/doc/getstarted/build_and_install/cmake/compile_options.csv
@@ -0,0 +1,12 @@
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
+WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file

From d623378ea0bb97b466c82980b31ee98d7c2d88e9 Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Tue, 10 Jan 2017 17:57:32 +0800
Subject: [PATCH 35/41] fix some warning about CpuSparseMatrix

---
 paddle/function/BufferArg.cpp     | 11 +++++++++++
 paddle/function/BufferArg.h       | 12 ++----------
 paddle/function/BufferArgTest.cpp |  1 +
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
index 65c6f30304..fde48a73b6 100644
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <glog/logging.h>
 
 #include "BufferArg.h"
+#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -28,4 +29,14 @@ const SparseMatrixArg& BufferArg::sparse() const {
   return dynamic_cast<const SparseMatrixArg&>(*this);
 }
 
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+
 }  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 9649913fa8..12352ba29e 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -18,9 +18,7 @@ limitations under the License. */
 
 #include "TensorShape.h"
 #include "TensorType.h"
-#include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -248,15 +246,9 @@ public:
     }
   }
 
-  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
-      : BufferArg(sparse, argType),
-        row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-        col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
 
-  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
-      : BufferArg(sparse, argType),
-        row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-        col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
 
   ~SparseMatrixArg() {}
 
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index a9ee3ab079..b345597435 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "Function.h"
 #include "paddle/math/MemoryHandle.h"
+#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 

From 298a6cd35c3d693af2b040ae4b08721dea3497ad Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 10 Jan 2017 18:00:24 +0800
Subject: [PATCH 36/41] Upgrade python protobuf to 3.0.0

---
 cmake/external/python.cmake |  4 ++++
 cmake/python_module.cmake   | 13 +++++++++++++
 paddle/setup.py.in          |  2 +-
 3 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index cbb6940221..fc681453ef 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -26,6 +26,10 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
     find_python_module(wheel REQUIRED)
     find_python_module(google.protobuf REQUIRED)
     FIND_PACKAGE(NumPy REQUIRED)
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+        "please use pip to upgrade protobuf.")
+    ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
     ##################################### PYTHON ########################################
     SET(PYTHON_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/python)
diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake
index 2eb3441428..1412b7f7f2 100644
--- a/cmake/python_module.cmake
+++ b/cmake/python_module.cmake
@@ -26,5 +26,18 @@ function(find_python_module module)
     if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
         message(FATAL_ERROR "python module ${module} is not found")
     endif()
+
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+        "import sys, ${module}; sys.stdout.write(${module}.__version__)"
+        OUTPUT_VARIABLE _${module}_version
+        RESULT_VARIABLE _${module}_status
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT _${module}_status)
+        set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
+            "Version of Python module ${module}")
+    endif(NOT _${module}_status)
+
     set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
+    set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
 endfunction(find_python_module)
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index e3650bf1c0..c79666bc81 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -70,6 +70,6 @@ setup(name="py_paddle",
   include_dirs = include_dirs,
   install_requires = [
     'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=2.4.1' # The paddle protobuf version
+    'protobuf>=3.0.0'    # The paddle protobuf version
   ],
 )

From 80c1679284c898bc8126dee255ca00c1e111a414 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Tue, 10 Jan 2017 21:54:43 +0800
Subject: [PATCH 37/41] Fix cudart bugs before initMain

---
 cmake/util.cmake                    |   1 +
 paddle/api/paddle_api_config.py.in  |   2 +-
 paddle/cuda/include/hl_dso_loader.h |   8 --
 paddle/cuda/src/hl_cuda_device.cc   | 172 +++++++-----------------
 paddle/cuda/src/hl_cudart_wrap.cc   | 200 ----------------------------
 paddle/cuda/src/hl_dso_loader.cc    |  14 +-
 6 files changed, 51 insertions(+), 346 deletions(-)
 delete mode 100644 paddle/cuda/src/hl_cudart_wrap.cc

diff --git a/cmake/util.cmake b/cmake/util.cmake
index 7da52bb758..24ad5c815c 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME)
     endif()
 
     if(WITH_GPU)
+        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
         if(NOT WITH_DSO OR WITH_METRIC)
             target_link_libraries(${TARGET_NAME}
                 ${CUDNN_LIBRARY}
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index e11ee92036..82f45ba6cc 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
 
-CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@"
+CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
 WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index 20c13f21e6..276a07d3c7 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
  */
 void GetCudnnDsoHandle(void** dso_handle);
 
-/**
- * @brief    load the DSO of CUDA Run Time
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCudartDsoHandle(void** dso_handle);
-
 /**
  * @brief    load the DSO of CURAND
  *
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index a71eecba27..6dfb12e00b 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -22,10 +22,9 @@ limitations under the License. */
 #include <sys/time.h>
 #include <unistd.h>
 #include <mutex>
-#include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
 // clang-format on
 
@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef CURAND_RAND_ROUTINE_EACH
 #undef DYNAMIC_LOAD_CURAND_WRAP
 
-std::once_flag cudart_dso_flag;
-void *cudart_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)        \
-  __macro(cudaMalloc)                     \
-  __macro(cudaHostAlloc)                  \
-  __macro(cudaFree)                       \
-  __macro(cudaFreeHost)                   \
-  __macro(cudaMemcpy)                     \
-  __macro(cudaMemset)                     \
-  __macro(cudaMemcpyAsync)                \
-  __macro(cudaSetDevice)                  \
-  __macro(cudaGetDevice)                  \
-  __macro(cudaGetDeviceCount)             \
-  __macro(cudaGetDeviceProperties)        \
-  __macro(cudaDeviceSynchronize)          \
-  __macro(cudaDeviceCanAccessPeer)        \
-  __macro(cudaDeviceEnablePeerAccess)     \
-  __macro(cudaStreamCreate)               \
-  __macro(cudaStreamDestroy)              \
-  __macro(cudaStreamSynchronize)          \
-  __macro(cudaStreamWaitEvent)            \
-  __macro(cudaEventCreate)                \
-  __macro(cudaEventRecord)                \
-  __macro(cudaEventQuery)                 \
-  __macro(cudaEventDestroy)               \
-  __macro(cudaEventSynchronize)           \
-  __macro(cudaEventElapsedTime)           \
-  __macro(cudaSetDeviceFlags)             \
-  __macro(cudaGetLastError)               \
-  __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)             \
-  __macro(cudaProfilerStart)              \
-  __macro(cudaProfilerStop)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#undef CUDA_ROUNTINE_EACH
-#undef DYNAMIC_LOAD_CUDART_WRAP
-
 } /* namespace dynload */
 
 /**
@@ -171,11 +98,11 @@ int g_cuda_lib_version = 0;
  * Check build-in cuda function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDA(cudaFunc)                                                  \
-  do {                                                                        \
-    cudaError_t cudaStat = cudaFunc;                                          \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
-                                    << dynload::cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc)                                         \
+  do {                                                               \
+    cudaError_t cudaStat = cudaFunc;                                 \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
+                                    << cudaGetErrorString(cudaStat); \
   } while (0)
 
 /**
@@ -284,13 +211,13 @@ void hl_fini() {
       tmp_stream = (char *)t_device[dev]->stream;
     }
     for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
+      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
     }
 
     /* free device memory */
     hl_free_mem_device(t_device[dev]->gpu_mem);
     hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event));
+    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
   }
 
   free(tmp);
@@ -308,7 +235,7 @@ void hl_set_device(int device) {
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
       << "Device: " << device << " is not specified in startup.";
 
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
 
   /* switch thread stream */
   for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
@@ -336,7 +263,7 @@ void hl_set_device(int device) {
 
 int hl_get_device() {
   int device;
-  CHECK_CUDA(dynload::cudaGetDevice(&device));
+  CHECK_CUDA(cudaGetDevice(&device));
   return device;
 }
 
@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) {
   void *dest_d;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
+  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
 
   return dest_d;
 }
@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) {
 void hl_free_mem_device(void *dest_d) {
   CHECK_NOTNULL(dest_d);
 
-  cudaError_t err = dynload::cudaFree(dest_d);
+  cudaError_t err = cudaFree(dest_d);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
       << hl_get_device_error_string();
 }
@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(
-      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) {
 void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
-  cudaError_t err = dynload::cudaFreeHost(dest_h);
+  cudaError_t err = cudaFreeHost(dest_h);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
       << hl_get_device_error_string();
 }
@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
   }
   CHECK_NOTNULL(dst);
   CHECK_NOTNULL(src);
-  CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
 }
 
 void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(dynload::cudaMemset(dest_d, value, size));
+  CHECK_CUDA(cudaMemset(dest_d, value, size));
 }
 
 void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
   }
   CHECK_NOTNULL(src_h);
   CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_h);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_d);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(
-      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
   CHECK_LT(stream, HPPL_STREAM_END);
   cu_stream = t_resource.stream[stream];
 
-  CHECK_CUDA(
-      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
+  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 
 void hl_start() {
@@ -439,8 +363,7 @@ void hl_start() {
 
 bool hl_device_can_access_peer(int device, int peerDevice) {
   int canAccessPeer;
-  CHECK_CUDA(
-      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
+  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
 
   if (canAccessPeer == 1) {
     return true;
@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
 }
 
 void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0);
+  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
   if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    dynload::cudaGetLastError();
+    cudaGetLastError();
   } else {
     CHECK_CUDA(err);
   }
@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
   int device = device_prop->device;
   global_device_resources device_res = device_prop->device_resources;
 
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
   /* device properties */
-  CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device));
+  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
 
   device_prop->major = cu_prop.major;
   device_prop->minor = cu_prop.minor;
@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
 
   /* create device stream */
   for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
   }
 
   /* cublas init */
@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
   device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
   pthread_mutex_init(device_res->gen_mutex, NULL);
 
-  CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
+  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 
 int hl_get_cuda_version() { return g_cuda_lib_version; }
 
 void hl_create_thread_resources(int device,
                                 thread_device_resources device_res) {
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
 
   /* create thread stream */
   for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
   }
 
   /* allocation device memory */
@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device,
   /* allocation host memory */
   device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
 
-  CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
+  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
 }
 
 void hl_specify_devices_start(int *device, int number) {
   if (hl_start_flag) return;
 
   /* 1. get the number of devices */
-  CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num));
+  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
   CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
   if (device == NULL) {
     number = g_system_device_num;
@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
+  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
 }
 
 void hl_create_event(hl_event_t *event) {
@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
   struct _hl_event_st *st_event =
       (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
 
-  CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
+  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
 
   *event = st_event;
 }
@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
   CHECK_NOTNULL(start);
   CHECK_NOTNULL(end);
 
-  CHECK_CUDA(
-      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
+  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
   return time;
 }
 
@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
+  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
 }
 
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 
 void hl_destroy_event(hl_event_t event) {
   CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event));
+  CHECK_CUDA(cudaEventDestroy(event->cu_event));
 
   free(event);
   event = NULL;
@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
 
 void hl_event_synchronize(hl_event_t event) {
   CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event));
+  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
 }
 
 void hl_get_device_name(char *name, int len, int device) {
@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
   *minor = g_device[device]->minor;
 }
 
-int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
 
 const char *hl_get_device_error_string() {
-  cudaError_t err = dynload::cudaGetLastError();
-  return dynload::cudaGetErrorString(err);
+  cudaError_t err = cudaGetLastError();
+  return cudaGetErrorString(err);
 }
 
 const char *hl_get_device_error_string(size_t err) {
-  return dynload::cudaGetErrorString((cudaError_t)err);
+  return cudaGetErrorString((cudaError_t)err);
 }
 
-void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 
 bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
+  cudaError_t err = cudaEventQuery(event->cu_event);
   CHECK(cudaSuccess == err || cudaErrorNotReady == err);
 
   if (cudaErrorNotReady == err) {
@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
   return true;
 }
 
-void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
 
-void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
deleted file mode 100644
index ecc03a729d..0000000000
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_USE_DSO
-
-#include <cuda_runtime.h>
-#include <mutex>
-#include "hl_dso_loader.h"
-
-/**
- * cudart wrapper: for dynamic load libcudart.so.
- * When nvcc compile cuda kernels, it will insert
- * some build-in runtime routines, which must be
- * provided by us if PADDLE_USE_DSO is true. If
- * PADDLE_USE_DSO is false, all of them must be
- * ignored to avoid multiple definitions.
- */
-namespace dynload {
-
-extern std::once_flag cudart_dso_flag;
-extern void *cudart_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    __type operator()(Args... args) {                                          \
-      typedef __type (*cudartFunc)(Args...);                                   \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)          \
-  __macro(cudaLaunch, cudaError_t)          \
-  __macro(cudaSetupArgument, cudaError_t)   \
-  __macro(cudaConfigureCall, cudaError_t)   \
-  __macro(__cudaRegisterFatBinary, void**)  \
-  __macro(__cudaUnregisterFatBinary, void)  \
-  __macro(__cudaRegisterFunction, void)     \
-  __macro(__cudaRegisterVar, void)          \
-  __macro(__cudaRegisterManagedVar, void)   \
-  __macro(__cudaInitModule, char)           \
-  __macro(__cudaRegisterTexture, void)      \
-  __macro(__cudaRegisterSurface, void)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#if CUDART_VERSION >= 7000
-DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
-#endif
-
-#undef CUDA_ROUNTINE_EACH
-
-} /* namespace dynload */
-
-#if CUDART_VERSION >= 7000
-__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
-                                                dim3 gridDim,
-                                                dim3 blockDim,
-                                                void **args,
-                                                size_t sharedMem,
-                                                cudaStream_t stream) {
-  return dynload::cudaLaunchKernel(
-      func, gridDim, blockDim, args, sharedMem, stream);
-}
-#endif /* CUDART_VERSION >= 7000 */
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
-  return dynload::cudaLaunch(func);
-}
-
-__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
-                                                 size_t size,
-                                                 size_t offset) {
-  return dynload::cudaSetupArgument(arg, size, offset);
-}
-
-__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
-                                                 dim3 blockDim,
-                                                 size_t sharedMem,
-                                                 cudaStream_t stream) {
-  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
-}
-
-extern "C" {
-
-void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
-  return dynload::__cudaRegisterFatBinary(fatCubin);
-}
-
-void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
-  return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
-                                      const char *hostFun,
-                                      char *deviceFun,
-                                      const char *deviceName,
-                                      int thread_limit,
-                                      uint3 *tid,
-                                      uint3 *bid,
-                                      dim3 *bDim,
-                                      dim3 *gDim,
-                                      int *wSize) {
-  return dynload::__cudaRegisterFunction(fatCubinHandle,
-                                         hostFun,
-                                         deviceFun,
-                                         deviceName,
-                                         thread_limit,
-                                         tid,
-                                         bid,
-                                         bDim,
-                                         gDim,
-                                         wSize);
-}
-
-void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
-                                 char *hostVar,
-                                 char *deviceAddress,
-                                 const char *deviceName,
-                                 int ext,
-                                 int size,
-                                 int constant,
-                                 int global) {
-  return dynload::__cudaRegisterVar(fatCubinHandle,
-                                    hostVar,
-                                    deviceAddress,
-                                    deviceName,
-                                    ext,
-                                    size,
-                                    constant,
-                                    global);
-}
-
-extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
-                                               void **hostVarPtrAddress,
-                                               char *deviceAddress,
-                                               const char *deviceName,
-                                               int ext,
-                                               int size,
-                                               int constant,
-                                               int global) {
-  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
-                                           hostVarPtrAddress,
-                                           deviceAddress,
-                                           deviceName,
-                                           ext,
-                                           size,
-                                           constant,
-                                           global);
-}
-
-char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  return dynload::__cudaInitModule(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
-                                     const struct textureReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int norm,
-                                     int ext) {
-  return dynload::__cudaRegisterTexture(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
-}
-
-void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
-                                     const struct surfaceReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int ext) {
-  return dynload::__cudaRegisterSurface(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
-}
-
-} /* extern "C" */
-
-#endif
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index c92909de53..53164dd27c 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir,
 DEFINE_string(cuda_dir,
               "",
               "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
-              "libcudart can not be specified by cuda_dir, since some "
-              "build-in function in cudart already ran before main entry). "
-              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
 
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
 #endif
 }
 
-void GetCudartDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
-#endif
-}
-
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);

From eda4254af09df054cca6ed25f3243e19e0ac2839 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 11 Jan 2017 12:39:02 +0800
Subject: [PATCH 38/41] Remove hl_cudart_wrap.cc in CMake

---
 paddle/cuda/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 57fb89608f..a28ccd6f07 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
     src/hl_cuda_device.cc)

From aa9f5162605d317caa72dcc4cfa1c08fc8faee55 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Wed, 11 Jan 2017 12:59:23 +0800
Subject: [PATCH 39/41] code refine, add comment and some naming problem

---
 paddle/pserver/ParameterServer2Main.cpp      |  8 ++--
 paddle/pserver/ParameterServerController.cpp | 43 ++++++++++----------
 paddle/pserver/ParameterServerController.h   | 22 ++++++----
 paddle/trainer/TrainerMain.cpp               |  7 ++--
 proto/ParameterServerConfig.proto            |  7 ++++
 5 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index 1145052522..845a2c27e2 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -20,10 +20,10 @@ using namespace paddle;  // NOLINT
 int main(int argc, char** argv) {
   initMain(argc, argv);
 
-  std::unique_ptr<ParameterServerController> pServerPtr(
-      paddle::ParameterServerController::createByGflags());
-  pServerPtr->start();
-  pServerPtr->join();
+  std::unique_ptr<ParameterServerController> parameterServerPtr(
+      paddle::ParameterServerController::createFromGflags());
+  parameterServerPtr->start();
+  parameterServerPtr->wait();
 
   return 0;
 }
diff --git a/paddle/pserver/ParameterServerController.cpp b/paddle/pserver/ParameterServerController.cpp
index ec24bc7e57..1d11a2e1ac 100644
--- a/paddle/pserver/ParameterServerController.cpp
+++ b/paddle/pserver/ParameterServerController.cpp
@@ -25,43 +25,44 @@ ParameterServerController::ParameterServerController(
   int numPorts = config.ports_num() + config.ports_num_for_sparse();
 
   if (config.nics().empty()) {
-    pservers_.resize(numPorts);
+    parameterServers_.resize(numPorts);
     for (int i = 0; i < numPorts; ++i) {
       if (config.rdma_tcp() == "rdma") {
-        pservers_[i].reset(
+        parameterServers_[i].reset(
             new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
         rdmaCpu = rdmaCpu % onlineCpus;
       } else {
-        pservers_[i].reset(
+        parameterServers_[i].reset(
             new ParameterServer2(std::string(), config.port() + i));
       }
-      CHECK(pservers_[i]->init()) << "Fail to initialize parameter server"
-                                  << config.port() + i;
+      CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
+                                             "server on port "
+                                          << config.port() + i;
     }
   } else {
     str::split(config.nics(), ',', &devices);
-    pservers_.resize(devices.size() * numPorts);
+    parameterServers_.resize(devices.size() * numPorts);
     for (int i = 0; i < numPorts; ++i) {
       for (size_t j = 0; j < devices.size(); ++j) {
         if (config.rdma_tcp() == "rdma") {
-          pservers_[i * devices.size() + j].reset(new ParameterServer2(
+          parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
               getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
           rdmaCpu = rdmaCpu % onlineCpus;
         } else {
-          pservers_[i * devices.size() + j].reset(
+          parameterServers_[i * devices.size() + j].reset(
               new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
         }
-        CHECK(pservers_[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server" << devices[j]
+        CHECK(parameterServers_[i * devices.size() + j]->init())
+            << "Fail to initialize parameter server with device " << devices[j]
             << config.port() + i;
       }
     }
   }
 }
 
-ParameterServerController::~ParameterServerController() { this->join(); }
+ParameterServerController::~ParameterServerController() { this->wait(); }
 
-ParameterServerController* ParameterServerController::createByGflags() {
+ParameterServerController* ParameterServerController::createFromGflags() {
   ParameterServerConfig config;
 
   config.set_nics(FLAGS_nics);
@@ -79,21 +80,21 @@ ParameterServerController* ParameterServerController::create(
 }
 
 void ParameterServerController::start() {
-  LOG(INFO) << "pserver sizes : " << pservers_.size();
+  LOG(INFO) << "number of parameterServer instances: "
+            << parameterServers_.size();
   int i = 0;
-  for (const auto& pserver : pservers_) {
-    LOG(INFO) << "pserver started : " << i;
-    pserver->start();
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Starting parameterServer[" << i << "]";
+    parameterServer->start();
     i++;
   }
 }
 
-void ParameterServerController::join() {
-  LOG(INFO) << "pserver sizes : " << pservers_.size();
+void ParameterServerController::wait() {
   int i = 0;
-  for (const auto& pserver : pservers_) {
-    LOG(INFO) << "pserver join : " << i;
-    pserver->join();
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Waiting parameterServer[" << i << "]";
+    parameterServer->join();
     i++;
   }
 }
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
index ee249de9d8..fe9bb0b4d0 100644
--- a/paddle/pserver/ParameterServerController.h
+++ b/paddle/pserver/ParameterServerController.h
@@ -21,6 +21,12 @@ limitations under the License. */
 
 namespace paddle {
 
+/**
+ * @brief ParameterServerController is used for create, init and manage multi
+ * parameter server instances. The num of the instances is decided by port
+ * num(the ports number for parameter send) and network devices configured
+ * by gflags or proto.
+ */
 class ParameterServerController final {
 public:
   DISABLE_COPY(ParameterServerController);
@@ -39,28 +45,30 @@ public:
    * @brief create ParameterServerController from gflags, this is used for
    * compatibility with the old usage of configuration by gflags.
    */
-  static ParameterServerController* createByGflags();
+  static ParameterServerController* createFromGflags();
 
   /**
    * @brief create ParameterServerController with ParameterServerConfig, remove
-   * gflags from ParameterServer. Init all pservers thread according to the
-   * config.
+   * gflags from ParameterServer. Init all ParameterServer2 instances according
+   * to
+   * the config.
    */
   static ParameterServerController* create(const ParameterServerConfig& config);
 
   /**
-   * @brief start all pserver thread in this ParameterServerController.
+   * @brief start all ParameterServer2 instances in this
+   * ParameterServerController.
    */
   void start();
 
   /**
-   * @brief join and wait for all pserver thread in this
+   * @brief join and wait for all ParameterServer2 instances thread in this
    * ParameterServerController.
    */
-  void join();
+  void wait();
 
 private:
-  std::vector<std::unique_ptr<ParameterServer2>> pservers_;
+  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 61de728f2a..c5c1d484e5 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -36,10 +36,11 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
 
-  std::unique_ptr<ParameterServerController> pServerPtr(nullptr);
+  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
   if (FLAGS_start_pserver) {
-    pServerPtr.reset(paddle::ParameterServerController::createByGflags());
-    pServerPtr->start();
+    parameterServerPtr.reset(
+        paddle::ParameterServerController::createFromGflags());
+    parameterServerPtr->start();
   }
   Trainer trainer;
   auto config = TrainerConfigHelper::createFromFlags();
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
index b4fbf901c2..3068bba8b1 100644
--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
@@ -15,10 +15,17 @@ syntax = "proto2";
 
 package paddle;
 
+
+/**
+ * Configuration structure for ParameterClient2.
+ */
 message ParameterClientConfig {
   required int32 trainer_id = 1;
 }
 
+/**
+ * Configuration structure for ParameterServer2.
+ */
 message ParameterServerConfig {
   // The ports number for parameter send,
   // increment based on default port number

From 4de77970d1b7937217df52fb4d5db018a886eaf9 Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Wed, 11 Jan 2017 14:05:39 +0800
Subject: [PATCH 40/41] Remove git submodule from docs

---
 doc/faq/index_cn.rst                          | 19 -------------------
 .../build_and_install/build_from_source_en.md |  9 ++++++---
 .../build_and_install/docker_install_en.rst   | 15 ++-------------
 doc/howto/dev/contribute_to_paddle_cn.md      |  1 -
 doc/howto/dev/contribute_to_paddle_en.md      |  1 -
 5 files changed, 8 insertions(+), 37 deletions(-)

diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 7d425a05d4..6d5367177d 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -286,22 +286,3 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash
 
         paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
-
-12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
----------------------------------------------------
-
-目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
-:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
-
-..  code-block:: bash
-
-    git clone --recursive https://github.com/PaddlePaddle/Paddle.git
-
-来获取所有源码。对于已经clone的git版本库，可以在Paddle的源码目录中执行\:
-
-..  code-block:: bash
-
-    git submodule init
-    git submodule update
-
-来获得所有第三方模块。
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 6c2b6e794e..6954be3b2b 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -17,7 +17,7 @@ cd paddle
 To compile the source code, your computer must be equipped with the following dependencies.
 
 - **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
-- **CMake**: version >= 3.0
+- **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
 
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
@@ -64,10 +64,13 @@ As a simple example, consider the following:
 
 1. **BLAS Dependencies(optional)**
   
-    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `CBLAS_INC_DIR` and `CBLAS_LIBRARIES`.
+    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
-    cmake .. -DCBLAS_INC_DIR=<cblas_inc> -DCBLAS_LIBRARIES=<cblas_libs>
+    # specify MKL
+    cmake .. -DMKL_ROOT=<mkl_path>
+    # or specify OpenBLAS
+    cmake .. -DOPENBLAS_ROOT=<openblas_path>
     ```
 
 2. **Doc Dependencies(optional)**
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 34279a29b2..51a1a11674 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,23 +16,13 @@ Developers can work on PaddlePaddle using Docker.  This allows
 developers to work on different platforms -- Linux, Mac OS X, and
 Windows -- in a consistent way.
 
-The general development workflow with Docker and Bazel is as follows:
+The general development workflow with Docker and CMake is as follows:
 
 1. Get the source code of Paddle:
 
    .. code-block:: bash
 
-      git clone --recursive https://github.com/PaddlePaddle/Paddle.git
-
-   
-   Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
-
-   If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
-   empty, please use the following command to get the submodule.
-
-   .. code-block:: bash
-
-      git submodule update --init --recursive
+      git clone https://github.com/PaddlePaddle/Paddle.git
 
 
 2. Build a development Docker image :code:`paddle:dev` from the source
@@ -162,7 +152,6 @@ source code:
    cd ~
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
-   git submodule update --init --recursive
    docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
    docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
 
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index e0a63f5a14..ee1b3213ea 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -33,7 +33,6 @@ cd Paddle
 git checkout -b develop  # 创建 develop 分支
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
 git pull upstream develop  # 更新 upstream
-git submodule update --init --recursive
 ```
 
 然后你可以通过做一个本地开发分支开始开发
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
index e578f6fce8..9b0d3e83c0 100644
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -38,7 +38,6 @@ cd Paddle
 git checkout -b develop  # create develop branch.
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
-git submodule update --init --recursive
 ```
 
 Then you can start to develop by making a local developement branch

From dc251c017bd93c2c79f6dbac0ae0f39c912c066d Mon Sep 17 00:00:00 2001
From: liaogang <liaogang@baidu.com>
Date: Fri, 13 Jan 2017 13:26:41 +0800
Subject: [PATCH 41/41] Enable cuda dynamic link libs

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index abe7b5228c..ec62dfb4b0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" OFF)
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)