Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into stat_usage

8 years ago · 766adaea7c
parent 7193cbf119 495649af57
commit 766adaea7c
56 changed files with 2142 additions and 959 deletions
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@ -107,7 +107,6 @@ function(link_paddle_exe TARGET_NAME)
        paddle_parameter
        paddle_proto
        paddle_cuda
-        paddle_test_main
        ${METRIC_LIBS}
        ${PROTOBUF_LIBRARY}
        ${LIBGLOG_LIBRARY}
@ -155,7 +154,8 @@ endfunction()
 # Rest Arguemnts: not used.
 function(link_paddle_test TARGET_NAME)
    link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
+    target_link_libraries(${TARGET_NAME}
+                          paddle_test_main
                          ${GTEST_LIBRARIES})
 endfunction()

--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@ -16,10 +16,9 @@ set -e
 set -x
 BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'

-wget ${BASE_URL}/baidu.dict
-
-DOWNLOAD_ITEMS=(model_32.emb model_64.emb model_128.emb model_256.emb)
-ITEM_MD5=(f88c8325ee6da6187f1080e8fe66c1cd
+DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
+ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
+          f88c8325ee6da6187f1080e8fe66c1cd
          927cf70f27f860aff1a5703ebf7f1584
 	  a52e43655cd25d279777ed509a1ae27b
 	  b92c67fe9ff70fea53596080e351ac80)
--- a/demo/recommendation/evaluate.py
+++ b/demo/recommendation/evaluate.py
@ -0,0 +1,37 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+import math
+
+
+def get_best_pass(log_filename):
+    with open(log_filename, 'r') as f:
+        text = f.read()
+        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
+                             re.S)
+        results = re.findall(pattern, text)
+        sorted_results = sorted(results, key=lambda result: float(result[0]))
+        return sorted_results[0]
+
+
+log_filename = sys.argv[1]
+log = get_best_pass(log_filename)
+predict_error = math.sqrt(float(log[0])) / 2
+print 'Best pass is %s, error is %s, which means predict get error as %f' % (
+    log[1], log[0], predict_error)
+
+evaluate_pass = "output/pass-%s" % log[1]
+print "evaluating from pass %s" % evaluate_pass
--- a/doc/howto/usage/cluster/k8s-aws/README.md
+++ b/doc/howto/usage/cluster/k8s-aws/README.md
--- a/doc/howto/usage/cluster/k8s-aws/add_security_group.png
+++ b/doc/howto/usage/cluster/k8s-aws/add_security_group.png
--- a/doc/howto/usage/cluster/k8s-aws/create_efs.png
+++ b/doc/howto/usage/cluster/k8s-aws/create_efs.png
--- a/doc/howto/usage/cluster/k8s-aws/efs_mount.png
+++ b/doc/howto/usage/cluster/k8s-aws/efs_mount.png
--- a/doc/howto/usage/cluster/k8s-aws/managed_policy.png
+++ b/doc/howto/usage/cluster/k8s-aws/managed_policy.png
--- a/doc/howto/usage/cluster/k8s/k8s_en.md
+++ b/doc/howto/usage/cluster/k8s/k8s_en.md
@ -0,0 +1,201 @@
+# Paddle On Kubernetes
+
+>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+
+## Build Docker Image
+
+In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+
+Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
+And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+  
+### Run Docker Container
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### Download Training Data
+
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### Modify Startup Script
+
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### Commit Docker Image
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## Use Kubernetes For Training
+
+>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+
+### Create Yaml Files
+
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### Start Paddle Job
+
+Using the above yaml file to start the Kubernetes job.
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+Get the detailed status of the job:
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### Get Training Result
+
+We can use kubectl command to take a look at the status of related pod.
+
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+We can also ssh to Kubernetes node to take a look at the training result.
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@ -1,6 +1,7 @@
 add_subdirectory(cuda)
 add_subdirectory(function)
 add_subdirectory(utils)
+add_subdirectory(testing)
 add_subdirectory(math)
 add_subdirectory(parameter)
 add_subdirectory(gserver)
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@ -178,6 +178,7 @@ namespace std {
 %newobject ParameterOptimizer::create;
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;

 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@ -803,6 +803,8 @@ private:

 public:
  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount);
  ~ParameterUpdater();

  /**
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@ -15,15 +15,25 @@ limitations under the License. */
 #include "PaddleAPI.h"

 #include "PaddleAPIPrivate.h"
+#include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"

 ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}

 ParameterUpdater *ParameterUpdater::createLocalUpdater(
    OptimizationConfig *config) {
-  auto param = new ParameterUpdater();
-  param->m->updater.reset(new paddle::SgdThreadUpdater(config->m->getConfig()));
-  return param;
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(
+      new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return updater;
+}
+
+ParameterUpdater *ParameterUpdater::createRemoteUpdater(
+    OptimizationConfig *config, int passCount) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr));
+  return updater;
 }

 ParameterUpdater::~ParameterUpdater() { delete m; }
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
 extern void hl_max_sequence_backward(
    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);

-/**
- * @brief   Context projection forward.
- *
- * @param[in]   input           input sequence.
- * @param[in]   sequence        sequence index.
- * @param[in]   weightData      padding data.
- * @param[out]  output          output sequence.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- * @param[in]   isPadding       trainable padding.
- *
- */
-extern void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding);
-
-/**
- * @brief   Context projection backward data.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  inputGrad       input gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- *
- */
-extern void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart);
-
-/**
- * @brief   Context projection backward weight.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  weightGrad      weight gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   weightDim       input sequence dimension.
- * @param[in]   totalPad        number of extra timesteps.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- *
- */
-extern void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad);
-
 /**
 * @brief   Memory copy from sequence to batch.
 *
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
 inline void hl_max_sequence_backward(
    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}

-inline void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding) {}
-
-inline void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {}
-
-inline void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {}
-
 inline void hl_sequence2batch_copy(real* batch,
                                   real* sequence,
                                   const int* batchIndex,
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@ -11,13 +11,16 @@ endif()

 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})

-add_library(paddle_test_main STATIC TestMain.cpp)
-
 if(WITH_GPU)
+if(WITH_TESTING)
    # TODO:
    # file(GLOB test_files . *OpTest.cpp)
    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
    add_simple_unittest(CrossMapNormalOpTest)
+    add_unittest(ContextProjectionOpTest
+        ContextProjectionOpTest.cpp
+        ../gserver/tests/TestUtil.cpp)
+endif()
 endif()

 add_style_check_target(paddle_function ${h_files})
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Context Projection Forward.
+ *
+ * \param[out]  outputs           output data.
+ * \param[in]   input             input data.
+ * \param[in]   weight            input weight.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionForward(typename MatrixT<Device>::type* output,
+                              const typename MatrixT<Device>::type* input,
+                              const typename MatrixT<Device>::type* weight,
+                              const typename SequenceT<Device>::type& sequence,
+                              size_t context_length,
+                              int context_start,
+                              size_t begin_pad);
+
+/**
+ * \brief   Context Projection Backward.
+ *
+ * \param[out]  outputs           output gradient.
+ * \param[in]   input             input gradient.
+ * \param[in]   weight            input weight gradient.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
+                               typename MatrixT<Device>::type* in_grad,
+                               typename MatrixT<Device>::type* w_grad,
+                               const typename SequenceT<Device>::type& seq_vec,
+                               size_t context_length,
+                               int context_start,
+                               size_t begin_pad,
+                               bool is_padding,
+                               size_t total_pad);
+
+template <DeviceType Device>
+void ContextProjectionBackwardData(
+    typename MatrixT<Device>::type* out_grad,
+    typename MatrixT<Device>::type* in_grad,
+    const typename SequenceT<Device>::type& sequence,
+    size_t context_length,
+    int context_start);
+
+template <DeviceType Device>
+void ContextProjectionBackwardWeight(
+    typename MatrixT<Device>::type* out_grad,
+    typename MatrixT<Device>::type* w_grad,
+    const typename SequenceT<Device>::type& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t total_pad,
+    size_t begin_pad);
+
+}  // namespace paddle
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionForward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start)));
+
+  CpuMatrix cpu_in(batch_size, input_dim);
+  cpu_in.randomizeUniform();
+  GpuMatrix gpu_in(batch_size, input_dim);
+  gpu_in.copyFrom(cpu_in);
+  auto cpu_weight =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_weight =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_weight->randomizeUniform();
+    gpu_weight->copyFrom(*cpu_weight);
+  }
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  CpuMatrix cpu_out(batch_size, input_dim * context_length);
+  GpuMatrix gpu_out(batch_size, input_dim * context_length);
+  cpu_out.randomizeUniform();
+  gpu_out.copyFrom(cpu_out);
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckEqual(cpu_out, gpu_out);
+}
+
+void testMatrixProjectionBackward(int context_start,
+                                  int context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionBackward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start))
+                              .set("is_padding", is_padding)
+                              .set("total_pad", pad));
+
+  CpuMatrix cpu_in_grad(batch_size, input_dim);
+  cpu_in_grad.randomizeUniform();
+  GpuMatrix gpu_in_grad(batch_size, input_dim);
+  gpu_in_grad.copyFrom(cpu_in_grad);
+
+  CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
+  cpu_out_grad.randomizeUniform();
+  GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
+  gpu_out_grad.copyFrom(cpu_out_grad);
+
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  auto cpu_w_grad =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_w_grad =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_w_grad->randomizeUniform();
+    gpu_w_grad->copyFrom(*cpu_w_grad);
+  }
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
+  if (is_padding) {
+    autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
+  }
+}
+
+TEST(ContextProjection, projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@ -30,6 +30,20 @@ real FuncConfig::get<real>(const std::string& key) const {
  return it->second.r;
 }

+template <>
+int FuncConfig::get<int>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.i;
+}
+
+template <>
+bool FuncConfig::get<bool>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.b;
+}
+
 template <>
 FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
@ -44,6 +58,20 @@ FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
  return *this;
 }

+template <>
+FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
+  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  valueMap_[key].i = v;
+  return *this;
+}
+
+template <>
+FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
+  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  valueMap_[key].b = v;
+  return *this;
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;

 }  // namespace paddle
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@ -40,6 +40,19 @@ struct MatrixT<DEVICE_TYPE_GPU> {
  using type = GpuMatrix;
 };

+template <DeviceType Device>
+struct SequenceT;
+
+template <>
+struct SequenceT<DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct SequenceT<DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
 typedef std::vector<size_t> Dims;

 class Tensor {
@ -59,6 +72,8 @@ public:
  union value {
    size_t s;
    real r;
+    int i;
+    bool b;
  };

  template <typename T>
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@ -33,18 +33,25 @@ public:
    // init cpu and gpu arguments
    auto initArgs = [=](
        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (auto arg : inArgs) {
+      for (const auto arg : inArgs) {
        size_t size = sizeof(real);
-        for (auto dim : arg.dims_) {
+        for (const auto dim : arg.dims_) {
          size *= dim;
        }
+        if (arg.getData()) {
+          // todo(tianbing), waste unnecessary mem here
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          // already init outside
+        } else {
          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
          cpuArgs.emplace_back(
              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
          gpuArgs.emplace_back(
              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-
          // will use an api to refactor this code.
          CpuVector cpuVector(size / sizeof(real),
                              (real*)cpuArgs.back().getData());
@ -53,6 +60,7 @@ public:
          cpuVector.uniform(0.001, 1);
          gpuVector.copyFrom(cpuVector);
        }
+      }
    };
    initArgs(cpuInputs, gpuInputs, inputs);
    initArgs(cpuOutputs, gpuOutputs, outputs);
@ -81,6 +89,10 @@ public:
    checkArgs(cpuInouts, gpuInouts);
  }

+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+
 protected:
  std::shared_ptr<FunctionBase> cpu;
  std::shared_ptr<FunctionBase> gpu;
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
    CHECK_EQ(inputDim * totalPad, parameter->getSize());
    weight_.reset(new Weight(totalPad, inputDim, parameter));
  }
+  // init forward_ and backward_ functions
+  init();
+}
+
+bool ContextProjection::init() {
+  size_t context_length = config_.context_length();
+  int context_start = config_.context_start();
+  bool is_padding = config_.trainable_padding();
+  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
+
+  createFunction(forward_,
+                 "ContextProjectionForward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_));
+  createFunction(backward_,
+                 "ContextProjectionBackward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_)
+                     .set("is_padding", is_padding)
+                     .set("total_pad", total_pad));
+
+  return true;
 }

 void ContextProjection::resetState() {
@ -78,25 +104,29 @@ LayerStatePtr ContextProjection::getState() {
 }

 void ContextProjection::forward() {
-  CHECK(in_->value);
+  CHECK(in_->value && out_->value);
  CHECK(in_->sequenceStartPositions);

-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
-
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(forward_.size(), 1) << "Only one forward function here";

  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  out_->value->contextProjectionForward(
-      *(in_->value),
-      state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr,
-      *startPositions,
-      config_.context_length(),
-      config_.context_start(),
-      beginPad_,
-      state_ ? true : isPadding);
+  bool is_padding = config_.trainable_padding();
+  /// first use state_, otherwise use weight_(padding false === w nullptr)
+  auto w_ptr =
+      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
+  auto start_pos = in_->sequenceStartPositions;
+  forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
+                     Tensor(w_ptr ? w_ptr->getData() : nullptr,
+                            Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
+                     Tensor(reinterpret_cast<real*>(
+                                const_cast<int*>(start_pos->getData(useGpu_))),
+                            Dims{start_pos->getSize()})},
+                    {Tensor(out_->value->getData(), Dims{batch_size, dim})},
+                    {});

  if (state_ && config_.context_start() < 0) {
    CHECK_EQ(1, in_->getNumSequences());
@ -118,41 +148,27 @@ void ContextProjection::forward() {
 }

 void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value);
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
+  CHECK(in_->value && out_->value && out_->grad);
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(batch_size, out_->value->getHeight());
+  CHECK_EQ(backward_.size(), 1) << "Only one backward function here";

  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  if (!out_->grad->useGpu()) {
-    out_->grad->contextProjectionBackward(
-        in_->grad.get(),
-        isPadding ? weight_->getWGrad().get() : nullptr,
-        *startPositions,
-        config_.context_length(),
-        config_.context_start(),
-        beginPad_,
-        isPadding);
-  } else {
-    if (in_->grad) {
-      out_->grad->contextProjectionBackwardData(*(in_->grad),
-                                                *startPositions,
-                                                config_.context_length(),
-                                                config_.context_start());
-    }
-
-    if (isPadding && weight_->getWGrad()) {
-      out_->grad->contextProjectionBackwardWeight(
-          *(weight_->getWGrad()),
-          *startPositions,
-          config_.context_length(),
-          config_.context_start(),
-          weight_->getWGrad()->getHeight(),
-          beginPad_);
-    }
-  }
+  bool is_padding = config_.trainable_padding();
+  auto start_pos = in_->sequenceStartPositions;
+  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
+  backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
+                             Dims{batch_size, input_dim}),
+                      Tensor(w_ptr ? w_ptr->getData() : nullptr,
+                             Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
+                      Tensor(reinterpret_cast<real*>(
+                                 const_cast<int*>(start_pos->getData(useGpu_))),
+                             Dims{start_pos->getSize()})},
+                     {Tensor(out_->grad->getData(), Dims{batch_size, dim})},
+                     {});

  if (config_.trainable_padding()) {
    weight_->getParameterPtr()->incUpdate(callback);
--- a/Show More
+++ b/Show More