Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_depthwiseConv_op_gpu

7 years ago · 3074ae7b8d
parent 2ffa3a8bf6 71bd0dfa25
commit 3074ae7b8d
54 changed files with 3462 additions and 283 deletions
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@ -0,0 +1,18 @@
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
+RUN pip install paddlepaddle
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
+
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -0,0 +1,76 @@
+# Performance for Distributed vgg16
+
+## Test Result
+
+### Hardware Infomation
+
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+
+### Single Node Single Thread
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
+| TensorFlow | - | - | - | - |
+
+### Different Batch Size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Per trainer CPU Core: 1
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
+| TensorFlow | - | - | - | - |
+
+
+### Accelerate Rate
+
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+
+| Trainer Count | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
+| TensorFlow | - | - | - | - |
+
+### Different Pserver Count
+
+- Trainer Count: 60
+- Batch Size: 128
+- Metrics: samples/ sec
+
+| PServer Count | 3 | 6 |10 | 20 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
+| TensorFlow | - | - | - | - |
+
+*The performance gap between Fuild and v2 comes from the network interference.*
+
+
+## Steps to Run the Performance Test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable Verbos Logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@ -0,0 +1,72 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: MKL_NUM_THREADS
+          value: "1"
+        - name: TRAINING_ROLE
+          value: "PSERVER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        command: ["paddle_k8s", "start_fluid"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@ -0,0 +1,69 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_fluid"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINING_ROLE
+          value: "TRAINER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@ -0,0 +1,64 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16v2job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16v2job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "python train.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        command: ["paddle_k8s", "start_pserver"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@ -0,0 +1,65 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16v2job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16v2job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_trainer", "v2"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "256"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "2"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@ -0,0 +1,154 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
+import gzip
+
+import paddle.v2.dataset.cifar as cifar
+import paddle.v2 as paddle
+import time
+import os
+
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
+NODE_COUNT = int(os.getenv("TRAINERS"))
+ts = 0
+
+
+def vgg(input, nums, class_dim):
+    def conv_block(input, num_filter, groups, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=input,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            pool_type=paddle.pooling.Max())
+
+    assert len(nums) == 5
+    # the channel of input feature is 3
+    conv1 = conv_block(input, 64, nums[0], 3)
+    conv2 = conv_block(conv1, 128, nums[1])
+    conv3 = conv_block(conv2, 256, nums[2])
+    conv4 = conv_block(conv3, 512, nums[3])
+    conv5 = conv_block(conv4, 512, nums[4])
+
+    fc_dim = 512
+    fc1 = paddle.layer.fc(input=conv5,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=fc1,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(input=fc2,
+                          size=class_dim,
+                          act=paddle.activation.Softmax())
+    return out
+
+
+def vgg13(input, class_dim):
+    nums = [2, 2, 2, 2, 2]
+    return vgg(input, nums, class_dim)
+
+
+def vgg16(input, class_dim):
+    nums = [2, 2, 3, 3, 3]
+    return vgg(input, nums, class_dim)
+
+
+def vgg19(input, class_dim):
+    nums = [2, 2, 4, 4, 4]
+    return vgg(input, nums, class_dim)
+
+
+def main():
+    global ts
+    paddle.init(use_gpu=False)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+
+    extra_layers = None
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
+    out = vgg16(image, class_dim=CLASS_DIM)
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                         BATCH_SIZE),
+        learning_rate=learning_rate / BATCH_SIZE,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=128000 * 35,
+        learning_rate_schedule="discexp", )
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            cifar.train10(),
+            # To use other data, replace the above line with:
+            # reader.train_reader('train.list'),
+            buf_size=1000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        cifar.test10(),
+        # To use other data, replace the above line with:
+        # reader.test_reader('val.list'),
+        batch_size=BATCH_SIZE)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 extra_layers=extra_layers,
+                                 is_local=False)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
+        if isinstance(event, paddle.event.BeginIteration):
+            ts = time.time()
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    time.time() - ts)
+        if isinstance(event, paddle.event.EndPass):
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
+            result = trainer.test(reader=test_reader)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    trainer.train(
+        reader=train_reader, num_passes=200, event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -87,6 +87,11 @@ roi_pool
 ..  autoclass:: paddle.v2.layer.roi_pool
    :noindex:

+pad
+----
+..  autoclass:: paddle.v2.layer.pad
+    :noindex:
+
 Norm Layer
 ==========

@ -133,6 +138,11 @@ grumemory
 ..  autoclass:: paddle.v2.layer.grumemory
    :noindex:

+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+    
 Recurrent Layer Group
 =====================

@ -340,6 +350,11 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:

+dropout
+--------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
+    
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@ -402,6 +417,11 @@ scale_shift
 ..  autoclass:: paddle.v2.layer.scale_shift
    :noindex:

+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
+
 Sampling Layers
 ===============

@ -420,22 +440,6 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:

-Factorization Machine Layer
-============================
-
-factorization_machine
---------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
-    :noindex:
-
-Slicing and Joining Layers
-==========================
-
-pad
----
-..  autoclass:: paddle.v2.layer.pad
-    :noindex:
-
 ..  _api_v2.layer_costs:

 Cost Layers
@ -526,6 +530,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
    :noindex:

+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
+    
 Check Layer
 ============

@ -534,31 +543,10 @@ eos
 ..  autoclass:: paddle.v2.layer.eos
    :noindex:

-Miscs
-=====
-
-dropout
--------
-..  autoclass:: paddle.v2.layer.dropout
-    :noindex:
-
-Activation with learnable parameter
-===================================
+Activation
+==========

 prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:
-
-gated_unit
-----------
-..  autoclass:: paddle.v2.layer.gated_unit
-    :noindex:
-
-Detection output Layer
-======================
-
-detection_output
----------------
-..  autoclass:: paddle.v2.layer.detection_output
-    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@ -73,3 +73,10 @@ wmt14
 ..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:
+
+wmt16
+++++
+
+..  automodule:: paddle.v2.dataset.wmt16
+    :members:
+    :noindex:
--- a/doc/design/switch.md
+++ b/doc/design/switch.md
@ -0,0 +1,32 @@
+### Design Doc: Switch
+
+### Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+switch = fluid.switch()
+with switch.block():
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+
+### The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) {
  {
    // MakeChannel should return a buffered channel is buffer_size > 0.
    auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
    CloseChannel(ch);
    delete ch;
  }
  {
    // MakeChannel should return an un-buffered channel is buffer_size = 0.
    auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
    CloseChannel(ch);
    delete ch;
  }
@ -78,3 +78,132 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  t.join();
  delete ch;
 }
+
+TEST(Channel, SimpleUnbufferedChannelTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          ch->Receive(&data);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data = 10;
+          ch->Send(&data);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  // Send should block after three iterations
+  // since we only have three receivers.
+  std::thread t([&]() {
+    // Try to send more number of times
+    // than receivers
+    for (int i = 0; i < 4; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 3; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 3U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <atomic>
 #include <condition_variable>
-#include <deque>
 #include <mutex>

 #include "paddle/framework/channel.h"
@ -36,20 +36,104 @@ class UnBuffered : public paddle::framework::Channel<T> {
  virtual ~UnBuffered();

 private:
-  UnBuffered() {}
+  std::mutex mu_ch_;
+  // Mutex for readers and writers who are waiting for other reader
+  // and writer to complete execution
+  std::recursive_mutex mu_read_, mu_write_;
+  // reader_found_ is set true when a reader is ready to accept data
+  // writer_found_ is set true when a writer is ready to send data
+  // A transaction occurs only when both are true
+  std::atomic<bool> reader_found_{false}, writer_found_{false};
+  std::condition_variable cv_channel_;
+  std::condition_variable_any cv_reader_, cv_writer_;
+  T* item{nullptr};
+  std::atomic<bool> closed_{false};
+
+  UnBuffered() : closed_(false) {}
+
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };

+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template <typename T>
+void UnBuffered<T>::Send(T* data) {
+  // Prevent other writers from entering
+  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
+  writer_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
+  // If writer comes first, it should wait till a reader arrives
+  cv_writer_.wait(cv_lock,
+                  [this]() { return reader_found_ == true || closed_; });
+  cv_reader_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> channel_lock(mu_ch_);
+    item = data;
+    channel_lock.unlock();
+    cv_channel_.notify_one();
+    channel_lock.lock();
+    cv_channel_.wait(channel_lock,
+                     [this]() { return item == nullptr || closed_; });
+  }
+  writer_found_ = false;
+}
+
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
 template <typename T>
-void UnBuffered<T>::Send(T* channel_element) {}
+void UnBuffered<T>::Receive(T* data) {
+  // Prevent other readers from entering
+  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
+  reader_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
+  // If reader comes first, it should wait till a writer arrives
+  cv_reader_.wait(cv_lock,
+                  [this]() { return writer_found_ == true || closed_; });
+  cv_writer_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> lock_ch{mu_ch_};
+    // Reader should wait for the writer to first write its data
+    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+    if (!closed_) {
+      *data = std::move(*item);
+      item = nullptr;
+      lock_ch.unlock();
+    }
+    cv_channel_.notify_one();
+  }
+  reader_found_ = false;
+}

+// This function implements the sequence of events
+// that take place once the channel is closed.
 template <typename T>
-void UnBuffered<T>::Receive(T*) {}
+void UnBuffered<T>::Close() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}

+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
 template <typename T>
-void UnBuffered<T>::Close() {}
+UnBuffered<T>::~UnBuffered() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}

+// This function notifies all the readers, writers and
+// the channel condition variables.
 template <typename T>
-UnBuffered<T>::~UnBuffered() {}
+void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  cv_writer_.notify_all();
+  cv_channel_.notify_all();
+  cv_reader_.notify_all();
+}

 }  // namespace details
 }  // namespace framework
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {

  bool HasOutputs(const std::string &name) const override;

-  DDim GetInputDim(const std::string &name) const override;
-
-  void SetOutputDim(const std::string &name, const DDim &dim) override;
-
  AttrReader Attrs() const override;

  const std::vector<std::string> &Inputs(
@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
  return true;
 }

-DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
-  std::vector<DDim> ddims = GetInputsDim(name);
-  auto length = ddims.size();
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have 1 value, "
-                    "but it has %d now",
-                    name, length);
-  return ddims[0];
-}
-
-void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
-                                                const DDim &dim) {
-  SetOutputsDim(name, {dim});
-}
-
 AttrReader CompileTimeInferShapeContext::Attrs() const {
  return AttrReader(op_.GetAttrMap());
 }
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return true;
  }

-  DDim GetInputDim(const std::string& name) const override {
-    return GetDim(op_.Input(name));
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Output(name), dim);
-  }
-
  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }

  const std::vector<std::string>& Inputs(
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@ -18,10 +18,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Input(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  return this->GetDim(arg_names[0]);
+}
+
+std::vector<DDim> InferShapeContext::GetInputsDim(
    const std::string &name) const {
-  const std::vector<std::string> &names = Inputs(name);
-  return GetDims(names);
+  const std::vector<std::string> &arg_names = Inputs(name);
+  return GetDims(arg_names);
 }

 DDim InferShapeContext::GetInputsElementDim(const std::string &name,
@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
  return this->GetDim(names[idx]);
 }

-void InferShapeContext::SetOutputsDim(
-    const std::string &name, const std::vector<framework::DDim> &dims) {
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
+  auto &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Output(%s) should hold one element, but now it holds %d",
+                    name, arg_names.size());
+  SetDim(arg_names[0], dim);
+}
+
+void InferShapeContext::SetOutputsDim(const std::string &name,
+                                      const std::vector<DDim> &dims) {
  auto &names = Outputs(name);
  SetDims(names, dims);
 }

-std::vector<framework::DDim> InferShapeContext::GetDims(
+std::vector<DDim> InferShapeContext::GetDims(
    const std::vector<std::string> &names) const {
-  std::vector<framework::DDim> ret;
+  std::vector<DDim> ret;
  ret.reserve(names.size());
  std::transform(
      names.begin(), names.end(), std::back_inserter(ret),
      [this](const std::string &name) { return this->GetDim(name); });
  return ret;
 }
-
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
-                                const std::vector<framework::DDim> &dims) {
+                                const std::vector<DDim> &dims) {
  size_t length = names.size();
  PADDLE_ENFORCE_EQ(length, dims.size());
  for (size_t i = 0; i < length; ++i) {
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@ -35,14 +35,13 @@ class InferShapeContext {
  virtual bool HasInputs(const std::string &name) const = 0;
  virtual bool HasOutputs(const std::string &name) const = 0;

-  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
+  DDim GetInputDim(const std::string &name) const;

-  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetInputsDim(const std::string &name) const;
  DDim GetInputsElementDim(const std::string &name, int idx) const;

-  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
-  void SetOutputsDim(const std::string &name,
-                     const std::vector<framework::DDim> &dims);
+  void SetOutputDim(const std::string &name, const DDim &dim);
+  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);

  virtual AttrReader Attrs() const = 0;
  virtual const std::vector<std::string> &Inputs(
@ -57,15 +56,13 @@ class InferShapeContext {

  // Note: In while op, we need this to be public
  void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
+               const std::vector<DDim> &dims);

 protected:
-  virtual framework::DDim GetDim(const std::string &name) const = 0;
-  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
-
-  std::vector<framework::DDim> GetDims(
-      const std::vector<std::string> &names) const;
+  virtual DDim GetDim(const std::string &name) const = 0;
+  virtual void SetDim(const std::string &name, const DDim &dim) = 0;

+  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
  std::vector<proto::VarDesc::VarType> GetVarTypes(
      const std::vector<std::string> &names) const;

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@ -178,19 +178,22 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
    bool needIm2col = isNeedIm2col(filter);

    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
    TensorShape colShape;
-    real* colData = NULL;

-    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
-    size_t colWidth = outputHeight * outputWidth;
-    // Max col matrix height 256, Max col matrix width 1024
-    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
-    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+    // Max col matrix width 4096, Max col matrix size 4M.
+    size_t outputHeightSteps =
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+    size_t maxColWidth = outputHeightSteps * outputWidth;
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;

    if (needIm2col) {
      colShape = TensorShape({inputChannels / groups_,
@ -199,7 +202,7 @@ public:
                              outputHeight,
                              outputWidth});

-      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
      colData = reinterpret_cast<real*>(memory_->getBuf());
    }

@ -209,20 +212,24 @@ public:
        (outputChannels / groups_) * outputHeight * outputWidth;
    size_t filterOffset = filter.getElements() / groups_;

-    int nStride = colWidth;
-    int kStride = colHeight;
+    int nStride = outputHeight * outputWidth;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
    for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
      for (size_t g = 0; g < groups_; g++) {
        if (needIm2col) {
          real beta_ = beta;
-          for (size_t colHeightStart = 0; colHeightStart < colHeight;
-               colHeightStart += stepColHeight) {
-            for (size_t colWidthStart = 0; colWidthStart < colWidth;
-                 colWidthStart += stepColWidth) {
-              int N = std::min(colWidth - colWidthStart, stepColWidth);
-              int K = std::min(colHeight - colHeightStart, stepColHeight);
+          for (size_t ic = 0; ic < inputChannels / groups_;
+               ic += channelSteps) {
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+              int height = std::min(outputHeight - oh, outputHeightSteps);
+
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
              // im2col
-              im2col(inputData + g * inputOffset,
+              im2col(inputData,
                     imShape,
                     colData,
                     colShape,
@ -232,13 +239,12 @@ public:
                     paddingW(),
                     dilationH(),
                     dilationW(),
-                     colHeightStart,
-                     K,
-                     colWidthStart,
+                     channels,
+                     oh,
+                     height,
                     N);

              // gemm
-              int M = outputChannels / groups_;
              BlasGemm<Device, real>::compute(
                  false,
                  false,
@ -246,12 +252,12 @@ public:
                  N,
                  K,
                  1.0f,
-                  filterData + g * filterOffset + colHeightStart,
+                  filterData + ic * filterHeight * filterWidth,
                  kStride,
                  colData,
                  N,
                  beta_,
-                  outputData + g * outputOffset + colWidthStart,
+                  outputData + oh * outputWidth,
                  nStride);
            }
            beta_ = 1.0;
@ -266,17 +272,18 @@ public:
                                          N,
                                          K,
                                          1.0f,
-                                          filterData + g * filterOffset,
+                                          filterData,
                                          K,
-                                          inputData + g * inputOffset,
+                                          inputData,
                                          N,
                                          beta,
-                                          outputData + g * outputOffset,
+                                          outputData,
                                          N);
        }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
      }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
    }

    memory_.reset();
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@ -111,39 +111,42 @@ public:
                  int paddingWidth,
                  int dilationHeight,
                  int dilationWidth,
-                  int colHeightStart,
-                  int colHeightSize,
-                  int colWidthStart,
-                  int colWidthSize) {
+                  int inputChannels,
+                  int colOffset,
+                  int colOutputHeight,
+                  int colWidth) {
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
    int filterHeight = colShape[1];
    int filterWidth = colShape[2];
    int outputWidth = colShape[4];

-    for (int colh = 0; colh < colHeightSize; colh++) {
-      int wOffset = (colHeightStart + colh) % filterWidth;
-      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
-      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
-
-      for (int colw = 0; colw < colWidthSize; colw++) {
-        int h = (colWidthStart + colw) / outputWidth;
-        int w = (colWidthStart + colw) % outputWidth;
-
-        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-        int imColIdx = w * strideWidth + wOffset * dilationWidth;
-        if ((imRowIdx - paddingHeight) < 0 ||
-            (imRowIdx - paddingHeight) >= inputHeight ||
-            (imColIdx - paddingWidth) < 0 ||
-            (imColIdx - paddingWidth) >= inputWidth) {
-          colData[colh * colWidthSize + colw] = static_cast<T>(0);
-        } else {
-          imRowIdx += c_im * inputHeight - paddingHeight;
-          imColIdx -= paddingWidth;
-          colData[colh * colWidthSize + colw] =
-              imData[imRowIdx * inputWidth + imColIdx];
+    for (int ic = 0; ic < inputChannels; ic++) {
+      for (int oh = 0; oh < colOutputHeight; oh++) {
+        T* dstData = colData + oh * outputWidth;
+        for (int fh = 0; fh < filterHeight; fh++) {
+          for (int fw = 0; fw < filterWidth; fw++) {
+            int imRowIdx = (oh + colOffset) * strideHeight +
+                           fh * dilationHeight - paddingHeight;
+            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+              memset(dstData, 0, outputWidth * sizeof(T));
+            } else {
+              for (int ow = 0; ow < outputWidth; ow++) {
+                int imColIdx =
+                    ow * strideWidth + fw * dilationWidth - paddingWidth;
+                if (imColIdx < 0 || imColIdx >= inputWidth) {
+                  dstData[ow] = T(0);
+                } else {
+                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
+                }
+              }
+            }
+            dstData += colWidth;
+          }
        }
      }
+      colData += filterHeight * filterWidth * colWidth;
+      imData += inputHeight * inputWidth;
    }
  }
 };
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
                          padding,
                          dilation,
                          dilation,
+                          channels,
                          0,
-                          height,
-                          0,
-                          width);
+                          outputHeight,
+                          outputHeight * outputWidth);

                  autotest::TensorCheckEqual(*output1, *output2);
                }
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
@ -4,4 +4,4 @@ cc_test(test_inference_recognize_digits_mlp
    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
 set_tests_properties(test_inference_recognize_digits_mlp
-    PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
+    PROPERTIES DEPENDS test_recognize_digits)
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    CHECK_EQ(channels * outLength, maskMatP->getWidth());
  }

-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[i * outStride + j] = -(real)FLT_MAX;
-    }
-  }
-
  /* pool max one by one */
  for (size_t n = 0; n < num; ++n) {  // frame by frame
    if (!isContiguous()) {
@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    for (size_t c = 0; c < channels; ++c) {  // channel by channel
      for (size_t ph = 0; ph < outputH; ++ph) {
        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
+        int hend = hstart + sizeY;
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
        for (size_t pw = 0; pw < outputW; ++pw) {
          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
+          int wend = wstart + sizeX;
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
          if (maskData == NULL) {
+            real tmp = -(real)FLT_MAX;
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
-                outData[ph * outputW + pw] = std::max(
-                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+                tmp = tmp < inputData[h * imgSizeW + w]
+                          ? inputData[h * imgSizeW + w]
+                          : tmp;
              }
            }
+            outData[ph * outputW + pw] = tmp;
          } else {
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
                   "Input(DistMat) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchIndices"),
+        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchDist"),
+        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");

    auto dims = ctx->GetInputDim("DistMat");
    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");

    ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDis", dims);
+    ctx->SetOutputDim("ColToRowMatchDist", dims);
  }
 };

@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* dist_mat = context.Input<LoDTensor>("DistMat");
    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");

    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();

@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
              "Otherwise, it means B[j] is matched to row "
              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
              "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDis",
+    AddOutput("ColToRowMatchDist",
              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
              "instance are called LoD. Then "
-              "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
+              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
    AddComment(R"DOC(
 This operator is a greedy bipartite matching algorithm, which is used to
 obtain the matching with the maximum distance based on the input
--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/operators/box_coder_op.cc
@ -0,0 +1,121 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/box_coder_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BoxCoderOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(PriorBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
+                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
+                   "Input(TargetBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
+                   "Output(OutputBox) of BoxCoderOp should not be null.");
+
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBoxVar must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                      "The rank of Input of TargetBox must be 2");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
+                      "The shape of TargetBox is [M, 4]");
+
+    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+
+    ctx->SetOutputDim(
+        "OutputBox",
+        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
+  }
+};
+
+class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
+        "each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>) "
+             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
+             "of variance.");
+    AddInput(
+        "TargetBox",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the box if the input "
+        "is image feature map, they are close to the origin of the coordinate "
+        "system. [xmax, ymax] is the right bottom coordinate of the box. "
+        "This tensor can contain LoD information to represent a batch "
+        "of inputs. One instance of this batch can contain different "
+        "numbers of entities.");
+    AddAttr<std::string>("code_type",
+                         "(string, default encode_center_size) "
+                         "the code type used with the target box")
+        .SetDefault("encode_center_size")
+        .InEnum({"encode_center_size", "decode_center_size"});
+    AddOutput(
+        "OutputBox",
+        "(LoDTensor or Tensor) "
+        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
+        "representing the result of N target boxes encoded/decoded with "
+        "M Prior boxes and variances.");
+
+    AddComment(R"DOC(
+Bounding Box Coder Operator.
+Encode/Decode the target bounding box with the priorbox information.
+The Encoding schema described below:
+ox = (tx - px) / pw / pxv
+oy = (ty - py) / ph / pyv
+ow = log(abs(tw / pw)) / pwv 
+oh = log(abs(th / ph)) / phv 
+The Decoding schema described below:
+ox = (pw * pxv * tx * + px) - tw / 2
+oy = (ph * pyv * ty * + py) - th / 2
+ow = exp(pwv * tw) * pw + tw / 2
+oh = exp(phv * th) * ph + th / 2
+where tx, ty, tw, th denote the target box's center coordinates, width and
+height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
+center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
+of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
+width and height.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
+                       ops::BoxCoderKernel<double>);
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/operators/box_coder_op.cu
@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/box_coder_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+
+    T target_box_center_x =
+        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
+        2;
+    T target_box_center_y = (target_box_data[row_idx * len + 3] +
+                             target_box_data[row_idx * len + 1]) /
+                            2;
+    T target_box_width =
+        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
+    T target_box_height =
+        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
+
+    output[idx * len] = (target_box_center_x - prior_box_center_x) /
+                        prior_box_width / prior_box_var_data[col_idx * len];
+    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
+                            prior_box_height /
+                            prior_box_var_data[col_idx * len + 1];
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
+                            prior_box_var_data[col_idx * len + 2];
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
+                            prior_box_var_data[col_idx * len + 3];
+  }
+}
+
+template <typename T>
+__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
+    T prior_box_height =
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
+
+    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+                             target_box_data[row_idx * len + 2]) *
+                         prior_box_width;
+    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+                              target_box_data[row_idx * len + 3]) *
+                          prior_box_height;
+    T target_box_center_x = prior_box_var_data[col_idx * len] *
+                                target_box_data[row_idx * len] *
+                                prior_box_width +
+                            prior_box_center_x;
+    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+                                target_box_data[row_idx * len + 1] *
+                                prior_box_height +
+                            prior_box_center_y;
+
+    output[idx * len] = target_box_center_x - target_box_width / 2;
+    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
+    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
+    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
+  }
+}
+
+template <typename T>
+class BoxCoderCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
+    int block = 512;
+    int grid = (row * col + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+
+    const T* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = prior_box_var->data<T>();
+    const T* target_box_data = target_box->data<T>();
+
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
+    T* output = output_box->data<T>();
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
+          output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
+                        ops::BoxCoderCUDAKernel<double>);
--- a/Show More
+++ b/Show More