Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_depthwiseConv_op_gpu

7 years ago · 3074ae7b8d
parent 2ffa3a8bf6 71bd0dfa25
commit 3074ae7b8d
54 changed files with 3462 additions and 283 deletions
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@ -0,0 +1,18 @@
 #FROM python:2.7.14
 FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
 RUN apt-get update && apt-get install -y python
 RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
 # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
 #       so we must build one with distribute support to install in this image.
 RUN pip install paddlepaddle
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
 RUN pip uninstall -y paddlepaddle
 # below lines may change a lot for debugging
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
 ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
 ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl && \
 chmod +x /usr/bin/paddle_k8s
 ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@ -0,0 +1,76 @@
 # Performance for Distributed vgg16
 ## Test Result
 ### Hardware Infomation
 - CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
 - cpu MHz		: 2101.000
 - cache size	: 20480 KB
 ### Single Node Single Thread
 - PServer Count: 10
 - Trainer Count: 20
 - Metrics: samples / sec
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
 | PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
 | TensorFlow | - | - | - | - |
 ### Different Batch Size
 - PServer Count: 10
 - Trainer Count: 20
 - Per trainer CPU Core: 1
 - Metrics: samples / sec
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
 | PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
 | TensorFlow | - | - | - | - |
 ### Accelerate Rate
 - Pserver Count: 20
 - Batch Size: 128
 - Metrics: samples / sec
 | Trainer Count | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
 | PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
 | TensorFlow | - | - | - | - |
 ### Different Pserver Count
 - Trainer Count: 60
 - Batch Size: 128
 - Metrics: samples/ sec
 | PServer Count | 3 | 6 |10 | 20 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
 | PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
 | TensorFlow | - | - | - | - |
 *The performance gap between Fuild and v2 comes from the network interference.*
 ## Steps to Run the Performance Test
 1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
 1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
 1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
 1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
 1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
 Check the logs for the distributed training progress and analyze the performance.
 ## Enable Verbos Logs
 Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.
--- a/benchmark/cluster/vgg16/fluid_pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@ -0,0 +1,72 @@
 apiVersion: extensions/v1beta1
 kind: ReplicaSet
 metadata:
  name: vgg16job-pserver
 spec:
  replicas: 10
  template:
    metadata:
      labels:
        paddle-job-pserver: vgg16job
    spec:
      hostNetwork: true
      imagePullSecrets:
      - name: job-registry-secret
      containers:
      - name: pserver
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
          containerPort: 30236
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16job
        - name: MKL_NUM_THREADS
          value: "1"
        - name: TRAINING_ROLE
          value: "PSERVER"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: "status.podIP"
        command: ["paddle_k8s", "start_fluid"]
        resources:
          requests:
            memory: 10Gi
            cpu: 4
          limits:
            memory: 10Gi
            cpu: 4
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@ -0,0 +1,69 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: vgg16job-trainer
 spec:
  parallelism: 20
  completions: 20
  template:
    metadata:
      labels:
        paddle-job: vgg16job
    spec:
      imagePullSecrets:
        - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_fluid"]
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16job
        - name: TRAINING_ROLE
          value: "TRAINER"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: "status.podIP"
        resources:
          requests:
            memory: 40Gi
            cpu: 2
          limits:
            memory: 40Gi
            cpu: 2
      restartPolicy: Never
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@ -0,0 +1,64 @@
 apiVersion: extensions/v1beta1
 kind: ReplicaSet
 metadata:
  name: vgg16v2job-pserver
 spec:
  replicas: 10
  template:
    metadata:
      labels:
        paddle-job-pserver: vgg16v2job
    spec:
      hostNetwork: true
      imagePullSecrets:
      - name: job-registry-secret
      containers:
      - name: pserver
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
          containerPort: 30236
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16v2job
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
          value: "python train.py"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        command: ["paddle_k8s", "start_pserver"]
        resources:
          requests:
            memory: 10Gi
            cpu: 4
          limits:
            memory: 10Gi
            cpu: 4
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@ -0,0 +1,65 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
  name: vgg16v2job-trainer
 spec:
  parallelism: 20
  completions: 20
  template:
    metadata:
      labels:
        paddle-job: vgg16v2job
    spec:
      imagePullSecrets:
        - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_trainer", "v2"]
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16v2job
        - name: BATCH_SIZE
          value: "256"
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "2"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        resources:
          requests:
            memory: 40Gi
            cpu: 2
          limits:
            memory: 40Gi
            cpu: 2
      restartPolicy: Never
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@ -0,0 +1,154 @@
 #  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 #
 #Licensed under the Apache License, Version 2.0 (the "License");
 #you may not use this file except in compliance with the License.
 #You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 #Unless required by applicable law or agreed to in writing, software
 #distributed under the License is distributed on an "AS IS" BASIS,
 #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #See the License for the specific language governing permissions and
 #limitations under the License.
 import gzip
 import paddle.v2.dataset.cifar as cifar
 import paddle.v2 as paddle
 import time
 import os
 DATA_DIM = 3 * 32 * 32
 CLASS_DIM = 10
 BATCH_SIZE = os.getenv("BATCH_SIZE")
 if BATCH_SIZE:
    BATCH_SIZE = int(BATCH_SIZE)
 else:
    BATCH_SIZE = 128
 print "batch_size", BATCH_SIZE
 NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0
 def vgg(input, nums, class_dim):
    def conv_block(input, num_filter, groups, num_channels=None):
        return paddle.networks.img_conv_group(
            input=input,
            num_channels=num_channels,
            pool_size=2,
            pool_stride=2,
            conv_num_filter=[num_filter] * groups,
            conv_filter_size=3,
            conv_act=paddle.activation.Relu(),
            pool_type=paddle.pooling.Max())
    assert len(nums) == 5
    # the channel of input feature is 3
    conv1 = conv_block(input, 64, nums[0], 3)
    conv2 = conv_block(conv1, 128, nums[1])
    conv3 = conv_block(conv2, 256, nums[2])
    conv4 = conv_block(conv3, 512, nums[3])
    conv5 = conv_block(conv4, 512, nums[4])
    fc_dim = 512
    fc1 = paddle.layer.fc(input=conv5,
                          size=fc_dim,
                          act=paddle.activation.Relu(),
                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
    fc2 = paddle.layer.fc(input=fc1,
                          size=fc_dim,
                          act=paddle.activation.Relu(),
                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
    out = paddle.layer.fc(input=fc2,
                          size=class_dim,
                          act=paddle.activation.Softmax())
    return out
 def vgg13(input, class_dim):
    nums = [2, 2, 2, 2, 2]
    return vgg(input, nums, class_dim)
 def vgg16(input, class_dim):
    nums = [2, 2, 3, 3, 3]
    return vgg(input, nums, class_dim)
 def vgg19(input, class_dim):
    nums = [2, 2, 4, 4, 4]
    return vgg(input, nums, class_dim)
 def main():
    global ts
    paddle.init(use_gpu=False)
    image = paddle.layer.data(
        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
    lbl = paddle.layer.data(
        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
    extra_layers = None
    # NOTE: for v2 distributed training need averaging updates.
    learning_rate = 1e-3 / NODE_COUNT
    out = vgg16(image, class_dim=CLASS_DIM)
    cost = paddle.layer.classification_cost(input=out, label=lbl)
    # Create parameters
    parameters = paddle.parameters.create(cost)
    # Create optimizer
    optimizer = paddle.optimizer.Momentum(
        momentum=0.9,
        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
                                                         BATCH_SIZE),
        learning_rate=learning_rate / BATCH_SIZE,
        learning_rate_decay_a=0.1,
        learning_rate_decay_b=128000 * 35,
        learning_rate_schedule="discexp", )
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            cifar.train10(),
            # To use other data, replace the above line with:
            # reader.train_reader('train.list'),
            buf_size=1000),
        batch_size=BATCH_SIZE)
    test_reader = paddle.batch(
        cifar.test10(),
        # To use other data, replace the above line with:
        # reader.test_reader('val.list'),
        batch_size=BATCH_SIZE)
    # Create trainer
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer,
                                 extra_layers=extra_layers,
                                 is_local=False)
    # End batch and end pass event handler
    def event_handler(event):
        global ts, ts_pass
        if isinstance(event, paddle.event.BeginPass):
            ts_pass = time.time()
        if isinstance(event, paddle.event.BeginIteration):
            ts = time.time()
        if isinstance(event, paddle.event.EndIteration):
            if event.batch_id % 1 == 0:
                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
                    event.pass_id, event.batch_id, event.cost, event.metrics,
                    time.time() - ts)
        if isinstance(event, paddle.event.EndPass):
            print "Pass %d end, spent: %f" % (event.pass_id,
                                              time.time() - ts_pass)
            result = trainer.test(reader=test_reader)
            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
    trainer.train(
        reader=train_reader, num_passes=200, event_handler=event_handler)
 if __name__ == '__main__':
    main()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@ -87,6 +87,11 @@ roi_pool
 ..  autoclass:: paddle.v2.layer.roi_pool
    :noindex:
 pad
 ----
 ..  autoclass:: paddle.v2.layer.pad
    :noindex:
 Norm Layer
 ==========
@ -133,6 +138,11 @@ grumemory
 ..  autoclass:: paddle.v2.layer.grumemory
    :noindex:
 gated_unit
 -----------
 ..  autoclass:: paddle.v2.layer.gated_unit
    :noindex:
 Recurrent Layer Group
 =====================
@ -340,6 +350,11 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
    :noindex:
 dropout
 --------
 ..  autoclass:: paddle.v2.layer.dropout
    :noindex:
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@ -402,6 +417,11 @@ scale_shift
 ..  autoclass:: paddle.v2.layer.scale_shift
    :noindex:
 factorization_machine
 ---------------------
 ..  autoclass:: paddle.v2.layer.factorization_machine
    :noindex:
 Sampling Layers
 ===============
@ -420,22 +440,6 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:
 Factorization Machine Layer
 ============================
 factorization_machine
 ---------------------
 ..  autoclass:: paddle.v2.layer.factorization_machine
    :noindex:
 Slicing and Joining Layers
 ==========================
 pad
 ----
 ..  autoclass:: paddle.v2.layer.pad
    :noindex:
 ..  _api_v2.layer_costs:
 Cost Layers
@ -526,6 +530,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
    :noindex:
 detection_output
 ----------------
 ..  autoclass:: paddle.v2.layer.detection_output
    :noindex:
 Check Layer
 ============
@ -534,31 +543,10 @@ eos
 ..  autoclass:: paddle.v2.layer.eos
    :noindex:
-Miscs
+Activation
-=====
+==========
 dropout
 --------
 ..  autoclass:: paddle.v2.layer.dropout
    :noindex:
 Activation with learnable parameter
 ===================================
 prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:
 gated_unit
 -----------
 ..  autoclass:: paddle.v2.layer.gated_unit
    :noindex:
 Detection output Layer
 ======================
 detection_output
 ----------------
 ..  autoclass:: paddle.v2.layer.detection_output
    :noindex:
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@ -73,3 +73,10 @@ wmt14
 ..  automodule:: paddle.v2.dataset.wmt14
    :members:
    :noindex:
 wmt16
 +++++
 ..  automodule:: paddle.v2.dataset.wmt16
    :members:
    :noindex:
--- a/doc/design/switch.md
+++ b/doc/design/switch.md
@ -0,0 +1,32 @@
 ### Design Doc: Switch
 ### Background
 Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
 The following example shows the usage of `fluid.switch`.
 ```python
 a = fluid.Var(10)
 b = fluid.Var(0)
 switch = fluid.switch()
 with switch.block():
    with switch.case(fluid.less_equal(a, 10)):
        fluid.print("Case 1")
    with switch.case(fluid.larger(a, 0)):
        fluid.print("Case 2")
    with switch.default():
        fluid.print("Case 3")
 ```
 ### The Semantics
 1. A `switch` control-flow checks cases one-by-one.
 1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
 1. It runs the first matched case, or the default case if there is one.
 1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
 The above program should print and print only "Case 1".
 The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) {
  {
    // MakeChannel should return a buffered channel is buffer_size > 0.
    auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
    CloseChannel(ch);
    delete ch;
  }
  {
    // MakeChannel should return an un-buffered channel is buffer_size = 0.
    auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
    CloseChannel(ch);
    delete ch;
  }
@ -78,3 +78,132 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
  t.join();
  delete ch;
 }
 TEST(Channel, SimpleUnbufferedChannelTest) {
  auto ch = MakeChannel<int>(0);
  unsigned sum_send = 0;
  std::thread t([&]() {
    for (int i = 0; i < 5; i++) {
      ch->Send(&i);
      sum_send += i;
    }
  });
  for (int i = 0; i < 5; i++) {
    int recv;
    ch->Receive(&recv);
    EXPECT_EQ(recv, i);
  }
  CloseChannel(ch);
  t.join();
  EXPECT_EQ(sum_send, 10U);
  delete ch;
 }
 // This tests that closing an unbuffered channel also unblocks
 //  unblocks any receivers waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
  auto ch = MakeChannel<int>(0);
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
  // Launches threads that try to read and are blocked becausew of no writers
  for (size_t i = 0; i < num_threads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
          int data;
          ch->Receive(&data);
          *p = true;
        },
        &thread_ended[i]);
  }
  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
  // Verify that all the threads are blocked
  for (size_t i = 0; i < num_threads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
  // Explicitly close the thread
  // This should unblock all receivers
  CloseChannel(ch);
  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
  for (size_t i = 0; i < num_threads; i++) t[i].join();
  delete ch;
 }
 // This tests that closing an unbuffered channel also unblocks
 //  unblocks any senders waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
  auto ch = MakeChannel<int>(0);
  size_t num_threads = 5;
  std::thread t[num_threads];
  bool thread_ended[num_threads];
  // Launches threads that try to read and are blocked becausew of no writers
  for (size_t i = 0; i < num_threads; i++) {
    thread_ended[i] = false;
    t[i] = std::thread(
        [&](bool *p) {
          int data = 10;
          ch->Send(&data);
          *p = true;
        },
        &thread_ended[i]);
  }
  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
  // Verify that all the threads are blocked
  for (size_t i = 0; i < num_threads; i++) {
    EXPECT_EQ(thread_ended[i], false);
  }
  // Explicitly close the thread
  // This should unblock all receivers
  CloseChannel(ch);
  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
  // Verify that all threads got unblocked
  for (size_t i = 0; i < num_threads; i++) {
    EXPECT_EQ(thread_ended[i], true);
  }
  for (size_t i = 0; i < num_threads; i++) t[i].join();
  delete ch;
 }
 TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
  auto ch = MakeChannel<int>(0);
  unsigned sum_send = 0;
  // Send should block after three iterations
  // since we only have three receivers.
  std::thread t([&]() {
    // Try to send more number of times
    // than receivers
    for (int i = 0; i < 4; i++) {
      ch->Send(&i);
      sum_send += i;
    }
  });
  for (int i = 0; i < 3; i++) {
    int recv;
    ch->Receive(&recv);
    EXPECT_EQ(recv, i);
  }
  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
  EXPECT_EQ(sum_send, 3U);
  CloseChannel(ch);
  t.join();
  delete ch;
 }
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include <atomic>
 #include <condition_variable>
 #include <deque>
 #include <mutex>
 #include "paddle/framework/channel.h"
@ -36,20 +36,104 @@ class UnBuffered : public paddle::framework::Channel<T> {
  virtual ~UnBuffered();
 private:
-  UnBuffered() {}
+  std::mutex mu_ch_;
  // Mutex for readers and writers who are waiting for other reader
  // and writer to complete execution
  std::recursive_mutex mu_read_, mu_write_;
  // reader_found_ is set true when a reader is ready to accept data
  // writer_found_ is set true when a writer is ready to send data
  // A transaction occurs only when both are true
  std::atomic<bool> reader_found_{false}, writer_found_{false};
  std::condition_variable cv_channel_;
  std::condition_variable_any cv_reader_, cv_writer_;
  T* item{nullptr};
  std::atomic<bool> closed_{false};
  UnBuffered() : closed_(false) {}
  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
 // This function implements the concept of how data should
 // be sent from a writer to a reader.
 template <typename T>
 void UnBuffered<T>::Send(T* data) {
  // Prevent other writers from entering
  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
  writer_found_ = true;
  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
  // If writer comes first, it should wait till a reader arrives
  cv_writer_.wait(cv_lock,
                  [this]() { return reader_found_ == true || closed_; });
  cv_reader_.notify_one();
  if (!closed_) {
    std::unique_lock<std::mutex> channel_lock(mu_ch_);
    item = data;
    channel_lock.unlock();
    cv_channel_.notify_one();
    channel_lock.lock();
    cv_channel_.wait(channel_lock,
                     [this]() { return item == nullptr || closed_; });
  }
  writer_found_ = false;
 }
 // This function implements the concept of how
 // data that was sent by a writer is read from a reader.
 template <typename T>
-void UnBuffered<T>::Send(T* channel_element) {}
+void UnBuffered<T>::Receive(T* data) {
  // Prevent other readers from entering
  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
  reader_found_ = true;
  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
  // If reader comes first, it should wait till a writer arrives
  cv_reader_.wait(cv_lock,
                  [this]() { return writer_found_ == true || closed_; });
  cv_writer_.notify_one();
  if (!closed_) {
    std::unique_lock<std::mutex> lock_ch{mu_ch_};
    // Reader should wait for the writer to first write its data
    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
    if (!closed_) {
      *data = std::move(*item);
      item = nullptr;
      lock_ch.unlock();
    }
    cv_channel_.notify_one();
  }
  reader_found_ = false;
 }
 // This function implements the sequence of events
 // that take place once the channel is closed.
 template <typename T>
-void UnBuffered<T>::Receive(T*) {}
+void UnBuffered<T>::Close() {
  std::unique_lock<std::mutex> lock(mu_ch_);
  item = nullptr;
  closed_ = true;
  NotifyAllParticipants(&lock);
 }
 // This function implements the sequence of events
 // that are executed once the object of an UnBuffered
 // channel is destroyed.
 template <typename T>
-void UnBuffered<T>::Close() {}
+UnBuffered<T>::~UnBuffered() {
  std::unique_lock<std::mutex> lock(mu_ch_);
  item = nullptr;
  closed_ = true;
  NotifyAllParticipants(&lock);
 }
 // This function notifies all the readers, writers and
 // the channel condition variables.
 template <typename T>
-UnBuffered<T>::~UnBuffered() {}
+void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
  lock->unlock();
  cv_writer_.notify_all();
  cv_channel_.notify_all();
  cv_reader_.notify_all();
 }
 }  // namespace details
 }  // namespace framework
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  bool HasOutputs(const std::string &name) const override;
  DDim GetInputDim(const std::string &name) const override;
  void SetOutputDim(const std::string &name, const DDim &dim) override;
  AttrReader Attrs() const override;
  const std::vector<std::string> &Inputs(
@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
  return true;
 }
 DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
  std::vector<DDim> ddims = GetInputsDim(name);
  auto length = ddims.size();
  PADDLE_ENFORCE_EQ(length, 1UL,
                    "Input(%s) should have 1 value, "
                    "but it has %d now",
                    name, length);
  return ddims[0];
 }
 void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
                                                const DDim &dim) {
  SetOutputsDim(name, {dim});
 }
 AttrReader CompileTimeInferShapeContext::Attrs() const {
  return AttrReader(op_.GetAttrMap());
 }
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return true;
  }
  DDim GetInputDim(const std::string& name) const override {
    return GetDim(op_.Input(name));
  }
  void SetOutputDim(const std::string& name, const DDim& dim) override {
    SetDim(op_.Output(name), dim);
  }
  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
  const std::vector<std::string>& Inputs(
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@ -18,10 +18,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
  const std::vector<std::string> &arg_names = Inputs(name);
  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
                    "Input(%s) should hold one element, but now it holds %d",
                    name, arg_names.size());
  return this->GetDim(arg_names[0]);
 }
 std::vector<DDim> InferShapeContext::GetInputsDim(
    const std::string &name) const {
-  const std::vector<std::string> &names = Inputs(name);
+  const std::vector<std::string> &arg_names = Inputs(name);
-  return GetDims(names);
+  return GetDims(arg_names);
 }
 DDim InferShapeContext::GetInputsElementDim(const std::string &name,
@ -30,24 +38,31 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
  return this->GetDim(names[idx]);
 }
-void InferShapeContext::SetOutputsDim(
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
-    const std::string &name, const std::vector<framework::DDim> &dims) {
+  auto &arg_names = Outputs(name);
  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
                    "Output(%s) should hold one element, but now it holds %d",
                    name, arg_names.size());
  SetDim(arg_names[0], dim);
 }
 void InferShapeContext::SetOutputsDim(const std::string &name,
                                      const std::vector<DDim> &dims) {
  auto &names = Outputs(name);
  SetDims(names, dims);
 }
-std::vector<framework::DDim> InferShapeContext::GetDims(
+std::vector<DDim> InferShapeContext::GetDims(
    const std::vector<std::string> &names) const {
-  std::vector<framework::DDim> ret;
+  std::vector<DDim> ret;
  ret.reserve(names.size());
  std::transform(
      names.begin(), names.end(), std::back_inserter(ret),
      [this](const std::string &name) { return this->GetDim(name); });
  return ret;
 }
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
-                                const std::vector<framework::DDim> &dims) {
+                                const std::vector<DDim> &dims) {
  size_t length = names.size();
  PADDLE_ENFORCE_EQ(length, dims.size());
  for (size_t i = 0; i < length; ++i) {
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@ -35,14 +35,13 @@ class InferShapeContext {
  virtual bool HasInputs(const std::string &name) const = 0;
  virtual bool HasOutputs(const std::string &name) const = 0;
-  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
+  DDim GetInputDim(const std::string &name) const;
-  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetInputsDim(const std::string &name) const;
  DDim GetInputsElementDim(const std::string &name, int idx) const;
-  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
+  void SetOutputDim(const std::string &name, const DDim &dim);
-  void SetOutputsDim(const std::string &name,
+  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
                     const std::vector<framework::DDim> &dims);
  virtual AttrReader Attrs() const = 0;
  virtual const std::vector<std::string> &Inputs(
@ -57,15 +56,13 @@ class InferShapeContext {
  // Note: In while op, we need this to be public
  void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
+               const std::vector<DDim> &dims);
 protected:
-  virtual framework::DDim GetDim(const std::string &name) const = 0;
+  virtual DDim GetDim(const std::string &name) const = 0;
-  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
+  virtual void SetDim(const std::string &name, const DDim &dim) = 0;
  std::vector<framework::DDim> GetDims(
      const std::vector<std::string> &names) const;
  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
  std::vector<proto::VarDesc::VarType> GetVarTypes(
      const std::vector<std::string> &names) const;
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@ -178,19 +178,22 @@ public:
    real* inputData = inputs[0].data<real>();
    real* filterData = inputs[1].data<real>();
    real* outputData = outputs[0].data<real>();
    real* colData = NULL;
    bool needIm2col = isNeedIm2col(filter);
    TensorShape imShape =
        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
    TensorShape colShape;
    real* colData = NULL;
-    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
+    // Max col matrix width 4096, Max col matrix size 4M.
-    size_t colWidth = outputHeight * outputWidth;
+    size_t outputHeightSteps =
-    // Max col matrix height 256, Max col matrix width 1024
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
-    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
+    size_t maxColWidth = outputHeightSteps * outputWidth;
-    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+    size_t channelSteps =
        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
                          (size_t)1),
                 inputChannels / groups_);
    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
    if (needIm2col) {
      colShape = TensorShape({inputChannels / groups_,
@ -199,7 +202,7 @@ public:
                              outputHeight,
                              outputWidth});
-      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
      colData = reinterpret_cast<real*>(memory_->getBuf());
    }
@ -209,20 +212,24 @@ public:
        (outputChannels / groups_) * outputHeight * outputWidth;
    size_t filterOffset = filter.getElements() / groups_;
-    int nStride = colWidth;
+    int nStride = outputHeight * outputWidth;
-    int kStride = colHeight;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
    for (size_t i = 0; i < batchSize; i++) {
      filterData = inputs[1].data<real>();
      for (size_t g = 0; g < groups_; g++) {
        if (needIm2col) {
          real beta_ = beta;
-          for (size_t colHeightStart = 0; colHeightStart < colHeight;
+          for (size_t ic = 0; ic < inputChannels / groups_;
-               colHeightStart += stepColHeight) {
+               ic += channelSteps) {
-            for (size_t colWidthStart = 0; colWidthStart < colWidth;
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
-                 colWidthStart += stepColWidth) {
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
-              int N = std::min(colWidth - colWidthStart, stepColWidth);
+              int height = std::min(outputHeight - oh, outputHeightSteps);
-              int K = std::min(colHeight - colHeightStart, stepColHeight);
+
              int M = outputChannels / groups_;
              int N = height * outputWidth;
              int K = channels * filterHeight * filterWidth;
              // im2col
-              im2col(inputData + g * inputOffset,
+              im2col(inputData,
                     imShape,
                     colData,
                     colShape,
@ -232,13 +239,12 @@ public:
                     paddingW(),
                     dilationH(),
                     dilationW(),
-                     colHeightStart,
+                     channels,
-                     K,
+                     oh,
-                     colWidthStart,
+                     height,
                     N);
              // gemm
              int M = outputChannels / groups_;
              BlasGemm<Device, real>::compute(
                  false,
                  false,
@ -246,12 +252,12 @@ public:
                  N,
                  K,
                  1.0f,
-                  filterData + g * filterOffset + colHeightStart,
+                  filterData + ic * filterHeight * filterWidth,
                  kStride,
                  colData,
                  N,
                  beta_,
-                  outputData + g * outputOffset + colWidthStart,
+                  outputData + oh * outputWidth,
                  nStride);
            }
            beta_ = 1.0;
@ -266,17 +272,18 @@ public:
                                          N,
                                          K,
                                          1.0f,
-                                          filterData + g * filterOffset,
+                                          filterData,
                                          K,
-                                          inputData + g * inputOffset,
+                                          inputData,
                                          N,
                                          beta,
-                                          outputData + g * outputOffset,
+                                          outputData,
                                          N);
        }
        inputData += inputOffset;
        outputData += outputOffset;
        filterData += filterOffset;
      }
      inputData += inputChannels * inputHeight * inputWidth;
      outputData += outputChannels * outputHeight * outputWidth;
    }
    memory_.reset();
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@ -111,39 +111,42 @@ public:
                  int paddingWidth,
                  int dilationHeight,
                  int dilationWidth,
-                  int colHeightStart,
+                  int inputChannels,
-                  int colHeightSize,
+                  int colOffset,
-                  int colWidthStart,
+                  int colOutputHeight,
-                  int colWidthSize) {
+                  int colWidth) {
    int inputHeight = imShape[1];
    int inputWidth = imShape[2];
    int filterHeight = colShape[1];
    int filterWidth = colShape[2];
    int outputWidth = colShape[4];
-    for (int colh = 0; colh < colHeightSize; colh++) {
+    for (int ic = 0; ic < inputChannels; ic++) {
-      int wOffset = (colHeightStart + colh) % filterWidth;
+      for (int oh = 0; oh < colOutputHeight; oh++) {
-      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
+        T* dstData = colData + oh * outputWidth;
-      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
+        for (int fh = 0; fh < filterHeight; fh++) {
-
+          for (int fw = 0; fw < filterWidth; fw++) {
-      for (int colw = 0; colw < colWidthSize; colw++) {
+            int imRowIdx = (oh + colOffset) * strideHeight +
-        int h = (colWidthStart + colw) / outputWidth;
+                           fh * dilationHeight - paddingHeight;
-        int w = (colWidthStart + colw) % outputWidth;
+            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
-
+              memset(dstData, 0, outputWidth * sizeof(T));
-        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
+            } else {
-        int imColIdx = w * strideWidth + wOffset * dilationWidth;
+              for (int ow = 0; ow < outputWidth; ow++) {
-        if ((imRowIdx - paddingHeight) < 0 ||
+                int imColIdx =
-            (imRowIdx - paddingHeight) >= inputHeight ||
+                    ow * strideWidth + fw * dilationWidth - paddingWidth;
-            (imColIdx - paddingWidth) < 0 ||
+                if (imColIdx < 0 || imColIdx >= inputWidth) {
-            (imColIdx - paddingWidth) >= inputWidth) {
+                  dstData[ow] = T(0);
-          colData[colh * colWidthSize + colw] = static_cast<T>(0);
+                } else {
-        } else {
+                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
-          imRowIdx += c_im * inputHeight - paddingHeight;
+                }
-          imColIdx -= paddingWidth;
+              }
-          colData[colh * colWidthSize + colw] =
+            }
-              imData[imRowIdx * inputWidth + imColIdx];
+            dstData += colWidth;
          }
        }
      }
      colData += filterHeight * filterWidth * colWidth;
      imData += inputHeight * inputWidth;
    }
  }
 };
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
                          padding,
                          dilation,
                          dilation,
                          channels,
                          0,
-                          height,
+                          outputHeight,
-                          0,
+                          outputHeight * outputWidth);
                          width);
                  autotest::TensorCheckEqual(*output1, *output2);
                }
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
@ -4,4 +4,4 @@ cc_test(test_inference_recognize_digits_mlp
    DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
    ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
 set_tests_properties(test_inference_recognize_digits_mlp
-    PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
+    PROPERTIES DEPENDS test_recognize_digits)
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    CHECK_EQ(channels * outLength, maskMatP->getWidth());
  }
  /* initialize the data_ */
  for (size_t i = 0; i < height_; i++) {
    for (size_t j = 0; j < width_; j++) {
      outData[i * outStride + j] = -(real)FLT_MAX;
    }
  }
  /* pool max one by one */
  for (size_t n = 0; n < num; ++n) {  // frame by frame
    if (!isContiguous()) {
@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
    for (size_t c = 0; c < channels; ++c) {  // channel by channel
      for (size_t ph = 0; ph < outputH; ++ph) {
        int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
+        int hend = hstart + sizeY;
-        hstart = std::max(hstart, 0);
+        hstart = hstart < 0 ? 0 : hstart;
        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
        for (size_t pw = 0; pw < outputW; ++pw) {
          int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
+          int wend = wstart + sizeX;
-          wstart = std::max(wstart, 0);
+          wstart = wstart < 0 ? 0 : wstart;
          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
          if (maskData == NULL) {
            real tmp = -(real)FLT_MAX;
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
-                outData[ph * outputW + pw] = std::max(
+                tmp = tmp < inputData[h * imgSizeW + w]
-                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+                          ? inputData[h * imgSizeW + w]
                          : tmp;
              }
            }
            outData[ph * outputW + pw] = tmp;
          } else {
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("DistMat"),
                   "Input(DistMat) of BipartiteMatch should not be null.");
    PADDLE_ENFORCE(
        ctx->HasOutput("ColToRowMatchIndices"),
        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
    PADDLE_ENFORCE(
        ctx->HasOutput("ColToRowMatchDist"),
        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
    auto dims = ctx->GetInputDim("DistMat");
    PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
    ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDis", dims);
+    ctx->SetOutputDim("ColToRowMatchDist", dims);
  }
 };
@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& context) const override {
    auto* dist_mat = context.Input<LoDTensor>("DistMat");
    auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
    auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
              "Otherwise, it means B[j] is matched to row "
              "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
              "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDis",
+    AddOutput("ColToRowMatchDist",
              "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
              "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
              "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
              "instance are called LoD. Then "
-              "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
+              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
    AddComment(R"DOC(
 This operator is a greedy bipartite matching algorithm, which is used to
 obtain the matching with the maximum distance based on the input
--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/operators/box_coder_op.cc
@ -0,0 +1,121 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/box_coder_op.h"
 namespace paddle {
 namespace operators {
 class BoxCoderOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
 protected:
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
                   "Input(PriorBox) of BoxCoderOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
                   "Input(TargetBox) of BoxCoderOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
                   "Output(OutputBox) of BoxCoderOp should not be null.");
    auto prior_box_dims = ctx->GetInputDim("PriorBox");
    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
    auto target_box_dims = ctx->GetInputDim("TargetBox");
    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
                      "The rank of Input of PriorBoxVar must be 2");
    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
                      "The rank of Input of TargetBox must be 2");
    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
                      "The shape of TargetBox is [M, 4]");
    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
    ctx->SetOutputDim(
        "OutputBox",
        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
  }
 };
 class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput(
        "PriorBox",
        "(Tensor, default Tensor<float>) "
        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
        "each box is represented as [xmin, ymin, xmax, ymax], "
        "[xmin, ymin] is the left top coordinate of the anchor box, "
        "if the input is image feature map, they are close to the origin "
        "of the coordinate system. [xmax, ymax] is the right bottom "
        "coordinate of the anchor box.");
    AddInput("PriorBoxVar",
             "(Tensor, default Tensor<float>) "
             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
             "of variance.");
    AddInput(
        "TargetBox",
        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
        "[xmin, ymin] is the left top coordinate of the box if the input "
        "is image feature map, they are close to the origin of the coordinate "
        "system. [xmax, ymax] is the right bottom coordinate of the box. "
        "This tensor can contain LoD information to represent a batch "
        "of inputs. One instance of this batch can contain different "
        "numbers of entities.");
    AddAttr<std::string>("code_type",
                         "(string, default encode_center_size) "
                         "the code type used with the target box")
        .SetDefault("encode_center_size")
        .InEnum({"encode_center_size", "decode_center_size"});
    AddOutput(
        "OutputBox",
        "(LoDTensor or Tensor) "
        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
        "representing the result of N target boxes encoded/decoded with "
        "M Prior boxes and variances.");
    AddComment(R"DOC(
 Bounding Box Coder Operator.
 Encode/Decode the target bounding box with the priorbox information.
 The Encoding schema described below:
 ox = (tx - px) / pw / pxv
 oy = (ty - py) / ph / pyv
 ow = log(abs(tw / pw)) / pwv 
 oh = log(abs(th / ph)) / phv 
 The Decoding schema described below:
 ox = (pw * pxv * tx * + px) - tw / 2
 oy = (ph * pyv * ty * + py) - th / 2
 ow = exp(pwv * tw) * pw + tw / 2
 oh = exp(phv * th) * ph + th / 2
 where tx, ty, tw, th denote the target box's center coordinates, width and
 height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
 center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
 of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
 width and height.
 )DOC");
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
 REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
                       ops::BoxCoderKernel<double>);
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/operators/box_coder_op.cu
@ -0,0 +1,150 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/box_coder_op.h"
 #include "paddle/platform/cuda_helper.h"
 namespace paddle {
 namespace operators {
 template <typename T>
 __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
                                       const T* prior_box_var_data,
                                       const T* target_box_data, const int row,
                                       const int col, const int len,
                                       T* output) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < row * col) {
    const int row_idx = idx / col;
    const int col_idx = idx % col;
    T prior_box_width =
        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
    T prior_box_height =
        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
    T prior_box_center_x =
        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
                            prior_box_data[col_idx * len + 1]) /
                           2;
    T target_box_center_x =
        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
        2;
    T target_box_center_y = (target_box_data[row_idx * len + 3] +
                             target_box_data[row_idx * len + 1]) /
                            2;
    T target_box_width =
        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
    T target_box_height =
        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
    output[idx * len] = (target_box_center_x - prior_box_center_x) /
                        prior_box_width / prior_box_var_data[col_idx * len];
    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
                            prior_box_height /
                            prior_box_var_data[col_idx * len + 1];
    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
                            prior_box_var_data[col_idx * len + 2];
    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
                            prior_box_var_data[col_idx * len + 3];
  }
 }
 template <typename T>
 __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                       const T* prior_box_var_data,
                                       const T* target_box_data, const int row,
                                       const int col, const int len,
                                       T* output) {
  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
  if (idx < row * col) {
    const int row_idx = idx / col;
    const int col_idx = idx % col;
    T prior_box_width =
        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
    T prior_box_height =
        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
    T prior_box_center_x =
        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
                            prior_box_data[col_idx * len + 1]) /
                           2;
    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
                             target_box_data[row_idx * len + 2]) *
                         prior_box_width;
    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
                              target_box_data[row_idx * len + 3]) *
                          prior_box_height;
    T target_box_center_x = prior_box_var_data[col_idx * len] *
                                target_box_data[row_idx * len] *
                                prior_box_width +
                            prior_box_center_x;
    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
                                target_box_data[row_idx * len + 1] *
                                prior_box_height +
                            prior_box_center_y;
    output[idx * len] = target_box_center_x - target_box_width / 2;
    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
  }
 }
 template <typename T>
 class BoxCoderCUDAKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
                   "This kernel only runs on GPU device.");
    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
    auto* output_box = context.Output<framework::Tensor>("OutputBox");
    if (target_box->lod().size()) {
      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
                        "Only support 1 level of LoD.");
    }
    auto row = target_box->dims()[0];
    auto col = prior_box->dims()[0];
    auto len = prior_box->dims()[1];
    int block = 512;
    int grid = (row * col + block - 1) / block;
    auto& device_ctx = context.cuda_device_context();
    const T* prior_box_data = prior_box->data<T>();
    const T* prior_box_var_data = prior_box_var->data<T>();
    const T* target_box_data = target_box->data<T>();
    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
    T* output = output_box->data<T>();
    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
    if (code_type == BoxCodeType::kEncodeCenterSize) {
      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
          output);
    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
          output);
    }
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
                        ops::BoxCoderCUDAKernel<double>);
--- a/Show More
+++ b/Show More