From 2b202f754bcb4f035eb22b316aea6f9d96c7386b Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Wed, 10 Jan 2018 21:40:46 +0800
Subject: [PATCH 001/102] Optimize maxPoolForward.

---
 paddle/math/Matrix.cpp | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 1ec4336cab..cc86b12be0 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -2015,13 +2015,6 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
     CHECK_EQ(channels * outLength, maskMatP->getWidth());
   }
 
-  /* initialize the data_ */
-  for (size_t i = 0; i < height_; i++) {
-    for (size_t j = 0; j < width_; j++) {
-      outData[i * outStride + j] = -(real)FLT_MAX;
-    }
-  }
-
   /* pool max one by one */
   for (size_t n = 0; n < num; ++n) {  // frame by frame
     if (!isContiguous()) {
@@ -2030,19 +2023,24 @@ void CpuMatrix::maxPoolForward(Matrix& inputMat,
     for (size_t c = 0; c < channels; ++c) {  // channel by channel
       for (size_t ph = 0; ph < outputH; ++ph) {
         int hstart = ph * strideH - paddingH;
-        int hend = std::min(hstart + sizeY, imgSizeH);
-        hstart = std::max(hstart, 0);
+        int hend = hstart + sizeY;
+        hstart = hstart < 0 ? 0 : hstart;
+        hend = hend < (int)imgSizeH ? hend : (int)imgSizeH;
         for (size_t pw = 0; pw < outputW; ++pw) {
           int wstart = pw * strideW - paddingW;
-          int wend = std::min(wstart + sizeX, imgSizeW);
-          wstart = std::max(wstart, 0);
+          int wend = wstart + sizeX;
+          wstart = wstart < 0 ? 0 : wstart;
+          wend = wend < (int)imgSizeW ? wend : (int)imgSizeW;
           if (maskData == NULL) {
+            real tmp = -(real)FLT_MAX;
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {
-                outData[ph * outputW + pw] = std::max(
-                    outData[ph * outputW + pw], inputData[h * imgSizeW + w]);
+                tmp = tmp < inputData[h * imgSizeW + w]
+                          ? inputData[h * imgSizeW + w]
+                          : tmp;
               }
             }
+            outData[ph * outputW + pw] = tmp;
           } else {
             for (int h = hstart; h < hend; ++h) {
               for (int w = wstart; w < wend; ++w) {

From ed0a564c909353c862bcb1533e41420fdd87eb9e Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 11 Jan 2018 13:42:40 +0800
Subject: [PATCH 002/102] Optimize GemmConvMobileFunction.

---
 paddle/function/GemmConvOp.cpp | 63 +++++++++++++++++++---------------
 paddle/function/Im2Col.h       | 53 ++++++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index cbdbf5335d..a9876cec2a 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -178,19 +178,22 @@ public:
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* outputData = outputs[0].data<real>();
+    real* colData = NULL;
     bool needIm2col = isNeedIm2col(filter);
 
     TensorShape imShape =
         TensorShape({inputChannels / groups_, inputHeight, inputWidth});
-
     TensorShape colShape;
-    real* colData = NULL;
 
-    size_t colHeight = inputChannels / groups_ * filterHeight * filterWidth;
-    size_t colWidth = outputHeight * outputWidth;
-    // Max col matrix height 256, Max col matrix width 1024
-    size_t stepColHeight = std::min(colHeight, static_cast<size_t>(256));
-    size_t stepColWidth = std::min(colWidth, static_cast<size_t>(2048));
+    // Max col matrix width 4096, Max col matrix size 4M.
+    size_t outputHeightSteps =
+        std::min(std::max(4096 / outputWidth, (size_t)1), outputHeight);
+    size_t maxColWidth = outputHeightSteps * outputWidth;
+    size_t channelSteps =
+        std::min(std::max((1048576 / maxColWidth) / filterHeight * filterWidth,
+                          (size_t)1),
+                 inputChannels / groups_);
+    size_t maxColHeight = channelSteps * filterHeight * filterWidth;
 
     if (needIm2col) {
       colShape = TensorShape({inputChannels / groups_,
@@ -199,7 +202,7 @@ public:
                               outputHeight,
                               outputWidth});
 
-      resizeBuffer<Device>(stepColHeight * stepColWidth * sizeof(real));
+      resizeBuffer<Device>(maxColHeight * maxColWidth * sizeof(real));
       colData = reinterpret_cast<real*>(memory_->getBuf());
     }
 
@@ -209,20 +212,24 @@ public:
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
 
-    int nStride = colWidth;
-    int kStride = colHeight;
+    int nStride = outputHeight * outputWidth;
+    int kStride = inputChannels / groups_ * filterHeight * filterWidth;
     for (size_t i = 0; i < batchSize; i++) {
+      filterData = inputs[1].data<real>();
       for (size_t g = 0; g < groups_; g++) {
         if (needIm2col) {
           real beta_ = beta;
-          for (size_t colHeightStart = 0; colHeightStart < colHeight;
-               colHeightStart += stepColHeight) {
-            for (size_t colWidthStart = 0; colWidthStart < colWidth;
-                 colWidthStart += stepColWidth) {
-              int N = std::min(colWidth - colWidthStart, stepColWidth);
-              int K = std::min(colHeight - colHeightStart, stepColHeight);
+          for (size_t ic = 0; ic < inputChannels / groups_;
+               ic += channelSteps) {
+            int channels = std::min(inputChannels / groups_ - ic, channelSteps);
+            for (size_t oh = 0; oh < outputHeight; oh += outputHeightSteps) {
+              int height = std::min(outputHeight - oh, outputHeightSteps);
+
+              int M = outputChannels / groups_;
+              int N = height * outputWidth;
+              int K = channels * filterHeight * filterWidth;
               // im2col
-              im2col(inputData + g * inputOffset,
+              im2col(inputData,
                      imShape,
                      colData,
                      colShape,
@@ -232,13 +239,12 @@ public:
                      paddingW(),
                      dilationH(),
                      dilationW(),
-                     colHeightStart,
-                     K,
-                     colWidthStart,
+                     channels,
+                     oh,
+                     height,
                      N);
 
               // gemm
-              int M = outputChannels / groups_;
               BlasGemm<Device, real>::compute(
                   false,
                   false,
@@ -246,12 +252,12 @@ public:
                   N,
                   K,
                   1.0f,
-                  filterData + g * filterOffset + colHeightStart,
+                  filterData + ic * filterHeight * filterWidth,
                   kStride,
                   colData,
                   N,
                   beta_,
-                  outputData + g * outputOffset + colWidthStart,
+                  outputData + oh * outputWidth,
                   nStride);
             }
             beta_ = 1.0;
@@ -266,17 +272,18 @@ public:
                                           N,
                                           K,
                                           1.0f,
-                                          filterData + g * filterOffset,
+                                          filterData,
                                           K,
-                                          inputData + g * inputOffset,
+                                          inputData,
                                           N,
                                           beta,
-                                          outputData + g * outputOffset,
+                                          outputData,
                                           N);
         }
+        inputData += inputOffset;
+        outputData += outputOffset;
+        filterData += filterOffset;
       }
-      inputData += inputChannels * inputHeight * inputWidth;
-      outputData += outputChannels * outputHeight * outputWidth;
     }
 
     memory_.reset();
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 36a9bcf84e..361ba4c18a 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -98,6 +98,7 @@ public:
                   int dilationWidth = 1);
 };
 
+#if 0
 template <class T>
 class Im2ColMobileFunctor {
 public:
@@ -147,5 +148,57 @@ public:
     }
   }
 };
+#endif
+
+template <class T>
+class Im2ColMobileFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int dilationHeight,
+                  int dilationWidth,
+                  int inputChannels,
+                  int colOffset,
+                  int colOutputHeight,
+                  int colWidth) {
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputWidth = colShape[4];
+
+    for (int ic = 0; ic < inputChannels; ic++) {
+      for (int oh = 0; oh < colOutputHeight; oh++) {
+        T* dstData = colData + oh * outputWidth;
+        for (int fh = 0; fh < filterHeight; fh++) {
+          for (int fw = 0; fw < filterWidth; fw++) {
+            int imRowIdx = (oh + colOffset) * strideHeight + fh - paddingHeight;
+            if (imRowIdx < 0 || imRowIdx >= inputHeight) {
+              memset(dstData, 0, outputWidth * sizeof(T));
+            } else {
+              for (int ow = 0; ow < outputWidth; ow++) {
+                int imColIdx = ow * strideWidth + fw - paddingWidth;
+                if (imColIdx < 0 || imColIdx >= inputWidth) {
+                  dstData[ow] = T(0);
+                } else {
+                  dstData[ow] = imData[imRowIdx * inputWidth + imColIdx];
+                }
+              }
+            }
+            dstData += colWidth;
+          }
+        }
+      }
+      colData += filterHeight * filterWidth * colWidth;
+      imData += inputHeight * inputWidth;
+    }
+  }
+};
 
 }  // namespace paddle

From 784e59406c541def813e75db5fd11bb0361eccef Mon Sep 17 00:00:00 2001
From: hedaoyuan <hedaoyuan@github.com>
Date: Thu, 11 Jan 2018 14:02:57 +0800
Subject: [PATCH 003/102] Bug fix of Im2ColMobileFunctor.

---
 paddle/function/Im2Col.h       | 58 +++-------------------------------
 paddle/function/Im2ColTest.cpp |  6 ++--
 2 files changed, 7 insertions(+), 57 deletions(-)

diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
index 361ba4c18a..915119e291 100644
--- a/paddle/function/Im2Col.h
+++ b/paddle/function/Im2Col.h
@@ -98,58 +98,6 @@ public:
                   int dilationWidth = 1);
 };
 
-#if 0
-template <class T>
-class Im2ColMobileFunctor {
-public:
-  void operator()(const T* imData,
-                  const TensorShape& imShape,
-                  T* colData,
-                  const TensorShape& colShape,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int dilationHeight,
-                  int dilationWidth,
-                  int colHeightStart,
-                  int colHeightSize,
-                  int colWidthStart,
-                  int colWidthSize) {
-    int inputHeight = imShape[1];
-    int inputWidth = imShape[2];
-    int filterHeight = colShape[1];
-    int filterWidth = colShape[2];
-    int outputWidth = colShape[4];
-
-    for (int colh = 0; colh < colHeightSize; colh++) {
-      int wOffset = (colHeightStart + colh) % filterWidth;
-      int hOffset = ((colHeightStart + colh) / filterWidth) % filterHeight;
-      int c_im = (colHeightStart + colh) / filterWidth / filterHeight;
-
-      for (int colw = 0; colw < colWidthSize; colw++) {
-        int h = (colWidthStart + colw) / outputWidth;
-        int w = (colWidthStart + colw) % outputWidth;
-
-        int imRowIdx = h * strideHeight + hOffset * dilationHeight;
-        int imColIdx = w * strideWidth + wOffset * dilationWidth;
-        if ((imRowIdx - paddingHeight) < 0 ||
-            (imRowIdx - paddingHeight) >= inputHeight ||
-            (imColIdx - paddingWidth) < 0 ||
-            (imColIdx - paddingWidth) >= inputWidth) {
-          colData[colh * colWidthSize + colw] = static_cast<T>(0);
-        } else {
-          imRowIdx += c_im * inputHeight - paddingHeight;
-          imColIdx -= paddingWidth;
-          colData[colh * colWidthSize + colw] =
-              imData[imRowIdx * inputWidth + imColIdx];
-        }
-      }
-    }
-  }
-};
-#endif
-
 template <class T>
 class Im2ColMobileFunctor {
 public:
@@ -178,12 +126,14 @@ public:
         T* dstData = colData + oh * outputWidth;
         for (int fh = 0; fh < filterHeight; fh++) {
           for (int fw = 0; fw < filterWidth; fw++) {
-            int imRowIdx = (oh + colOffset) * strideHeight + fh - paddingHeight;
+            int imRowIdx = (oh + colOffset) * strideHeight +
+                           fh * dilationHeight - paddingHeight;
             if (imRowIdx < 0 || imRowIdx >= inputHeight) {
               memset(dstData, 0, outputWidth * sizeof(T));
             } else {
               for (int ow = 0; ow < outputWidth; ow++) {
-                int imColIdx = ow * strideWidth + fw - paddingWidth;
+                int imColIdx =
+                    ow * strideWidth + fw * dilationWidth - paddingWidth;
                 if (imColIdx < 0 || imColIdx >= inputWidth) {
                   dstData[ow] = T(0);
                 } else {
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index 3ba866dcdd..fe44a8bf79 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -202,10 +202,10 @@ void TestIm2ColMobileFunctor() {
                           padding,
                           dilation,
                           dilation,
+                          channels,
                           0,
-                          height,
-                          0,
-                          width);
+                          outputHeight,
+                          outputHeight * outputWidth);
 
                   autotest::TensorCheckEqual(*output1, *output2);
                 }

From 373f8ba036ff60d6781c9ec2717102336de89b0f Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 15 Jan 2018 21:03:13 +0800
Subject: [PATCH 004/102] add v2 dist benchmark vgg

---
 benchmark/cluster/v2/Dockerfile   |   4 +
 benchmark/cluster/v2/pserver.yaml |  64 +++++++++++++++
 benchmark/cluster/v2/reader.py    |  56 +++++++++++++
 benchmark/cluster/v2/trainer.yaml |  63 +++++++++++++++
 benchmark/cluster/v2/vgg16.py     | 125 ++++++++++++++++++++++++++++++
 5 files changed, 312 insertions(+)
 create mode 100644 benchmark/cluster/v2/Dockerfile
 create mode 100644 benchmark/cluster/v2/pserver.yaml
 create mode 100644 benchmark/cluster/v2/reader.py
 create mode 100644 benchmark/cluster/v2/trainer.yaml
 create mode 100644 benchmark/cluster/v2/vgg16.py

diff --git a/benchmark/cluster/v2/Dockerfile b/benchmark/cluster/v2/Dockerfile
new file mode 100644
index 0000000000..c52acd51a2
--- /dev/null
+++ b/benchmark/cluster/v2/Dockerfile
@@ -0,0 +1,4 @@
+FROM registry.baidu.com/paddlepaddle/rawjob
+RUN mkdir -p /workspace && mkdir -p /root/.cache/paddle/dataset/flowers/
+ADD vgg16.py reader.py /workspace/
+ADD 102flowers.tgz imagelabels.mat setid.mat /root/.cache/paddle/dataset/flowers/
diff --git a/benchmark/cluster/v2/pserver.yaml b/benchmark/cluster/v2/pserver.yaml
new file mode 100644
index 0000000000..ed1671bbbd
--- /dev/null
+++ b/benchmark/cluster/v2/pserver.yaml
@@ -0,0 +1,64 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "python train.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        command: ["paddle_k8s", "start_pserver"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
diff --git a/benchmark/cluster/v2/reader.py b/benchmark/cluster/v2/reader.py
new file mode 100644
index 0000000000..a5a2d54841
--- /dev/null
+++ b/benchmark/cluster/v2/reader.py
@@ -0,0 +1,56 @@
+import random
+from paddle.v2.image import load_and_transform
+import paddle.v2 as paddle
+from multiprocessing import cpu_count
+
+
+def train_mapper(sample):
+    '''
+    map image path to type needed by model input layer for the training set
+    '''
+    img, label = sample
+    img = paddle.image.load_image(img)
+    img = paddle.image.simple_transform(img, 256, 224, True)
+    return img.flatten().astype('float32'), label
+
+
+def test_mapper(sample):
+    '''
+    map image path to type needed by model input layer for the test set
+    '''
+    img, label = sample
+    img = paddle.image.load_image(img)
+    img = paddle.image.simple_transform(img, 256, 224, True)
+    return img.flatten().astype('float32'), label
+
+
+def train_reader(train_list, buffered_size=1024):
+    def reader():
+        with open(train_list, 'r') as f:
+            lines = [line.strip() for line in f]
+            for line in lines:
+                img_path, lab = line.strip().split('\t')
+                yield img_path, int(lab)
+
+    return paddle.reader.xmap_readers(train_mapper, reader,
+                                      cpu_count(), buffered_size)
+
+
+def test_reader(test_list, buffered_size=1024):
+    def reader():
+        with open(test_list, 'r') as f:
+            lines = [line.strip() for line in f]
+            for line in lines:
+                img_path, lab = line.strip().split('\t')
+                yield img_path, int(lab)
+
+    return paddle.reader.xmap_readers(test_mapper, reader,
+                                      cpu_count(), buffered_size)
+
+
+if __name__ == '__main__':
+    #for im in train_reader('train.list'):
+    #    print len(im[0])
+    #for im in train_reader('test.list'):
+    #    print len(im[0])
+    paddle.dataset.flowers.train()
diff --git a/benchmark/cluster/v2/trainer.yaml b/benchmark/cluster/v2/trainer.yaml
new file mode 100644
index 0000000000..33c95df365
--- /dev/null
+++ b/benchmark/cluster/v2/trainer.yaml
@@ -0,0 +1,63 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_trainer", "v2"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "cd /workspace && python /workspace/vgg16.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/v2/vgg16.py
new file mode 100644
index 0000000000..699fc07628
--- /dev/null
+++ b/benchmark/cluster/v2/vgg16.py
@@ -0,0 +1,125 @@
+import gzip
+
+import paddle.v2.dataset.flowers as flowers
+import paddle.v2 as paddle
+import reader
+
+DATA_DIM = 3 * 224 * 224  # Use 3 * 331 * 331 or 3 * 299 * 299 for Inception-ResNet-v2.
+CLASS_DIM = 102
+BATCH_SIZE = 128
+
+
+def vgg(input, nums, class_dim):
+    def conv_block(input, num_filter, groups, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=input,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            pool_type=paddle.pooling.Max())
+
+    assert len(nums) == 5
+    # the channel of input feature is 3
+    conv1 = conv_block(input, 64, nums[0], 3)
+    conv2 = conv_block(conv1, 128, nums[1])
+    conv3 = conv_block(conv2, 256, nums[2])
+    conv4 = conv_block(conv3, 512, nums[3])
+    conv5 = conv_block(conv4, 512, nums[4])
+
+    fc_dim = 4096
+    fc1 = paddle.layer.fc(input=conv5,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=fc1,
+                          size=fc_dim,
+                          act=paddle.activation.Relu(),
+                          layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    out = paddle.layer.fc(input=fc2,
+                          size=class_dim,
+                          act=paddle.activation.Softmax())
+    return out
+
+
+def vgg13(input, class_dim):
+    nums = [2, 2, 2, 2, 2]
+    return vgg(input, nums, class_dim)
+
+
+def vgg16(input, class_dim):
+    nums = [2, 2, 3, 3, 3]
+    return vgg(input, nums, class_dim)
+
+
+def vgg19(input, class_dim):
+    nums = [2, 2, 4, 4, 4]
+    return vgg(input, nums, class_dim)
+
+
+def main():
+    paddle.init(use_gpu=True, trainer_count=1)
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(DATA_DIM))
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(CLASS_DIM))
+
+    extra_layers = None
+    learning_rate = 0.01
+    out = vgg16(image, class_dim=CLASS_DIM)
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 *
+                                                         BATCH_SIZE),
+        learning_rate=learning_rate / BATCH_SIZE,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=128000 * 35,
+        learning_rate_schedule="discexp", )
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            flowers.train(),
+            # To use other data, replace the above line with:
+            # reader.train_reader('train.list'),
+            buf_size=1000),
+        batch_size=BATCH_SIZE)
+    test_reader = paddle.batch(
+        flowers.valid(),
+        # To use other data, replace the above line with:
+        # reader.test_reader('val.list'),
+        batch_size=BATCH_SIZE)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 extra_layers=extra_layers,
+                                 is_local=False)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+        if isinstance(event, paddle.event.EndPass):
+            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+            result = trainer.test(reader=test_reader)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    trainer.train(
+        reader=train_reader, num_passes=200, event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()

From bbff57e085675edefc27f6bdc34e8baac5b59a05 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 16 Jan 2018 08:52:06 +0800
Subject: [PATCH 005/102] update docker file

---
 benchmark/cluster/v2/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/cluster/v2/Dockerfile b/benchmark/cluster/v2/Dockerfile
index c52acd51a2..3377cf0100 100644
--- a/benchmark/cluster/v2/Dockerfile
+++ b/benchmark/cluster/v2/Dockerfile
@@ -1,4 +1,4 @@
-FROM registry.baidu.com/paddlepaddle/rawjob
+FROM paddlepaddle/paddlecloud-job
 RUN mkdir -p /workspace && mkdir -p /root/.cache/paddle/dataset/flowers/
 ADD vgg16.py reader.py /workspace/
 ADD 102flowers.tgz imagelabels.mat setid.mat /root/.cache/paddle/dataset/flowers/

From 9ad149a928e1c9916ffd421bf9e365045108c482 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 16 Jan 2018 08:54:21 +0800
Subject: [PATCH 006/102] fix copyright check

---
 benchmark/cluster/v2/reader.py | 14 ++++++++++++++
 benchmark/cluster/v2/vgg16.py  | 14 ++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/benchmark/cluster/v2/reader.py b/benchmark/cluster/v2/reader.py
index a5a2d54841..060bf2bda2 100644
--- a/benchmark/cluster/v2/reader.py
+++ b/benchmark/cluster/v2/reader.py
@@ -1,3 +1,17 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
 import random
 from paddle.v2.image import load_and_transform
 import paddle.v2 as paddle
diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/v2/vgg16.py
index 699fc07628..dc9573bd79 100644
--- a/benchmark/cluster/v2/vgg16.py
+++ b/benchmark/cluster/v2/vgg16.py
@@ -1,3 +1,17 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+
 import gzip
 
 import paddle.v2.dataset.flowers as flowers

From 311d159e11a004c11676a47a5e7945dfadc718b5 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 16 Jan 2018 11:02:14 +0800
Subject: [PATCH 007/102] add copyright for newly merged files

---
 benchmark/cluster/v2/Dockerfile                     |  3 ++-
 benchmark/cluster/v2/vgg16.py                       |  2 +-
 benchmark/tensorflow/image/googlenet_multi_gpu.py   | 13 +++++++++++++
 doc/getstarted/concepts/src/infer.py                | 13 +++++++++++++
 paddle/gserver/layers/MultiBoxLossLayer.h           | 13 +++++++++++++
 .../v2/fluid/tests/test_dynrnn_static_input.py      | 13 +++++++++++++
 6 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/benchmark/cluster/v2/Dockerfile b/benchmark/cluster/v2/Dockerfile
index 3377cf0100..32e68b6150 100644
--- a/benchmark/cluster/v2/Dockerfile
+++ b/benchmark/cluster/v2/Dockerfile
@@ -1,4 +1,5 @@
 FROM paddlepaddle/paddlecloud-job
 RUN mkdir -p /workspace && mkdir -p /root/.cache/paddle/dataset/flowers/
 ADD vgg16.py reader.py /workspace/
-ADD 102flowers.tgz imagelabels.mat setid.mat /root/.cache/paddle/dataset/flowers/
+COPY 102flowers.tgz imagelabels.mat setid.mat /root/.cache/paddle/dataset/flowers/
+
diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/v2/vgg16.py
index dc9573bd79..8644a547b3 100644
--- a/benchmark/cluster/v2/vgg16.py
+++ b/benchmark/cluster/v2/vgg16.py
@@ -74,7 +74,7 @@ def vgg19(input, class_dim):
 
 
 def main():
-    paddle.init(use_gpu=True, trainer_count=1)
+    paddle.init(use_gpu=False, trainer_count=1)
     image = paddle.layer.data(
         name="image", type=paddle.data_type.dense_vector(DATA_DIM))
     lbl = paddle.layer.data(
diff --git a/benchmark/tensorflow/image/googlenet_multi_gpu.py b/benchmark/tensorflow/image/googlenet_multi_gpu.py
index 31466faa37..44de3800a8 100644
--- a/benchmark/tensorflow/image/googlenet_multi_gpu.py
+++ b/benchmark/tensorflow/image/googlenet_multi_gpu.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 from six.moves import xrange  # pylint: disable=redefined-builtin
 from datetime import datetime
 import math
diff --git a/doc/getstarted/concepts/src/infer.py b/doc/getstarted/concepts/src/infer.py
index 4cc58dfee0..ee71cd7a9a 100644
--- a/doc/getstarted/concepts/src/infer.py
+++ b/doc/getstarted/concepts/src/infer.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import paddle.v2 as paddle
 import numpy as np
 
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
index 9935da5644..40df312a25 100644
--- a/paddle/gserver/layers/MultiBoxLossLayer.h
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -1,3 +1,16 @@
+//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 /* copyright (c) 2016 paddlepaddle authors. all rights reserve.
 
 licensed under the apache license, version 2.0 (the "license");
diff --git a/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
index 9b138a6207..d6878f0b6d 100644
--- a/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
+++ b/python/paddle/v2/fluid/tests/test_dynrnn_static_input.py
@@ -1,3 +1,16 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
 import unittest
 import paddle.v2 as paddle
 import paddle.v2.fluid.core as core

From a0ac133987a925df1907f3804ccb3cbc32b763b7 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 16 Jan 2018 14:58:22 +0800
Subject: [PATCH 008/102] update job

---
 benchmark/cluster/v2/trainer.yaml | 6 ++++--
 benchmark/cluster/v2/vgg16.py     | 4 ++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/benchmark/cluster/v2/trainer.yaml b/benchmark/cluster/v2/trainer.yaml
index 33c95df365..a4958b2278 100644
--- a/benchmark/cluster/v2/trainer.yaml
+++ b/benchmark/cluster/v2/trainer.yaml
@@ -21,6 +21,8 @@ spec:
         env:
         - name: PADDLE_JOB_NAME
           value: vgg16job
+        - name: OMP_NUM_THREADS
+          value: "1"
         - name: TRAINERS
           value: "20"
         - name: PSERVERS
@@ -36,7 +38,7 @@ spec:
         - name: PADDLE_INIT_NICS
           value: "xgbe0"
         - name: PADDLE_INIT_TRAINER_COUNT
-          value: "1"
+          value: "2"
         - name: PADDLE_INIT_PORTS_NUM
           value: "1"
         - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
@@ -44,7 +46,7 @@ spec:
         - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
           value: "20"
         - name: PADDLE_INIT_NUM_PASSES
-          value: "1"
+          value: "2"
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/v2/vgg16.py
index 8644a547b3..85502c38e4 100644
--- a/benchmark/cluster/v2/vgg16.py
+++ b/benchmark/cluster/v2/vgg16.py
@@ -74,14 +74,14 @@ def vgg19(input, class_dim):
 
 
 def main():
-    paddle.init(use_gpu=False, trainer_count=1)
+    paddle.init(use_gpu=False)
     image = paddle.layer.data(
         name="image", type=paddle.data_type.dense_vector(DATA_DIM))
     lbl = paddle.layer.data(
         name="label", type=paddle.data_type.integer_value(CLASS_DIM))
 
     extra_layers = None
-    learning_rate = 0.01
+    learning_rate = 1e-3
     out = vgg16(image, class_dim=CLASS_DIM)
     cost = paddle.layer.classification_cost(input=out, label=lbl)
 

From b315a408e915e49fe8ffe4cf66bddfc512348e9d Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 16 Jan 2018 15:42:49 +0800
Subject: [PATCH 009/102] update

---
 benchmark/cluster/v2/vgg16.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/v2/vgg16.py
index 85502c38e4..400dcf1b41 100644
--- a/benchmark/cluster/v2/vgg16.py
+++ b/benchmark/cluster/v2/vgg16.py
@@ -81,7 +81,7 @@ def main():
         name="label", type=paddle.data_type.integer_value(CLASS_DIM))
 
     extra_layers = None
-    learning_rate = 1e-3
+    learning_rate = 1e-3 / 20
     out = vgg16(image, class_dim=CLASS_DIM)
     cost = paddle.layer.classification_cost(input=out, label=lbl)
 

From 9f50195b9d6346e08a59e6656f55b0d22efb3d81 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Fri, 19 Jan 2018 11:15:55 +0800
Subject: [PATCH 010/102] update using cifar10

---
 benchmark/cluster/v2/Dockerfile |  8 ++++----
 benchmark/cluster/v2/reader.py  |  2 +-
 benchmark/cluster/v2/vgg16.py   | 22 +++++++++++++++-------
 3 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/benchmark/cluster/v2/Dockerfile b/benchmark/cluster/v2/Dockerfile
index 32e68b6150..ac85b1a7d0 100644
--- a/benchmark/cluster/v2/Dockerfile
+++ b/benchmark/cluster/v2/Dockerfile
@@ -1,5 +1,5 @@
 FROM paddlepaddle/paddlecloud-job
-RUN mkdir -p /workspace && mkdir -p /root/.cache/paddle/dataset/flowers/
-ADD vgg16.py reader.py /workspace/
-COPY 102flowers.tgz imagelabels.mat setid.mat /root/.cache/paddle/dataset/flowers/
-
+RUN mkdir -p /workspace
+ADD reader.py /workspace/
+RUN python /workspace/reader.py
+ADD vgg16.py /workspace/
diff --git a/benchmark/cluster/v2/reader.py b/benchmark/cluster/v2/reader.py
index 060bf2bda2..16ac2dbcef 100644
--- a/benchmark/cluster/v2/reader.py
+++ b/benchmark/cluster/v2/reader.py
@@ -67,4 +67,4 @@ if __name__ == '__main__':
     #    print len(im[0])
     #for im in train_reader('test.list'):
     #    print len(im[0])
-    paddle.dataset.flowers.train()
+    paddle.dataset.cifar.train10()
diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/v2/vgg16.py
index 8644a547b3..9189493276 100644
--- a/benchmark/cluster/v2/vgg16.py
+++ b/benchmark/cluster/v2/vgg16.py
@@ -14,13 +14,15 @@
 
 import gzip
 
-import paddle.v2.dataset.flowers as flowers
+import paddle.v2.dataset.cifar as cifar
 import paddle.v2 as paddle
 import reader
+import time
 
-DATA_DIM = 3 * 224 * 224  # Use 3 * 331 * 331 or 3 * 299 * 299 for Inception-ResNet-v2.
-CLASS_DIM = 102
+DATA_DIM = 3 * 32 * 32
+CLASS_DIM = 10
 BATCH_SIZE = 128
+ts = 0
 
 
 def vgg(input, nums, class_dim):
@@ -74,6 +76,7 @@ def vgg19(input, class_dim):
 
 
 def main():
+    global ts
     paddle.init(use_gpu=False, trainer_count=1)
     image = paddle.layer.data(
         name="image", type=paddle.data_type.dense_vector(DATA_DIM))
@@ -100,13 +103,13 @@ def main():
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
-            flowers.train(),
+            cifar.train10(),
             # To use other data, replace the above line with:
             # reader.train_reader('train.list'),
             buf_size=1000),
         batch_size=BATCH_SIZE)
     test_reader = paddle.batch(
-        flowers.valid(),
+        cifar.test10(),
         # To use other data, replace the above line with:
         # reader.test_reader('val.list'),
         batch_size=BATCH_SIZE)
@@ -120,10 +123,14 @@ def main():
 
     # End batch and end pass event handler
     def event_handler(event):
+        global ts
+        if isinstance(event, paddle.event.BeginIteration):
+            ts = time.time()
         if isinstance(event, paddle.event.EndIteration):
             if event.batch_id % 1 == 0:
-                print "\nPass %d, Batch %d, Cost %f, %s" % (
-                    event.pass_id, event.batch_id, event.cost, event.metrics)
+                print "\nPass %d, Batch %d, Cost %f, %s, spent: %f" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    time.time() - ts)
         if isinstance(event, paddle.event.EndPass):
             with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
                 trainer.save_parameter_to_tar(f)
@@ -137,3 +144,4 @@ def main():
 
 if __name__ == '__main__':
     main()
+

From c5a14ed4cdbaebb68490a28a914a094b62c35bcc Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox@gmail.com>
Date: Fri, 19 Jan 2018 11:31:58 +0800
Subject: [PATCH 011/102] add mine_hard_examples operator

---
 paddle/operators/mine_hard_examples_op.cc     | 184 ++++++++++++++++++
 paddle/operators/mine_hard_examples_op.h      | 148 ++++++++++++++
 .../fluid/tests/test_mine_hard_examples_op.py |  99 ++++++++++
 3 files changed, 431 insertions(+)
 create mode 100644 paddle/operators/mine_hard_examples_op.cc
 create mode 100755 paddle/operators/mine_hard_examples_op.h
 create mode 100755 python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py

diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc
new file mode 100644
index 0000000000..75098d0bcd
--- /dev/null
+++ b/paddle/operators/mine_hard_examples_op.cc
@@ -0,0 +1,184 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/mine_hard_examples_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MineHardExamplesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
+                   "Input(ClsLoss) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("MatchIndics"),
+        "Input(MatchIndics) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("MatchDis"),
+                   "Input(MatchDis) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("NegIndics"),
+        "Output(NegIndics) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("UpdatedMatchIndics"),
+        "Output(UpdatedMatchIndics) of MineHardExamplesOp should not be null.");
+
+    auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
+    auto idx_dims = ctx->GetInputDim("MatchIndics");
+    auto dis_dims = ctx->GetInputDim("MatchDis");
+
+    PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
+                      "The shape of ClsLoss is [N, Np].");
+    PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
+                      "The shape of MatchIndics is [N, Np].");
+    PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
+                      "The shape of MatchDis is [N, Np].");
+
+    if (ctx->HasInput("LocLoss")) {
+      auto loc_loss_dims = ctx->GetInputDim("LocLoss");
+      PADDLE_ENFORCE_EQ(loc_loss_dims.size(), 2UL,
+                        "The shape of LocLoss is [N, Np].");
+      PADDLE_ENFORCE_EQ(cls_loss_dims[0], loc_loss_dims[0],
+                        "Batch size of ClsLoss and LocLoss must be the same.");
+      PADDLE_ENFORCE_EQ(
+          cls_loss_dims[1], loc_loss_dims[1],
+          "Prior box number of ClsLoss and LocLoss must be the same.");
+    }
+
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[0], idx_dims[0],
+        "Batch size of ClsLoss and MatchIndics must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchIndics must be the same.");
+
+    PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
+                      "Batch size of ClsLoss and MatchDis must be the same.");
+    PADDLE_ENFORCE_EQ(
+        cls_loss_dims[1], idx_dims[1],
+        "Prior box number of ClsLoss and MatchDis must be the same.");
+
+    auto mining_type =
+        GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
+
+    PADDLE_ENFORCE_NE(mining_type, MiningType::kNone,
+                      "mining_type must be hard_example or max_negative");
+
+    if (mining_type == MiningType::kMaxNegative) {
+      auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
+      auto neg_dis_threshold = ctx->Attrs().Get<float>("neg_dis_threshold");
+      PADDLE_ENFORCE_GT(
+          neg_pos_ratio, 0.0f,
+          "neg_pos_ratio must greater than zero in max_negative mode");
+      PADDLE_ENFORCE_GT(
+          neg_dis_threshold, 0.0f,
+          "neg_dis_threshold must greater than zero in max_negative mode");
+    } else if (mining_type == MiningType::kHardExample) {
+      auto sample_size = ctx->Attrs().Get<int>("sample_size");
+      PADDLE_ENFORCE_GT(
+          sample_size, 0,
+          "sample_size must greater than zero in hard_example mode");
+    }
+
+    ctx->SetOutputDim("UpdatedMatchIndics", idx_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
+        ctx.device_context());
+  }
+};
+
+class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MineHardExamplesOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "ClsLoss",
+        "(Tensor, default Tensor<float>), The classification loss wit shape "
+        "[N, Np], N is the batch size and Np is the number of prior box.");
+    AddInput("LocLoss",
+             "(Tensor, optional, default Tensor<float>), The localization loss "
+             "wit shape [N, Np], N is the batch size and Np is the number of "
+             "prior box.")
+        .AsDispensable();
+    AddInput("MatchIndics",
+             "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
+             "the batch size and Np is the number of prior box. "
+             "MatchIndics[i][j] equal -1 means box[j] does not match any "
+             "entity, otherwise means Box[j] is matched to row.");
+    AddInput("MatchDis",
+             "(Tensor, default Tensor<float>) Matched indices with shape [N, "
+             "Np], N is the batch size and Np is the number of prior box.");
+    AddAttr<float>("neg_pos_ratio",
+                   "(float) The ratio of the negative box to the positive "
+                   "box. Use only when mining_type is equal to max_negative.")
+        .SetDefault(1.0);
+    AddAttr<float>("neg_dis_threshold",
+                   "(float) The negative box dis value threshold. "
+                   "Use only when mining_type is equal to max_negative.")
+        .SetDefault(0.5);
+    AddAttr<int>("sample_size",
+                 "(float) The max sample size of negative box. Use only when "
+                 "mining_type is equal to hard_example.")
+        .SetDefault(0);
+    AddAttr<std::string>("mining_type",
+                         "(float) The mining algorithm name, the value is "
+                         "hard_example or max_negative.")
+        .SetDefault("max_negative")
+        .InEnum({"hard_example", "max_negative"});
+
+    AddOutput("NegIndics",
+              "(LoDTensor) The output of negative example indics.a lod tensor "
+              "with shape [Neg, 1]. The size of lod[0] is batch size, "
+              "and each element is the box index. "
+              "For example, the batch size is 2, the lod is [[0, 1, 2]], "
+              "the sample 0's box 1(MatchIndics[0][1]) is selected, "
+              "and sample 1's box 0 is selected. The output NegIndics is "
+              "[[1], [0]].");
+
+    AddOutput("UpdatedMatchIndics",
+              "(Tensor) The output of updated MatchIndics, a tensor with "
+              "shape [N, M]. Only update when mining_type is equal to "
+              "hard_example. The input MatchIndics elements will be update to "
+              "-1 when it not in the highest loss list");
+
+    AddComment(R"DOC(
+Mine hard examples Operator.
+This operator implements hard example mining to select a subset of negative box indics.
+For each image, selects the box with highest losses. subject to the condition that the box cannot have
+an MatchDis > neg_dis_threshold when mining_type is equals max_negative. The selected number is 
+min(sample_size, max_negative_box_number) when mining_type is equals hard_example,
+or min(neg_pos_ratio * positive_box_number, max_negative_box_number) when mining_type is 
+equals max_negative, where the max_negative_box_number is the count of MatchIndics elements with value -1.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(mine_hard_examples, ops::MineHardExamplesOp,
+                             ops::MineHardExamplesOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    mine_hard_examples,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::MineHardExamplesKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/mine_hard_examples_op.h b/paddle/operators/mine_hard_examples_op.h
new file mode 100755
index 0000000000..0a652a60c5
--- /dev/null
+++ b/paddle/operators/mine_hard_examples_op.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+enum MiningType { kNone = 0, kMaxNegative, kHardExample };
+
+template <typename T>
+bool SortScoreDescend(const std::pair<float, T>& pair1,
+                      const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
+                             const float match_dis,
+                             const float neg_dis_threshold) {
+  if (mining_type == MiningType::kMaxNegative) {
+    return match_idx == -1 && match_dis < neg_dis_threshold;
+  } else if (mining_type == MiningType::kHardExample) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+MiningType GetMiningType(std::string str) {
+  if (str == "max_negative") {
+    return MiningType::kMaxNegative;
+  } else if (str == "hard_example") {
+    return MiningType::kHardExample;
+  } else {
+    return MiningType::kNone;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MineHardExamplesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
+    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
+    auto* in_matched_indics = ctx.Input<framework::Tensor>("MatchIndics");
+    auto* in_match_dis = ctx.Input<framework::Tensor>("MatchDis");
+    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
+    T neg_dis_threshold = static_cast<T>(ctx.Attr<float>("neg_dis_threshold"));
+    int sample_size = ctx.Attr<int>("sample_size");
+    MiningType mining_type =
+        GetMiningType(ctx.Attr<std::string>("mining_type"));
+
+    auto out_neg_indics = ctx.Output<framework::LoDTensor>("NegIndics");
+    auto out_match_indics = ctx.Output<framework::Tensor>("UpdatedMatchIndics");
+
+    framework::Copy(*in_matched_indics, ctx.GetPlace(), out_match_indics);
+
+    int batch_size = in_matched_indics->dims()[0];
+    int prior_num = in_matched_indics->dims()[1];
+
+    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indics);
+
+    auto match_indices_et =
+        framework::EigenMatrix<int>::From(*out_match_indics);
+
+    auto match_dis = framework::EigenMatrix<float>::From(*in_match_dis);
+    auto cls_loss = framework::EigenMatrix<float>::From(*in_cls_loss);
+    auto loc_loss = framework::EigenMatrix<float>::From(*in_loc_loss);
+
+    std::vector<std::vector<int>> all_neg_indices;
+    int all_neg_num = 0;
+    for (int n = 0; n < batch_size; ++n) {
+      std::vector<std::pair<float, size_t>> loss_idx;
+      int neg_sel = 0;
+      for (int m = 0; m < prior_num; ++m) {
+        if (IsEligibleMining(mining_type, match_indices(n, m), match_dis(n, m),
+                             neg_dis_threshold)) {
+          T loss = cls_loss(n, m);
+          if (mining_type == MiningType::kHardExample) {
+            loss = cls_loss(n, m) + loc_loss(n, m);
+          }
+          loss_idx.push_back(std::make_pair(loss, m));
+          ++neg_sel;
+        }
+      }
+      if (mining_type == MiningType::kMaxNegative) {
+        int num_pos = 0;
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) != -1) ++num_pos;
+        }
+        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
+      } else if (mining_type == MiningType::kHardExample) {
+        neg_sel = std::min(sample_size, neg_sel);
+      }
+      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<int>);
+      std::set<int> sel_indices;
+      std::vector<int> neg_indices;
+      for (int n = 0; n < neg_sel; ++n) {
+        sel_indices.insert(loss_idx[n].second);
+      }
+
+      for (int m = 0; m < prior_num; ++m) {
+        if (match_indices(n, m) > -1) {
+          if (mining_type == MiningType::kHardExample &&
+              sel_indices.find(m) == sel_indices.end()) {
+            match_indices_et(n, m) = -1;
+          }
+        } else {
+          if (sel_indices.find(m) != sel_indices.end()) {
+            neg_indices.push_back(m);
+          }
+        }
+      }
+      all_neg_indices.push_back(neg_indices);
+      all_neg_num += neg_indices.size();
+    }
+
+    framework::LoD out_neg_indics_lod;
+    out_neg_indics_lod.resize(1);
+    int neg_offset = 0;
+    auto neg_data = out_neg_indics->mutable_data<int>(
+        framework::make_ddim({all_neg_num, 1}), ctx.GetPlace());
+    out_neg_indics_lod[0].push_back(neg_offset);
+    for (auto neg_indices : all_neg_indices) {
+      for (auto neg_idx : neg_indices) {
+        neg_data[neg_offset++] = neg_idx;
+      }
+      out_neg_indics_lod[0].push_back(neg_offset);
+    }
+    out_neg_indics->set_lod(out_neg_indics_lod);
+    return;
+  }
+};
+}  // namespace operators
+
+}  // namespace paddle
diff --git a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
new file mode 100755
index 0000000000..e7dd04740a
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
@@ -0,0 +1,99 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+class TestMineHardExamplesOp(OpTest):
+    def set_data(self):
+        self.init_test_data()
+        self.inputs = {
+            'ClsLoss': self.cls_loss,
+            'LocLoss': self.loc_loss,
+            'MatchIndics': self.match_indices,
+            'MatchDis': self.match_dis
+        }
+
+        self.attrs = {
+            'neg_pos_ratio': self.neg_pos_ratio,
+            'neg_overlap': self.neg_overlap,
+            'sample_size': self.sample_size,
+            'mining_type': self.mining_type
+        }
+
+        self.outputs = {
+            'NegIndics': (self.neg_indices, self.neg_indices_lod),
+            'UpdatedMatchIndics': self.updated_match_indices
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        return
+
+    def setUp(self):
+        self.op_type = "mine_hard_examples"
+        self.set_data()
+
+    def init_test_data(self):
+        self.neg_pos_ratio = 1.0
+        self.neg_overlap = 0.5
+        self.sample_size = 0
+        self.mining_type = "max_negative"
+        self.cls_loss = np.array([[0.1, 0.1, 0.3],
+                                  [0.3, 0.1, 0.1]]).astype('float32')
+
+        self.loc_loss = np.array([[0.1, 0.2, 0.3],
+                                  [0.3, 0.4, 0.1]]).astype('float32')
+
+        self.match_dis = np.array([[0.2, 0.4, 0.8],
+                                   [0.1, 0.9, 0.3]]).astype('float32')
+
+        self.match_indices = np.array([[0, -1, -1],
+                                       [-1, 0, -1]]).astype('int32')
+
+        self.updated_match_indices = self.match_indices
+
+        self.neg_indices_lod = [[0, 1, 2]]
+        self.neg_indices = np.array([[1], [0]]).astype('int32')
+
+
+class TestMineHardExamplesOpHardExample(TestMineHardExamplesOp):
+    def init_test_data(self):
+        super(TestMineHardExamplesOpHardExample, self).init_test_data()
+        self.mining_type = "hard_example"
+        self.sample_size = 2
+
+        self.cls_loss = np.array([[0.5, 0.1, 0.3],
+                                  [0.3, 0.1, 0.1]]).astype('float32')
+
+        self.loc_loss = np.array([[0.2, 0.2, 0.3],
+                                  [0.3, 0.1, 0.2]]).astype('float32')
+
+        self.match_indices = np.array([[0, -1, -1],
+                                       [-1, 0, -1]]).astype('int32')
+
+        self.updated_match_indices = np.array([[0, -1, -1],
+                                               [-1, -1, -1]]).astype('int32')
+
+        self.neg_indices_lod = [[0, 1, 3]]
+        self.neg_indices = np.array([[2], [0], [2]]).astype('int32')
+
+
+if __name__ == '__main__':
+    unittest.main()

From 541b42e6fb7a5f4adaaad96251659e3bc9591b9d Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Fri, 19 Jan 2018 13:56:42 +0800
Subject: [PATCH 012/102] fix style

---
 benchmark/cluster/v2/trainer.yaml | 2 +-
 benchmark/cluster/v2/vgg16.py     | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/benchmark/cluster/v2/trainer.yaml b/benchmark/cluster/v2/trainer.yaml
index 33c95df365..75fffc64b0 100644
--- a/benchmark/cluster/v2/trainer.yaml
+++ b/benchmark/cluster/v2/trainer.yaml
@@ -28,7 +28,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && python /workspace/vgg16.py"
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/v2/vgg16.py
index 9189493276..59e3997d78 100644
--- a/benchmark/cluster/v2/vgg16.py
+++ b/benchmark/cluster/v2/vgg16.py
@@ -144,4 +144,3 @@ def main():
 
 if __name__ == '__main__':
     main()
-

From d3905fbc1e53dcb8ef5481860e44e9ab4a704e5d Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Fri, 19 Jan 2018 19:02:02 +0800
Subject: [PATCH 013/102] add fluid vgg16 dist test

---
 benchmark/cluster/vgg16/fluid/Dockerfile      |  12 +
 benchmark/cluster/vgg16/fluid/k8s_tools.py    |  78 ++++++
 benchmark/cluster/vgg16/fluid/paddle_k8s      | 200 ++++++++++++++
 benchmark/cluster/vgg16/fluid/pserver.yaml    |  72 +++++
 benchmark/cluster/vgg16/fluid/reader.py       |   2 +
 benchmark/cluster/vgg16/fluid/trainer.yaml    |  69 +++++
 benchmark/cluster/vgg16/fluid/vgg16.py        | 248 ++++++++++++++++++
 benchmark/cluster/{ => vgg16}/v2/Dockerfile   |   2 +
 benchmark/cluster/{ => vgg16}/v2/pserver.yaml |   0
 benchmark/cluster/{ => vgg16}/v2/reader.py    |   0
 benchmark/cluster/{ => vgg16}/v2/trainer.yaml |   0
 benchmark/cluster/{ => vgg16}/v2/vgg16.py     |   0
 12 files changed, 683 insertions(+)
 create mode 100644 benchmark/cluster/vgg16/fluid/Dockerfile
 create mode 100644 benchmark/cluster/vgg16/fluid/k8s_tools.py
 create mode 100755 benchmark/cluster/vgg16/fluid/paddle_k8s
 create mode 100644 benchmark/cluster/vgg16/fluid/pserver.yaml
 create mode 100644 benchmark/cluster/vgg16/fluid/reader.py
 create mode 100644 benchmark/cluster/vgg16/fluid/trainer.yaml
 create mode 100644 benchmark/cluster/vgg16/fluid/vgg16.py
 rename benchmark/cluster/{ => vgg16}/v2/Dockerfile (81%)
 rename benchmark/cluster/{ => vgg16}/v2/pserver.yaml (100%)
 rename benchmark/cluster/{ => vgg16}/v2/reader.py (100%)
 rename benchmark/cluster/{ => vgg16}/v2/trainer.yaml (100%)
 rename benchmark/cluster/{ => vgg16}/v2/vgg16.py (100%)

diff --git a/benchmark/cluster/vgg16/fluid/Dockerfile b/benchmark/cluster/vgg16/fluid/Dockerfile
new file mode 100644
index 0000000000..77cd17f2b9
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/Dockerfile
@@ -0,0 +1,12 @@
+#FROM paddlepaddle/paddlecloud-job
+#RUN mkdir -p /workspace
+#ADD reader.py /workspace/
+#RUN python /workspace/reader.py
+FROM python:2.7.14
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ADD paddle_k8s /usr/bin
+ADD k8s_tools.py /root
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev 
+
+ADD vgg16.py /workspace/
diff --git a/benchmark/cluster/vgg16/fluid/k8s_tools.py b/benchmark/cluster/vgg16/fluid/k8s_tools.py
new file mode 100644
index 0000000000..8a64dbd361
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/k8s_tools.py
@@ -0,0 +1,78 @@
+#!/bin/env python
+import os
+import sys
+import time
+import socket
+from kubernetes import client, config
+PADDLE_JOB_NAME = os.getenv("PADDLE_JOB_NAME")
+NAMESPACE = os.getenv("NAMESPACE")
+PORT = os.getenv("PSERVER_PORT")
+if os.getenv("KUBERNETES_SERVICE_HOST", None):
+    config.load_incluster_config()
+else:
+    config.load_kube_config()
+v1 = client.CoreV1Api()
+
+
+def fetch_pods_info(label_selector):
+    api_response = v1.list_namespaced_pod(
+        namespace=NAMESPACE, pretty=True, label_selector=label_selector)
+    pod_list = []
+    for item in api_response.items:
+        pod_list.append((item.status.phase, item.status.pod_ip))
+    return pod_list
+
+
+def wait_pods_running(label_selector, desired):
+    print "label selector: %s, desired: %s" % (label_selector, desired)
+    while True:
+        count = count_pods_by_phase(label_selector, 'Running')
+        # NOTE: pods may be scaled.
+        if count >= int(desired):
+            break
+        print 'current cnt: %d sleep for 5 seconds...' % count
+        time.sleep(5)
+
+def count_pods_by_phase(label_selector, phase):
+    pod_list = fetch_pods_info(label_selector)
+    filtered_pod_list = filter(lambda x: x[0] == phase, pod_list)
+    return len(filtered_pod_list)
+
+
+def fetch_pserver_ips():
+    label_selector = "paddle-job-pserver=%s" % PADDLE_JOB_NAME
+    pod_list = fetch_pods_info(label_selector)
+    pserver_ips = [item[1] for item in pod_list]
+    return ",".join(pserver_ips)
+
+def fetch_master_ip():
+    label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
+    pod_list = fetch_pods_info(label_selector)
+    master_ips = [item[1] for item in pod_list]
+    return master_ips[0]
+
+def fetch_trainer_id():
+    label_selector = "paddle-job=%s" % PADDLE_JOB_NAME
+    pod_list = fetch_pods_info(label_selector)
+    trainer_ips = [item[1] for item in pod_list]
+    trainer_ips.sort()
+    local_ip = socket.gethostbyname(socket.gethostname())
+    for i in xrange(len(trainer_ips)):
+        if trainer_ips[i] == local_ip:
+            return i
+    return None
+
+
+if __name__ == "__main__":
+    command = sys.argv[1]
+    if command == "fetch_pserver_ips":
+        print fetch_pserver_ips()
+    elif command == "fetch_trainer_id":
+        print fetch_trainer_id()
+    elif command == "fetch_master_ip":
+        print fetch_master_ip()
+    elif command == "count_pods_by_phase":
+        print count_pods_by_phase(sys.argv[2], sys.argv[3])
+    elif command == "wait_pods_running":
+        wait_pods_running(sys.argv[2], sys.argv[3])
+
diff --git a/benchmark/cluster/vgg16/fluid/paddle_k8s b/benchmark/cluster/vgg16/fluid/paddle_k8s
new file mode 100755
index 0000000000..8f1c5db717
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/paddle_k8s
@@ -0,0 +1,200 @@
+#!/bin/bash
+start_pserver() {
+    stdbuf -oL paddle pserver \
+      --use_gpu=0 \
+      --port=$PADDLE_INIT_PORT \
+      --ports_num=$PADDLE_INIT_PORTS_NUM \
+      --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
+      --nics=$PADDLE_INIT_NICS \
+      --comment=paddle_process_k8s \
+      --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
+}
+
+start_new_pserver() {
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
+  export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
+  stdbuf -oL /usr/bin/pserver \
+    -port=$PADDLE_INIT_PORT \
+    -num-pservers=$PSERVERS \
+    -log-level=debug \
+    -etcd-endpoint=http://$MASTER_IP:2379
+}
+
+start_master() {
+  stdbuf -oL /usr/bin/master \
+  -port=8080 \
+  -chunk-per-task=1\
+  -task-timout-dur=16s\
+  -endpoints=http://127.0.0.1:2379
+}
+
+check_failed_cnt() {
+  max_failed=$1
+  failed_count=$(python /root/k8s_tools.py count_pods_by_phase paddle-job=${PADDLE_JOB_NAME} Failed) 
+  if [ $failed_count -gt $max_failed ]; then
+    stdbuf -oL echo "Failed trainer count beyond the threadhold: "$max_failed
+    echo "Failed trainer count beyond the threshold: " $max_failed > /dev/termination-log 
+    exit 0
+  fi
+}
+
+check_trainer_ret() {
+  ret=$1
+  stdbuf -oL echo "job returned $ret...setting pod return message..."
+  stdbuf -oL echo "==============================="
+
+  if [ $ret -eq 136 ] ; then
+    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
+  elif [ $ret -eq 139 ] ; then
+    echo "Segmentation Fault" > /dev/termination-log
+  elif [ $ret -eq 1 ] ; then
+    echo "General Error" > /dev/termination-log
+  elif [ $ret -eq 134 ] ; then
+    echo "Program Abort" > /dev/termination-log
+  fi
+  stdbuf -oL echo "termination log wroted..."
+  exit $ret
+}
+
+start_fluid_process() {
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
+  if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
+    check_failed_cnt ${TRAINERS}
+    sleep 5
+    stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
+    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
+  fi
+  export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
+  stdbuf -oL sh -c "${ENTRY}"
+  check_trainer_ret $?
+}
+
+start_new_trainer() {
+  # FIXME(Yancey1989): use command-line interface to configure the max failed count
+  check_failed_cnt ${TRAINERS}
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
+  sleep 5
+  stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
+  export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
+  export ETCD_IP="$MASTER_IP"
+
+  # NOTE: $TRAINER_PACKAGE may be large, do not copy
+  export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
+  cd $TRAINER_PACKAGE
+
+  stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
+  $PADDLE_INIT_NUM_GRADIENT_SERVERS, "version: " $1 
+
+  stdbuf -oL sh -c "${ENTRY}"
+  check_trainer_ret $?
+}
+
+start_trainer() {
+    # paddle v1 and V2 distributed training does not allow any trainer failed. 
+    check_failed_cnt 0
+    stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
+    stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job=${PADDLE_JOB_NAME} ${TRAINERS}
+
+    export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
+    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
+    stdbuf -oL echo $PADDLE_INIT_TRAINER_ID > /trainer_id
+    # FIXME: /trainer_count = PADDLE_INIT_NUM_GRADIENT_SERVERS
+    stdbuf -oL echo $PADDLE_INIT_NUM_GRADIENT_SERVERS > /trainer_count
+
+    # NOTE: $TRAINER_PACKAGE may be large, do not copy
+    export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
+    cd $TRAINER_PACKAGE
+
+    stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
+    $PADDLE_INIT_NUM_GRADIENT_SERVERS, "trainer_id: " $PADDLE_INIT_TRAINER_ID, \
+    "version: " $1
+
+    # FIXME: If we use the new PServer by Golang, add Kubernetes healthz
+    # to wait PServer process get ready.Now only sleep 20 seconds.
+    sleep 20
+
+    case "$1" in
+      "v1")
+        FILE_COUNT=$(wc -l $TRAIN_LIST | awk '{print $1}')
+        if [ $FILE_COUNT -le $PADDLE_INIT_NUM_GRADIENT_SERVERS ]; then
+          echo "file count less than trainers"
+          check_trainer_ret 0
+        fi
+        let lines_per_node="$FILE_COUNT / ($PADDLE_INIT_NUM_GRADIENT_SERVERS + 1)"
+        echo "spliting file to" $lines_per_node
+        cp $TRAIN_LIST /
+        cd /
+        split -l $lines_per_node -d -a 3 $TRAIN_LIST train.list
+        CURRENT_LIST=$(printf "train.list%03d" $PADDLE_INIT_TRAINER_ID)
+        # always use /train.list for paddle v1 for each node.
+        echo "File for current node ${CURRENT_LIST}"
+        sleep 10
+        cp $CURRENT_LIST train.list
+
+        cd $TRAINER_PACKAGE
+
+        stdbuf -oL  paddle train \
+          --port=$PADDLE_INIT_PORT \
+          --nics=$PADDLE_INIT_NICS \
+          --ports_num=$PADDLE_INIT_PORTS_NUM \
+          --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
+          --num_passes=$PADDLE_INIT_NUM_PASSES \
+          --trainer_count=$PADDLE_INIT_TRAINER_COUNT \
+          --saving_period=1 \
+          --log_period=20 \
+          --local=0 \
+          --rdma_tcp=tcp \
+          --config=$TOPOLOGY \
+          --use_gpu=$PADDLE_INIT_USE_GPU \
+          --trainer_id=$PADDLE_INIT_TRAINER_ID \
+          --save_dir=$OUTPUT \
+          --pservers=$PADDLE_INIT_PSERVERS \
+          --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
+        # paddle v1 API does not allow any trainer failed.
+        check_trainer_ret $? 
+        ;;
+      "v2")
+        stdbuf -oL sh -c "${ENTRY}"
+        # paddle v2 API does not allow any trainer failed.
+        check_trainer_ret $? 
+        ;;
+      *)
+        ;;
+    esac
+}
+
+usage() {
+    echo "usage: paddle_k8s [<args>]:"
+    echo "  start_trainer  [v1|v2]    Start a trainer process with v1 or v2 API"
+    echo "  start_pserver             Start a pserver process"
+    echo "  start_new_pserver         Start a new pserver process"
+    echo "  start_new_trainer         Start a new triner process"
+}
+
+case "$1" in
+    start_pserver)
+        start_pserver
+        ;;
+    start_trainer)
+        start_trainer $2
+        ;;
+    start_new_trainer)
+        start_new_trainer
+        ;;
+    start_new_pserver)
+        start_new_pserver
+        ;;
+    start_master)
+        start_master
+        ;;
+    start_fluid)
+        start_fluid_process
+        ;;
+    --help)
+        usage
+        ;;
+    *)
+        usage
+        ;;
+esac
+
diff --git a/benchmark/cluster/vgg16/fluid/pserver.yaml b/benchmark/cluster/vgg16/fluid/pserver.yaml
new file mode 100644
index 0000000000..47d2380d2e
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/pserver.yaml
@@ -0,0 +1,72 @@
+apiVersion: extensions/v1beta1
+kind: ReplicaSet
+metadata:
+  name: vgg16job-pserver
+spec:
+  replicas: 10
+  template:
+    metadata:
+      labels:
+        paddle-job-pserver: vgg16job
+    spec:
+      hostNetwork: true
+      imagePullSecrets:
+      - name: job-registry-secret
+      containers:
+      - name: pserver
+        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        imagePullPolicy: Always
+        ports:
+        - name: jobport-30236
+          containerPort: 30236
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: MKL_NUM_THREADS
+          value: "1"
+        - name: TRAINING_ROLE
+          value: "PSERVER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        command: ["paddle_k8s", "start_fluid"]
+        resources:
+          requests:
+            memory: 10Gi
+            cpu: 4
+          limits:
+            memory: 10Gi
+            cpu: 4
diff --git a/benchmark/cluster/vgg16/fluid/reader.py b/benchmark/cluster/vgg16/fluid/reader.py
new file mode 100644
index 0000000000..c5161ddea2
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/reader.py
@@ -0,0 +1,2 @@
+import paddle.v2 as paddle
+paddle.dataset.cifar.train10()
diff --git a/benchmark/cluster/vgg16/fluid/trainer.yaml b/benchmark/cluster/vgg16/fluid/trainer.yaml
new file mode 100644
index 0000000000..bada190764
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/trainer.yaml
@@ -0,0 +1,69 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: vgg16job-trainer
+spec:
+  parallelism: 20
+  completions: 20
+  template:
+    metadata:
+      labels:
+        paddle-job: vgg16job
+    spec:
+      imagePullSecrets:
+        - name: job-registry-secret
+      hostNetwork: true
+      containers:
+      - name: trainer
+        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        imagePullPolicy: Always
+        command: ["paddle_k8s", "start_trainer", "v2"]
+        env:
+        - name: PADDLE_JOB_NAME
+          value: vgg16job
+        - name: TRAINING_ROLE
+          value: "TRAINER"
+        - name: TRAINERS
+          value: "20"
+        - name: PSERVERS
+          value: "10"
+        - name: TOPOLOGY
+          value: ""
+        - name: ENTRY
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+        - name: TRAINER_PACKAGE
+          value: "/workspace"
+        - name: PADDLE_INIT_PORT
+          value: "30236"
+        - name: PADDLE_INIT_NICS
+          value: "xgbe0"
+        - name: PADDLE_INIT_TRAINER_COUNT
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM
+          value: "1"
+        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
+          value: "1"
+        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
+          value: "20"
+        - name: PADDLE_INIT_NUM_PASSES
+          value: "1"
+        - name: PADDLE_INIT_USE_GPU
+          value: "0"
+        - name: LD_LIBRARY_PATH
+          value: "/usr/local/nvidia/lib64"
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: "metadata.namespace"
+        - name: POD_IP
+          valueFrom:
+            fieldRef:
+              fieldPath: "status.podIP"
+        resources:
+          requests:
+            memory: 40Gi
+            cpu: 2
+          limits:
+            memory: 40Gi
+            cpu: 2
+      restartPolicy: Never
diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/fluid/vgg16.py
new file mode 100644
index 0000000000..0595a28784
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/vgg16.py
@@ -0,0 +1,248 @@
+"""VGG16 benchmark in Fluid"""
+from __future__ import print_function
+
+import sys
+import time
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.core as core
+import argparse
+import functools
+import os
+
+def str2bool(v):
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    '--batch_size', type=int, default=128, help="Batch size for training.")
+parser.add_argument(
+    '--learning_rate',
+    type=float,
+    default=1e-3,
+    help="Learning rate for training.")
+parser.add_argument('--num_passes', type=int, default=50, help="No. of passes.")
+parser.add_argument(
+    '--device',
+    type=str,
+    default='CPU',
+    choices=['CPU', 'GPU'],
+    help="The device type.")
+parser.add_argument(
+    '--data_format',
+    type=str,
+    default='NCHW',
+    choices=['NCHW', 'NHWC'],
+    help='The data order, now only support NCHW.')
+parser.add_argument(
+    '--data_set',
+    type=str,
+    default='cifar10',
+    choices=['cifar10', 'flowers'],
+    help='Optional dataset for benchmark.')
+parser.add_argument(
+    '--local',
+    type=str2bool,
+    default=True,
+    help='Whether to run as local mode.')
+args = parser.parse_args()
+
+
+def vgg16_bn_drop(input):
+    def conv_block(input, num_filter, groups, dropouts):
+        return fluid.nets.img_conv_group(
+            input=input,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act='relu',
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type='max')
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0])
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
+    return fc2
+
+
+def main():
+    if args.data_set == "cifar10":
+        classdim = 10
+        if args.data_format == 'NCHW':
+            data_shape = [3, 32, 32]
+        else:
+            data_shape = [32, 32, 3]
+    else:
+        classdim = 102
+        if args.data_format == 'NCHW':
+            data_shape = [3, 224, 224]
+        else:
+            data_shape = [224, 224, 3]
+
+    # Input data
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    # Train program
+    net = vgg16_bn_drop(images)
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    # Evaluator
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+    # inference program
+    inference_program = fluid.default_main_program().clone()
+    with fluid.program_guard(inference_program):
+        test_target = accuracy.metrics + accuracy.states
+        inference_program = fluid.io.get_inference_program(test_target)
+
+    # Optimization
+    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
+    optimize_ops, params_grads = optimizer.minimize(avg_cost)
+
+    # Initialize executor
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    exe = fluid.Executor(place)
+
+
+    # test
+    def test(exe):
+        accuracy.reset(exe)
+        for batch_id, data in enumerate(test_reader()):
+            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                    data)).astype("float32")
+            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+            y_data = y_data.reshape([-1, 1])
+
+            exe.run(inference_program,
+                    feed={"pixel": img_data,
+                          "label": y_data})
+
+        return accuracy.eval(exe)
+
+    def train_loop(exe, trainer_prog):
+        iters = 0
+        for pass_id in range(args.num_passes):
+            # train
+            start_time = time.time()
+            num_samples = 0
+            accuracy.reset(exe)
+            for batch_id, data in enumerate(train_reader()):
+                img_data = np.array(map(lambda x: x[0].reshape(data_shape),
+                                        data)).astype("float32")
+                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                y_data = y_data.reshape([-1, 1])
+
+                loss, acc = exe.run(trainer_prog,
+                                    feed={"pixel": img_data,
+                                        "label": y_data},
+                                    fetch_list=[avg_cost] + accuracy.metrics)
+                iters += 1
+                num_samples += len(data)
+                print(
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
+                    (pass_id, iters, loss, acc)
+                )  # The accuracy is the accumulation of batches, but not the current batch.
+
+            pass_elapsed = time.time() - start_time
+            pass_train_acc = accuracy.eval(exe)
+            pass_test_acc = test(exe)
+            print(
+                "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
+                % (pass_id, num_samples / pass_elapsed, pass_train_acc,
+                pass_test_acc))
+
+    if args.local:
+        # Parameter initialization
+        exe.run(fluid.default_startup_program())
+
+        # data reader
+        train_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10()
+                if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+                buf_size=5120),
+            batch_size=args.batch_size)
+        test_reader = paddle.batch(
+            paddle.dataset.cifar.test10()
+            if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+            batch_size=args.batch_size)
+        train_loop(exe, fluid.default_main_program())
+    else:
+        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # all pserver endpoints
+        eplist = []
+        for ip in pserver_ips.split(","):
+            eplist.append(':'.join([ip, "6174"]))
+        pserver_endpoints = ",".join(eplist)
+        print("pserver endpoints: ", pserver_endpoints)
+        trainers = int(os.getenv("TRAINERS"))  # total trainer count
+        current_endpoint = os.getenv("POD_IP") + ":6174"  # current pserver endpoint
+        training_role = os.getenv("TRAINING_ROLE",
+                                "TRAINER")  # get the training role: trainer/pserver
+        t = fluid.DistributeTranspiler()
+        t.transpile(
+            optimize_ops, params_grads, pservers=pserver_endpoints, trainers=trainers)
+
+        if training_role == "PSERVER":
+            if not current_endpoint:
+                print("need env SERVER_ENDPOINT")
+                exit(1)
+            pserver_prog = t.get_pserver_program(current_endpoint)
+            pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+            print("starting server side startup")
+            exe.run(pserver_startup)
+            print("starting parameter server...")
+            exe.run(pserver_prog)
+        elif training_role == "TRAINER":
+            # Parameter initialization
+            exe.run(fluid.default_startup_program())
+
+            # data reader
+            train_reader = paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.cifar.train10()
+                    if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+                    buf_size=5120),
+                batch_size=args.batch_size)
+            test_reader = paddle.batch(
+                paddle.dataset.cifar.test10()
+                if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+                batch_size=args.batch_size)
+
+            trainer_prog = t.get_trainer_program()
+            feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
+            # TODO(typhoonzero): change trainer startup program to fetch parameters from pserver
+            exe.run(fluid.default_startup_program())
+            train_loop(exe, trainer_prog)
+        else:
+            print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+def print_arguments():
+    print('-----------  Configuration Arguments -----------')
+    for arg, value in sorted(vars(args).iteritems()):
+        print('%s: %s' % (arg, value))
+    print('------------------------------------------------')
+
+
+if __name__ == "__main__":
+    print_arguments()
+    main()
diff --git a/benchmark/cluster/v2/Dockerfile b/benchmark/cluster/vgg16/v2/Dockerfile
similarity index 81%
rename from benchmark/cluster/v2/Dockerfile
rename to benchmark/cluster/vgg16/v2/Dockerfile
index ac85b1a7d0..5f129a8e32 100644
--- a/benchmark/cluster/v2/Dockerfile
+++ b/benchmark/cluster/vgg16/v2/Dockerfile
@@ -3,3 +3,5 @@ RUN mkdir -p /workspace
 ADD reader.py /workspace/
 RUN python /workspace/reader.py
 ADD vgg16.py /workspace/
+
+ADD vgg16_fluid.py /workspace
diff --git a/benchmark/cluster/v2/pserver.yaml b/benchmark/cluster/vgg16/v2/pserver.yaml
similarity index 100%
rename from benchmark/cluster/v2/pserver.yaml
rename to benchmark/cluster/vgg16/v2/pserver.yaml
diff --git a/benchmark/cluster/v2/reader.py b/benchmark/cluster/vgg16/v2/reader.py
similarity index 100%
rename from benchmark/cluster/v2/reader.py
rename to benchmark/cluster/vgg16/v2/reader.py
diff --git a/benchmark/cluster/v2/trainer.yaml b/benchmark/cluster/vgg16/v2/trainer.yaml
similarity index 100%
rename from benchmark/cluster/v2/trainer.yaml
rename to benchmark/cluster/vgg16/v2/trainer.yaml
diff --git a/benchmark/cluster/v2/vgg16.py b/benchmark/cluster/vgg16/v2/vgg16.py
similarity index 100%
rename from benchmark/cluster/v2/vgg16.py
rename to benchmark/cluster/vgg16/v2/vgg16.py

From cb34f6a230bf51cc6cb0b8b2ef93b3e13ed3f516 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 22 Jan 2018 14:45:06 +0800
Subject: [PATCH 014/102] update fluid vgg16 and add readme

---
 benchmark/cluster/vgg16/fluid/Dockerfile   |  7 +++++--
 benchmark/cluster/vgg16/fluid/README.md    | 15 +++++++++++++++
 benchmark/cluster/vgg16/fluid/paddle_k8s   |  1 -
 benchmark/cluster/vgg16/fluid/pserver.yaml |  2 +-
 benchmark/cluster/vgg16/fluid/trainer.yaml |  4 ++--
 benchmark/cluster/vgg16/fluid/vgg16.py     |  6 ++++--
 6 files changed, 27 insertions(+), 8 deletions(-)
 create mode 100644 benchmark/cluster/vgg16/fluid/README.md

diff --git a/benchmark/cluster/vgg16/fluid/Dockerfile b/benchmark/cluster/vgg16/fluid/Dockerfile
index 77cd17f2b9..711076b09e 100644
--- a/benchmark/cluster/vgg16/fluid/Dockerfile
+++ b/benchmark/cluster/vgg16/fluid/Dockerfile
@@ -3,10 +3,13 @@
 #ADD reader.py /workspace/
 #RUN python /workspace/reader.py
 FROM python:2.7.14
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
 ADD paddle_k8s /usr/bin
 ADD k8s_tools.py /root
 RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev 
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl
+ENV LD_LIBRARY_PATH=/usr/local/lib
+ADD reader.py /workspace/
+RUN python /workspace/reader.py
 
 ADD vgg16.py /workspace/
diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md
new file mode 100644
index 0000000000..63a460f7a6
--- /dev/null
+++ b/benchmark/cluster/vgg16/fluid/README.md
@@ -0,0 +1,15 @@
+# Fluid distributed training perf test
+
+## Steps to get started
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable verbos logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
\ No newline at end of file
diff --git a/benchmark/cluster/vgg16/fluid/paddle_k8s b/benchmark/cluster/vgg16/fluid/paddle_k8s
index 8f1c5db717..af5f35b3ec 100755
--- a/benchmark/cluster/vgg16/fluid/paddle_k8s
+++ b/benchmark/cluster/vgg16/fluid/paddle_k8s
@@ -61,7 +61,6 @@ start_fluid_process() {
   if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
     check_failed_cnt ${TRAINERS}
     sleep 5
-    stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
     export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
   fi
   export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
diff --git a/benchmark/cluster/vgg16/fluid/pserver.yaml b/benchmark/cluster/vgg16/fluid/pserver.yaml
index 47d2380d2e..e1a58260af 100644
--- a/benchmark/cluster/vgg16/fluid/pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid/pserver.yaml
@@ -33,7 +33,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/vgg16/fluid/trainer.yaml b/benchmark/cluster/vgg16/fluid/trainer.yaml
index bada190764..c8e26d4b51 100644
--- a/benchmark/cluster/vgg16/fluid/trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid/trainer.yaml
@@ -17,7 +17,7 @@ spec:
       - name: trainer
         image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
         imagePullPolicy: Always
-        command: ["paddle_k8s", "start_trainer", "v2"]
+        command: ["paddle_k8s", "start_fluid"]
         env:
         - name: PADDLE_JOB_NAME
           value: vgg16job
@@ -30,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+          value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/fluid/vgg16.py
index 0595a28784..a973f9d2a6 100644
--- a/benchmark/cluster/vgg16/fluid/vgg16.py
+++ b/benchmark/cluster/vgg16/fluid/vgg16.py
@@ -140,12 +140,14 @@ def main():
 
     def train_loop(exe, trainer_prog):
         iters = 0
+        ts = time.time()
         for pass_id in range(args.num_passes):
             # train
             start_time = time.time()
             num_samples = 0
             accuracy.reset(exe)
             for batch_id, data in enumerate(train_reader()):
+                ts = time.time()
                 img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                         data)).astype("float32")
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
@@ -158,8 +160,8 @@ def main():
                 iters += 1
                 num_samples += len(data)
                 print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f" %
-                    (pass_id, iters, loss, acc)
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" %
+                    (pass_id, iters, loss, acc, time.time() - ts)
                 )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time

From b38452dffaa766311450fd79fb0432b63899545d Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 22 Jan 2018 17:01:01 +0800
Subject: [PATCH 015/102] fix styles

---
 benchmark/cluster/vgg16/fluid/README.md    |  3 +-
 benchmark/cluster/vgg16/fluid/k8s_tools.py | 18 ++++++-
 benchmark/cluster/vgg16/fluid/reader.py    | 14 ++++++
 benchmark/cluster/vgg16/fluid/vgg16.py     | 58 +++++++++++++++-------
 4 files changed, 73 insertions(+), 20 deletions(-)

diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md
index 63a460f7a6..02b17dceb9 100644
--- a/benchmark/cluster/vgg16/fluid/README.md
+++ b/benchmark/cluster/vgg16/fluid/README.md
@@ -12,4 +12,5 @@ Check the logs for the distributed training progress and analyze the performance
 
 ## Enable verbos logs
 
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
\ No newline at end of file
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
+
diff --git a/benchmark/cluster/vgg16/fluid/k8s_tools.py b/benchmark/cluster/vgg16/fluid/k8s_tools.py
index 8a64dbd361..4bee96a7a8 100644
--- a/benchmark/cluster/vgg16/fluid/k8s_tools.py
+++ b/benchmark/cluster/vgg16/fluid/k8s_tools.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 #!/bin/env python
 import os
 import sys
@@ -33,6 +47,7 @@ def wait_pods_running(label_selector, desired):
         print 'current cnt: %d sleep for 5 seconds...' % count
         time.sleep(5)
 
+
 def count_pods_by_phase(label_selector, phase):
     pod_list = fetch_pods_info(label_selector)
     filtered_pod_list = filter(lambda x: x[0] == phase, pod_list)
@@ -45,12 +60,14 @@ def fetch_pserver_ips():
     pserver_ips = [item[1] for item in pod_list]
     return ",".join(pserver_ips)
 
+
 def fetch_master_ip():
     label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
     pod_list = fetch_pods_info(label_selector)
     master_ips = [item[1] for item in pod_list]
     return master_ips[0]
 
+
 def fetch_trainer_id():
     label_selector = "paddle-job=%s" % PADDLE_JOB_NAME
     pod_list = fetch_pods_info(label_selector)
@@ -75,4 +92,3 @@ if __name__ == "__main__":
         print count_pods_by_phase(sys.argv[2], sys.argv[3])
     elif command == "wait_pods_running":
         wait_pods_running(sys.argv[2], sys.argv[3])
-
diff --git a/benchmark/cluster/vgg16/fluid/reader.py b/benchmark/cluster/vgg16/fluid/reader.py
index c5161ddea2..3e20f830fc 100644
--- a/benchmark/cluster/vgg16/fluid/reader.py
+++ b/benchmark/cluster/vgg16/fluid/reader.py
@@ -1,2 +1,16 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle.v2 as paddle
 paddle.dataset.cifar.train10()
diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/fluid/vgg16.py
index a973f9d2a6..3c7b5bf2f1 100644
--- a/benchmark/cluster/vgg16/fluid/vgg16.py
+++ b/benchmark/cluster/vgg16/fluid/vgg16.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """VGG16 benchmark in Fluid"""
 from __future__ import print_function
 
@@ -11,6 +25,7 @@ import argparse
 import functools
 import os
 
+
 def str2bool(v):
     if v.lower() in ('yes', 'true', 't', 'y', '1'):
         return True
@@ -19,6 +34,7 @@ def str2bool(v):
     else:
         raise argparse.ArgumentTypeError('Boolean value expected.')
 
+
 parser = argparse.ArgumentParser(description=__doc__)
 parser.add_argument(
     '--batch_size', type=int, default=128, help="Batch size for training.")
@@ -122,7 +138,6 @@ def main():
     place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
     exe = fluid.Executor(place)
 
-
     # test
     def test(exe):
         accuracy.reset(exe)
@@ -148,20 +163,21 @@ def main():
             accuracy.reset(exe)
             for batch_id, data in enumerate(train_reader()):
                 ts = time.time()
-                img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                        data)).astype("float32")
+                img_data = np.array(
+                    map(lambda x: x[0].reshape(data_shape), data)).astype(
+                        "float32")
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                 y_data = y_data.reshape([-1, 1])
 
                 loss, acc = exe.run(trainer_prog,
                                     feed={"pixel": img_data,
-                                        "label": y_data},
+                                          "label": y_data},
                                     fetch_list=[avg_cost] + accuracy.metrics)
                 iters += 1
                 num_samples += len(data)
                 print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f" %
-                    (pass_id, iters, loss, acc, time.time() - ts)
+                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+                    % (pass_id, iters, loss, acc, time.time() - ts)
                 )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
@@ -170,7 +186,7 @@ def main():
             print(
                 "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
                 % (pass_id, num_samples / pass_elapsed, pass_train_acc,
-                pass_test_acc))
+                   pass_test_acc))
 
     if args.local:
         # Parameter initialization
@@ -179,8 +195,8 @@ def main():
         # data reader
         train_reader = paddle.batch(
             paddle.reader.shuffle(
-                paddle.dataset.cifar.train10()
-                if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+                paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                else paddle.dataset.flowers.train(),
                 buf_size=5120),
             batch_size=args.batch_size)
         test_reader = paddle.batch(
@@ -196,19 +212,25 @@ def main():
         pserver_endpoints = ",".join(eplist)
         print("pserver endpoints: ", pserver_endpoints)
         trainers = int(os.getenv("TRAINERS"))  # total trainer count
-        current_endpoint = os.getenv("POD_IP") + ":6174"  # current pserver endpoint
-        training_role = os.getenv("TRAINING_ROLE",
-                                "TRAINER")  # get the training role: trainer/pserver
+        current_endpoint = os.getenv(
+            "POD_IP") + ":6174"  # current pserver endpoint
+        training_role = os.getenv(
+            "TRAINING_ROLE",
+            "TRAINER")  # get the training role: trainer/pserver
         t = fluid.DistributeTranspiler()
         t.transpile(
-            optimize_ops, params_grads, pservers=pserver_endpoints, trainers=trainers)
+            optimize_ops,
+            params_grads,
+            pservers=pserver_endpoints,
+            trainers=trainers)
 
         if training_role == "PSERVER":
             if not current_endpoint:
                 print("need env SERVER_ENDPOINT")
                 exit(1)
             pserver_prog = t.get_pserver_program(current_endpoint)
-            pserver_startup = t.get_startup_program(current_endpoint, pserver_prog)
+            pserver_startup = t.get_startup_program(current_endpoint,
+                                                    pserver_prog)
             print("starting server side startup")
             exe.run(pserver_startup)
             print("starting parameter server...")
@@ -220,13 +242,13 @@ def main():
             # data reader
             train_reader = paddle.batch(
                 paddle.reader.shuffle(
-                    paddle.dataset.cifar.train10()
-                    if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
+                    paddle.dataset.cifar.train10() if args.data_set == 'cifar10'
+                    else paddle.dataset.flowers.train(),
                     buf_size=5120),
                 batch_size=args.batch_size)
             test_reader = paddle.batch(
-                paddle.dataset.cifar.test10()
-                if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
+                paddle.dataset.cifar.test10() if args.data_set == 'cifar10' else
+                paddle.dataset.flowers.test(),
                 batch_size=args.batch_size)
 
             trainer_prog = t.get_trainer_program()

From 900e911f4223e654c20a68a2db1404dadccfb953 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 22 Jan 2018 18:54:54 +0800
Subject: [PATCH 016/102] fix style check

---
 benchmark/cluster/vgg16/fluid/README.md | 1 -
 benchmark/cluster/vgg16/fluid/vgg16.py  | 1 -
 benchmark/cluster/vgg16/v2/pserver.yaml | 6 +++---
 benchmark/cluster/vgg16/v2/trainer.yaml | 6 +++---
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md
index 02b17dceb9..71a3a934d2 100644
--- a/benchmark/cluster/vgg16/fluid/README.md
+++ b/benchmark/cluster/vgg16/fluid/README.md
@@ -13,4 +13,3 @@ Check the logs for the distributed training progress and analyze the performance
 ## Enable verbos logs
 
 Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
-
diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/fluid/vgg16.py
index 3c7b5bf2f1..88d6d79cc0 100644
--- a/benchmark/cluster/vgg16/fluid/vgg16.py
+++ b/benchmark/cluster/vgg16/fluid/vgg16.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """VGG16 benchmark in Fluid"""
 from __future__ import print_function
 
diff --git a/benchmark/cluster/vgg16/v2/pserver.yaml b/benchmark/cluster/vgg16/v2/pserver.yaml
index ed1671bbbd..943675e147 100644
--- a/benchmark/cluster/vgg16/v2/pserver.yaml
+++ b/benchmark/cluster/vgg16/v2/pserver.yaml
@@ -1,13 +1,13 @@
 apiVersion: extensions/v1beta1
 kind: ReplicaSet
 metadata:
-  name: vgg16job-pserver
+  name: vgg16v2job-pserver
 spec:
   replicas: 10
   template:
     metadata:
       labels:
-        paddle-job-pserver: vgg16job
+        paddle-job-pserver: vgg16v2job
     spec:
       hostNetwork: true
       imagePullSecrets:
@@ -21,7 +21,7 @@ spec:
           containerPort: 30236
         env:
         - name: PADDLE_JOB_NAME
-          value: vgg16job
+          value: vgg16v2job
         - name: TRAINERS
           value: "20"
         - name: PSERVERS
diff --git a/benchmark/cluster/vgg16/v2/trainer.yaml b/benchmark/cluster/vgg16/v2/trainer.yaml
index 75fffc64b0..200b6dc304 100644
--- a/benchmark/cluster/vgg16/v2/trainer.yaml
+++ b/benchmark/cluster/vgg16/v2/trainer.yaml
@@ -1,14 +1,14 @@
 apiVersion: batch/v1
 kind: Job
 metadata:
-  name: vgg16job-trainer
+  name: vgg16v2job-trainer
 spec:
   parallelism: 20
   completions: 20
   template:
     metadata:
       labels:
-        paddle-job: vgg16job
+        paddle-job: vgg16v2job
     spec:
       imagePullSecrets:
         - name: job-registry-secret
@@ -20,7 +20,7 @@ spec:
         command: ["paddle_k8s", "start_trainer", "v2"]
         env:
         - name: PADDLE_JOB_NAME
-          value: vgg16job
+          value: vgg16v2job
         - name: TRAINERS
           value: "20"
         - name: PSERVERS

From 3772d27dfbf83b22333b7cc0eacfb3acd805c036 Mon Sep 17 00:00:00 2001
From: zlx <zlx_hg@163.com>
Date: Mon, 22 Jan 2018 21:09:41 +0800
Subject: [PATCH 017/102] add depthwise conv forward

---
 paddle/operators/conv_op.cc             |   7 +
 paddle/operators/conv_op.cu.cc          |   5 +
 paddle/operators/conv_op.h              |  30 ++
 paddle/operators/math/depthwise_conv.cu | 347 ++++++++++++++++++++++++
 paddle/operators/math/depthwise_conv.h  |  57 ++++
 5 files changed, 446 insertions(+)
 create mode 100644 paddle/operators/math/depthwise_conv.cu
 create mode 100644 paddle/operators/math/depthwise_conv.h

diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index d6882b275b..55a78efea1 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -318,9 +318,16 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
             ops::ConvOpGrad);
+REGISTER_OP(depthwiseConv, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
+            ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
+REGISTER_OP_CPU_KERNEL(
+    depthwiseConv,
+    ops::DepthwiseConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::DepthwiseConvKernel<paddle::platform::CPUDeviceContext, double>);
+
 REGISTER_OP_CPU_KERNEL(
     conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 4f942444f3..4c7a345784 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -16,6 +16,11 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 
+REGISTER_OP_CUDA_KERNEL(
+    depthwiseConv,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
+
 REGISTER_OP_CUDA_KERNEL(
     conv2d, ops::GemmConvKernel<paddle::platform::CUDADeviceContext, float>,
     ops::GemmConvKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index 5a8933e791..ca61f1c6e6 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/depthwise_conv.h"
 #include "paddle/operators/math/im2col.h"
 #include "paddle/operators/math/math_function.h"
 #include "paddle/operators/math/vol2col.h"
@@ -350,5 +351,34 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
     }
   }
 };
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped in the calculations,
+    // so here use an assignment operation,
+    // that avoids modifying the variable in the Scope.
+    Tensor filter = *context.Input<Tensor>("Filter");
+    Tensor* output = context.Output<Tensor>("Output");
+    output->mutable_data<T>(context.GetPlace());
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
+
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
+    filter.Resize(filter_matrix_shape);
+
+    math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
+
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    depthwiseConv(dev_ctx, input, filter, filter_shape_vec, strides, paddings,
+                  output);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
new file mode 100644
index 0000000000..16a0037ab1
--- /dev/null
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -0,0 +1,347 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// CUDA kernel to compute the depthwise convolution forward pass
+template <typename T>
+__global__ void KernelDepthwiseConv(
+    const int nthreads, const T* const input_data, const T* const filter_data,
+    const int batch_size, const int output_channels, const int output_height,
+    const int output_width, const int input_channels, const int input_height,
+    const int input_width, const int filter_multiplier, const int filter_height,
+    const int filter_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width, T* const output_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / output_channels / output_height / output_width;
+    const int c_out = (index / output_height / output_width) % output_channels;
+    const int h_out = (index / output_width) % output_height;
+    const int w_out = index % output_width;
+
+    const int c_in = c_out / filter_multiplier;
+    const T* weight = filter_data + c_out * filter_height * filter_width;
+    T value = 0;
+    const int h_in_start = -padding_height + h_out * stride_height;
+    const int w_in_start = -padding_width + w_out * stride_width;
+    const int h_in_end =
+        -padding_height + h_out * stride_height + filter_height - 1;
+    const int w_in_end =
+        -padding_width + w_out * stride_width + filter_width - 1;
+    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
+        (w_in_end < input_width)) {
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          const int h_in = -padding_height + h_out * stride_height + kh;
+          const int w_in = -padding_width + w_out * stride_width + kw;
+          const int offset =
+              ((batch * input_channels + c_in) * input_height + h_in) *
+                  input_width +
+              w_in;
+          value += (*weight) * input_data[offset];
+          ++weight;
+        }
+      }
+    } else {
+      for (int kh = 0; kh < filter_height; ++kh) {
+        for (int kw = 0; kw < filter_width; ++kw) {
+          const int h_in = -padding_height + h_out * stride_height + kh;
+          const int w_in = -padding_width + w_out * stride_width + kw;
+          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
+              (w_in < input_width)) {
+            const int offset =
+                ((batch * input_channels + c_in) * input_height + h_in) *
+                    input_width +
+                w_in;
+            value += (*weight) * input_data[offset];
+          }
+          ++weight;
+        }
+      }
+    }
+    output_data[index] = value;
+  }
+}
+/*
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <typename T>
+__global__ void KernelDepthwiseConvInputGrad(const int nthreads,
+                                      const T* const top_diff,
+                                      const T* const weight_data,
+                                      const int num,
+                                      const int outputChannels,
+                                      const int outputHeight,
+                                      const int outputWidth,
+                                      const int inputChannels,
+                                      const int inputHeight,
+                                      const int inputWidth,
+                                      const int filterMultiplier,
+                                      const int filterHeight,
+                                      const int filterWidth,
+                                      const int strideH,
+                                      const int strideW,
+                                      const int paddingH,
+                                      const int paddingW,
+                                      T* const bottom_diff) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+
+    const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH) / strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH) / strideH;
+    h_out_end = outputHeight - 1 < h_out_end ? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW) / strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW) / strideW;
+    w_out_end = outputWidth - 1 < w_out_end ? outputWidth - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start; c_out < c_out_start + filterMultiplier;
+         c_out++) {
+      for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+        const int filter_h = h_in + paddingH - h_out * strideH;
+        for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+          const int filter_w = w_in + paddingW - w_out * strideW;
+          const int filter_offset = c_out * filterHeight * filterWidth +
+                                    filter_h * filterWidth + filter_w;
+          const int top_diff_offset =
+              ((batch * outputChannels + c_out) * outputHeight + h_out) *
+                  outputWidth +
+              w_out;
+          value += top_diff[top_diff_offset] * weight_data[filter_offset];
+        }
+      }
+    }
+    bottom_diff[index] += value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
+template <typename T>
+__global__ void KernelDepthwiseConvFilterGrad(const int num_i,
+                                       const int nthreads,
+                                       const T* const top_diff,
+                                       const T* const inputData,
+                                       const int num,
+                                       const int outputChannels,
+                                       const int outputHeight,
+                                       const int outputWidth,
+                                       const int inputChannels,
+                                       const int inputHeight,
+                                       const int inputWidth,
+                                       const int filterMultiplier,
+                                       const int filterHeight,
+                                       const int filterWidth,
+                                       const int strideH,
+                                       const int strideW,
+                                       const int paddingH,
+                                       const int paddingW,
+                                       T* const buffer_data) {
+  int index = (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+    const int kh =
+        (index / filterWidth / outputHeight / outputWidth) % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight) && (w_in >= 0) &&
+        (w_in < inputWidth)) {
+      const int c_out =
+          index / (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset =
+          ((batch * outputChannels + c_out) * outputHeight + h_out) *
+              outputWidth + w_out;
+      const int bottom_offset =
+          ((batch * inputChannels + c_in) * inputHeight + h_in) * inputWidth +
+          w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+*/
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& filter, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* filter_data = filter.data<T>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelDepthwiseConv<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+        nthreads, input_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        output_channels / input_channels, ksize_height, ksize_width,
+        stride_height, stride_width, padding_height, padding_width,
+        output_data);
+  }
+};
+
+/*
+
+template <typename T>
+class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, PoolProcess, T>
+{
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process, framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2DGrad<PoolProcess, T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        pool_process, input_grad_data);
+  }
+};
+
+template <typename T>
+class DepthwiseConvdFilterGradFunctor<platform::CUDADeviceContext, T> {
+ public:
+  void operator()(const platform::CUDADeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DGrad<T><<<grid, threads, 0, context.stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_channels,
+        input_height, input_width, output_height, output_width, ksize_height,
+        ksize_width, stride_height, stride_width, padding_height, padding_width,
+        input_grad_data);
+  }
+};
+*/
+
+template class DepthwiseConvFunctor<platform::CUDADeviceContext,
+                                    paddle::operators::math::MaxPool<float>,
+                                    float>;
+
+/*
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<float>,
+                                 float>;
+
+template class DepthwiseConvFunctor<platform::CUDADeviceContext,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
+                                 paddle::operators::math::MaxPoolGrad<double>,
+                                 double>;
+*/
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/depthwise_conv.h b/paddle/operators/math/depthwise_conv.h
new file mode 100644
index 0000000000..2e48fe5912
--- /dev/null
+++ b/paddle/operators/math/depthwise_conv.h
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvFunctor {
+ public:
+  void operator()(const DeviceContext& context, const framework::Tensor& input,
+                  const framework::Tensor& filter, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* output);
+};
+
+/*
+template <typename DeviceContext, typename T>
+class DepthwiseConvInputGradFunctor {
+public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& filter,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* input_grad);
+};
+
+template <typename DeviceContext, typename T>
+class DepthwiseConvFilterGradFunctor {
+public:
+  void operator()(const DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  framework::Tensor* filter_grad);
+};
+*/
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle

From ca636eedb9813cbd99400d5be60e86bae709dc56 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Tue, 23 Jan 2018 11:45:42 +0800
Subject: [PATCH 018/102] remove libwarpctc.so in core.so and
 libpaddle_fluid.so

---
 cmake/generic.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 585db019d5..147de8b242 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -186,6 +186,11 @@ function(cc_library TARGET_NAME)
       add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
     endif()
     if (cc_library_DEPS)
+      # Don't need link libwarpctc.so
+      if ("${cc_library_DEPS};" MATCHES "warpctc;")
+        list(REMOVE_ITEM cc_library_DEPS warpctc)
+        add_dependencies(${TARGET_NAME} warpctc)
+      endif()
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()

From 06db70384397a4d5b61cd7493ebab9b06faf3244 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Tue, 23 Jan 2018 14:22:01 +0800
Subject: [PATCH 019/102] ../../../../../paddle/api

---
 paddle/operators/CMakeLists.txt         |  3 ++-
 paddle/operators/conv_op.cc             | 11 ++++++++---
 paddle/operators/conv_op.h              |  7 ++-----
 paddle/operators/math/CMakeLists.txt    |  1 +
 paddle/operators/math/depthwise_conv.cu | 18 ++++++------------
 5 files changed, 19 insertions(+), 21 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 6745a8da17..fa2f8caacf 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -155,7 +155,8 @@ op_library(parallel_do_op DEPS executor)
 
 # Regist multiple Kernel to pybind
 if (WITH_GPU)
-op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
+op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
+    vol2col depthwise_conv)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
   conv_transpose_cudnn_op.cu.cc DEPS vol2col)
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 55a78efea1..a53b11615c 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -318,15 +318,20 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
 namespace ops = paddle::operators;
 REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
             ops::ConvOpGrad);
-REGISTER_OP(depthwiseConv, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
+REGISTER_OP(depthwiseConv, ops::ConvOp, ops::Conv2DOpMaker, depthwiseConv_grad,
             ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
 REGISTER_OP_CPU_KERNEL(
     depthwiseConv,
-    ops::DepthwiseConvKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::DepthwiseConvKernel<paddle::platform::CPUDeviceContext, double>);
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_CPU_KERNEL(
+    depthwiseConv_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
 REGISTER_OP_CPU_KERNEL(
     conv2d, ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index ca61f1c6e6..a9138dbf93 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -364,18 +364,15 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
     Tensor* output = context.Output<Tensor>("Output");
     output->mutable_data<T>(context.GetPlace());
 
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
 
-    framework::DDim filter_matrix_shape = {filter.dims()[0],
-                                           filter.numel() / filter.dims()[0]};
-    filter.Resize(filter_matrix_shape);
-
     math::DepthwiseConvFunctor<DeviceContext, T> depthwiseConv;
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    depthwiseConv(dev_ctx, input, filter, filter_shape_vec, strides, paddings,
+    depthwiseConv(dev_ctx, *input, filter, ksize, strides, paddings,
                   output);
   }
 };
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index c607704efa..6fb1531236 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -8,6 +8,7 @@ if(WITH_GPU)
     nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
     nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
     nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
     nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
     nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
index 16a0037ab1..aee052d379 100644
--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/math/pooling.h"
+#include "paddle/operators/math/depthwise_conv.h"
 #include "paddle/platform/cuda_helper.h"
 
 namespace paddle {
@@ -195,7 +195,7 @@ __global__ void KernelDepthwiseConvFilterGrad(const int num_i,
  * Ksize, strides, paddings are two elements. These two elements represent
  * height and width, respectively.
  */
-template <typename T>
+template <class T>
 class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
@@ -226,7 +226,7 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
 
-    KernelDepthwiseConv<T><<<grid, threads, 0, STREAM_DEFAULT>>>(
+    KernelDepthwiseConv<T><<<grid, threads, 0, context.stream()>>>(
         nthreads, input_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         output_channels / input_channels, ksize_height, ksize_width,
@@ -236,7 +236,6 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
 };
 
 /*
-
 template <typename T>
 class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, PoolProcess, T>
 {
@@ -254,8 +253,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, PoolProcess, T>
     const int output_height = output.dims()[2];
     const int output_width = output.dims()[3];
     const int ksize_height = ksize[0];
-    const int ksize_width = ksize[1];
-    const int stride_height = strides[0];
+    const int ksize_width = ksize[1]; const int stride_height = strides[0];
     const int stride_width = strides[1];
     const int padding_height = paddings[0];
     const int padding_width = paddings[1];
@@ -321,24 +319,20 @@ class DepthwiseConvdFilterGradFunctor<platform::CUDADeviceContext, T> {
 */
 
 template class DepthwiseConvFunctor<platform::CUDADeviceContext,
-                                    paddle::operators::math::MaxPool<float>,
                                     float>;
+template class DepthwiseConvFunctor<platform::CUDADeviceContext,
+                                    double>;
 
 /*
 template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
                                  float>;
 template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<float>,
                                  float>;
 
 template class DepthwiseConvFunctor<platform::CUDADeviceContext,
-                             paddle::operators::math::MaxPool<double>, double>;
 template class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
                                  double>;
 template class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext,
-                                 paddle::operators::math::MaxPoolGrad<double>,
                                  double>;
 */
 

From 70142ae65eb234b804fd7b96a953f1f6ea2aff90 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Tue, 23 Jan 2018 18:33:08 +0800
Subject: [PATCH 020/102] update dist benchmark to one image

---
 .../cluster/vgg16/{fluid => }/Dockerfile      |  2 +-
 benchmark/cluster/vgg16/README.md             | 58 +++++++++++++++
 benchmark/cluster/vgg16/fluid/README.md       | 15 ----
 .../pserver.yaml => fluid_pserver.yaml}       |  6 +-
 .../trainer.yaml => fluid_trainer.yaml}       |  6 +-
 .../cluster/vgg16/{fluid => }/k8s_tools.py    |  0
 .../cluster/vgg16/{fluid => }/paddle_k8s      |  0
 benchmark/cluster/vgg16/{fluid => }/reader.py |  0
 benchmark/cluster/vgg16/v2/Dockerfile         |  7 --
 benchmark/cluster/vgg16/v2/reader.py          | 70 -------------------
 .../{v2/pserver.yaml => v2_pserver.yaml}      |  4 +-
 .../{v2/trainer.yaml => v2_trainer.yaml}      |  8 ++-
 .../vgg16/{fluid/vgg16.py => vgg16_fluid.py}  |  0
 .../vgg16/{v2/vgg16.py => vgg16_v2.py}        | 21 ++++--
 14 files changed, 86 insertions(+), 111 deletions(-)
 rename benchmark/cluster/vgg16/{fluid => }/Dockerfile (91%)
 create mode 100644 benchmark/cluster/vgg16/README.md
 delete mode 100644 benchmark/cluster/vgg16/fluid/README.md
 rename benchmark/cluster/vgg16/{fluid/pserver.yaml => fluid_pserver.yaml} (89%)
 rename benchmark/cluster/vgg16/{fluid/trainer.yaml => fluid_trainer.yaml} (87%)
 rename benchmark/cluster/vgg16/{fluid => }/k8s_tools.py (100%)
 rename benchmark/cluster/vgg16/{fluid => }/paddle_k8s (100%)
 rename benchmark/cluster/vgg16/{fluid => }/reader.py (100%)
 delete mode 100644 benchmark/cluster/vgg16/v2/Dockerfile
 delete mode 100644 benchmark/cluster/vgg16/v2/reader.py
 rename benchmark/cluster/vgg16/{v2/pserver.yaml => v2_pserver.yaml} (92%)
 rename benchmark/cluster/vgg16/{v2/trainer.yaml => v2_trainer.yaml} (88%)
 rename benchmark/cluster/vgg16/{fluid/vgg16.py => vgg16_fluid.py} (100%)
 rename benchmark/cluster/vgg16/{v2/vgg16.py => vgg16_v2.py} (90%)

diff --git a/benchmark/cluster/vgg16/fluid/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
similarity index 91%
rename from benchmark/cluster/vgg16/fluid/Dockerfile
rename to benchmark/cluster/vgg16/Dockerfile
index 711076b09e..dfaffb8c21 100644
--- a/benchmark/cluster/vgg16/fluid/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -12,4 +12,4 @@ ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD reader.py /workspace/
 RUN python /workspace/reader.py
 
-ADD vgg16.py /workspace/
+ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
new file mode 100644
index 0000000000..18128e5276
--- /dev/null
+++ b/benchmark/cluster/vgg16/README.md
@@ -0,0 +1,58 @@
+# Performance for distributed vgg16
+
+## Test Result
+
+### Single node single thread
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | 16.74 | - |
+| PaddlePaddle v2 | - | - | 17.60 | - |
+| TensorFlow | - | - | - | - |
+
+### different batch size
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | 247.40 | - | - |
+| PaddlePaddle v2 | - | - | 256.14 | - |
+| TensorFlow | - | - | - | - |
+
+### different pserver number
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Accelerate rate
+
+| Trainer Counter | 20 | 40 | 80 | 100 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+
+## Steps to run the performance test
+
+1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
+1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
+1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
+1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
+1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
+
+Check the logs for the distributed training progress and analyze the performance.
+
+## Enable verbos logs
+
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid/README.md b/benchmark/cluster/vgg16/fluid/README.md
deleted file mode 100644
index 71a3a934d2..0000000000
--- a/benchmark/cluster/vgg16/fluid/README.md
+++ /dev/null
@@ -1,15 +0,0 @@
-# Fluid distributed training perf test
-
-## Steps to get started
-
-1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
-1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
-1. Run `docker build -t [image:tag] .` to build the docker image and run `docker push [image:tag]` to push the image to reponsitory so kubernetes can find it.
-1. Run `kubectl create -f pserver.yaml && kubectl create -f trainer.yaml` to start the job on your kubernetes cluster (you must configure the `kubectl` client before this step).
-1. Run `kubectl get po` to get running pods, and run `kubectl logs [podID]` to fetch the pod log of pservers and trainers.
-
-Check the logs for the distributed training progress and analyze the performance.
-
-## Enable verbos logs
-
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
diff --git a/benchmark/cluster/vgg16/fluid/pserver.yaml b/benchmark/cluster/vgg16/fluid_pserver.yaml
similarity index 89%
rename from benchmark/cluster/vgg16/fluid/pserver.yaml
rename to benchmark/cluster/vgg16/fluid_pserver.yaml
index e1a58260af..ee8b0763b6 100644
--- a/benchmark/cluster/vgg16/fluid/pserver.yaml
+++ b/benchmark/cluster/vgg16/fluid_pserver.yaml
@@ -14,7 +14,7 @@ spec:
       - name: job-registry-secret
       containers:
       - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         ports:
         - name: jobport-30236
@@ -33,7 +33,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
@@ -53,7 +53,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:
diff --git a/benchmark/cluster/vgg16/fluid/trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
similarity index 87%
rename from benchmark/cluster/vgg16/fluid/trainer.yaml
rename to benchmark/cluster/vgg16/fluid_trainer.yaml
index c8e26d4b51..0a0ed25ebe 100644
--- a/benchmark/cluster/vgg16/fluid/trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -15,7 +15,7 @@ spec:
       hostNetwork: true
       containers:
       - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16_fluid"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         command: ["paddle_k8s", "start_fluid"]
         env:
@@ -30,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && LD_LIBRARY_PATH=/usr/local/lib MKL_NUM_THREADS=1 python /workspace/vgg16.py --local 0"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
@@ -50,7 +50,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:
diff --git a/benchmark/cluster/vgg16/fluid/k8s_tools.py b/benchmark/cluster/vgg16/k8s_tools.py
similarity index 100%
rename from benchmark/cluster/vgg16/fluid/k8s_tools.py
rename to benchmark/cluster/vgg16/k8s_tools.py
diff --git a/benchmark/cluster/vgg16/fluid/paddle_k8s b/benchmark/cluster/vgg16/paddle_k8s
similarity index 100%
rename from benchmark/cluster/vgg16/fluid/paddle_k8s
rename to benchmark/cluster/vgg16/paddle_k8s
diff --git a/benchmark/cluster/vgg16/fluid/reader.py b/benchmark/cluster/vgg16/reader.py
similarity index 100%
rename from benchmark/cluster/vgg16/fluid/reader.py
rename to benchmark/cluster/vgg16/reader.py
diff --git a/benchmark/cluster/vgg16/v2/Dockerfile b/benchmark/cluster/vgg16/v2/Dockerfile
deleted file mode 100644
index 5f129a8e32..0000000000
--- a/benchmark/cluster/vgg16/v2/Dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM paddlepaddle/paddlecloud-job
-RUN mkdir -p /workspace
-ADD reader.py /workspace/
-RUN python /workspace/reader.py
-ADD vgg16.py /workspace/
-
-ADD vgg16_fluid.py /workspace
diff --git a/benchmark/cluster/vgg16/v2/reader.py b/benchmark/cluster/vgg16/v2/reader.py
deleted file mode 100644
index 16ac2dbcef..0000000000
--- a/benchmark/cluster/vgg16/v2/reader.py
+++ /dev/null
@@ -1,70 +0,0 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-#
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
-#
-#    http://www.apache.org/licenses/LICENSE-2.0
-#
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
-
-import random
-from paddle.v2.image import load_and_transform
-import paddle.v2 as paddle
-from multiprocessing import cpu_count
-
-
-def train_mapper(sample):
-    '''
-    map image path to type needed by model input layer for the training set
-    '''
-    img, label = sample
-    img = paddle.image.load_image(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
-    return img.flatten().astype('float32'), label
-
-
-def test_mapper(sample):
-    '''
-    map image path to type needed by model input layer for the test set
-    '''
-    img, label = sample
-    img = paddle.image.load_image(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
-    return img.flatten().astype('float32'), label
-
-
-def train_reader(train_list, buffered_size=1024):
-    def reader():
-        with open(train_list, 'r') as f:
-            lines = [line.strip() for line in f]
-            for line in lines:
-                img_path, lab = line.strip().split('\t')
-                yield img_path, int(lab)
-
-    return paddle.reader.xmap_readers(train_mapper, reader,
-                                      cpu_count(), buffered_size)
-
-
-def test_reader(test_list, buffered_size=1024):
-    def reader():
-        with open(test_list, 'r') as f:
-            lines = [line.strip() for line in f]
-            for line in lines:
-                img_path, lab = line.strip().split('\t')
-                yield img_path, int(lab)
-
-    return paddle.reader.xmap_readers(test_mapper, reader,
-                                      cpu_count(), buffered_size)
-
-
-if __name__ == '__main__':
-    #for im in train_reader('train.list'):
-    #    print len(im[0])
-    #for im in train_reader('test.list'):
-    #    print len(im[0])
-    paddle.dataset.cifar.train10()
diff --git a/benchmark/cluster/vgg16/v2/pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
similarity index 92%
rename from benchmark/cluster/vgg16/v2/pserver.yaml
rename to benchmark/cluster/vgg16/v2_pserver.yaml
index 943675e147..dd1271e0cf 100644
--- a/benchmark/cluster/vgg16/v2/pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -14,7 +14,7 @@ spec:
       - name: job-registry-secret
       containers:
       - name: pserver
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         ports:
         - name: jobport-30236
@@ -49,7 +49,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:
diff --git a/benchmark/cluster/vgg16/v2/trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
similarity index 88%
rename from benchmark/cluster/vgg16/v2/trainer.yaml
rename to benchmark/cluster/vgg16/v2_trainer.yaml
index 3288fbae26..9d52e231f0 100644
--- a/benchmark/cluster/vgg16/v2/trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -15,12 +15,14 @@ spec:
       hostNetwork: true
       containers:
       - name: trainer
-        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
+        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
         imagePullPolicy: Always
         command: ["paddle_k8s", "start_trainer", "v2"]
         env:
         - name: PADDLE_JOB_NAME
           value: vgg16v2job
+        - name: BATCH_SIZE
+          value: "128"
         - name: TRAINERS
           value: "20"
         - name: PSERVERS
@@ -28,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
@@ -48,7 +50,7 @@ spec:
         - name: PADDLE_INIT_USE_GPU
           value: "0"
         - name: LD_LIBRARY_PATH
-          value: "/usr/local/nvidia/lib64"
+          value: "/usr/local/lib:/usr/local/nvidia/lib64"
         - name: NAMESPACE
           valueFrom:
             fieldRef:
diff --git a/benchmark/cluster/vgg16/fluid/vgg16.py b/benchmark/cluster/vgg16/vgg16_fluid.py
similarity index 100%
rename from benchmark/cluster/vgg16/fluid/vgg16.py
rename to benchmark/cluster/vgg16/vgg16_fluid.py
diff --git a/benchmark/cluster/vgg16/v2/vgg16.py b/benchmark/cluster/vgg16/vgg16_v2.py
similarity index 90%
rename from benchmark/cluster/vgg16/v2/vgg16.py
rename to benchmark/cluster/vgg16/vgg16_v2.py
index 0ffa9703b7..284dbec48d 100644
--- a/benchmark/cluster/vgg16/v2/vgg16.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -16,12 +16,17 @@ import gzip
 
 import paddle.v2.dataset.cifar as cifar
 import paddle.v2 as paddle
-import reader
 import time
+import os
 
 DATA_DIM = 3 * 32 * 32
 CLASS_DIM = 10
-BATCH_SIZE = 128
+BATCH_SIZE = os.getenv("BATCH_SIZE")
+if BATCH_SIZE:
+    BATCH_SIZE = int(BATCH_SIZE)
+else:
+    BATCH_SIZE = 128
+NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0
 
 
@@ -84,7 +89,8 @@ def main():
         name="label", type=paddle.data_type.integer_value(CLASS_DIM))
 
     extra_layers = None
-    learning_rate = 1e-3 / 20
+    # NOTE: for v2 distributed training need averaging updates.
+    learning_rate = 1e-3 / NODE_COUNT
     out = vgg16(image, class_dim=CLASS_DIM)
     cost = paddle.layer.classification_cost(input=out, label=lbl)
 
@@ -123,7 +129,9 @@ def main():
 
     # End batch and end pass event handler
     def event_handler(event):
-        global ts
+        global ts, ts_pass
+        if isinstance(event, paddle.event.BeginPass):
+            ts_pass = time.time()
         if isinstance(event, paddle.event.BeginIteration):
             ts = time.time()
         if isinstance(event, paddle.event.EndIteration):
@@ -132,9 +140,8 @@ def main():
                     event.pass_id, event.batch_id, event.cost, event.metrics,
                     time.time() - ts)
         if isinstance(event, paddle.event.EndPass):
-            with gzip.open('params_pass_%d.tar.gz' % event.pass_id, 'w') as f:
-                trainer.save_parameter_to_tar(f)
-
+            print "Pass %d end, spent: %f" % (event.pass_id,
+                                              time.time() - ts_pass)
             result = trainer.test(reader=test_reader)
             print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
 

From 72eccb238ebea3bb0509da7bf6e9ff18db088763 Mon Sep 17 00:00:00 2001
From: gaoyuan <gaoyuan18@baidu.com>
Date: Sun, 28 Jan 2018 15:38:54 +0800
Subject: [PATCH 021/102] add box coder op

---
 paddle/operators/box_coder_op.cc              | 106 ++++++++++++
 paddle/operators/box_coder_op.cu              | 145 ++++++++++++++++
 paddle/operators/box_coder_op.h               | 163 ++++++++++++++++++
 .../v2/fluid/tests/test_box_coder_op.py       | 117 +++++++++++++
 4 files changed, 531 insertions(+)
 create mode 100644 paddle/operators/box_coder_op.cc
 create mode 100644 paddle/operators/box_coder_op.cu
 create mode 100644 paddle/operators/box_coder_op.h
 create mode 100644 python/paddle/v2/fluid/tests/test_box_coder_op.py

diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc
new file mode 100644
index 0000000000..0cb20a4182
--- /dev/null
+++ b/paddle/operators/box_coder_op.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/box_coder_op.h"
+
+namespace paddle {
+namespace operators {
+
+class BoxCoderOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(PriorBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
+                   "Input(PriorBoxVar) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+                   "Input(TargetBox) of BoxCoderOp should not be null.");
+
+    auto prior_box_dims = ctx->GetInputDim("PriorBox");
+    auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
+    auto target_box_dims = ctx->GetInputDim("TargetBox");
+
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2UL,
+                      "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4UL,
+                      "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 2UL,
+                      "The shape of PriorBoxVar is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_var_dims[1], 4UL,
+                      "The shape of PriorBoxVar is [N, 4]");
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2UL,
+                      "The shape of TargetBox is [M, 4]");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], 4UL,
+                      "The shape of TargetBox is [M, 4]");
+
+    GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
+
+    ctx->SetOutputDim("OutputBox", framework::make_ddim({target_box_dims[0],
+                                                         target_box_dims[1]}));
+  }
+};
+
+class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BoxCoderOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "PriorBox",
+        "(Tensor, default Tensor<float>) "
+        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds N boxes, "
+        "each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the anchor box, "
+        "if the input is image feature map, they are close to the origin "
+        "of the coordinate system. [xmax, ymax] is the right bottom "
+        "coordinate of the anchor box.");
+    AddInput("PriorBoxVar",
+             "(Tensor, default Tensor<float>) "
+             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds N group "
+             "of variance.");
+    AddInput(
+        "TargetBox",
+        "(LoDTensor or Tensor) this input is a 2-D LoDTensor with shape "
+        "[N, 4], each box is represented as [xmin, ymin, xmax, ymax], "
+        "[xmin, ymin] is the left top coordinate of the box if the input "
+        "is image feature map, they are close to the origin of the coordinate "
+        "system. [xmax, ymax] is the right bottom coordinate of the box. "
+        "This tensor can contain LoD information to represent a batch "
+        "of inputs. One instance of this batch can contain different "
+        "numbers of entities.");
+    AddAttr<std::string>("code_type",
+                         "(string, default encode_center_size) "
+                         "the code type used with the target box")
+        .SetDefault("encode_center_size")
+        .InEnum({"encode_center_size", "decode_center_size"});
+    AddOutput(
+        "OutputBox",
+        "(Tensor, default Tensor<float>)"
+        "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
+        "representing the result of N target boxes encoded/decoded with "
+        "M Prior boxes and variances.");
+
+    AddComment(R"DOC(
+Bounding Box Coder Operator.
+Encode/Decode the priorbox information with the target bounding box.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(box_coder, ops::BoxCoderOp, ops::BoxCoderOpMaker);
+REGISTER_OP_CPU_KERNEL(box_coder, ops::BoxCoderKernel<float>,
+                       ops::BoxCoderKernel<double>);
diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu
new file mode 100644
index 0000000000..4055ded1f8
--- /dev/null
+++ b/paddle/operators/box_coder_op.cu
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/box_coder_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using platform::PADDLE_CUDA_NUM_THREADS;
+
+template <typename T>
+__global__ void EncodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, int row,
+                                       int col, T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * 4 + 2] - prior_box_data[col_idx * 4];
+    T prior_box_height =
+        prior_box_data[col_idx * 4 + 3] - prior_box_data[col_idx * 4 + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * 4 + 2] + prior_box_data[col_idx * 4]) / 2;
+    T prior_box_center_y =
+        (prior_box_data[col_idx * 4 + 3] + prior_box_data[col_idx * 4 + 1]) / 2;
+
+    T target_box_center_x =
+        (target_box_data[row_idx * 4 + 2] + target_box_data[row_idx * 4]) / 2;
+    T target_box_center_y =
+        (target_box_data[row_idx * 4 + 3] + target_box_data[row_idx * 4 + 1]) /
+        2;
+    T target_box_width =
+        target_box_data[row_idx * 4 + 2] - target_box_data[row_idx * 4];
+    T target_box_height =
+        target_box_data[row_idx * 4 + 3] - target_box_data[row_idx * 4 + 1];
+
+    output[idx * 4] = (target_box_center_x - prior_box_center_x) /
+                      prior_box_width / prior_box_var_data[col_idx * 4];
+    output[idx * 4 + 1] = (target_box_center_y - prior_box_center_y) /
+                          prior_box_height /
+                          prior_box_var_data[col_idx * 4 + 1];
+    output[idx * 4 + 2] = log(fabs(target_box_width / prior_box_width)) /
+                          prior_box_var_data[col_idx * 4 + 2];
+    output[idx * 4 + 3] = log(fabs(target_box_height / prior_box_height)) /
+                          prior_box_var_data[col_idx * 4 + 3];
+  }
+}
+
+template <typename T>
+__global__ void DecodeCenterSizeKernel(const T* prior_box_data,
+                                       const T* prior_box_var_data,
+                                       const T* target_box_data, int row,
+                                       int col, T* output) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < row * col) {
+    const int row_idx = idx / col;
+    const int col_idx = idx % col;
+    T prior_box_width =
+        prior_box_data[col_idx * 4 + 2] - prior_box_data[col_idx * 4];
+    T prior_box_height =
+        prior_box_data[col_idx * 4 + 3] - prior_box_data[col_idx * 4 + 1];
+    T prior_box_center_x =
+        (prior_box_data[col_idx * 4 + 2] + prior_box_data[col_idx * 4]) / 2;
+    T prior_box_center_y =
+        (prior_box_data[col_idx * 4 + 3] + prior_box_data[col_idx * 4 + 1]) / 2;
+
+    T target_box_width = exp(prior_box_var_data[col_idx * 4 + 2] *
+                             target_box_data[row_idx * 4 + 2]) *
+                         prior_box_width;
+    T target_box_height = exp(prior_box_var_data[col_idx * 4 + 3] *
+                              target_box_data[row_idx * 4 + 3]) *
+                          prior_box_height;
+    T target_box_center_x = prior_box_var_data[col_idx * 4] *
+                                target_box_data[row_idx * 4] * prior_box_width +
+                            prior_box_center_x;
+    T target_box_center_y = prior_box_var_data[col_idx * 4 + 1] *
+                                target_box_data[row_idx * 4 + 1] *
+                                prior_box_height +
+                            prior_box_center_y;
+
+    output[idx * 4] = target_box_center_x - target_box_width / 2;
+    output[idx * 4 + 1] = target_box_center_y - target_box_height / 2;
+    output[idx * 4 + 2] = target_box_center_x + target_box_width / 2;
+    output[idx * 4 + 3] = target_box_center_y + target_box_height / 2;
+  }
+}
+
+template <typename T>
+class BoxCoderCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<Tensor>("OutputBox");
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+    int block = 512;
+    int grid = (row * col + block - 1) / block;
+    auto& device_ctx = context.cuda_device_context();
+
+    const T* prior_box_data = prior_box->data<T>();
+    const T* prior_box_var_data = prior_box_var->data<T>();
+    const T* target_box_data = target_box->data<T>();
+
+    output_box->mutable_data<T>({row, col, 4}, context.GetPlace());
+    T* output = output_box->data<T>();
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col,
+          output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
+          prior_box_data, prior_box_var_data, target_box_data, row, col,
+          output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(box_coder, ops::BoxCoderCUDAKernel<float>,
+                        ops::BoxCoderCUDAKernel<double>);
diff --git a/paddle/operators/box_coder_op.h b/paddle/operators/box_coder_op.h
new file mode 100644
index 0000000000..3865da40c3
--- /dev/null
+++ b/paddle/operators/box_coder_op.h
@@ -0,0 +1,163 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
+
+inline BoxCodeType GetBoxCodeType(const std::string& type) {
+  if (type == "encode_center_size") {
+    return BoxCodeType::kEncodeCenterSize;
+  } else if (type == "decode_center_size") {
+    return BoxCodeType::kDecodeCenterSize;
+  }
+  PADDLE_THROW("Not support type %s.", type);
+}
+
+template <typename T>
+class BoxCoderKernel : public framework::OpKernel<T> {
+ public:
+  void EncodeCenterSize(const Tensor& target_box, const Tensor& prior_box,
+                        const Tensor& prior_box_var, T* output) const {
+    PADDLE_ENFORCE_EQ(target_box.dims().size(), 2,
+                      "The rank of target_box must be 2.");
+    PADDLE_ENFORCE_EQ(prior_box.dims().size(), 2,
+                      "The rank of prior_box must be 2.");
+    PADDLE_ENFORCE_EQ(prior_box_var.dims().size(), 2,
+                      "The rank of prior_box_var must be 2.");
+    PADDLE_ENFORCE_EQ(prior_box.dims()[0], prior_box_var.dims()[0],
+                      "The dims of prior_box must equal to prior_box_var.");
+
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width = prior_box_data[j * 4 + 2] - prior_box_data[j * 4];
+        T prior_box_height =
+            prior_box_data[j * 4 + 3] - prior_box_data[j * 4 + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * 4 + 2] + prior_box_data[j * 4]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * 4 + 3] + prior_box_data[j * 4 + 1]) / 2;
+
+        T target_box_center_x =
+            (target_box_data[i * 4 + 2] + target_box_data[i * 4]) / 2;
+        T target_box_center_y =
+            (target_box_data[i * 4 + 3] + target_box_data[i * 4 + 1]) / 2;
+        T target_box_width =
+            target_box_data[i * 4 + 2] - target_box_data[i * 4];
+        T target_box_height =
+            target_box_data[i * 4 + 3] - target_box_data[i * 4 + 1];
+
+        size_t offset = i * col * 4 + j * 4;
+        output[offset] = (target_box_center_x - prior_box_center_x) /
+                         prior_box_width / prior_box_var_data[j * 4];
+        output[offset + 1] = (target_box_center_y - prior_box_center_y) /
+                             prior_box_height / prior_box_var_data[j * 4 + 1];
+        output[offset + 2] =
+            std::log(std::fabs(target_box_width / prior_box_width)) /
+            prior_box_var_data[j * 4 + 2];
+        output[offset + 3] =
+            std::log(std::fabs(target_box_height / prior_box_height)) /
+            prior_box_var_data[j * 4 + 3];
+      }
+    }
+  }
+  void DecodeCenterSize(const Tensor& target_box, const Tensor& prior_box,
+                        const Tensor& prior_box_var, T* output) const {
+    PADDLE_ENFORCE_EQ(target_box.dims().size(), 2,
+                      "The rank of target_box must be 2.");
+    PADDLE_ENFORCE_EQ(prior_box.dims().size(), 2,
+                      "The rank of prior_box must be 2.");
+    PADDLE_ENFORCE_EQ(prior_box_var.dims().size(), 2,
+                      "The rank of prior_box_var must be 2.");
+    PADDLE_ENFORCE_EQ(prior_box.dims()[0], prior_box_var.dims()[0],
+                      "The dims of prior_box must equal to prior_box_var.");
+
+    int64_t row = target_box.dims()[0];
+    int64_t col = prior_box.dims()[0];
+
+    auto* target_box_data = target_box.data<T>();
+    auto* prior_box_data = prior_box.data<T>();
+    auto* prior_box_var_data = prior_box_var.data<T>();
+
+    for (int64_t i = 0; i < row; ++i) {
+      for (int64_t j = 0; j < col; ++j) {
+        T prior_box_width = prior_box_data[j * 4 + 2] - prior_box_data[j * 4];
+        T prior_box_height =
+            prior_box_data[j * 4 + 3] - prior_box_data[j * 4 + 1];
+        T prior_box_center_x =
+            (prior_box_data[j * 4 + 2] + prior_box_data[j * 4]) / 2;
+        T prior_box_center_y =
+            (prior_box_data[j * 4 + 3] + prior_box_data[j * 4 + 1]) / 2;
+
+        T target_box_center_x = prior_box_var_data[j * 4] *
+                                    target_box_data[i * 4] * prior_box_width +
+                                prior_box_center_x;
+        T target_box_center_y = prior_box_var_data[j * 4 + 1] *
+                                    target_box_data[i * 4 + 1] *
+                                    prior_box_height +
+                                prior_box_center_y;
+        T target_box_width = std::exp(prior_box_var_data[j * 4 + 2] *
+                                      target_box_data[i * 4 + 2]) *
+                             prior_box_width;
+        T target_box_height = std::exp(prior_box_var_data[j * 4 + 3] *
+                                       target_box_data[i * 4 + 3]) *
+                              prior_box_height;
+
+        size_t offset = i * col * 4 + j * 4;
+        output[offset] = target_box_center_x - target_box_width / 2;
+        output[offset + 1] = target_box_center_y - target_box_height / 2;
+        output[offset + 2] = target_box_center_x + target_box_width / 2;
+        output[offset + 3] = target_box_center_y + target_box_height / 2;
+      }
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* prior_box = context.Input<framework::Tensor>("PriorBox");
+    auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
+    auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
+    auto* output_box = context.Output<Tensor>("OutputBox");
+
+    if (target_box->lod().size()) {
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
+                        "Only support 1 level of LoD.");
+    }
+    auto row = target_box->dims()[0];
+    auto col = prior_box->dims()[0];
+
+    output_box->mutable_data<T>({row, col, 4}, context.GetPlace());
+
+    auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
+    T* output = output_box->data<T>();
+    if (code_type == BoxCodeType::kEncodeCenterSize) {
+      EncodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    } else if (code_type == BoxCodeType::kDecodeCenterSize) {
+      DecodeCenterSize(*target_box, *prior_box, *prior_box_var, output);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/v2/fluid/tests/test_box_coder_op.py b/python/paddle/v2/fluid/tests/test_box_coder_op.py
new file mode 100644
index 0000000000..fcf5da01ce
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_box_coder_op.py
@@ -0,0 +1,117 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import sys
+import math
+from op_test import OpTest
+
+
+def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
+    prior_box_x = (prior_box[:, 2] + prior_box[:, 0]) / 2
+    prior_box_y = (prior_box[:, 3] + prior_box[:, 1]) / 2
+    prior_box_width = (prior_box[:, 2] - prior_box[:, 0])
+    prior_box_height = (prior_box[:, 3] - prior_box[:, 1])
+
+    if (code_type == "EncodeCenterSize"):
+        target_box_x = (target_box[:, 2] + target_box[:, 0]) / 2
+        target_box_y = (target_box[:, 3] + target_box[:, 1]) / 2
+        target_box_width = (target_box[:, 2] - target_box[:, 0])
+        target_box_height = (target_box[:, 3] - target_box[:, 1])
+
+        for i in range(target_box.shape[0]):
+            output_box[i,:,0] = (target_box_x[i] - prior_box_x) / prior_box_width / \
+                                prior_box_var[:,0]
+            output_box[i,:,1] = (target_box_y[i] - prior_box_y) / prior_box_height / \
+                                prior_box_var[:,1]
+            output_box[i,:,2] = np.log(np.fabs(target_box_width[i] / prior_box_width)) / \
+                                prior_box_var[:,2]
+            output_box[i,:,3] = np.log(np.fabs(target_box_height[i] / prior_box_height)) / \
+                                prior_box_var[:,3]
+
+    elif (code_type == "DecodeCenterSize"):
+        for i in range(target_box.shape[0]):
+            target_box_x = prior_box_var[:,0] * target_box[i][0] * \
+                           prior_box_width[:] + prior_box_x[:]
+            target_box_y = prior_box_var[:,1] * target_box[i][1] * \
+                           prior_box_height[:] + prior_box_y[:]
+            target_box_width = np.exp(prior_box_var[:,2] * target_box[i][2]) * \
+                               prior_box_width[:]
+            target_box_height = np.exp(prior_box_var[:,3] * target_box[i][3]) * \
+                                prior_box_height[:]
+            output_box[i, :, 0] = target_box_x - target_box_width / 2
+            output_box[i, :, 1] = target_box_y - target_box_height / 2
+            output_box[i, :, 2] = target_box_x + target_box_width / 2
+            output_box[i, :, 3] = target_box_y + target_box_height / 2
+
+
+def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):
+    n = target_box.shape[0]
+    m = prior_box.shape[0]
+    output_box = np.zeros((n, m, 4), dtype=np.float32)
+    for i in range(len(lod) - 1):
+        box_coder(target_box[lod[i]:lod[i + 1], :], prior_box, prior_box_var,
+                  output_box[lod[i]:lod[i + 1], :, :], code_type)
+    return output_box
+
+
+class TestBoxCoderOp(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 20]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.random.random((10, 4)).astype('float32')
+        target_box = np.random.random((20, 4)).astype('float32')
+        code_type = "DecodeCenterSize"
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': target_box,
+        }
+        self.attrs = {'code_type': 'decode_center_size'}
+        self.outputs = {'OutputBox': output_box}
+
+
+class TestBoxCoderOpWithLoD(OpTest):
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "box_coder"
+        lod = [[0, 4, 12, 20]]
+        prior_box = np.random.random((10, 4)).astype('float32')
+        prior_box_var = np.random.random((10, 4)).astype('float32')
+        target_box = np.random.random((20, 4)).astype('float32')
+        code_type = "EncodeCenterSize"
+        output_box = batch_box_coder(prior_box, prior_box_var, target_box,
+                                     lod[0], code_type)
+
+        self.inputs = {
+            'PriorBox': prior_box,
+            'PriorBoxVar': prior_box_var,
+            'TargetBox': (target_box, lod),
+        }
+        self.attrs = {'code_type': 'encode_center_size'}
+        self.outputs = {'OutputBox': output_box}
+
+
+if __name__ == '__main__':
+    unittest.main()

From bc6c4dbb9fa06ade3bc6da36fbd6265cec163be4 Mon Sep 17 00:00:00 2001
From: Yuan Gao <yuan.gao.gavin@gmail.com>
Date: Sun, 28 Jan 2018 16:00:55 +0800
Subject: [PATCH 022/102] Update box_coder_op.cc

---
 paddle/operators/box_coder_op.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc
index 0cb20a4182..7d0d28aaba 100644
--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/operators/box_coder_op.cc
@@ -32,15 +32,15 @@ class BoxCoderOp : public framework::OperatorWithKernel {
     auto target_box_dims = ctx->GetInputDim("TargetBox");
 
     PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2UL,
-                      "The shape of PriorBox is [N, 4]");
+                      "The rank of Input of PriorBox must be 2");
     PADDLE_ENFORCE_EQ(prior_box_dims[1], 4UL,
                       "The shape of PriorBox is [N, 4]");
     PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 2UL,
-                      "The shape of PriorBoxVar is [N, 4]");
+                      "The rank of Input of PriorBoxVar must be 2");
     PADDLE_ENFORCE_EQ(prior_box_var_dims[1], 4UL,
                       "The shape of PriorBoxVar is [N, 4]");
     PADDLE_ENFORCE_EQ(target_box_dims.size(), 2UL,
-                      "The shape of TargetBox is [M, 4]");
+                      "The rank of Input of TargetBox must be 2");
     PADDLE_ENFORCE_EQ(target_box_dims[1], 4UL,
                       "The shape of TargetBox is [M, 4]");
 

From bd64719a2f012af82dcac731179a998764d432b9 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Mon, 29 Jan 2018 20:42:29 +0800
Subject: [PATCH 023/102] update for today

---
 benchmark/cluster/vgg16/README.md          | 44 +++++++++++++++-------
 benchmark/cluster/vgg16/fluid_trainer.yaml |  2 +-
 benchmark/cluster/vgg16/v2_trainer.yaml    |  2 +-
 benchmark/cluster/vgg16/vgg16_fluid.py     | 39 ++++++++++---------
 benchmark/cluster/vgg16/vgg16_v2.py        |  1 +
 5 files changed, 54 insertions(+), 34 deletions(-)

diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index 18128e5276..c1e85a2c40 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -2,41 +2,57 @@
 
 ## Test Result
 
-### Single node single thread
+### Hardware Infomation
+
+- CPU: Intel(R) Xeon(R) CPU E5-2620 v4 @ 2.10GHz
+- cpu MHz		: 2101.000
+- cache size	: 20480 KB
+
+### Single Node Single Thread
+
+- PServer Count: 10
+- Trainer Count: 20
+- Metrics: samples / sec
 
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | 16.74 | - |
-| PaddlePaddle v2 | - | - | 17.60 | - |
+| PaddlePaddle Fluid | 15.44 | 16.32 | 16.74 | 16.79 |
+| PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
 | TensorFlow | - | - | - | - |
 
 ### different batch size
 
 - PServer Count: 10
 - Trainer Count: 20
+- Per trainer CPU Core: 1
 - Metrics: samples / sec
 
 | Batch Size | 32 | 64 | 128 | 256 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | 247.40 | - | - |
-| PaddlePaddle v2 | - | - | 256.14 | - |
+| PaddlePaddle Fluid | 190.20 | 222.15 | 247.40 | 258.18 |
+| PaddlePaddle v2 | 170.96 | 233.71 | 256.14 | 329.23 |
 | TensorFlow | - | - | - | - |
 
-### different pserver number
 
-- Trainer Count: 100
-- Batch Size: 64
-- Metrics: mini-batch / sec
+### Accelerate rate
 
-| PServer Count | 10 | 20 | 40 | 60 |
+- Pserver Count: 20
+- Batch Size: 128
+- Metrics: samples / sec
+
+| Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
+| PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
+| PaddlePaddle v2 | 356.28 | - | - | 1041.99 |
 | TensorFlow | - | - | - | - |
 
-### Accelerate rate
+### different pserver number
 
-| Trainer Counter | 20 | 40 | 80 | 100 |
+- Trainer Count: 100
+- Batch Size: 128
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | - | - | - | - |
 | PaddlePaddle v2 | - | - | - | - |
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
index 0a0ed25ebe..2f6a87ab02 100644
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -30,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
index 9d52e231f0..997bbc81c9 100644
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -22,7 +22,7 @@ spec:
         - name: PADDLE_JOB_NAME
           value: vgg16v2job
         - name: BATCH_SIZE
-          value: "128"
+          value: "256"
         - name: TRAINERS
           value: "20"
         - name: PSERVERS
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 88d6d79cc0..51a01af672 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -20,6 +20,7 @@ import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
+import paddle.v2.fluid.profiler as profiler
 import argparse
 import functools
 import os
@@ -160,24 +161,25 @@ def main():
             start_time = time.time()
             num_samples = 0
             accuracy.reset(exe)
-            for batch_id, data in enumerate(train_reader()):
-                ts = time.time()
-                img_data = np.array(
-                    map(lambda x: x[0].reshape(data_shape), data)).astype(
-                        "float32")
-                y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = y_data.reshape([-1, 1])
-
-                loss, acc = exe.run(trainer_prog,
-                                    feed={"pixel": img_data,
-                                          "label": y_data},
-                                    fetch_list=[avg_cost] + accuracy.metrics)
-                iters += 1
-                num_samples += len(data)
-                print(
-                    "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
-                    % (pass_id, iters, loss, acc, time.time() - ts)
-                )  # The accuracy is the accumulation of batches, but not the current batch.
+            with profiler.profiler("CPU", 'total') as prof:
+                for batch_id, data in enumerate(train_reader()):
+                    ts = time.time()
+                    img_data = np.array(
+                        map(lambda x: x[0].reshape(data_shape), data)).astype(
+                            "float32")
+                    y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                    y_data = y_data.reshape([-1, 1])
+
+                    loss, acc = exe.run(trainer_prog,
+                                        feed={"pixel": img_data,
+                                            "label": y_data},
+                                        fetch_list=[avg_cost] + accuracy.metrics)
+                    iters += 1
+                    num_samples += len(data)
+                    print(
+                        "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, spent %f"
+                        % (pass_id, iters, loss, acc, time.time() - ts)
+                    )  # The accuracy is the accumulation of batches, but not the current batch.
 
             pass_elapsed = time.time() - start_time
             pass_train_acc = accuracy.eval(exe)
@@ -211,6 +213,7 @@ def main():
         pserver_endpoints = ",".join(eplist)
         print("pserver endpoints: ", pserver_endpoints)
         trainers = int(os.getenv("TRAINERS"))  # total trainer count
+        print("trainers total: ", trainers)
         current_endpoint = os.getenv(
             "POD_IP") + ":6174"  # current pserver endpoint
         training_role = os.getenv(
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
index 284dbec48d..81ddeb0332 100644
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -26,6 +26,7 @@ if BATCH_SIZE:
     BATCH_SIZE = int(BATCH_SIZE)
 else:
     BATCH_SIZE = 128
+print "batch_size", BATCH_SIZE
 NODE_COUNT = int(os.getenv("TRAINERS"))
 ts = 0
 

From 912a4f2511ad118d7a989cbe4e7f634503670e34 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Mon, 29 Jan 2018 23:49:56 +0800
Subject: [PATCH 024/102] Add multi-class non-maximum suppression operator.

---
 paddle/operators/multiclass_nms_op.cc         | 353 ++++++++++++++++++
 .../v2/fluid/tests/test_bipartite_match_op.py |   2 +-
 .../v2/fluid/tests/test_multiclass_nms_op.py  | 199 ++++++++++
 3 files changed, 553 insertions(+), 1 deletion(-)
 create mode 100644 paddle/operators/multiclass_nms_op.cc
 create mode 100644 python/paddle/v2/fluid/tests/test_multiclass_nms_op.py

diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
new file mode 100644
index 0000000000..19c5b7efd6
--- /dev/null
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -0,0 +1,353 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+constexpr int64_t kOutputDim = 6;
+constexpr int64_t kBBoxSize = 4;
+
+class MulticlassNMSOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Bboxes"),
+                   "Input(Bboxes) of MulticlassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Scores"),
+                   "Input(Scores) of MulticlassNMS should not be null.");
+
+    auto box_dims = ctx->GetInputDim("Bboxes");
+    auto score_dims = ctx->GetInputDim("Scores");
+
+    PADDLE_ENFORCE_EQ(box_dims.size(), 3,
+                      "The rank of Input(Bboxes) must be 3.");
+    PADDLE_ENFORCE_EQ(score_dims.size(), 3,
+                      "The rank of Input(Scores) must be 3.");
+    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[0]);
+    PADDLE_ENFORCE_EQ(box_dims[2], 4);
+    PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2]);
+
+    // Here the box_dims[0] is not the real dimension of output.
+    // It will be rewritten in the computing kernel.
+    ctx->SetOutputDim("Out", {box_dims[0], 6});
+  }
+};
+
+template <class T>
+bool SortScorePairDescend(const std::pair<float, T>& pair1,
+                          const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <class T>
+static inline void GetMaxScoreIndex(
+    const std::vector<T>& scores, const T threshold, int top_k,
+    std::vector<std::pair<T, int>>* sorted_indices) {
+  for (size_t i = 0; i < scores.size(); ++i) {
+    if (scores[i] > threshold) {
+      sorted_indices->push_back(std::make_pair(scores[i], i));
+    }
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
+                   SortScorePairDescend<int>);
+  // Keep top_k scores if needed.
+  if (top_k > -1 && top_k < sorted_indices->size()) {
+    sorted_indices->resize(top_k);
+  }
+}
+
+template <class T>
+T BBoxArea(const T* box, const bool normalized) {
+  if (box[2] < box[0] || box[3] < box[1]) {
+    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
+    return T(0.);
+  } else {
+    const T w = box[2] - box[0];
+    const T h = box[3] - box[1];
+    if (normalized) {
+      return w * h;
+    } else {
+      // If bbox is not within range [0, 1].
+      return (w + 1) * (h + 1);
+    }
+  }
+}
+
+template <class T>
+static inline T JaccardOverlap(const T* box1, const T* box2,
+                               const bool normalized) {
+  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
+      box2[3] < box1[1]) {
+    return static_cast<T>(0.);
+  } else {
+    const T inter_xmin = std::max(box1[0], box2[0]);
+    const T inter_ymin = std::max(box1[1], box2[1]);
+    const T inter_xmax = std::min(box1[2], box2[2]);
+    const T inter_ymax = std::min(box1[3], box2[3]);
+    const T inter_w = inter_xmax - inter_xmin;
+    const T inter_h = inter_ymax - inter_ymin;
+    const T inter_area = inter_w * inter_h;
+    const T bbox1_area = BBoxArea<T>(box1, normalized);
+    const T bbox2_area = BBoxArea<T>(box2, normalized);
+    return inter_area / (bbox1_area + bbox2_area - inter_area);
+  }
+}
+
+template <typename T>
+class MulticlassNMSKernel : public framework::OpKernel<T> {
+ public:
+  void NMSFast(const Tensor& bbox, const Tensor& scores,
+               const T score_threshold, const T nms_threshold, const T eta,
+               const int64_t top_k, std::vector<int>* selected_indices) const {
+    // The total boxes for each instance.
+    int64_t num_boxes = bbox.dims()[0];
+    // 4: [xmin ymin xmax ymax]
+    int64_t box_size = bbox.dims()[1];
+
+    std::vector<T> scores_data(num_boxes);
+    std::copy_n(scores.data<T>(), num_boxes, scores_data.begin());
+    std::vector<std::pair<T, int>> sorted_indices;
+    GetMaxScoreIndex(scores_data, score_threshold, top_k, &sorted_indices);
+
+    selected_indices->clear();
+    T adaptive_threshold = nms_threshold;
+    const T* bbox_data = bbox.data<T>();
+
+    while (sorted_indices.size() != 0) {
+      const int idx = sorted_indices.front().second;
+      bool keep = true;
+      for (int k = 0; k < selected_indices->size(); ++k) {
+        if (keep) {
+          const int kept_idx = (*selected_indices)[k];
+          T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                        bbox_data + kept_idx * box_size, true);
+          keep = overlap <= adaptive_threshold;
+        } else {
+          break;
+        }
+      }
+      if (keep) {
+        selected_indices->push_back(idx);
+      }
+      sorted_indices.erase(sorted_indices.begin());
+      if (keep && eta < 1 && adaptive_threshold > 0.5) {
+        adaptive_threshold *= eta;
+      }
+    }
+  }
+
+  void MulticlassNMS(const framework::ExecutionContext& ctx,
+                     const Tensor& scores, const Tensor& bboxes,
+                     std::map<int, std::vector<int>>* indices,
+                     int* num_nmsed_out) const {
+    int64_t background_label = ctx.Attr<int64_t>("background_label");
+    int64_t nms_top_k = ctx.Attr<int64_t>("nms_top_k");
+    int64_t keep_top_k = ctx.Attr<int64_t>("keep_top_k");
+    T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
+    T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
+    T score_threshold = static_cast<T>(ctx.Attr<float>("confidence_threshold"));
+
+    int64_t class_num = scores.dims()[0];
+    int64_t predict_dim = scores.dims()[1];
+    int num_det = 0;
+    for (int64_t c = 0; c < class_num; ++c) {
+      if (c == background_label) continue;
+      Tensor score = scores.Slice(c, c + 1);
+      NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
+              &((*indices)[c]));
+      num_det += indices[c].size();
+    }
+
+    *num_nmsed_out = num_det;
+    const T* scores_data = scores.data<T>();
+    if (keep_top_k > -1 && num_det > keep_top_k) {
+      std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
+      for (const auto& it : *indices) {
+        int label = it.first;
+        const T* sdata = scores_data + label * predict_dim;
+        const std::vector<int>& label_indices = it.second;
+        for (int j = 0; j < label_indices.size(); ++j) {
+          int idx = label_indices[j];
+          PADDLE_ENFORCE_LT(idx, predict_dim);
+          score_index_pairs.push_back(
+              std::make_pair(sdata[idx], std::make_pair(label, idx)));
+        }
+      }
+      // Keep top k results per image.
+      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
+                SortScorePairDescend<std::pair<int, int>>);
+      score_index_pairs.resize(keep_top_k);
+
+      // Store the new indices.
+      std::map<int, std::vector<int>> new_indices;
+      for (int j = 0; j < score_index_pairs.size(); ++j) {
+        int label = score_index_pairs[j].second.first;
+        int idx = score_index_pairs[j].second.second;
+        new_indices[label].push_back(idx);
+      }
+      new_indices.swap(*indices);
+      *num_nmsed_out = keep_top_k;
+    }
+  }
+
+  void MulticlassOutput(const Tensor& scores, const Tensor& bboxes,
+                        std::map<int, std::vector<int>>& selected_indices,
+                        Tensor* outs) const {
+    int predict_dim = scores.dims()[1];
+    auto* scores_data = scores.data<T>();
+    auto* bboxes_data = bboxes.data<T>();
+    auto* odata = outs->data<T>();
+
+    int count = 0;
+    for (const auto& it : selected_indices) {
+      int label = it.first;
+      const T* sdata = scores_data + label * predict_dim;
+      std::vector<int> indices = it.second;
+      for (int j = 0; j < indices.size(); ++j) {
+        int idx = indices[j];
+        const T* bdata = bboxes_data + idx * kBBoxSize;
+        odata[count * kOutputDim] = label;           // label
+        odata[count * kOutputDim + 1] = sdata[idx];  // score
+        odata[count * kOutputDim + 2] = bdata[0];    // xmin
+        odata[count * kOutputDim + 3] = bdata[1];    // ymin
+        odata[count * kOutputDim + 4] = bdata[2];    // xmax
+        odata[count * kOutputDim + 5] = bdata[3];    // ymax
+      }
+      count++;
+    }
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* boxes = ctx.Input<Tensor>("Bboxes");
+    auto* scores = ctx.Input<Tensor>("Scores");
+    auto* outs = ctx.Output<LoDTensor>("Out");
+
+    auto box_dims = boxes->dims();
+    auto score_dims = scores->dims();
+
+    int64_t batch_size = box_dims[0];
+    int64_t class_num = score_dims[1];
+    int64_t predict_dim = score_dims[2];
+
+    std::vector<std::map<int, std::vector<int>>> all_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int64_t i = 0; i < batch_size; ++i) {
+      Tensor ins_score = scores->Slice(i, i + 1);
+      ins_score.Resize({class_num, predict_dim});
+      std::map<int, std::vector<int>> indices;
+      int num_nmsed_out = 0;
+      MulticlassNMS(ctx, ins_score, *boxes, &indices, &num_nmsed_out);
+      all_indices.push_back(indices);
+      batch_starts.push_back(batch_starts.back() + num_nmsed_out);
+    }
+
+    int num_kept = batch_starts.back();
+    if (num_kept == 0) {
+      outs->Resize({0, 0});
+    } else {
+      outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
+      for (int64_t i = 0; i < batch_size; ++i) {
+        Tensor ins_score = scores->Slice(i, i + 1);
+        ins_score.Resize({class_num, predict_dim});
+        int64_t s = batch_starts[i];
+        int64_t e = batch_starts[i + 1];
+        if (e > s) {
+          Tensor out = outs->Slice(s, e);
+          MulticlassOutput(ins_score, *boxes, all_indices[i], &out);
+        }
+      }
+    }
+
+    framework::LoD lod;
+    lod.emplace_back(batch_starts);
+
+    outs->set_lod(lod);
+  }
+};
+
+class MulticlassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MulticlassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Bboxes",
+             "(Tensor) A 2-D Tensor with shape [M, 4] represents the location "
+             "predictions with M bboxes. 4 is the number of "
+             "each location coordinates.");
+    AddOutput("Scores",
+              "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+              "confidence predictions. N is the batch size, C is the class "
+              "number, M is number of predictions for each class, which is "
+              "the same with Bboxes.");
+    AddAttr<int64_t>(
+        "background_label",
+        "(int64_t, defalut: 0) "
+        "The index of background label, the background label will be ignored.")
+        .SetDefault(0);
+    AddAttr<float>("nms_threshold",
+                   "(float, defalut: 0.3) "
+                   "The threshold to be used in nms.")
+        .SetDefault(0.3);
+    AddAttr<int64_t>("nms_top_k",
+                     "(int64_t) "
+                     " .");
+    AddAttr<float>("nms_eta",
+                   "(float) "
+                   "The parameter for adaptive nms.")
+        .SetDefault(1.0);
+    AddAttr<int64_t>("keep_top_k",
+                     "(int64_t) "
+                     ".");
+    AddAttr<float>("confidence_threshold",
+                   "(float) "
+                   ".");
+    AddOutput("Out",
+              "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
+              "detections. Each row has 6 values: "
+              "[label, confidence, xmin, ymin, xmax, ymax], No is the total "
+              "number of detections in this mini-batch. For each instance, "
+              "the offsets in first dimension are called LoD, the number of "
+              "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
+              "no detected bbox.");
+    AddComment(R"DOC(
+This operators is to do multi-class non maximum suppression (nms) on a batched
+of boxes and scores.
+
+This op greedily selects a subset of detection bounding boxes, pruning
+away boxes that have high IOU (intersection over union) overlap (> thresh)
+with already selected boxes.  It operates independently for each class for
+which scores are provided (via the scores field of the input box_list),
+pruning boxes with score less than a provided threshold prior to
+applying NMS.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(multiclass_nms, ops::MulticlassNMSOp,
+                  ops::MulticlassNMSOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MulticlassNMSKernel<float>,
+                       ops::MulticlassNMSKernel<double>);
diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
index 7413829897..c35fb20b10 100644
--- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
@@ -62,7 +62,7 @@ def batch_bipartite_match(distance, lod):
     return match_indices, match_dist
 
 
-class TestBipartiteMatchOpForWithLoD(OpTest):
+class TestBipartiteMatchOpWithLoD(OpTest):
     def setUp(self):
         self.op_type = 'bipartite_match'
         lod = [[0, 5, 11, 23]]
diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
new file mode 100644
index 0000000000..60c6488f84
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
@@ -0,0 +1,199 @@
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+#Licensed under the Apache License, Version 2.0 (the "License");
+#you may not use this file except in compliance with the License.
+#You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+#Unless required by applicable law or agreed to in writing, software
+#distributed under the License is distributed on an "AS IS" BASIS,
+#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#See the License for the specific language governing permissions and
+#limitations under the License.
+import unittest
+import numpy as np
+import copy
+from op_test import OpTest
+
+
+def iou(box_a, box_b):
+    """Apply intersection-over-union overlap between box_a and box_b
+    """
+    xmin_a = min(box_a[0], box_a[2])
+    ymin_a = min(box_a[1], box_a[3])
+    xmax_a = max(box_a[0], box_a[2])
+    ymax_a = max(box_a[1], box_a[3])
+
+    xmin_b = min(box_b[0], box_b[2])
+    ymin_b = min(box_b[1], box_b[3])
+    xmax_b = max(box_b[0], box_b[2])
+    ymax_b = max(box_b[1], box_b[3])
+
+    area_a = (ymax_a - ymin_a) * (xmax_a - xmin_a)
+    area_b = (ymax_b - ymin_b) * (xmax_b - xmin_b)
+    if area_a <= 0 and area_b <= 0:
+        return 0.0
+
+    xa = max(xmin_a, xmin_b)
+    ya = max(ymin_a, ymin_b)
+    xb = min(xmax_a, xmax_b)
+    yb = min(ymax_a, ymax_b)
+
+    inter_area = max(xb - xa, 0.0) * max(yb - ya, 0.0)
+
+    box_a_area = (box_a[2] - box_a[0]) * (box_a[3] - box_a[1])
+    box_b_area = (box_b[2] - box_b[0]) * (box_b[3] - box_b[1])
+
+    iou_ratio = inter_area / (area_a + area_b - inter_area)
+
+    return iou_ratio
+
+
+def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
+    """Apply non-maximum suppression at test time to avoid detecting too many
+    overlapping bounding boxes for a given object.
+    Args:
+        boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
+        scores: (tensor) The class predscores for the img, Shape:[num_priors].
+        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
+        top_k: (int) The Maximum number of box preds to consider.
+    Return:
+        The indices of the kept boxes with respect to num_priors.
+    """
+    all_scores = copy.deepcopy(scores)
+    all_scores = all_scores.flatten()
+    selected_indices = np.argwhere(all_scores > score_threshold)
+    selected_indices = selected_indices.flatten()
+    all_scores = all_scores[selected_indices]
+
+    sorted_indices = np.argsort(-all_scores, axis=0)
+    sorted_scores = all_scores[sorted_indices]
+    if top_k < -1 and top_k < sorted_indices.shape[0]:
+        sorted_indices = sorted_indices[:top_k]
+        sorted_scores = sorted_scores[:top_k]
+
+    selected_indices = []
+    adaptive_threshold = nms_threshold
+    for i in range(sorted_scores.shape[0]):
+        idx = sorted_indices[i]
+        keep = True
+        for k in range(len(selected_indices)):
+            if keep:
+                kept_idx = selected_indices[k]
+                overlap = iou(boxes[idx], boxes[kept_idx])
+                keep = overlap <= adaptive_threshold
+            else:
+                break
+        if keep:
+            selected_indices.append(idx)
+        if keep and eta < 1 and adaptive_threshold > 0.5:
+            adaptive_threshold *= eta
+    return selected_indices
+
+
+def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
+                   nms_top_k, keep_top_k):
+    class_num = scores.shape[0]
+    priorbox_num = scores.shape[1]
+
+    selected_indices = []
+    num_det = 0
+    for c in range(class_num):
+        if c == background: continue
+        indices = nms(boxes, scores[c], score_threshold, nms_threshold,
+                      nms_top_k)
+        selected_indices.append((c, indices))
+        num_det += len(indices)
+
+    if keep_top_k > -1 and num_det > keep_top_k:
+        score_index = []
+        for c, indices in selected_indices:
+            for idx in indices:
+                score_index.append((scores[c][idx], c, idx))
+
+        sorted_score_index = sorted(
+            score_index, key=lambda tup: tup[0], reverse=True)
+        sorted_score_index = sorted_score_index[:keep_top_k]
+        selected_indices = []
+        for s, c, idx in sorted_score_index:
+            selected_indices.append((c, idx))
+
+    return selected_indices
+
+
+def batched_multiclass_nms(boxes, scores, background, score_threshold,
+                           nms_threshold, nms_top_k, keep_top_k):
+    batch_size = scores.shape[0]
+
+    det_outs = []
+    lod = [0]
+    for n in range(batch_size):
+        nmsed_outs = multiclass_nms(boxes, scores[n], background,
+                                    score_threshold, nms_threshold, nms_top_k,
+                                    keep_top_k)
+        lod.append(lod[-1] + len(nmsed_outs))
+        if len(nmsed_outs) == 0: continue
+        for c, indices in nmsed_outs:
+            for idx in indices:
+                xmin, ymin, xmax, ymax = boxes[idx][:]
+                det_outs.append(
+                    (c, scores[n][c][idx], c, xmin, ymin, xmax, ymax))
+    return det_outs, lod
+
+
+class TestMulticlassNMSOp(OpTest):
+    def setUp(self):
+        self.op_type = 'multiclass_nms'
+        N = 7
+        M = 1230
+        C = 21
+        BOX_SIZE = 4
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = 0.01
+
+        scores = np.random.random((N, C, M)).astype('float32')
+        boxes = np.random.random((M, BOX_SIZE)).astype('float32')
+        boxes[:, 0:2] = boxes[:, 0:2] * 0.5
+        boxes[:, 2:4] = boxes[:, 0:2] * 0.5 + 0.5
+
+        nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
+                                                 score_threshold, nms_threshold,
+                                                 nms_top_k, keep_top_k)
+        self.inputs = {'Bboxes': boxes, 'Scores': scores}
+        self.outputs = {'Out': (nmsed_outs, [lod])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestIOU(unittest.TestCase):
+    def test_iou(self):
+        box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
+        box2 = np.array([3.0, 4.0, 6.0, 8.0]).astype('float32')
+
+        expt_output = np.array([2.0 / 16.0]).astype('float32')
+        calc_output = np.array([iou(box1, box2)]).astype('float32')
+        self.assertTrue(np.allclose(calc_output, expt_output))
+
+
+if __name__ == '__main__':
+    unittest.main()
+    # N = 7
+    # M = 8
+    # C = 5
+    # BOX_SIZE = 4
+    # background = 0
+    # nms_threshold = 0.3
+    # nms_top_k = 400
+    # keep_top_k = 200
+    # score_threshold = 0.5
+
+    # scores = np.random.random((N, C, M)).astype('float32')
+    # boxes = np.random.random((M, BOX_SIZE)).astype('float32')
+    # boxes[:, 0 : 2] = boxes[:, 0 : 2] * 0.5
+    # boxes[:, 2 : 4] = boxes[:, 0 : 2] * 0.5 + 0.5
+    # print nmsed_outs, lod

From 2731fd96606b18411b485269e36fd44ae8909650 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 30 Jan 2018 00:19:28 +0800
Subject: [PATCH 025/102] Update doc for multiclass_nms_op.

---
 paddle/operators/multiclass_nms_op.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
index 19c5b7efd6..5da553a6cc 100644
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -37,13 +37,12 @@ class MulticlassNMSOp : public framework::OperatorWithKernel {
     auto box_dims = ctx->GetInputDim("Bboxes");
     auto score_dims = ctx->GetInputDim("Scores");
 
-    PADDLE_ENFORCE_EQ(box_dims.size(), 3,
+    PADDLE_ENFORCE_EQ(box_dims.size(), 2,
                       "The rank of Input(Bboxes) must be 3.");
     PADDLE_ENFORCE_EQ(score_dims.size(), 3,
                       "The rank of Input(Scores) must be 3.");
-    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[0]);
     PADDLE_ENFORCE_EQ(box_dims[2], 4);
-    PADDLE_ENFORCE_EQ(box_dims[1], score_dims[2]);
+    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2]);
 
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
@@ -308,17 +307,19 @@ class MulticlassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(0.3);
     AddAttr<int64_t>("nms_top_k",
                      "(int64_t) "
-                     " .");
+                     "Maximum number of results to be kept.");
     AddAttr<float>("nms_eta",
                    "(float) "
                    "The parameter for adaptive nms.")
         .SetDefault(1.0);
     AddAttr<int64_t>("keep_top_k",
                      "(int64_t) "
-                     ".");
+                     "Number of total bboxes to be kept per image after nms "
+                     "step. -1 means keeping all bboxes after nms step.");
     AddAttr<float>("confidence_threshold",
                    "(float) "
-                   ".");
+                   "Only consider detections whose confidences are larger than "
+                   "a threshold. If not provided, consider all boxes.");
     AddOutput("Out",
               "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
               "detections. Each row has 6 values: "
@@ -328,15 +329,14 @@ class MulticlassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
               "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
               "no detected bbox.");
     AddComment(R"DOC(
-This operators is to do multi-class non maximum suppression (nms) on a batched
+This operators is to do multi-class non maximum suppression (NMS) on a batched
 of boxes and scores.
 
 This op greedily selects a subset of detection bounding boxes, pruning
 away boxes that have high IOU (intersection over union) overlap (> thresh)
 with already selected boxes.  It operates independently for each class for
-which scores are provided (via the scores field of the input box_list),
-pruning boxes with score less than a provided threshold prior to
-applying NMS.
+which scores are provided, pruning boxes with score less than a provided
+threshold prior to applying NMS.
 
 )DOC");
   }

From 35dec3d7228e2f924ccc6549a420604110640337 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 30 Jan 2018 17:59:48 +0800
Subject: [PATCH 026/102] Fix bug in unit test.

---
 paddle/operators/multiclass_nms_op.cc         | 84 +++++++++++--------
 .../v2/fluid/tests/test_multiclass_nms_op.py  | 61 +++++++-------
 2 files changed, 82 insertions(+), 63 deletions(-)

diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
index 5da553a6cc..93c8b5216f 100644
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -41,13 +41,22 @@ class MulticlassNMSOp : public framework::OperatorWithKernel {
                       "The rank of Input(Bboxes) must be 3.");
     PADDLE_ENFORCE_EQ(score_dims.size(), 3,
                       "The rank of Input(Scores) must be 3.");
-    PADDLE_ENFORCE_EQ(box_dims[2], 4);
+    PADDLE_ENFORCE_EQ(box_dims[1], 4);
     PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2]);
 
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
     ctx->SetOutputDim("Out", {box_dims[0], 6});
   }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(
+            ctx.Input<framework::LoDTensor>("Scores")->type()),
+        ctx.device_context());
+  }
 };
 
 template <class T>
@@ -158,12 +167,12 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
                      const Tensor& scores, const Tensor& bboxes,
                      std::map<int, std::vector<int>>* indices,
                      int* num_nmsed_out) const {
-    int64_t background_label = ctx.Attr<int64_t>("background_label");
-    int64_t nms_top_k = ctx.Attr<int64_t>("nms_top_k");
-    int64_t keep_top_k = ctx.Attr<int64_t>("keep_top_k");
+    int64_t background_label = ctx.Attr<int>("background_label");
+    int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
+    int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
     T nms_threshold = static_cast<T>(ctx.Attr<float>("nms_threshold"));
     T nms_eta = static_cast<T>(ctx.Attr<float>("nms_eta"));
-    T score_threshold = static_cast<T>(ctx.Attr<float>("confidence_threshold"));
+    T score_threshold = static_cast<T>(ctx.Attr<float>("score_threshold"));
 
     int64_t class_num = scores.dims()[0];
     int64_t predict_dim = scores.dims()[1];
@@ -173,7 +182,7 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
       Tensor score = scores.Slice(c, c + 1);
       NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
               &((*indices)[c]));
-      num_det += indices[c].size();
+      num_det += (*indices)[c].size();
     }
 
     *num_nmsed_out = num_det;
@@ -230,8 +239,8 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
         odata[count * kOutputDim + 3] = bdata[1];    // ymin
         odata[count * kOutputDim + 4] = bdata[2];    // xmax
         odata[count * kOutputDim + 5] = bdata[3];    // ymax
+        count++;
       }
-      count++;
     }
   }
 
@@ -240,10 +249,9 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
     auto* scores = ctx.Input<Tensor>("Scores");
     auto* outs = ctx.Output<LoDTensor>("Out");
 
-    auto box_dims = boxes->dims();
     auto score_dims = scores->dims();
 
-    int64_t batch_size = box_dims[0];
+    int64_t batch_size = score_dims[0];
     int64_t class_num = score_dims[1];
     int64_t predict_dim = score_dims[2];
 
@@ -291,35 +299,37 @@ class MulticlassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor) A 2-D Tensor with shape [M, 4] represents the location "
              "predictions with M bboxes. 4 is the number of "
              "each location coordinates.");
-    AddOutput("Scores",
-              "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
-              "confidence predictions. N is the batch size, C is the class "
-              "number, M is number of predictions for each class, which is "
-              "the same with Bboxes.");
-    AddAttr<int64_t>(
+    AddInput("Scores",
+             "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
+             "confidence predictions. N is the batch size, C is the class "
+             "number, M is number of predictions for each class, which is "
+             "the same with Bboxes.");
+    AddAttr<int>(
         "background_label",
         "(int64_t, defalut: 0) "
         "The index of background label, the background label will be ignored.")
         .SetDefault(0);
+    AddAttr<float>("score_threshold",
+                   "(float) "
+                   "Only consider detections whose confidences are larger than "
+                   "a threshold. If not provided, consider all boxes.");
+    AddAttr<int>("nms_top_k",
+                 "(int64_t) "
+                 "Maximum number of detections to be kept according to the "
+                 "confidences aftern the filtering detections based on "
+                 "score_threshold");
     AddAttr<float>("nms_threshold",
                    "(float, defalut: 0.3) "
-                   "The threshold to be used in nms.")
+                   "The threshold to be used in NMS.")
         .SetDefault(0.3);
-    AddAttr<int64_t>("nms_top_k",
-                     "(int64_t) "
-                     "Maximum number of results to be kept.");
     AddAttr<float>("nms_eta",
                    "(float) "
-                   "The parameter for adaptive nms.")
+                   "The parameter for adaptive NMS.")
         .SetDefault(1.0);
-    AddAttr<int64_t>("keep_top_k",
-                     "(int64_t) "
-                     "Number of total bboxes to be kept per image after nms "
-                     "step. -1 means keeping all bboxes after nms step.");
-    AddAttr<float>("confidence_threshold",
-                   "(float) "
-                   "Only consider detections whose confidences are larger than "
-                   "a threshold. If not provided, consider all boxes.");
+    AddAttr<int>("keep_top_k",
+                 "(int64_t) "
+                 "Number of total bboxes to be kept per image after NMS "
+                 "step. -1 means keeping all bboxes after NMS step.");
     AddOutput("Out",
               "(LoDTensor) A 2-D LoDTensor with shape [No, 6] represents the "
               "detections. Each row has 6 values: "
@@ -329,15 +339,21 @@ class MulticlassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
               "offset is N + 1, if LoD[i + 1] - LoD[i] == 0, means there is "
               "no detected bbox.");
     AddComment(R"DOC(
-This operators is to do multi-class non maximum suppression (NMS) on a batched
+This operator is to do multi-class non maximum suppression (NMS) on a batched
 of boxes and scores.
 
-This op greedily selects a subset of detection bounding boxes, pruning
-away boxes that have high IOU (intersection over union) overlap (> thresh)
-with already selected boxes.  It operates independently for each class for
-which scores are provided, pruning boxes with score less than a provided
-threshold prior to applying NMS.
+In the NMS step, this operator greedily selects a subset of detection bounding
+boxes that have high scores larger than score_threshold, if providing this
+threshold, then selects the largest nms_top_k confidences scores if nms_top_k
+is larger than -1. Then this operator pruns away boxes that have high IOU
+(intersection over union) overlap with already selected boxes by adaptive
+threshold NMS based on parameters of nms_threshold and nms_eta.
+
+Aftern NMS step, only at most keep_top_k number of total bboxes are to be kept
+per image if keep_top_k is larger than -1.
 
+This operator support multi-class and batched inputs. It applying NMS
+independently for each class.
 )DOC");
   }
 };
diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
index 60c6488f84..b619c52e55 100644
--- a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
+++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
@@ -69,7 +69,7 @@ def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
 
     sorted_indices = np.argsort(-all_scores, axis=0)
     sorted_scores = all_scores[sorted_indices]
-    if top_k < -1 and top_k < sorted_indices.shape[0]:
+    if top_k > -1 and top_k < sorted_indices.shape[0]:
         sorted_indices = sorted_indices[:top_k]
         sorted_scores = sorted_scores[:top_k]
 
@@ -82,7 +82,7 @@ def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
             if keep:
                 kept_idx = selected_indices[k]
                 overlap = iou(boxes[idx], boxes[kept_idx])
-                keep = overlap <= adaptive_threshold
+                keep = True if overlap <= adaptive_threshold else False
             else:
                 break
         if keep:
@@ -103,14 +103,14 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
         if c == background: continue
         indices = nms(boxes, scores[c], score_threshold, nms_threshold,
                       nms_top_k)
-        selected_indices.append((c, indices))
+        for idx in indices:
+            selected_indices.append((c, idx))
         num_det += len(indices)
 
     if keep_top_k > -1 and num_det > keep_top_k:
         score_index = []
-        for c, indices in selected_indices:
-            for idx in indices:
-                score_index.append((scores[c][idx], c, idx))
+        for c, idx in selected_indices:
+            score_index.append((scores[c][idx], c, idx))
 
         sorted_score_index = sorted(
             score_index, key=lambda tup: tup[0], reverse=True)
@@ -134,19 +134,16 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
                                     keep_top_k)
         lod.append(lod[-1] + len(nmsed_outs))
         if len(nmsed_outs) == 0: continue
-        for c, indices in nmsed_outs:
-            for idx in indices:
-                xmin, ymin, xmax, ymax = boxes[idx][:]
-                det_outs.append(
-                    (c, scores[n][c][idx], c, xmin, ymin, xmax, ymax))
+        for c, idx in nmsed_outs:
+            xmin, ymin, xmax, ymax = boxes[idx][:]
+            det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
     return det_outs, lod
 
 
 class TestMulticlassNMSOp(OpTest):
     def setUp(self):
-        self.op_type = 'multiclass_nms'
         N = 7
-        M = 1230
+        M = 1240
         C = 21
         BOX_SIZE = 4
         background = 0
@@ -155,7 +152,17 @@ class TestMulticlassNMSOp(OpTest):
         keep_top_k = 200
         score_threshold = 0.01
 
-        scores = np.random.random((N, C, M)).astype('float32')
+        scores = np.random.random((N * M, C)).astype('float32')
+
+        def softmax(x):
+            shiftx = x - np.max(x).clip(-64.)
+            exps = np.exp(shiftx)
+            return exps / np.sum(exps)
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores = np.transpose(scores, (0, 2, 1))
+
         boxes = np.random.random((M, BOX_SIZE)).astype('float32')
         boxes[:, 0:2] = boxes[:, 0:2] * 0.5
         boxes[:, 2:4] = boxes[:, 0:2] * 0.5 + 0.5
@@ -163,8 +170,19 @@ class TestMulticlassNMSOp(OpTest):
         nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
                                                  score_threshold, nms_threshold,
                                                  nms_top_k, keep_top_k)
+        nmsed_outs = np.array(nmsed_outs).astype('float32')
+
+        self.op_type = 'multiclass_nms'
         self.inputs = {'Bboxes': boxes, 'Scores': scores}
         self.outputs = {'Out': (nmsed_outs, [lod])}
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+        }
 
     def test_check_output(self):
         self.check_output()
@@ -182,18 +200,3 @@ class TestIOU(unittest.TestCase):
 
 if __name__ == '__main__':
     unittest.main()
-    # N = 7
-    # M = 8
-    # C = 5
-    # BOX_SIZE = 4
-    # background = 0
-    # nms_threshold = 0.3
-    # nms_top_k = 400
-    # keep_top_k = 200
-    # score_threshold = 0.5
-
-    # scores = np.random.random((N, C, M)).astype('float32')
-    # boxes = np.random.random((M, BOX_SIZE)).astype('float32')
-    # boxes[:, 0 : 2] = boxes[:, 0 : 2] * 0.5
-    # boxes[:, 2 : 4] = boxes[:, 0 : 2] * 0.5 + 0.5
-    # print nmsed_outs, lod

From 6e17babe49a7fdeb4f345c83d347f217d05e7e77 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Tue, 30 Jan 2018 19:05:53 +0800
Subject: [PATCH 027/102] More efficient, add check on python side

---
 paddle/operators/CMakeLists.txt         |  1 -
 paddle/operators/math/depthwise_conv.cu | 52 ++++++++++++-------------
 python/paddle/v2/fluid/layers/nn.py     |  3 +-
 3 files changed, 26 insertions(+), 30 deletions(-)

diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 8b442af45b..f7d600414f 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -159,7 +159,6 @@ if (WITH_GPU)
 op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS
     vol2col depthwise_conv)
 
-# op_library(conv_op SRCS conv_op.cc conv_op.cu.cc conv_cudnn_op.cu.cc DEPS vol2col)
 op_library(edit_distance_op SRCS edit_distance_op.cc edit_distance_op.cu DEPS math_function)
 op_library(pool_op SRCS pool_op.cc pool_op.cu.cc pool_cudnn_op.cu.cc DEPS pooling)
 op_library(conv_transpose_op SRCS conv_transpose_op.cc conv_transpose_op.cu.cc
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
index 23e26e8827..4aa38151e6 100644
--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -46,16 +46,18 @@ __global__ void KernelDepthwiseConv(
         -padding_height + h_out * stride_height + filter_height - 1;
     const int w_in_end =
         -padding_width + w_out * stride_width + filter_width - 1;
+
+    const int in_offset =
+        ((batch * input_channels + c_in) * input_height) * input_width;
+
     if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
         (w_in_end < input_width)) {
       for (int kh = 0; kh < filter_height; ++kh) {
         for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = -padding_height + h_out * stride_height + kh;
-          const int w_in = -padding_width + w_out * stride_width + kw;
-          const int offset =
-              ((batch * input_channels + c_in) * input_height + h_in) *
-                  input_width +
-              w_in;
+          const int h_in = h_in_start + kh;
+          const int w_in = w_in_start + kw;
+          const int offset = in_offset + h_in * input_width + w_in;
+
           value += (*weight) * input_data[offset];
           ++weight;
         }
@@ -63,14 +65,11 @@ __global__ void KernelDepthwiseConv(
     } else {
       for (int kh = 0; kh < filter_height; ++kh) {
         for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = -padding_height + h_out * stride_height + kh;
-          const int w_in = -padding_width + w_out * stride_width + kw;
+          const int h_in = h_in_start + kh;
+          const int w_in = w_in_start + kw;
           if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
               (w_in < input_width)) {
-            const int offset =
-                ((batch * input_channels + c_in) * input_height + h_in) *
-                    input_width +
-                w_in;
+            const int offset = in_offset + h_in * input_width + w_in;
             value += (*weight) * input_data[offset];
           }
           ++weight;
@@ -159,36 +158,33 @@ __global__ void KernelDepthwiseConvFilterGrad(
     const int h_in_end =
         -padding_height + h_out * stride_height + filter_height;
     const int w_in_end = -padding_width + w_out * stride_width + filter_width;
+    const int in_offset =
+        (batch * input_channels + c_in) * input_height * input_width;
+
+    T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
+
     if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
         (w_in_end < input_width)) {
       for (int kw = 0; kw < filter_width; kw++) {
         for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = -padding_height + h_out * stride_height + kh;
-          const int w_in = -padding_width + w_out * stride_width + kw;
-          const int offset =
-              ((batch * input_channels + c_in) * input_height + h_in) *
-                  input_width +
-              w_in;
+          const int h_in = h_in_start + kh;
+          const int w_in = w_in_start + kw;
+          const int offset = in_offset + h_in * input_width + w_in;
           const T diff_temp = output_grad_data[index] * input_data[offset];
-          T* addr = filter_grad_data + c_out * filter_height * filter_width +
-                    kh * filter_width + kw;
+          T* addr = addr_offset + kh * filter_width + kw;
           paddle::platform::CudaAtomicAdd(addr, diff_temp);
         }
       }
     } else {
       for (int kw = 0; kw < filter_width; kw++) {
         for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = -padding_height + h_out * stride_height + kh;
-          const int w_in = -padding_width + w_out * stride_width + kw;
+          const int h_in = h_in_start + kh;
+          const int w_in = w_in_start + kw;
           if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
               (w_in < input_width)) {
-            const int offset =
-                ((batch * input_channels + c_in) * input_height + h_in) *
-                    input_width +
-                w_in;
+            const int offset = in_offset + h_in * input_width + w_in;
             const T diff_temp = output_grad_data[index] * input_data[offset];
-            T* addr = filter_grad_data + c_out * filter_height * filter_width +
-                      kh * filter_width + kw;
+            T* addr = addr_offset + kh * filter_width + kw;
             paddle::platform::CudaAtomicAdd(addr, diff_temp);
           }
         }
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 40c7ec5866..a047cc4eec 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1013,7 +1013,8 @@ def conv2d(input,
     num_channels = input.shape[1]
 
     l_type = 'conv2d'
-    if num_channels == groups and not use_cudnn:
+    if (num_channels == groups and num_filters % num_channels == 0 and
+            not use_cudnn):
         l_type = 'depthwise_conv'
 
     helper = LayerHelper(l_type, **locals())

From 537886408863f68d7863e8245d746d2c15ef55dd Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Tue, 30 Jan 2018 21:30:16 +0800
Subject: [PATCH 028/102] Fix the output order and add more unit test cases.

---
 paddle/operators/multiclass_nms_op.cc         | 16 +++--
 .../v2/fluid/tests/test_multiclass_nms_op.py  | 68 +++++++++++++------
 2 files changed, 57 insertions(+), 27 deletions(-)

diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
index 93c8b5216f..4689306d24 100644
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -201,8 +201,8 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
         }
       }
       // Keep top k results per image.
-      std::sort(score_index_pairs.begin(), score_index_pairs.end(),
-                SortScorePairDescend<std::pair<int, int>>);
+      std::stable_sort(score_index_pairs.begin(), score_index_pairs.end(),
+                       SortScorePairDescend<std::pair<int, int>>);
       score_index_pairs.resize(keep_top_k);
 
       // Store the new indices.
@@ -269,7 +269,8 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
 
     int num_kept = batch_starts.back();
     if (num_kept == 0) {
-      outs->Resize({0, 0});
+      T* od = outs->mutable_data<T>({1}, ctx.GetPlace());
+      od[0] = -1;
     } else {
       outs->mutable_data<T>({num_kept, kOutputDim}, ctx.GetPlace());
       for (int64_t i = 0; i < batch_size; ++i) {
@@ -349,11 +350,16 @@ is larger than -1. Then this operator pruns away boxes that have high IOU
 (intersection over union) overlap with already selected boxes by adaptive
 threshold NMS based on parameters of nms_threshold and nms_eta.
 
-Aftern NMS step, only at most keep_top_k number of total bboxes are to be kept
+Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
 per image if keep_top_k is larger than -1.
 
 This operator support multi-class and batched inputs. It applying NMS
-independently for each class.
+independently for each class. The outputs is a 2-D LoDTenosr, for each
+image, the offsets in first dimension of LoDTensor are called LoD, the number
+of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
+means there is no detected bbox for this image. If there is no detected boxes
+for all images, all the elements in LoD are 0, and the Out only contains one
+value which is -1.
 )DOC");
   }
 };
diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
index b619c52e55..3097b8388c 100644
--- a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
+++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
@@ -56,8 +56,12 @@ def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
     Args:
         boxes: (tensor) The location preds for the img, Shape: [num_priors,4].
         scores: (tensor) The class predscores for the img, Shape:[num_priors].
-        overlap: (float) The overlap thresh for suppressing unnecessary boxes.
-        top_k: (int) The Maximum number of box preds to consider.
+        score_threshold: (float) The confidence thresh for filtering low
+            confidence boxes.
+        nms_threshold: (float) The overlap thresh for suppressing unnecessary
+            boxes.
+        top_k: (int) The maximum number of box preds to consider.
+        eta: (float) The parameter for adaptive NMS.
     Return:
         The indices of the kept boxes with respect to num_priors.
     """
@@ -67,7 +71,7 @@ def nms(boxes, scores, score_threshold, nms_threshold, top_k=200, eta=1.0):
     selected_indices = selected_indices.flatten()
     all_scores = all_scores[selected_indices]
 
-    sorted_indices = np.argsort(-all_scores, axis=0)
+    sorted_indices = np.argsort(-all_scores, axis=0, kind='mergesort')
     sorted_scores = all_scores[sorted_indices]
     if top_k > -1 and top_k < sorted_indices.shape[0]:
         sorted_indices = sorted_indices[:top_k]
@@ -97,29 +101,33 @@ def multiclass_nms(boxes, scores, background, score_threshold, nms_threshold,
     class_num = scores.shape[0]
     priorbox_num = scores.shape[1]
 
-    selected_indices = []
+    selected_indices = {}
     num_det = 0
     for c in range(class_num):
         if c == background: continue
         indices = nms(boxes, scores[c], score_threshold, nms_threshold,
                       nms_top_k)
-        for idx in indices:
-            selected_indices.append((c, idx))
+        selected_indices[c] = indices
         num_det += len(indices)
 
     if keep_top_k > -1 and num_det > keep_top_k:
         score_index = []
-        for c, idx in selected_indices:
-            score_index.append((scores[c][idx], c, idx))
+        for c, indices in selected_indices.iteritems():
+            for idx in indices:
+                score_index.append((scores[c][idx], c, idx))
 
         sorted_score_index = sorted(
             score_index, key=lambda tup: tup[0], reverse=True)
         sorted_score_index = sorted_score_index[:keep_top_k]
-        selected_indices = []
+        selected_indices = {}
+
+        for _, c, _ in sorted_score_index:
+            selected_indices[c] = []
         for s, c, idx in sorted_score_index:
-            selected_indices.append((c, idx))
+            selected_indices[c].append(idx)
+        num_det = keep_top_k
 
-    return selected_indices
+    return selected_indices, num_det
 
 
 def batched_multiclass_nms(boxes, scores, background, score_threshold,
@@ -129,28 +137,36 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold,
     det_outs = []
     lod = [0]
     for n in range(batch_size):
-        nmsed_outs = multiclass_nms(boxes, scores[n], background,
-                                    score_threshold, nms_threshold, nms_top_k,
-                                    keep_top_k)
-        lod.append(lod[-1] + len(nmsed_outs))
-        if len(nmsed_outs) == 0: continue
-        for c, idx in nmsed_outs:
-            xmin, ymin, xmax, ymax = boxes[idx][:]
-            det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+        nmsed_outs, nmsed_num = multiclass_nms(boxes, scores[n], background,
+                                               score_threshold, nms_threshold,
+                                               nms_top_k, keep_top_k)
+        lod.append(lod[-1] + nmsed_num)
+        if nmsed_num == 0: continue
+
+        for c, indices in nmsed_outs.iteritems():
+            for idx in indices:
+                xmin, ymin, xmax, ymax = boxes[idx][:]
+                det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax])
+
     return det_outs, lod
 
 
 class TestMulticlassNMSOp(OpTest):
+    def set_argument(self):
+        self.score_threshold = 0.01
+
     def setUp(self):
+        self.set_argument()
         N = 7
-        M = 1240
+        M = 1200
         C = 21
         BOX_SIZE = 4
+
         background = 0
         nms_threshold = 0.3
         nms_top_k = 400
         keep_top_k = 200
-        score_threshold = 0.01
+        score_threshold = self.score_threshold
 
         scores = np.random.random((N * M, C)).astype('float32')
 
@@ -165,11 +181,12 @@ class TestMulticlassNMSOp(OpTest):
 
         boxes = np.random.random((M, BOX_SIZE)).astype('float32')
         boxes[:, 0:2] = boxes[:, 0:2] * 0.5
-        boxes[:, 2:4] = boxes[:, 0:2] * 0.5 + 0.5
+        boxes[:, 2:4] = boxes[:, 2:4] * 0.5 + 0.5
 
         nmsed_outs, lod = batched_multiclass_nms(boxes, scores, background,
                                                  score_threshold, nms_threshold,
                                                  nms_top_k, keep_top_k)
+        nmsed_outs = [-1] if not nmsed_outs else nmsed_outs
         nmsed_outs = np.array(nmsed_outs).astype('float32')
 
         self.op_type = 'multiclass_nms'
@@ -188,6 +205,13 @@ class TestMulticlassNMSOp(OpTest):
         self.check_output()
 
 
+class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0 
+        self.score_threshold = 2.0
+
+
 class TestIOU(unittest.TestCase):
     def test_iou(self):
         box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')

From 419e4c49d07afbbb9abd3c323ce66794410b4ed8 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 31 Jan 2018 06:38:17 +0000
Subject: [PATCH 029/102] modify some

---
 benchmark/cluster/vgg16/v2_pserver.yaml | 4 ++--
 benchmark/cluster/vgg16/v2_trainer.yaml | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
index dd1271e0cf..857e2ff455 100644
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -23,13 +23,13 @@ spec:
         - name: PADDLE_JOB_NAME
           value: vgg16v2job
         - name: TRAINERS
-          value: "20"
+          value: "60"
         - name: PSERVERS
           value: "10"
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "python train.py"
+          value: "python -u train.py"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
index 997bbc81c9..be0f741b34 100644
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -3,8 +3,8 @@ kind: Job
 metadata:
   name: vgg16v2job-trainer
 spec:
-  parallelism: 20
-  completions: 20
+  parallelism: 60
+  completions: 60
   template:
     metadata:
       labels:
@@ -24,13 +24,13 @@ spec:
         - name: BATCH_SIZE
           value: "256"
         - name: TRAINERS
-          value: "20"
+          value: "60"
         - name: PSERVERS
           value: "10"
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
+          value: "cd /workspace && MKL_NUM_THREADS=1 python -u /workspace/vgg16_v2.py"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT

From b148f065a8d88c944c354eaea0e31a3da5fde99c Mon Sep 17 00:00:00 2001
From: Yang Yu <yuyang18@baidu.com>
Date: Wed, 31 Jan 2018 13:31:41 +0800
Subject: [PATCH 030/102] Make Fit a line a normal unittest

---
 .../v2/fluid/tests/book/test_fit_a_line.py    | 88 +++++++++++++------
 1 file changed, 59 insertions(+), 29 deletions(-)

diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index 0b954c60b6..27f34b1733 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -12,44 +12,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import contextlib
+import unittest
 
-x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
-y_predict = fluid.layers.fc(input=x, size=1, act=None)
+def main(use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
 
-y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+    x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
-cost = fluid.layers.square_error_cost(input=y_predict, label=y)
-avg_cost = fluid.layers.mean(x=cost)
+    y_predict = fluid.layers.fc(input=x, size=1, act=None)
 
-sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-sgd_optimizer.minimize(avg_cost)
+    y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-BATCH_SIZE = 20
+    cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+    avg_cost = fluid.layers.mean(x=cost)
 
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.uci_housing.train(), buf_size=500),
-    batch_size=BATCH_SIZE)
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+    sgd_optimizer.minimize(avg_cost)
 
-place = fluid.CPUPlace()
-feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
-exe = fluid.Executor(place)
+    BATCH_SIZE = 20
 
-exe.run(fluid.default_startup_program())
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.uci_housing.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
 
-PASS_NUM = 100
-for pass_id in range(PASS_NUM):
-    fluid.io.save_persistables(exe, "./fit_a_line.model/")
-    fluid.io.load_persistables(exe, "./fit_a_line.model/")
-    for data in train_reader():
-        avg_loss_value, = exe.run(fluid.default_main_program(),
-                                  feed=feeder.feed(data),
-                                  fetch_list=[avg_cost])
-        print(avg_loss_value)
-        if avg_loss_value[0] < 10.0:
-            exit(0)  # if avg cost less than 10.0, we think our code is good.
-exit(1)
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+    exe = fluid.Executor(place)
+
+    exe.run(fluid.default_startup_program())
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        fluid.io.save_persistables(exe, "./fit_a_line.model/")
+        fluid.io.load_persistables(exe, "./fit_a_line.model/")
+        for data in train_reader():
+            avg_loss_value, = exe.run(fluid.default_main_program(),
+                                      feed=feeder.feed(data),
+                                      fetch_list=[avg_cost])
+            print(avg_loss_value)
+            if avg_loss_value[0] < 10.0:
+                return
+    raise AssertionError("Fit a line cost is too large, {0:2.2}".format(
+        avg_loss_value[0]))
+
+
+class TestFitALine(unittest.TestCase):
+    def test_cpu(self):
+        with self.program_scope_guard():
+            main(use_cuda=False)
+
+    def test_cuda(self):
+        with self.program_scope_guard():
+            main(use_cuda=True)
+
+    @contextlib.contextmanager
+    def program_scope_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1b1f305babc3c91d0761814306df0004620be309 Mon Sep 17 00:00:00 2001
From: Yang Yu <yuyang18@baidu.com>
Date: Wed, 31 Jan 2018 13:07:19 +0800
Subject: [PATCH 031/102] Make image_classification as a normal python unittest

---
 .../paddle/v2/fluid/tests/book/CMakeLists.txt |   4 +-
 .../book/test_image_classification_train.py   | 143 +++++++++++-------
 2 files changed, 89 insertions(+), 58 deletions(-)

diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index dda02c03fd..a870478db8 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,9 +1,7 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
-py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
-py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+list(REMOVE_ITEM TEST_OPS test_recognize_digits)
 py_test(test_recognize_digits_mlp_cpu
   SRCS test_recognize_digits.py
   ARGS mlp)
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index 30582a21d0..a4168d16db 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -14,10 +14,10 @@
 
 from __future__ import print_function
 
-import sys
-
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
+import unittest
+import contextlib
 
 
 def resnet_cifar10(input, depth=32):
@@ -89,56 +89,89 @@ def vgg16_bn_drop(input):
     return fc2
 
 
-classdim = 10
-data_shape = [3, 32, 32]
-
-images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
-label = fluid.layers.data(name='label', shape=[1], dtype='int64')
-
-net_type = "vgg"
-if len(sys.argv) >= 2:
-    net_type = sys.argv[1]
-
-if net_type == "vgg":
-    print("train vgg net")
-    net = vgg16_bn_drop(images)
-elif net_type == "resnet":
-    print("train resnet")
-    net = resnet_cifar10(images, 32)
-else:
-    raise ValueError("%s network is not supported" % net_type)
-
-predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
-cost = fluid.layers.cross_entropy(input=predict, label=label)
-avg_cost = fluid.layers.mean(x=cost)
-
-optimizer = fluid.optimizer.Adam(learning_rate=0.001)
-opts = optimizer.minimize(avg_cost)
-
-accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
-
-BATCH_SIZE = 128
-PASS_NUM = 1
-
-train_reader = paddle.batch(
-    paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=128 * 10),
-    batch_size=BATCH_SIZE)
-
-place = fluid.CPUPlace()
-exe = fluid.Executor(place)
-feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
-exe.run(fluid.default_startup_program())
-
-for pass_id in range(PASS_NUM):
-    accuracy.reset(exe)
-    for data in train_reader():
-        loss, acc = exe.run(fluid.default_main_program(),
-                            feed=feeder.feed(data),
-                            fetch_list=[avg_cost] + accuracy.metrics)
-        pass_acc = accuracy.eval(exe)
-        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-            pass_acc))
-        # this model is slow, so if we can train two mini batch, we think it works properly.
-        exit(0)
-exit(1)
+def main(net_type, use_cuda):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+
+    classdim = 10
+    data_shape = [3, 32, 32]
+
+    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+
+    if net_type == "vgg":
+        print("train vgg net")
+        net = vgg16_bn_drop(images)
+    elif net_type == "resnet":
+        print("train resnet")
+        net = resnet_cifar10(images, 32)
+    else:
+        raise ValueError("%s network is not supported" % net_type)
+
+    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+    cost = fluid.layers.cross_entropy(input=predict, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adam(learning_rate=0.001)
+    optimizer.minimize(avg_cost)
+
+    accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+    BATCH_SIZE = 128
+    PASS_NUM = 1
+
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.cifar.train10(), buf_size=128 * 10),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
+    exe.run(fluid.default_startup_program())
+
+    loss = 0.0
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+                pass_acc))
+            return
+
+    raise AssertionError(
+        "Image classification loss is too large, {0:2.2}".format(loss))
+
+
+class TestImageClassification(unittest.TestCase):
+    def test_vgg_cuda(self):
+        with self.scope_prog_guard():
+            main('vgg', use_cuda=True)
+
+    def test_resnet_cuda(self):
+        with self.scope_prog_guard():
+            main('resnet', use_cuda=True)
+
+    def test_vgg_cpu(self):
+        with self.scope_prog_guard():
+            main('vgg', use_cuda=False)
+
+    def test_resnet_cpu(self):
+        with self.scope_prog_guard():
+            main('resnet', use_cuda=False)
+
+    @contextlib.contextmanager
+    def scope_prog_guard(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                yield
+
+
+if __name__ == '__main__':
+    unittest.main()

From 38b8b7f6acb51e62b97a62e3215d39b0d6f7553b Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 31 Jan 2018 09:09:32 +0000
Subject: [PATCH 032/102] add results

---
 benchmark/cluster/README.md       | 10 +++++-----
 benchmark/cluster/vgg16/README.md |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
index b619613ea7..c2be886b34 100644
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@@ -44,14 +44,14 @@
 
 ### Measure the Performance for Different PServer Count
 
-- Trainer Count: 100
-- Batch Size: 64
+- Trainer Count: 60
+- Batch Size: 128
 - Metrics: mini-batch / sec
 
-| PServer Count | 10 | 20 | 40 | 60 |
+| PServer Count | 3 | 6 | 10 | 20 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
+| PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 412.2 | 368.4 | 346.8 | 283.2 |
 | TensorFlow | - | - | - | - |
 
 ### Measure Parallel Efficiency By Increasing Trainer Count
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index c1e85a2c40..333e14250b 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -48,14 +48,14 @@
 
 ### different pserver number
 
-- Trainer Count: 100
+- Trainer Count: 60
 - Batch Size: 128
 - Metrics: mini-batch / sec
 
-| PServer Count | 10 | 20 | 40 | 60 |
+| PServer Count | 3 | 6 |10 | 20 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | - | - | - | - |
-| PaddlePaddle v2 | - | - | - | - |
+| PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle v2 | 412.2 | 368.4 | 346.8 | 283.2 |
 | TensorFlow | - | - | - | - |
 
 

From cfbbb9841d3ab9f6736cd7e02273fe8dc7a1df39 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Wed, 31 Jan 2018 09:18:35 +0000
Subject: [PATCH 033/102] clean code

---
 benchmark/cluster/README.md             | 10 +++++-----
 benchmark/cluster/vgg16/v2_pserver.yaml |  2 +-
 benchmark/cluster/vgg16/v2_trainer.yaml |  6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
index c2be886b34..b619613ea7 100644
--- a/benchmark/cluster/README.md
+++ b/benchmark/cluster/README.md
@@ -44,14 +44,14 @@
 
 ### Measure the Performance for Different PServer Count
 
-- Trainer Count: 60
-- Batch Size: 128
+- Trainer Count: 100
+- Batch Size: 64
 - Metrics: mini-batch / sec
 
-| PServer Count | 3 | 6 | 10 | 20 |
+| PServer Count | 10 | 20 | 40 | 60 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 412.2 | 368.4 | 346.8 | 283.2 |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
 | TensorFlow | - | - | - | - |
 
 ### Measure Parallel Efficiency By Increasing Trainer Count
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
index 857e2ff455..935cf0be3c 100644
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -23,7 +23,7 @@ spec:
         - name: PADDLE_JOB_NAME
           value: vgg16v2job
         - name: TRAINERS
-          value: "60"
+          value: "20"
         - name: PSERVERS
           value: "10"
         - name: TOPOLOGY
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
index be0f741b34..5189009f3e 100644
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -3,8 +3,8 @@ kind: Job
 metadata:
   name: vgg16v2job-trainer
 spec:
-  parallelism: 60
-  completions: 60
+  parallelism: 20
+  completions: 20
   template:
     metadata:
       labels:
@@ -24,7 +24,7 @@ spec:
         - name: BATCH_SIZE
           value: "256"
         - name: TRAINERS
-          value: "60"
+          value: "20"
         - name: PSERVERS
           value: "10"
         - name: TOPOLOGY

From f9db5629873b117c226b15858f128dd2c1f9fd16 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Wed, 31 Jan 2018 20:09:08 +0800
Subject: [PATCH 034/102] update results

---
 benchmark/cluster/vgg16/Dockerfile         |  16 +-
 benchmark/cluster/vgg16/README.md          |   2 +-
 benchmark/cluster/vgg16/fluid_trainer.yaml |   2 +-
 benchmark/cluster/vgg16/k8s_tools.py       |  94 ----------
 benchmark/cluster/vgg16/paddle_k8s         | 199 ---------------------
 benchmark/cluster/vgg16/reader.py          |  16 --
 benchmark/cluster/vgg16/v2_trainer.yaml    |   2 +-
 benchmark/cluster/vgg16/vgg16_v2.py        |   2 +-
 8 files changed, 11 insertions(+), 322 deletions(-)
 delete mode 100644 benchmark/cluster/vgg16/k8s_tools.py
 delete mode 100755 benchmark/cluster/vgg16/paddle_k8s
 delete mode 100644 benchmark/cluster/vgg16/reader.py

diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
index dfaffb8c21..c34f7e8fcf 100644
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -1,15 +1,13 @@
-#FROM paddlepaddle/paddlecloud-job
-#RUN mkdir -p /workspace
-#ADD reader.py /workspace/
-#RUN python /workspace/reader.py
 FROM python:2.7.14
-ADD paddle_k8s /usr/bin
-ADD k8s_tools.py /root
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev 
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev && \
+chmod +x /usr/bin/paddle_k8s
+# NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
+#       so we must build one with distribute support to install in this image.
 ADD *.whl /
 RUN pip install /*.whl && rm -f /*.whl
 ENV LD_LIBRARY_PATH=/usr/local/lib
-ADD reader.py /workspace/
-RUN python /workspace/reader.py
+RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
 
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index c1e85a2c40..0c404e60a8 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -43,7 +43,7 @@
 | Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
-| PaddlePaddle v2 | 356.28 | - | - | 1041.99 |
+| PaddlePaddle v2 (need more tests) | 356.28 | 785.39 | 853.30 | 1041.99 |
 | TensorFlow | - | - | - | - |
 
 ### different pserver number
diff --git a/benchmark/cluster/vgg16/fluid_trainer.yaml b/benchmark/cluster/vgg16/fluid_trainer.yaml
index 2f6a87ab02..0a0ed25ebe 100644
--- a/benchmark/cluster/vgg16/fluid_trainer.yaml
+++ b/benchmark/cluster/vgg16/fluid_trainer.yaml
@@ -30,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 256"
+          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0 --batch_size 128"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/vgg16/k8s_tools.py b/benchmark/cluster/vgg16/k8s_tools.py
deleted file mode 100644
index 4bee96a7a8..0000000000
--- a/benchmark/cluster/vgg16/k8s_tools.py
+++ /dev/null
@@ -1,94 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#!/bin/env python
-import os
-import sys
-import time
-import socket
-from kubernetes import client, config
-PADDLE_JOB_NAME = os.getenv("PADDLE_JOB_NAME")
-NAMESPACE = os.getenv("NAMESPACE")
-PORT = os.getenv("PSERVER_PORT")
-if os.getenv("KUBERNETES_SERVICE_HOST", None):
-    config.load_incluster_config()
-else:
-    config.load_kube_config()
-v1 = client.CoreV1Api()
-
-
-def fetch_pods_info(label_selector):
-    api_response = v1.list_namespaced_pod(
-        namespace=NAMESPACE, pretty=True, label_selector=label_selector)
-    pod_list = []
-    for item in api_response.items:
-        pod_list.append((item.status.phase, item.status.pod_ip))
-    return pod_list
-
-
-def wait_pods_running(label_selector, desired):
-    print "label selector: %s, desired: %s" % (label_selector, desired)
-    while True:
-        count = count_pods_by_phase(label_selector, 'Running')
-        # NOTE: pods may be scaled.
-        if count >= int(desired):
-            break
-        print 'current cnt: %d sleep for 5 seconds...' % count
-        time.sleep(5)
-
-
-def count_pods_by_phase(label_selector, phase):
-    pod_list = fetch_pods_info(label_selector)
-    filtered_pod_list = filter(lambda x: x[0] == phase, pod_list)
-    return len(filtered_pod_list)
-
-
-def fetch_pserver_ips():
-    label_selector = "paddle-job-pserver=%s" % PADDLE_JOB_NAME
-    pod_list = fetch_pods_info(label_selector)
-    pserver_ips = [item[1] for item in pod_list]
-    return ",".join(pserver_ips)
-
-
-def fetch_master_ip():
-    label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
-    pod_list = fetch_pods_info(label_selector)
-    master_ips = [item[1] for item in pod_list]
-    return master_ips[0]
-
-
-def fetch_trainer_id():
-    label_selector = "paddle-job=%s" % PADDLE_JOB_NAME
-    pod_list = fetch_pods_info(label_selector)
-    trainer_ips = [item[1] for item in pod_list]
-    trainer_ips.sort()
-    local_ip = socket.gethostbyname(socket.gethostname())
-    for i in xrange(len(trainer_ips)):
-        if trainer_ips[i] == local_ip:
-            return i
-    return None
-
-
-if __name__ == "__main__":
-    command = sys.argv[1]
-    if command == "fetch_pserver_ips":
-        print fetch_pserver_ips()
-    elif command == "fetch_trainer_id":
-        print fetch_trainer_id()
-    elif command == "fetch_master_ip":
-        print fetch_master_ip()
-    elif command == "count_pods_by_phase":
-        print count_pods_by_phase(sys.argv[2], sys.argv[3])
-    elif command == "wait_pods_running":
-        wait_pods_running(sys.argv[2], sys.argv[3])
diff --git a/benchmark/cluster/vgg16/paddle_k8s b/benchmark/cluster/vgg16/paddle_k8s
deleted file mode 100755
index af5f35b3ec..0000000000
--- a/benchmark/cluster/vgg16/paddle_k8s
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/bin/bash
-start_pserver() {
-    stdbuf -oL paddle pserver \
-      --use_gpu=0 \
-      --port=$PADDLE_INIT_PORT \
-      --ports_num=$PADDLE_INIT_PORTS_NUM \
-      --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
-      --nics=$PADDLE_INIT_NICS \
-      --comment=paddle_process_k8s \
-      --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
-}
-
-start_new_pserver() {
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
-  export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
-  stdbuf -oL /usr/bin/pserver \
-    -port=$PADDLE_INIT_PORT \
-    -num-pservers=$PSERVERS \
-    -log-level=debug \
-    -etcd-endpoint=http://$MASTER_IP:2379
-}
-
-start_master() {
-  stdbuf -oL /usr/bin/master \
-  -port=8080 \
-  -chunk-per-task=1\
-  -task-timout-dur=16s\
-  -endpoints=http://127.0.0.1:2379
-}
-
-check_failed_cnt() {
-  max_failed=$1
-  failed_count=$(python /root/k8s_tools.py count_pods_by_phase paddle-job=${PADDLE_JOB_NAME} Failed) 
-  if [ $failed_count -gt $max_failed ]; then
-    stdbuf -oL echo "Failed trainer count beyond the threadhold: "$max_failed
-    echo "Failed trainer count beyond the threshold: " $max_failed > /dev/termination-log 
-    exit 0
-  fi
-}
-
-check_trainer_ret() {
-  ret=$1
-  stdbuf -oL echo "job returned $ret...setting pod return message..."
-  stdbuf -oL echo "==============================="
-
-  if [ $ret -eq 136 ] ; then
-    echo "Error Arithmetic Operation(Floating Point Exception)" > /dev/termination-log
-  elif [ $ret -eq 139 ] ; then
-    echo "Segmentation Fault" > /dev/termination-log
-  elif [ $ret -eq 1 ] ; then
-    echo "General Error" > /dev/termination-log
-  elif [ $ret -eq 134 ] ; then
-    echo "Program Abort" > /dev/termination-log
-  fi
-  stdbuf -oL echo "termination log wroted..."
-  exit $ret
-}
-
-start_fluid_process() {
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
-  if [ "${TRAINING_ROLE}" == "TRAINER" ]; then
-    check_failed_cnt ${TRAINERS}
-    sleep 5
-    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
-  fi
-  export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
-  stdbuf -oL sh -c "${ENTRY}"
-  check_trainer_ret $?
-}
-
-start_new_trainer() {
-  # FIXME(Yancey1989): use command-line interface to configure the max failed count
-  check_failed_cnt ${TRAINERS}
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
-  sleep 5
-  stdbuf -oL python /root/k8s_tools.py wait_pods_running  paddle-job-master=${PADDLE_JOB_NAME} 1
-  export MASTER_IP=$(python /root/k8s_tools.py fetch_master_ip)
-  export ETCD_IP="$MASTER_IP"
-
-  # NOTE: $TRAINER_PACKAGE may be large, do not copy
-  export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
-  cd $TRAINER_PACKAGE
-
-  stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
-  $PADDLE_INIT_NUM_GRADIENT_SERVERS, "version: " $1 
-
-  stdbuf -oL sh -c "${ENTRY}"
-  check_trainer_ret $?
-}
-
-start_trainer() {
-    # paddle v1 and V2 distributed training does not allow any trainer failed. 
-    check_failed_cnt 0
-    stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job-pserver=${PADDLE_JOB_NAME} ${PSERVERS}
-    stdbuf -oL python /root/k8s_tools.py wait_pods_running paddle-job=${PADDLE_JOB_NAME} ${TRAINERS}
-
-    export PADDLE_INIT_PSERVERS=$(python /root/k8s_tools.py fetch_pserver_ips)
-    export PADDLE_INIT_TRAINER_ID=$(python /root/k8s_tools.py fetch_trainer_id)
-    stdbuf -oL echo $PADDLE_INIT_TRAINER_ID > /trainer_id
-    # FIXME: /trainer_count = PADDLE_INIT_NUM_GRADIENT_SERVERS
-    stdbuf -oL echo $PADDLE_INIT_NUM_GRADIENT_SERVERS > /trainer_count
-
-    # NOTE: $TRAINER_PACKAGE may be large, do not copy
-    export PYTHONPATH=$TRAINER_PACKAGE:$PYTHONPATH
-    cd $TRAINER_PACKAGE
-
-    stdbuf -oL echo "Starting training job: " $TRAINER_PACKAGE, "num_gradient_servers:" \
-    $PADDLE_INIT_NUM_GRADIENT_SERVERS, "trainer_id: " $PADDLE_INIT_TRAINER_ID, \
-    "version: " $1
-
-    # FIXME: If we use the new PServer by Golang, add Kubernetes healthz
-    # to wait PServer process get ready.Now only sleep 20 seconds.
-    sleep 20
-
-    case "$1" in
-      "v1")
-        FILE_COUNT=$(wc -l $TRAIN_LIST | awk '{print $1}')
-        if [ $FILE_COUNT -le $PADDLE_INIT_NUM_GRADIENT_SERVERS ]; then
-          echo "file count less than trainers"
-          check_trainer_ret 0
-        fi
-        let lines_per_node="$FILE_COUNT / ($PADDLE_INIT_NUM_GRADIENT_SERVERS + 1)"
-        echo "spliting file to" $lines_per_node
-        cp $TRAIN_LIST /
-        cd /
-        split -l $lines_per_node -d -a 3 $TRAIN_LIST train.list
-        CURRENT_LIST=$(printf "train.list%03d" $PADDLE_INIT_TRAINER_ID)
-        # always use /train.list for paddle v1 for each node.
-        echo "File for current node ${CURRENT_LIST}"
-        sleep 10
-        cp $CURRENT_LIST train.list
-
-        cd $TRAINER_PACKAGE
-
-        stdbuf -oL  paddle train \
-          --port=$PADDLE_INIT_PORT \
-          --nics=$PADDLE_INIT_NICS \
-          --ports_num=$PADDLE_INIT_PORTS_NUM \
-          --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE \
-          --num_passes=$PADDLE_INIT_NUM_PASSES \
-          --trainer_count=$PADDLE_INIT_TRAINER_COUNT \
-          --saving_period=1 \
-          --log_period=20 \
-          --local=0 \
-          --rdma_tcp=tcp \
-          --config=$TOPOLOGY \
-          --use_gpu=$PADDLE_INIT_USE_GPU \
-          --trainer_id=$PADDLE_INIT_TRAINER_ID \
-          --save_dir=$OUTPUT \
-          --pservers=$PADDLE_INIT_PSERVERS \
-          --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS
-        # paddle v1 API does not allow any trainer failed.
-        check_trainer_ret $? 
-        ;;
-      "v2")
-        stdbuf -oL sh -c "${ENTRY}"
-        # paddle v2 API does not allow any trainer failed.
-        check_trainer_ret $? 
-        ;;
-      *)
-        ;;
-    esac
-}
-
-usage() {
-    echo "usage: paddle_k8s [<args>]:"
-    echo "  start_trainer  [v1|v2]    Start a trainer process with v1 or v2 API"
-    echo "  start_pserver             Start a pserver process"
-    echo "  start_new_pserver         Start a new pserver process"
-    echo "  start_new_trainer         Start a new triner process"
-}
-
-case "$1" in
-    start_pserver)
-        start_pserver
-        ;;
-    start_trainer)
-        start_trainer $2
-        ;;
-    start_new_trainer)
-        start_new_trainer
-        ;;
-    start_new_pserver)
-        start_new_pserver
-        ;;
-    start_master)
-        start_master
-        ;;
-    start_fluid)
-        start_fluid_process
-        ;;
-    --help)
-        usage
-        ;;
-    *)
-        usage
-        ;;
-esac
-
diff --git a/benchmark/cluster/vgg16/reader.py b/benchmark/cluster/vgg16/reader.py
deleted file mode 100644
index 3e20f830fc..0000000000
--- a/benchmark/cluster/vgg16/reader.py
+++ /dev/null
@@ -1,16 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import paddle.v2 as paddle
-paddle.dataset.cifar.train10()
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
index 997bbc81c9..12c8964066 100644
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -38,7 +38,7 @@ spec:
         - name: PADDLE_INIT_NICS
           value: "xgbe0"
         - name: PADDLE_INIT_TRAINER_COUNT
-          value: "2"
+          value: "1"
         - name: PADDLE_INIT_PORTS_NUM
           value: "1"
         - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
diff --git a/benchmark/cluster/vgg16/vgg16_v2.py b/benchmark/cluster/vgg16/vgg16_v2.py
index 81ddeb0332..6ac6b3c332 100644
--- a/benchmark/cluster/vgg16/vgg16_v2.py
+++ b/benchmark/cluster/vgg16/vgg16_v2.py
@@ -51,7 +51,7 @@ def vgg(input, nums, class_dim):
     conv4 = conv_block(conv3, 512, nums[3])
     conv5 = conv_block(conv4, 512, nums[4])
 
-    fc_dim = 4096
+    fc_dim = 512
     fc1 = paddle.layer.fc(input=conv5,
                           size=fc_dim,
                           act=paddle.activation.Relu(),

From 58bfaea8afcc2b30c5f73a5c52f1cafc6a8682f2 Mon Sep 17 00:00:00 2001
From: gaoyuan <gaoyuan18@baidu.com>
Date: Wed, 31 Jan 2018 21:32:22 +0800
Subject: [PATCH 035/102] update according to the code review

---
 paddle/operators/box_coder_op.cc | 49 ++++++++++++--------
 paddle/operators/box_coder_op.cu |  4 +-
 paddle/operators/box_coder_op.h  | 79 +++++++++++++-------------------
 3 files changed, 66 insertions(+), 66 deletions(-)

diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc
index 0cb20a4182..41123f9b6e 100644
--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/operators/box_coder_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -31,23 +31,21 @@ class BoxCoderOp : public framework::OperatorWithKernel {
     auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
     auto target_box_dims = ctx->GetInputDim("TargetBox");
 
-    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2UL,
-                      "The shape of PriorBox is [N, 4]");
-    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4UL,
-                      "The shape of PriorBox is [N, 4]");
-    PADDLE_ENFORCE_EQ(prior_box_var_dims.size(), 2UL,
-                      "The shape of PriorBoxVar is [N, 4]");
-    PADDLE_ENFORCE_EQ(prior_box_var_dims[1], 4UL,
-                      "The shape of PriorBoxVar is [N, 4]");
-    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2UL,
-                      "The shape of TargetBox is [M, 4]");
-    PADDLE_ENFORCE_EQ(target_box_dims[1], 4UL,
+    PADDLE_ENFORCE_EQ(prior_box_dims.size(), 2,
+                      "The rank of Input of PriorBoxVar must be 2");
+    PADDLE_ENFORCE_EQ(prior_box_dims[1], 4, "The shape of PriorBox is [N, 4]");
+    PADDLE_ENFORCE_EQ(prior_box_dims, prior_box_var_dims);
+    PADDLE_ENFORCE_EQ(target_box_dims.size(), 2,
+                      "The rank of Input of TargetBox must be 2");
+    PADDLE_ENFORCE_EQ(target_box_dims[1], 4,
                       "The shape of TargetBox is [M, 4]");
 
     GetBoxCodeType(ctx->Attrs().Get<std::string>("code_type"));
 
-    ctx->SetOutputDim("OutputBox", framework::make_ddim({target_box_dims[0],
-                                                         target_box_dims[1]}));
+    ctx->SetOutputDim(
+        "OutputBox",
+        framework::make_ddim({target_box_dims[0], prior_box_dims[0], 4}));
+    ctx->ShareLoD("TargetBox", /*->*/ "OutputBox");
   }
 };
 
@@ -58,7 +56,7 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput(
         "PriorBox",
         "(Tensor, default Tensor<float>) "
-        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds N boxes, "
+        "Box list PriorBox is a 2-D Tensor with shape [M, 4] holds M boxes, "
         "each box is represented as [xmin, ymin, xmax, ymax], "
         "[xmin, ymin] is the left top coordinate of the anchor box, "
         "if the input is image feature map, they are close to the origin "
@@ -66,7 +64,7 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
         "coordinate of the anchor box.");
     AddInput("PriorBoxVar",
              "(Tensor, default Tensor<float>) "
-             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds N group "
+             "PriorBoxVar is a 2-D Tensor with shape [M, 4] holds M group "
              "of variance.");
     AddInput(
         "TargetBox",
@@ -85,14 +83,29 @@ class BoxCoderOpMaker : public framework::OpProtoAndCheckerMaker {
         .InEnum({"encode_center_size", "decode_center_size"});
     AddOutput(
         "OutputBox",
-        "(Tensor, default Tensor<float>)"
+        "(LoDTensor or Tensor) "
         "(Tensor) The output of box_coder_op, a tensor with shape [N, M, 4] "
         "representing the result of N target boxes encoded/decoded with "
         "M Prior boxes and variances.");
 
     AddComment(R"DOC(
 Bounding Box Coder Operator.
-Encode/Decode the priorbox information with the target bounding box.
+Encode/Decode the target bounding box with the priorbox information.
+The Encoding schema described below:
+ox = (tx - px) / pw / pxv
+oy = (ty - py) / ph / pyv
+ow = log(abs(tw / pw)) / pwv 
+oh = log(abs(th / ph)) / phv 
+The Decoding schema described below:
+ox = (pw * pxv * tx * + px) - tw / 2
+oy = (ph * pyv * ty * + py) - th / 2
+ow = exp(pwv * tw) * pw + tw / 2
+oh = exp(phv * th) * ph + th / 2
+where tx, ty, tw, th denote the target box's center coordinates, width and
+height respectively. Similarly, px, py, pw, ph denote the priorbox's(anchor)
+center coordinates, width and height. pxv, pyv, pwv, phv denote the variance
+of the priorbox and ox, oy, ow, oh denote the encoded/decoded coordinates,
+width and height.
 )DOC");
   }
 };
diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu
index 4055ded1f8..9e2ea8cc67 100644
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/operators/box_coder_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -108,7 +108,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     auto* output_box = context.Output<Tensor>("OutputBox");
 
     if (target_box->lod().size()) {
-      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,
+      PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
                         "Only support 1 level of LoD.");
     }
     auto row = target_box->dims()[0];
diff --git a/paddle/operators/box_coder_op.h b/paddle/operators/box_coder_op.h
index 3865da40c3..d1c9a40459 100644
--- a/paddle/operators/box_coder_op.h
+++ b/paddle/operators/box_coder_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
@@ -35,67 +35,52 @@ class BoxCoderKernel : public framework::OpKernel<T> {
  public:
   void EncodeCenterSize(const Tensor& target_box, const Tensor& prior_box,
                         const Tensor& prior_box_var, T* output) const {
-    PADDLE_ENFORCE_EQ(target_box.dims().size(), 2,
-                      "The rank of target_box must be 2.");
-    PADDLE_ENFORCE_EQ(prior_box.dims().size(), 2,
-                      "The rank of prior_box must be 2.");
-    PADDLE_ENFORCE_EQ(prior_box_var.dims().size(), 2,
-                      "The rank of prior_box_var must be 2.");
-    PADDLE_ENFORCE_EQ(prior_box.dims()[0], prior_box_var.dims()[0],
-                      "The dims of prior_box must equal to prior_box_var.");
-
     int64_t row = target_box.dims()[0];
     int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
     auto* target_box_data = target_box.data<T>();
     auto* prior_box_data = prior_box.data<T>();
     auto* prior_box_var_data = prior_box_var.data<T>();
 
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width = prior_box_data[j * 4 + 2] - prior_box_data[j * 4];
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
         T prior_box_height =
-            prior_box_data[j * 4 + 3] - prior_box_data[j * 4 + 1];
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
         T prior_box_center_x =
-            (prior_box_data[j * 4 + 2] + prior_box_data[j * 4]) / 2;
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
         T prior_box_center_y =
-            (prior_box_data[j * 4 + 3] + prior_box_data[j * 4 + 1]) / 2;
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
 
         T target_box_center_x =
-            (target_box_data[i * 4 + 2] + target_box_data[i * 4]) / 2;
+            (target_box_data[i * len + 2] + target_box_data[i * len]) / 2;
         T target_box_center_y =
-            (target_box_data[i * 4 + 3] + target_box_data[i * 4 + 1]) / 2;
+            (target_box_data[i * len + 3] + target_box_data[i * len + 1]) / 2;
         T target_box_width =
-            target_box_data[i * 4 + 2] - target_box_data[i * 4];
+            target_box_data[i * len + 2] - target_box_data[i * len];
         T target_box_height =
-            target_box_data[i * 4 + 3] - target_box_data[i * 4 + 1];
+            target_box_data[i * len + 3] - target_box_data[i * len + 1];
 
-        size_t offset = i * col * 4 + j * 4;
+        size_t offset = i * col * len + j * len;
         output[offset] = (target_box_center_x - prior_box_center_x) /
-                         prior_box_width / prior_box_var_data[j * 4];
+                         prior_box_width / prior_box_var_data[j * len];
         output[offset + 1] = (target_box_center_y - prior_box_center_y) /
-                             prior_box_height / prior_box_var_data[j * 4 + 1];
+                             prior_box_height / prior_box_var_data[j * len + 1];
         output[offset + 2] =
             std::log(std::fabs(target_box_width / prior_box_width)) /
-            prior_box_var_data[j * 4 + 2];
+            prior_box_var_data[j * len + 2];
         output[offset + 3] =
             std::log(std::fabs(target_box_height / prior_box_height)) /
-            prior_box_var_data[j * 4 + 3];
+            prior_box_var_data[j * len + 3];
       }
     }
   }
   void DecodeCenterSize(const Tensor& target_box, const Tensor& prior_box,
                         const Tensor& prior_box_var, T* output) const {
-    PADDLE_ENFORCE_EQ(target_box.dims().size(), 2,
-                      "The rank of target_box must be 2.");
-    PADDLE_ENFORCE_EQ(prior_box.dims().size(), 2,
-                      "The rank of prior_box must be 2.");
-    PADDLE_ENFORCE_EQ(prior_box_var.dims().size(), 2,
-                      "The rank of prior_box_var must be 2.");
-    PADDLE_ENFORCE_EQ(prior_box.dims()[0], prior_box_var.dims()[0],
-                      "The dims of prior_box must equal to prior_box_var.");
-
     int64_t row = target_box.dims()[0];
     int64_t col = prior_box.dims()[0];
+    int64_t len = prior_box.dims()[1];
 
     auto* target_box_data = target_box.data<T>();
     auto* prior_box_data = prior_box.data<T>();
@@ -103,29 +88,30 @@ class BoxCoderKernel : public framework::OpKernel<T> {
 
     for (int64_t i = 0; i < row; ++i) {
       for (int64_t j = 0; j < col; ++j) {
-        T prior_box_width = prior_box_data[j * 4 + 2] - prior_box_data[j * 4];
+        T prior_box_width =
+            prior_box_data[j * len + 2] - prior_box_data[j * len];
         T prior_box_height =
-            prior_box_data[j * 4 + 3] - prior_box_data[j * 4 + 1];
+            prior_box_data[j * len + 3] - prior_box_data[j * len + 1];
         T prior_box_center_x =
-            (prior_box_data[j * 4 + 2] + prior_box_data[j * 4]) / 2;
+            (prior_box_data[j * len + 2] + prior_box_data[j * len]) / 2;
         T prior_box_center_y =
-            (prior_box_data[j * 4 + 3] + prior_box_data[j * 4 + 1]) / 2;
+            (prior_box_data[j * len + 3] + prior_box_data[j * len + 1]) / 2;
 
-        T target_box_center_x = prior_box_var_data[j * 4] *
-                                    target_box_data[i * 4] * prior_box_width +
+        T target_box_center_x = prior_box_var_data[j * len] *
+                                    target_box_data[i * len] * prior_box_width +
                                 prior_box_center_x;
-        T target_box_center_y = prior_box_var_data[j * 4 + 1] *
-                                    target_box_data[i * 4 + 1] *
+        T target_box_center_y = prior_box_var_data[j * len + 1] *
+                                    target_box_data[i * len + 1] *
                                     prior_box_height +
                                 prior_box_center_y;
-        T target_box_width = std::exp(prior_box_var_data[j * 4 + 2] *
-                                      target_box_data[i * 4 + 2]) *
+        T target_box_width = std::exp(prior_box_var_data[j * len + 2] *
+                                      target_box_data[i * len + 2]) *
                              prior_box_width;
-        T target_box_height = std::exp(prior_box_var_data[j * 4 + 3] *
-                                       target_box_data[i * 4 + 3]) *
+        T target_box_height = std::exp(prior_box_var_data[j * len + 3] *
+                                       target_box_data[i * len + 3]) *
                               prior_box_height;
 
-        size_t offset = i * col * 4 + j * 4;
+        size_t offset = i * col * len + j * len;
         output[offset] = target_box_center_x - target_box_width / 2;
         output[offset + 1] = target_box_center_y - target_box_height / 2;
         output[offset + 2] = target_box_center_x + target_box_width / 2;
@@ -146,8 +132,9 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     }
     auto row = target_box->dims()[0];
     auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
 
-    output_box->mutable_data<T>({row, col, 4}, context.GetPlace());
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
 
     auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
     T* output = output_box->data<T>();

From c3e89f308a0cd0d694a1e4fed51dbeef92a156bb Mon Sep 17 00:00:00 2001
From: gaoyuan <gaoyuan18@baidu.com>
Date: Wed, 31 Jan 2018 22:28:49 +0800
Subject: [PATCH 036/102] update accoding to the code review

---
 paddle/operators/box_coder_op.cu              |  2 -
 .../v2/fluid/tests/test_box_coder_op.py       | 72 +++++++++++--------
 2 files changed, 41 insertions(+), 33 deletions(-)

diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu
index 9e2ea8cc67..f2ea592f8e 100644
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/operators/box_coder_op.cu
@@ -15,8 +15,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using platform::PADDLE_CUDA_NUM_THREADS;
-
 template <typename T>
 __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
diff --git a/python/paddle/v2/fluid/tests/test_box_coder_op.py b/python/paddle/v2/fluid/tests/test_box_coder_op.py
index fcf5da01ce..0dc18476fd 100644
--- a/python/paddle/v2/fluid/tests/test_box_coder_op.py
+++ b/python/paddle/v2/fluid/tests/test_box_coder_op.py
@@ -20,41 +20,51 @@ from op_test import OpTest
 
 
 def box_coder(target_box, prior_box, prior_box_var, output_box, code_type):
-    prior_box_x = (prior_box[:, 2] + prior_box[:, 0]) / 2
-    prior_box_y = (prior_box[:, 3] + prior_box[:, 1]) / 2
-    prior_box_width = (prior_box[:, 2] - prior_box[:, 0])
-    prior_box_height = (prior_box[:, 3] - prior_box[:, 1])
+    prior_box_x = (
+        (prior_box[:, 2] + prior_box[:, 0]) / 2).reshape(1, prior_box.shape[0])
+    prior_box_y = (
+        (prior_box[:, 3] + prior_box[:, 1]) / 2).reshape(1, prior_box.shape[0])
+    prior_box_width = (
+        (prior_box[:, 2] - prior_box[:, 0])).reshape(1, prior_box.shape[0])
+    prior_box_height = (
+        (prior_box[:, 3] - prior_box[:, 1])).reshape(1, prior_box.shape[0])
+    prior_box_var = prior_box_var.reshape(1, prior_box_var.shape[0],
+                                          prior_box_var.shape[1])
 
     if (code_type == "EncodeCenterSize"):
-        target_box_x = (target_box[:, 2] + target_box[:, 0]) / 2
-        target_box_y = (target_box[:, 3] + target_box[:, 1]) / 2
-        target_box_width = (target_box[:, 2] - target_box[:, 0])
-        target_box_height = (target_box[:, 3] - target_box[:, 1])
-
-        for i in range(target_box.shape[0]):
-            output_box[i,:,0] = (target_box_x[i] - prior_box_x) / prior_box_width / \
-                                prior_box_var[:,0]
-            output_box[i,:,1] = (target_box_y[i] - prior_box_y) / prior_box_height / \
-                                prior_box_var[:,1]
-            output_box[i,:,2] = np.log(np.fabs(target_box_width[i] / prior_box_width)) / \
-                                prior_box_var[:,2]
-            output_box[i,:,3] = np.log(np.fabs(target_box_height[i] / prior_box_height)) / \
-                                prior_box_var[:,3]
+        target_box_x = ((target_box[:, 2] + target_box[:, 0]) / 2).reshape(
+            target_box.shape[0], 1)
+        target_box_y = ((target_box[:, 3] + target_box[:, 1]) / 2).reshape(
+            target_box.shape[0], 1)
+        target_box_width = ((target_box[:, 2] - target_box[:, 0])).reshape(
+            target_box.shape[0], 1)
+        target_box_height = ((target_box[:, 3] - target_box[:, 1])).reshape(
+            target_box.shape[0], 1)
+
+        output_box[:,:,0] = (target_box_x - prior_box_x) / prior_box_width / \
+                prior_box_var[:,:,0]
+        output_box[:,:,1] = (target_box_y - prior_box_y) / prior_box_height / \
+                prior_box_var[:,:,1]
+        output_box[:,:,2] = np.log(np.fabs(target_box_width / prior_box_width)) / \
+                prior_box_var[:,:,2]
+        output_box[:,:,3] = np.log(np.fabs(target_box_height / prior_box_height)) / \
+                prior_box_var[:,:,3]
 
     elif (code_type == "DecodeCenterSize"):
-        for i in range(target_box.shape[0]):
-            target_box_x = prior_box_var[:,0] * target_box[i][0] * \
-                           prior_box_width[:] + prior_box_x[:]
-            target_box_y = prior_box_var[:,1] * target_box[i][1] * \
-                           prior_box_height[:] + prior_box_y[:]
-            target_box_width = np.exp(prior_box_var[:,2] * target_box[i][2]) * \
-                               prior_box_width[:]
-            target_box_height = np.exp(prior_box_var[:,3] * target_box[i][3]) * \
-                                prior_box_height[:]
-            output_box[i, :, 0] = target_box_x - target_box_width / 2
-            output_box[i, :, 1] = target_box_y - target_box_height / 2
-            output_box[i, :, 2] = target_box_x + target_box_width / 2
-            output_box[i, :, 3] = target_box_y + target_box_height / 2
+        target_box = target_box.reshape(target_box.shape[0], 1,
+                                        target_box.shape[1])
+        target_box_x = prior_box_var[:,:,0] * target_box[:,:,0] * \
+                       prior_box_width + prior_box_x
+        target_box_y = prior_box_var[:,:,1] * target_box[:,:,1] * \
+                       prior_box_height + prior_box_y
+        target_box_width = np.exp(prior_box_var[:,:,2] * target_box[:,:,2]) * \
+                           prior_box_width
+        target_box_height = np.exp(prior_box_var[:,:,3] * target_box[:,:,3]) * \
+                            prior_box_height
+        output_box[:, :, 0] = target_box_x - target_box_width / 2
+        output_box[:, :, 1] = target_box_y - target_box_height / 2
+        output_box[:, :, 2] = target_box_x + target_box_width / 2
+        output_box[:, :, 3] = target_box_y + target_box_height / 2
 
 
 def batch_box_coder(prior_box, prior_box_var, target_box, lod, code_type):

From e14272bbb355e9330e075e63f78cab348cc402b7 Mon Sep 17 00:00:00 2001
From: gaoyuan <gaoyuan18@baidu.com>
Date: Wed, 31 Jan 2018 22:46:23 +0800
Subject: [PATCH 037/102] update accoding to the code review

---
 paddle/operators/box_coder_op.cu | 93 +++++++++++++++++---------------
 1 file changed, 50 insertions(+), 43 deletions(-)

diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu
index f2ea592f8e..883cc54305 100644
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/operators/box_coder_op.cu
@@ -18,79 +18,85 @@ namespace operators {
 template <typename T>
 __global__ void EncodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
-                                       const T* target_box_data, int row,
-                                       int col, T* output) {
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int row_idx = idx / col;
     const int col_idx = idx % col;
     T prior_box_width =
-        prior_box_data[col_idx * 4 + 2] - prior_box_data[col_idx * 4];
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
     T prior_box_height =
-        prior_box_data[col_idx * 4 + 3] - prior_box_data[col_idx * 4 + 1];
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
     T prior_box_center_x =
-        (prior_box_data[col_idx * 4 + 2] + prior_box_data[col_idx * 4]) / 2;
-    T prior_box_center_y =
-        (prior_box_data[col_idx * 4 + 3] + prior_box_data[col_idx * 4 + 1]) / 2;
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
 
     T target_box_center_x =
-        (target_box_data[row_idx * 4 + 2] + target_box_data[row_idx * 4]) / 2;
-    T target_box_center_y =
-        (target_box_data[row_idx * 4 + 3] + target_box_data[row_idx * 4 + 1]) /
+        (target_box_data[row_idx * len + 2] + target_box_data[row_idx * len]) /
         2;
+    T target_box_center_y = (target_box_data[row_idx * len + 3] +
+                             target_box_data[row_idx * len + 1]) /
+                            2;
     T target_box_width =
-        target_box_data[row_idx * 4 + 2] - target_box_data[row_idx * 4];
+        target_box_data[row_idx * len + 2] - target_box_data[row_idx * len];
     T target_box_height =
-        target_box_data[row_idx * 4 + 3] - target_box_data[row_idx * 4 + 1];
+        target_box_data[row_idx * len + 3] - target_box_data[row_idx * len + 1];
 
-    output[idx * 4] = (target_box_center_x - prior_box_center_x) /
-                      prior_box_width / prior_box_var_data[col_idx * 4];
-    output[idx * 4 + 1] = (target_box_center_y - prior_box_center_y) /
-                          prior_box_height /
-                          prior_box_var_data[col_idx * 4 + 1];
-    output[idx * 4 + 2] = log(fabs(target_box_width / prior_box_width)) /
-                          prior_box_var_data[col_idx * 4 + 2];
-    output[idx * 4 + 3] = log(fabs(target_box_height / prior_box_height)) /
-                          prior_box_var_data[col_idx * 4 + 3];
+    output[idx * len] = (target_box_center_x - prior_box_center_x) /
+                        prior_box_width / prior_box_var_data[col_idx * len];
+    output[idx * len + 1] = (target_box_center_y - prior_box_center_y) /
+                            prior_box_height /
+                            prior_box_var_data[col_idx * len + 1];
+    output[idx * len + 2] = log(fabs(target_box_width / prior_box_width)) /
+                            prior_box_var_data[col_idx * len + 2];
+    output[idx * len + 3] = log(fabs(target_box_height / prior_box_height)) /
+                            prior_box_var_data[col_idx * len + 3];
   }
 }
 
 template <typename T>
 __global__ void DecodeCenterSizeKernel(const T* prior_box_data,
                                        const T* prior_box_var_data,
-                                       const T* target_box_data, int row,
-                                       int col, T* output) {
+                                       const T* target_box_data, const int row,
+                                       const int col, const int len,
+                                       T* output) {
   const int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx < row * col) {
     const int row_idx = idx / col;
     const int col_idx = idx % col;
     T prior_box_width =
-        prior_box_data[col_idx * 4 + 2] - prior_box_data[col_idx * 4];
+        prior_box_data[col_idx * len + 2] - prior_box_data[col_idx * len];
     T prior_box_height =
-        prior_box_data[col_idx * 4 + 3] - prior_box_data[col_idx * 4 + 1];
+        prior_box_data[col_idx * len + 3] - prior_box_data[col_idx * len + 1];
     T prior_box_center_x =
-        (prior_box_data[col_idx * 4 + 2] + prior_box_data[col_idx * 4]) / 2;
-    T prior_box_center_y =
-        (prior_box_data[col_idx * 4 + 3] + prior_box_data[col_idx * 4 + 1]) / 2;
+        (prior_box_data[col_idx * len + 2] + prior_box_data[col_idx * len]) / 2;
+    T prior_box_center_y = (prior_box_data[col_idx * len + 3] +
+                            prior_box_data[col_idx * len + 1]) /
+                           2;
 
-    T target_box_width = exp(prior_box_var_data[col_idx * 4 + 2] *
-                             target_box_data[row_idx * 4 + 2]) *
+    T target_box_width = exp(prior_box_var_data[col_idx * len + 2] *
+                             target_box_data[row_idx * len + 2]) *
                          prior_box_width;
-    T target_box_height = exp(prior_box_var_data[col_idx * 4 + 3] *
-                              target_box_data[row_idx * 4 + 3]) *
+    T target_box_height = exp(prior_box_var_data[col_idx * len + 3] *
+                              target_box_data[row_idx * len + 3]) *
                           prior_box_height;
-    T target_box_center_x = prior_box_var_data[col_idx * 4] *
-                                target_box_data[row_idx * 4] * prior_box_width +
+    T target_box_center_x = prior_box_var_data[col_idx * len] *
+                                target_box_data[row_idx * len] *
+                                prior_box_width +
                             prior_box_center_x;
-    T target_box_center_y = prior_box_var_data[col_idx * 4 + 1] *
-                                target_box_data[row_idx * 4 + 1] *
+    T target_box_center_y = prior_box_var_data[col_idx * len + 1] *
+                                target_box_data[row_idx * len + 1] *
                                 prior_box_height +
                             prior_box_center_y;
 
-    output[idx * 4] = target_box_center_x - target_box_width / 2;
-    output[idx * 4 + 1] = target_box_center_y - target_box_height / 2;
-    output[idx * 4 + 2] = target_box_center_x + target_box_width / 2;
-    output[idx * 4 + 3] = target_box_center_y + target_box_height / 2;
+    output[idx * len] = target_box_center_x - target_box_width / 2;
+    output[idx * len + 1] = target_box_center_y - target_box_height / 2;
+    output[idx * len + 2] = target_box_center_x + target_box_width / 2;
+    output[idx * len + 3] = target_box_center_y + target_box_height / 2;
   }
 }
 
@@ -111,6 +117,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     }
     auto row = target_box->dims()[0];
     auto col = prior_box->dims()[0];
+    auto len = prior_box->dims()[1];
     int block = 512;
     int grid = (row * col + block - 1) / block;
     auto& device_ctx = context.cuda_device_context();
@@ -119,17 +126,17 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     const T* prior_box_var_data = prior_box_var->data<T>();
     const T* target_box_data = target_box->data<T>();
 
-    output_box->mutable_data<T>({row, col, 4}, context.GetPlace());
+    output_box->mutable_data<T>({row, col, len}, context.GetPlace());
     T* output = output_box->data<T>();
 
     auto code_type = GetBoxCodeType(context.Attr<std::string>("code_type"));
     if (code_type == BoxCodeType::kEncodeCenterSize) {
       EncodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-          prior_box_data, prior_box_var_data, target_box_data, row, col,
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
           output);
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
       DecodeCenterSizeKernel<T><<<grid, block, 0, device_ctx.stream()>>>(
-          prior_box_data, prior_box_var_data, target_box_data, row, col,
+          prior_box_data, prior_box_var_data, target_box_data, row, col, len,
           output);
     }
   }

From ff5570c12605b5b983cb9308ae507e1bb214143c Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox@gmail.com>
Date: Thu, 1 Feb 2018 11:29:26 +0800
Subject: [PATCH 038/102] update mine_hard_examples_op

---
 paddle/operators/mine_hard_examples_op.cc     | 234 ++++++++++++++----
 paddle/operators/mine_hard_examples_op.h      | 148 -----------
 .../fluid/tests/test_mine_hard_examples_op.py |  29 +--
 3 files changed, 202 insertions(+), 209 deletions(-)
 delete mode 100755 paddle/operators/mine_hard_examples_op.h

diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc
index 75098d0bcd..603368f93c 100644
--- a/paddle/operators/mine_hard_examples_op.cc
+++ b/paddle/operators/mine_hard_examples_op.cc
@@ -12,41 +12,178 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/operators/mine_hard_examples_op.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
+enum MiningType { kNone = 0, kMaxNegative, kHardExample };
+
+template <typename T>
+bool SortScoreDescend(const std::pair<float, T>& pair1,
+                      const std::pair<float, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
+                             const float match_dist,
+                             const float neg_dist_threshold) {
+  if (mining_type == MiningType::kMaxNegative) {
+    return match_idx == -1 && match_dist < neg_dist_threshold;
+  } else if (mining_type == MiningType::kHardExample) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+MiningType GetMiningType(std::string str) {
+  if (str == "max_negative") {
+    return MiningType::kMaxNegative;
+  } else if (str == "hard_example") {
+    return MiningType::kHardExample;
+  } else {
+    return MiningType::kNone;
+  }
+}
+
+template <typename DeviceContext, typename T>
+class MineHardExamplesKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
+    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
+    auto* in_matched_indices = ctx.Input<framework::Tensor>("MatchIndices");
+    auto* in_match_dist = ctx.Input<framework::Tensor>("MatchDist");
+    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
+    T neg_dist_threshold =
+        static_cast<T>(ctx.Attr<float>("neg_dist_threshold"));
+    int sample_size = ctx.Attr<int>("sample_size");
+    MiningType mining_type =
+        GetMiningType(ctx.Attr<std::string>("mining_type"));
+
+    auto out_neg_indices = ctx.Output<framework::LoDTensor>("NegIndices");
+    auto out_match_indices =
+        ctx.Output<framework::Tensor>("UpdatedMatchIndices");
+
+    framework::Copy(*in_matched_indices, ctx.GetPlace(), out_match_indices);
+
+    int batch_size = in_matched_indices->dims()[0];
+    int prior_num = in_matched_indices->dims()[1];
+
+    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indices);
+
+    auto match_indices_et =
+        framework::EigenMatrix<int>::From(*out_match_indices);
+
+    auto match_dist = framework::EigenMatrix<T>::From(*in_match_dist);
+
+    const T* cls_loss = in_cls_loss->data<T>();
+    const T* loc_loss = nullptr;
+    if (in_loc_loss) {
+      loc_loss = in_loc_loss->data<T>();
+    }
+
+    std::vector<std::vector<int>> all_neg_indices;
+    std::vector<size_t> batch_starts = {0};
+    for (int n = 0; n < batch_size; ++n) {
+      std::vector<std::pair<T, size_t>> loss_idx;
+      int neg_sel = 0;
+      for (int m = 0; m < prior_num; ++m) {
+        if (IsEligibleMining(mining_type, match_indices(n, m), match_dist(n, m),
+                             neg_dist_threshold)) {
+          T loss = cls_loss[n * prior_num + m];
+          if (mining_type == MiningType::kHardExample && loc_loss != nullptr) {
+            loss = cls_loss[n * prior_num + m] + loc_loss[n * prior_num + m];
+          }
+          loss_idx.push_back(std::make_pair(loss, m));
+          ++neg_sel;
+        }
+      }
+
+      if (mining_type == MiningType::kMaxNegative) {
+        int num_pos = 0;
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) != -1) ++num_pos;
+        }
+        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
+      } else if (mining_type == MiningType::kHardExample) {
+        neg_sel = std::min(sample_size, neg_sel);
+      }
+
+      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<int>);
+      std::set<int> sel_indices;
+      std::vector<int> neg_indices;
+      std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
+                     std::inserter(sel_indices, sel_indices.begin()),
+                     [](std::pair<T, size_t> l) -> int {
+                       return static_cast<int>(l.second);
+                     });
+
+      for (int m = 0; m < prior_num; ++m) {
+        if (match_indices(n, m) > -1) {
+          if (mining_type == MiningType::kHardExample &&
+              sel_indices.find(m) == sel_indices.end()) {
+            match_indices_et(n, m) = -1;
+          }
+        } else {
+          if (sel_indices.find(m) != sel_indices.end()) {
+            neg_indices.push_back(m);
+          }
+        }
+      }
+      all_neg_indices.push_back(neg_indices);
+      batch_starts.push_back(batch_starts.back() + neg_indices.size());
+    }
+
+    framework::LoD out_neg_indices_lod;
+    out_neg_indices_lod.emplace_back(batch_starts);
+    int neg_offset = 0;
+    auto neg_data = out_neg_indices->mutable_data<int>(
+        framework::make_ddim({static_cast<int>(batch_starts.back()), 1}),
+        ctx.GetPlace());
+
+    for (auto neg_indices : all_neg_indices) {
+      std::copy(neg_indices.begin(), neg_indices.end(), neg_data + neg_offset);
+      neg_offset += neg_indices.size();
+    }
+    out_neg_indices->set_lod(out_neg_indices_lod);
+    return;
+  }
+};
+
 class MineHardExamplesOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("ClsLoss"),
                    "Input(ClsLoss) of MineHardExamplesOp should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasInput("MatchIndics"),
-        "Input(MatchIndics) of MineHardExamplesOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("MatchDis"),
-                   "Input(MatchDis) of MineHardExamplesOp should not be null.");
+        ctx->HasInput("MatchIndices"),
+        "Input(MatchIndices) of MineHardExamplesOp should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasOutput("NegIndics"),
-        "Output(NegIndics) of MineHardExamplesOp should not be null.");
+        ctx->HasInput("MatchDist"),
+        "Input(MatchDist) of MineHardExamplesOp should not be null.");
     PADDLE_ENFORCE(
-        ctx->HasOutput("UpdatedMatchIndics"),
-        "Output(UpdatedMatchIndics) of MineHardExamplesOp should not be null.");
+        ctx->HasOutput("NegIndices"),
+        "Output(NegIndices) of MineHardExamplesOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("UpdatedMatchIndices"),
+                   "Output(UpdatedMatchIndices) of MineHardExamplesOp should "
+                   "not be null.");
 
     auto cls_loss_dims = ctx->GetInputDim("ClsLoss");
-    auto idx_dims = ctx->GetInputDim("MatchIndics");
-    auto dis_dims = ctx->GetInputDim("MatchDis");
+    auto idx_dims = ctx->GetInputDim("MatchIndices");
+    auto dis_dims = ctx->GetInputDim("MatchDist");
 
     PADDLE_ENFORCE_EQ(cls_loss_dims.size(), 2UL,
                       "The shape of ClsLoss is [N, Np].");
     PADDLE_ENFORCE_EQ(idx_dims.size(), 2UL,
-                      "The shape of MatchIndics is [N, Np].");
+                      "The shape of MatchIndices is [N, Np].");
     PADDLE_ENFORCE_EQ(dis_dims.size(), 2UL,
-                      "The shape of MatchDis is [N, Np].");
+                      "The shape of MatchDist is [N, Np].");
 
     if (ctx->HasInput("LocLoss")) {
       auto loc_loss_dims = ctx->GetInputDim("LocLoss");
@@ -61,16 +198,16 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
 
     PADDLE_ENFORCE_EQ(
         cls_loss_dims[0], idx_dims[0],
-        "Batch size of ClsLoss and MatchIndics must be the same.");
+        "Batch size of ClsLoss and MatchIndices must be the same.");
     PADDLE_ENFORCE_EQ(
         cls_loss_dims[1], idx_dims[1],
-        "Prior box number of ClsLoss and MatchIndics must be the same.");
+        "Prior box number of ClsLoss and MatchIndices must be the same.");
 
     PADDLE_ENFORCE_EQ(cls_loss_dims[0], dis_dims[0],
-                      "Batch size of ClsLoss and MatchDis must be the same.");
+                      "Batch size of ClsLoss and MatchDist must be the same.");
     PADDLE_ENFORCE_EQ(
         cls_loss_dims[1], idx_dims[1],
-        "Prior box number of ClsLoss and MatchDis must be the same.");
+        "Prior box number of ClsLoss and MatchDist must be the same.");
 
     auto mining_type =
         GetMiningType(ctx->Attrs().Get<std::string>("mining_type"));
@@ -80,13 +217,13 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
 
     if (mining_type == MiningType::kMaxNegative) {
       auto neg_pos_ratio = ctx->Attrs().Get<float>("neg_pos_ratio");
-      auto neg_dis_threshold = ctx->Attrs().Get<float>("neg_dis_threshold");
+      auto neg_dist_threshold = ctx->Attrs().Get<float>("neg_dist_threshold");
       PADDLE_ENFORCE_GT(
           neg_pos_ratio, 0.0f,
           "neg_pos_ratio must greater than zero in max_negative mode");
       PADDLE_ENFORCE_GT(
-          neg_dis_threshold, 0.0f,
-          "neg_dis_threshold must greater than zero in max_negative mode");
+          neg_dist_threshold, 0.0f,
+          "neg_dist_threshold must greater than zero in max_negative mode");
     } else if (mining_type == MiningType::kHardExample) {
       auto sample_size = ctx->Attrs().Get<int>("sample_size");
       PADDLE_ENFORCE_GT(
@@ -94,12 +231,12 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
           "sample_size must greater than zero in hard_example mode");
     }
 
-    ctx->SetOutputDim("UpdatedMatchIndics", idx_dims);
+    ctx->SetOutputDim("UpdatedMatchIndices", idx_dims);
   }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
+      const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
         framework::ToDataType(ctx.Input<framework::Tensor>("ClsLoss")->type()),
         ctx.device_context());
@@ -108,30 +245,31 @@ class MineHardExamplesOp : public framework::OperatorWithKernel {
 
 class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MineHardExamplesOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  MineHardExamplesOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput(
         "ClsLoss",
-        "(Tensor, default Tensor<float>), The classification loss wit shape "
+        "(Tensor, default Tensor<float>), The classification loss with shape "
         "[N, Np], N is the batch size and Np is the number of prior box.");
     AddInput("LocLoss",
              "(Tensor, optional, default Tensor<float>), The localization loss "
              "wit shape [N, Np], N is the batch size and Np is the number of "
              "prior box.")
         .AsDispensable();
-    AddInput("MatchIndics",
+    AddInput("MatchIndices",
              "(Tensor, Tensor<int>), Matched indices with shape [N, Np], N is "
              "the batch size and Np is the number of prior box. "
-             "MatchIndics[i][j] equal -1 means box[j] does not match any "
-             "entity, otherwise means Box[j] is matched to row.");
-    AddInput("MatchDis",
+             "MatchIndices[i][j] equal -1 means the j-th prior box in i-th "
+             "instance does not match any entity, otherwise means it is "
+             "matched to row.");
+    AddInput("MatchDist",
              "(Tensor, default Tensor<float>) Matched indices with shape [N, "
              "Np], N is the batch size and Np is the number of prior box.");
     AddAttr<float>("neg_pos_ratio",
                    "(float) The ratio of the negative box to the positive "
                    "box. Use only when mining_type is equal to max_negative.")
         .SetDefault(1.0);
-    AddAttr<float>("neg_dis_threshold",
+    AddAttr<float>("neg_dist_threshold",
                    "(float) The negative box dis value threshold. "
                    "Use only when mining_type is equal to max_negative.")
         .SetDefault(0.5);
@@ -145,29 +283,31 @@ class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault("max_negative")
         .InEnum({"hard_example", "max_negative"});
 
-    AddOutput("NegIndics",
-              "(LoDTensor) The output of negative example indics.a lod tensor "
-              "with shape [Neg, 1]. The size of lod[0] is batch size, "
-              "and each element is the box index. "
-              "For example, the batch size is 2, the lod is [[0, 1, 2]], "
-              "the sample 0's box 1(MatchIndics[0][1]) is selected, "
-              "and sample 1's box 0 is selected. The output NegIndics is "
-              "[[1], [0]].");
-
-    AddOutput("UpdatedMatchIndics",
-              "(Tensor) The output of updated MatchIndics, a tensor with "
-              "shape [N, M]. Only update when mining_type is equal to "
-              "hard_example. The input MatchIndics elements will be update to "
-              "-1 when it not in the highest loss list");
+    AddOutput(
+        "NegIndices",
+        "(LoDTensor<int>) The output of negative example indices. a LoDTensor "
+        "with shape [Neg, 1]. The size of lod[0] minus 1 is batch size, "
+        "and each element is the prior box index. "
+        "For example, the batch size is 2, the lod is [[0, 1, 2]], "
+        "the sample 0's box 1(MatchIndices[0][1]) is selected, "
+        "and sample 1's box 0 is selected. The output NegIndices is "
+        "[[1], [0]].");
+
+    AddOutput("UpdatedMatchIndices",
+              "(Tensor<int>) The output of updated MatchIndices, a tensor with "
+              "shape [N, Np]. Only update when mining_type is equal to "
+              "hard_example. The input MatchIndices elements will be update to "
+              "-1 when it is not in the candidate high loss list of negative "
+              "examples.");
 
     AddComment(R"DOC(
 Mine hard examples Operator.
-This operator implements hard example mining to select a subset of negative box indics.
+This operator implements hard example mining to select a subset of negative box indices.
 For each image, selects the box with highest losses. subject to the condition that the box cannot have
-an MatchDis > neg_dis_threshold when mining_type is equals max_negative. The selected number is 
+an Matcht > neg_dist_threshold when mining_type is equals max_negative. The selected number is 
 min(sample_size, max_negative_box_number) when mining_type is equals hard_example,
 or min(neg_pos_ratio * positive_box_number, max_negative_box_number) when mining_type is 
-equals max_negative, where the max_negative_box_number is the count of MatchIndics elements with value -1.
+equals max_negative, where the max_negative_box_number is the count of MatchIndices elements with value -1.
 )DOC");
   }
 };
diff --git a/paddle/operators/mine_hard_examples_op.h b/paddle/operators/mine_hard_examples_op.h
deleted file mode 100755
index 0a652a60c5..0000000000
--- a/paddle/operators/mine_hard_examples_op.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-enum MiningType { kNone = 0, kMaxNegative, kHardExample };
-
-template <typename T>
-bool SortScoreDescend(const std::pair<float, T>& pair1,
-                      const std::pair<float, T>& pair2) {
-  return pair1.first > pair2.first;
-}
-
-inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
-                             const float match_dis,
-                             const float neg_dis_threshold) {
-  if (mining_type == MiningType::kMaxNegative) {
-    return match_idx == -1 && match_dis < neg_dis_threshold;
-  } else if (mining_type == MiningType::kHardExample) {
-    return true;
-  } else {
-    return false;
-  }
-}
-
-MiningType GetMiningType(std::string str) {
-  if (str == "max_negative") {
-    return MiningType::kMaxNegative;
-  } else if (str == "hard_example") {
-    return MiningType::kHardExample;
-  } else {
-    return MiningType::kNone;
-  }
-}
-
-template <typename DeviceContext, typename T>
-class MineHardExamplesKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_cls_loss = ctx.Input<framework::Tensor>("ClsLoss");
-    auto* in_loc_loss = ctx.Input<framework::Tensor>("LocLoss");
-    auto* in_matched_indics = ctx.Input<framework::Tensor>("MatchIndics");
-    auto* in_match_dis = ctx.Input<framework::Tensor>("MatchDis");
-    float neg_pos_ratio = ctx.Attr<float>("neg_pos_ratio");
-    T neg_dis_threshold = static_cast<T>(ctx.Attr<float>("neg_dis_threshold"));
-    int sample_size = ctx.Attr<int>("sample_size");
-    MiningType mining_type =
-        GetMiningType(ctx.Attr<std::string>("mining_type"));
-
-    auto out_neg_indics = ctx.Output<framework::LoDTensor>("NegIndics");
-    auto out_match_indics = ctx.Output<framework::Tensor>("UpdatedMatchIndics");
-
-    framework::Copy(*in_matched_indics, ctx.GetPlace(), out_match_indics);
-
-    int batch_size = in_matched_indics->dims()[0];
-    int prior_num = in_matched_indics->dims()[1];
-
-    auto match_indices = framework::EigenMatrix<int>::From(*in_matched_indics);
-
-    auto match_indices_et =
-        framework::EigenMatrix<int>::From(*out_match_indics);
-
-    auto match_dis = framework::EigenMatrix<float>::From(*in_match_dis);
-    auto cls_loss = framework::EigenMatrix<float>::From(*in_cls_loss);
-    auto loc_loss = framework::EigenMatrix<float>::From(*in_loc_loss);
-
-    std::vector<std::vector<int>> all_neg_indices;
-    int all_neg_num = 0;
-    for (int n = 0; n < batch_size; ++n) {
-      std::vector<std::pair<float, size_t>> loss_idx;
-      int neg_sel = 0;
-      for (int m = 0; m < prior_num; ++m) {
-        if (IsEligibleMining(mining_type, match_indices(n, m), match_dis(n, m),
-                             neg_dis_threshold)) {
-          T loss = cls_loss(n, m);
-          if (mining_type == MiningType::kHardExample) {
-            loss = cls_loss(n, m) + loc_loss(n, m);
-          }
-          loss_idx.push_back(std::make_pair(loss, m));
-          ++neg_sel;
-        }
-      }
-      if (mining_type == MiningType::kMaxNegative) {
-        int num_pos = 0;
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) != -1) ++num_pos;
-        }
-        neg_sel = std::min(static_cast<int>(num_pos * neg_pos_ratio), neg_sel);
-      } else if (mining_type == MiningType::kHardExample) {
-        neg_sel = std::min(sample_size, neg_sel);
-      }
-      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<int>);
-      std::set<int> sel_indices;
-      std::vector<int> neg_indices;
-      for (int n = 0; n < neg_sel; ++n) {
-        sel_indices.insert(loss_idx[n].second);
-      }
-
-      for (int m = 0; m < prior_num; ++m) {
-        if (match_indices(n, m) > -1) {
-          if (mining_type == MiningType::kHardExample &&
-              sel_indices.find(m) == sel_indices.end()) {
-            match_indices_et(n, m) = -1;
-          }
-        } else {
-          if (sel_indices.find(m) != sel_indices.end()) {
-            neg_indices.push_back(m);
-          }
-        }
-      }
-      all_neg_indices.push_back(neg_indices);
-      all_neg_num += neg_indices.size();
-    }
-
-    framework::LoD out_neg_indics_lod;
-    out_neg_indics_lod.resize(1);
-    int neg_offset = 0;
-    auto neg_data = out_neg_indics->mutable_data<int>(
-        framework::make_ddim({all_neg_num, 1}), ctx.GetPlace());
-    out_neg_indics_lod[0].push_back(neg_offset);
-    for (auto neg_indices : all_neg_indices) {
-      for (auto neg_idx : neg_indices) {
-        neg_data[neg_offset++] = neg_idx;
-      }
-      out_neg_indics_lod[0].push_back(neg_offset);
-    }
-    out_neg_indics->set_lod(out_neg_indics_lod);
-    return;
-  }
-};
-}  // namespace operators
-
-}  // namespace paddle
diff --git a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
index e7dd04740a..c27573c3d6 100755
--- a/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
+++ b/python/paddle/v2/fluid/tests/test_mine_hard_examples_op.py
@@ -1,16 +1,17 @@
-#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
-#    http://www.apache.org/licenses/LICENSE-2.0
+#     http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import unittest
 import numpy as np
 import sys
@@ -24,8 +25,8 @@ class TestMineHardExamplesOp(OpTest):
         self.inputs = {
             'ClsLoss': self.cls_loss,
             'LocLoss': self.loc_loss,
-            'MatchIndics': self.match_indices,
-            'MatchDis': self.match_dis
+            'MatchIndices': self.match_indices,
+            'MatchDist': self.match_dis
         }
 
         self.attrs = {
@@ -36,8 +37,8 @@ class TestMineHardExamplesOp(OpTest):
         }
 
         self.outputs = {
-            'NegIndics': (self.neg_indices, self.neg_indices_lod),
-            'UpdatedMatchIndics': self.updated_match_indices
+            'NegIndices': (self.neg_indices, self.neg_indices_lod),
+            'UpdatedMatchIndices': self.updated_match_indices
         }
 
     def test_check_output(self):

From d6edfd0ce30615c7c15332ed07dbd9dd5996e430 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 1 Feb 2018 12:47:01 +0800
Subject: [PATCH 039/102] update points

---
 benchmark/cluster/vgg16/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index 0c404e60a8..b999a51809 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -42,8 +42,8 @@
 
 | Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 291.06 | 518.80 | 836.26 | 1019.29 |
-| PaddlePaddle v2 (need more tests) | 356.28 | 785.39 | 853.30 | 1041.99 |
+| PaddlePaddle Fluid | 263.29 | 518.80 | 836.26 | 1019.29 |
+| PaddlePaddle v2 (need more tests) | 326.85 | 534.58 | 853.30 | 1041.99 |
 | TensorFlow | - | - | - | - |
 
 ### different pserver number

From 355ecaf38b2aab763428e47faa798b10f45f7c69 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 1 Feb 2018 14:15:41 +0800
Subject: [PATCH 040/102] fix style check

---
 benchmark/cluster/vgg16/vgg16_fluid.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 51a01af672..87a151db21 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -170,10 +170,11 @@ def main():
                     y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                     y_data = y_data.reshape([-1, 1])
 
-                    loss, acc = exe.run(trainer_prog,
-                                        feed={"pixel": img_data,
-                                            "label": y_data},
-                                        fetch_list=[avg_cost] + accuracy.metrics)
+                    loss, acc = exe.run(
+                        trainer_prog,
+                        feed={"pixel": img_data,
+                              "label": y_data},
+                        fetch_list=[avg_cost] + accuracy.metrics)
                     iters += 1
                     num_samples += len(data)
                     print(

From b7fbb91f069bfda8658ac7341111a53615b7903f Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 1 Feb 2018 14:36:19 +0800
Subject: [PATCH 041/102] follow comments

---
 benchmark/cluster/vgg16/README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index b999a51809..69a242e305 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -20,7 +20,7 @@
 | PaddlePaddle v2 | 15.97 | 17.04 | 17.60 | 17.83 |
 | TensorFlow | - | - | - | - |
 
-### different batch size
+### Different Batch Size
 
 - PServer Count: 10
 - Trainer Count: 20
@@ -34,7 +34,7 @@
 | TensorFlow | - | - | - | - |
 
 
-### Accelerate rate
+### Accelerate Rate
 
 - Pserver Count: 20
 - Batch Size: 128
@@ -42,11 +42,11 @@
 
 | Trainer Counter | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 263.29 | 518.80 | 836.26 | 1019.29 |
-| PaddlePaddle v2 (need more tests) | 326.85 | 534.58 | 853.30 | 1041.99 |
+| PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
+| PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
 | TensorFlow | - | - | - | - |
 
-### different pserver number
+### Different Pserver Number
 
 - Trainer Count: 100
 - Batch Size: 128

From d11e7b434f1272ecff05156de70eabaca26cc1f1 Mon Sep 17 00:00:00 2001
From: Yang Yu <yuyang18@baidu.com>
Date: Thu, 1 Feb 2018 15:00:52 +0800
Subject: [PATCH 042/102] Make recognize digits as a normal python unittest

---
 paddle/inference/tests/book/CMakeLists.txt    |  2 +-
 python/paddle/v2/fluid/tests/book/.gitignore  |  1 +
 .../paddle/v2/fluid/tests/book/CMakeLists.txt | 26 +------
 .../fluid/tests/book/test_recognize_digits.py | 67 +++++++++++++++----
 4 files changed, 56 insertions(+), 40 deletions(-)
 create mode 100644 python/paddle/v2/fluid/tests/book/.gitignore

diff --git a/paddle/inference/tests/book/CMakeLists.txt b/paddle/inference/tests/book/CMakeLists.txt
index d3798fb8fd..0e987eb024 100644
--- a/paddle/inference/tests/book/CMakeLists.txt
+++ b/paddle/inference/tests/book/CMakeLists.txt
@@ -4,4 +4,4 @@ cc_test(test_inference_recognize_digits_mlp
     DEPS ARCHIVE_START paddle_fluid ARCHIVE_END
     ARGS --dirname=${PYTHON_TESTS_DIR}/book/recognize_digits_mlp.inference.model)
 set_tests_properties(test_inference_recognize_digits_mlp
-    PROPERTIES DEPENDS test_recognize_digits_mlp_cpu)
+    PROPERTIES DEPENDS test_recognize_digits)
diff --git a/python/paddle/v2/fluid/tests/book/.gitignore b/python/paddle/v2/fluid/tests/book/.gitignore
new file mode 100644
index 0000000000..f0b574b939
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book/.gitignore
@@ -0,0 +1 @@
+recognize_digits_*.inference.model
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index dda02c03fd..a35abe3e0c 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,33 +1,9 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-list(REMOVE_ITEM TEST_OPS test_image_classification_train test_recognize_digits)
+list(REMOVE_ITEM TEST_OPS test_image_classification_train)
 py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
 py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
-py_test(test_recognize_digits_mlp_cpu
-  SRCS test_recognize_digits.py
-  ARGS mlp)
-py_test(test_recognize_digits_mlp_cuda
-  SRCS test_recognize_digits.py
-  ARGS mlp --use_cuda)
-py_test(test_recognize_digits_conv_cpu
-  SRCS test_recognize_digits.py
-  ARGS conv)
-py_test(test_recognize_digits_conv_cuda
-  SRCS test_recognize_digits.py
-  ARGS conv --use_cuda)
-py_test(test_recognize_digits_mlp_cpu_parallel
-  SRCS test_recognize_digits.py
-  ARGS mlp --parallel)
-py_test(test_recognize_digits_mlp_cuda_parallel
-  SRCS test_recognize_digits.py
-  ARGS mlp --use_cuda --parallel)
-py_test(test_recognize_digits_conv_cpu_parallel
-  SRCS test_recognize_digits.py
-  ARGS conv --parallel)
-py_test(test_recognize_digits_conv_cuda_parallel
-  SRCS test_recognize_digits.py
-  ARGS conv --use_cuda --parallel)
 
 # default test
 foreach(src ${TEST_OPS})
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
index b4b6020f58..b8f55c813b 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -17,6 +17,7 @@ import paddle.v2.fluid as fluid
 import paddle.v2 as paddle
 import sys
 import numpy
+import unittest
 
 
 def parse_arg():
@@ -74,18 +75,18 @@ def conv_net(img, label):
     return loss_net(conv_pool_2, label)
 
 
-def train(args, save_dirname=None):
-    print("recognize digits with args: {0}".format(" ".join(sys.argv[1:])))
-
+def train(nn_type, use_cuda, parallel, save_dirname):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
     img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
     label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-    if args.nn_type == 'mlp':
+    if nn_type == 'mlp':
         net_conf = mlp
     else:
         net_conf = conv_net
 
-    if args.parallel:
+    if parallel:
         places = fluid.layers.get_places()
         pd = fluid.layers.ParallelDo(places)
         with pd.do():
@@ -107,7 +108,7 @@ def train(args, save_dirname=None):
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
     optimizer.minimize(avg_loss)
 
-    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     exe = fluid.Executor(place)
     exe.run(fluid.default_startup_program())
@@ -147,13 +148,14 @@ def train(args, save_dirname=None):
                         'PassID {0:1}, BatchID {1:04}, Test Loss {2:2.2}, Acc {3:2.2}'.
                         format(pass_id, batch_id + 1,
                                float(avg_loss_val), float(acc_val)))
+    raise AssertionError("Loss of recognize digits is too large")
 
 
-def infer(args, save_dirname=None):
+def infer(use_cuda, save_dirname=None):
     if save_dirname is None:
         return
 
-    place = fluid.CUDAPlace(0) if args.use_cuda else fluid.CPUPlace()
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     exe = fluid.Executor(place)
 
     # Use fluid.io.load_inference_model to obtain the inference program desc,
@@ -174,11 +176,48 @@ def infer(args, save_dirname=None):
     print("infer results: ", results[0])
 
 
-if __name__ == '__main__':
-    args = parse_arg()
-    if not args.use_cuda and not args.parallel:
-        save_dirname = "recognize_digits_" + args.nn_type + ".inference.model"
+def main(use_cuda, parallel, nn_type):
+    if not use_cuda and not parallel:
+        save_dirname = "recognize_digits_" + nn_type + ".inference.model"
     else:
         save_dirname = None
-    train(args, save_dirname)
-    infer(args, save_dirname)
+
+    train(
+        nn_type=nn_type,
+        use_cuda=use_cuda,
+        parallel=parallel,
+        save_dirname=save_dirname)
+    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+
+
+class TestRecognizeDigits(unittest.TestCase):
+    pass
+
+
+def inject_test_method(use_cuda, parallel, nn_type):
+    def __impl__(self):
+        prog = fluid.Program()
+        startup_prog = fluid.Program()
+        scope = fluid.core.Scope()
+        with fluid.scope_guard(scope):
+            with fluid.program_guard(prog, startup_prog):
+                main(use_cuda, parallel, nn_type)
+
+    fn = 'test_{0}_{1}_{2}'.format(nn_type, 'cuda'
+                                   if use_cuda else 'cpu', 'parallel'
+                                   if parallel else 'normal')
+
+    setattr(TestRecognizeDigits, fn, __impl__)
+
+
+def inject_all_tests():
+    for use_cuda in (False, True):
+        for parallel in (False, True):
+            for nn_type in ('mlp', 'conv'):
+                inject_test_method(use_cuda, parallel, nn_type)
+
+
+inject_all_tests()
+
+if __name__ == '__main__':
+    unittest.main()

From c98b40e4783a9222674c280c957837b1255c2844 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 1 Feb 2018 16:06:40 +0800
Subject: [PATCH 043/102] clean code

---
 benchmark/cluster/vgg16/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index 333e14250b..725ce59025 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -55,7 +55,7 @@
 | PServer Count | 3 | 6 |10 | 20 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | 412.2 | 368.4 | 346.8 | 283.2 |
+| PaddlePaddle v2 | - | - | 729.7 | - |
 | TensorFlow | - | - | - | - |
 
 

From 3b87080a4e3cf37bc119fa31511812e26d854e86 Mon Sep 17 00:00:00 2001
From: Yang Yu <yuyang18@baidu.com>
Date: Thu, 1 Feb 2018 16:21:31 +0800
Subject: [PATCH 044/102] Make NMT as normal python unittests

---
 paddle/operators/while_op.cc                  |   2 +
 python/paddle/v2/fluid/layers/tensor.py       |   4 +-
 .../tests/book/test_machine_translation.py    | 103 ++++++++++++++----
 3 files changed, 85 insertions(+), 24 deletions(-)

diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 2fdd25dbbe..6ffbc19517 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -53,6 +53,8 @@ class WhileOp : public framework::OperatorBase {
     auto step_scopes =
         scope.FindVar(Output(kStepScopes))->GetMutable<StepScopeVar>();
 
+    PADDLE_ENFORCE(platform::is_cpu_place(cond.place()),
+                   "Condition of while op must in CPU memory.");
     while (cond.data<bool>()[0]) {
       auto &current_scope = scope.NewScope();
       step_scopes->push_back(&current_scope);
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index c435c5206d..8460af2a08 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -295,7 +295,7 @@ def fill_constant_batch_size_like(input,
     return out
 
 
-def ones(shape, dtype):
+def ones(shape, dtype, force_cpu=False):
     """
     **ones**
 
@@ -319,7 +319,7 @@ def ones(shape, dtype):
     return fill_constant(value=1.0, **locals())
 
 
-def zeros(shape, dtype):
+def zeros(shape, dtype, force_cpu=False):
     """
     **zeros**
 
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
index 82b760d693..5716ddd3dd 100644
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -11,21 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import contextlib
 
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as pd
 from paddle.v2.fluid.executor import Executor
+import unittest
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
-src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
 hidden_dim = 32
 word_dim = 16
-IS_SPARSE = True
 batch_size = 2
 max_length = 8
 topk_size = 50
@@ -34,10 +33,8 @@ beam_size = 2
 
 decoder_size = hidden_dim
 
-place = core.CPUPlace()
 
-
-def encoder():
+def encoder(is_sparse):
     # encoder
     src_word_id = pd.data(
         name="src_word_id", shape=[1], dtype='int64', lod_level=1)
@@ -45,7 +42,7 @@ def encoder():
         input=src_word_id,
         size=[dict_size, word_dim],
         dtype='float32',
-        is_sparse=IS_SPARSE,
+        is_sparse=is_sparse,
         param_attr=fluid.ParamAttr(name='vemb'))
 
     fc1 = pd.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
@@ -54,7 +51,7 @@ def encoder():
     return encoder_out
 
 
-def decoder_train(context):
+def decoder_train(context, is_sparse):
     # decoder
     trg_language_word = pd.data(
         name="target_language_word", shape=[1], dtype='int64', lod_level=1)
@@ -62,7 +59,7 @@ def decoder_train(context):
         input=trg_language_word,
         size=[dict_size, word_dim],
         dtype='float32',
-        is_sparse=IS_SPARSE,
+        is_sparse=is_sparse,
         param_attr=fluid.ParamAttr(name='vemb'))
 
     rnn = pd.DynamicRNN()
@@ -82,10 +79,10 @@ def decoder_train(context):
     return rnn()
 
 
-def decoder_decode(context):
+def decoder_decode(context, is_sparse):
     init_state = context
     array_len = pd.fill_constant(shape=[1], dtype='int64', value=max_length)
-    counter = pd.zeros(shape=[1], dtype='int64')
+    counter = pd.zeros(shape=[1], dtype='int64', force_cpu=True)
 
     # fill the first element with init_state
     state_array = pd.create_array('float32')
@@ -117,7 +114,7 @@ def decoder_decode(context):
             input=pre_ids,
             size=[dict_size, word_dim],
             dtype='float32',
-            is_sparse=IS_SPARSE)
+            is_sparse=is_sparse)
 
         # use rnn unit to update rnn
         current_state = pd.fc(input=[pre_ids_emb, pre_state_expanded],
@@ -150,7 +147,7 @@ def decoder_decode(context):
 
 
 def set_init_lod(data, lod, place):
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(data, place)
     res.set_lod(lod)
     return res
@@ -165,15 +162,19 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
 
 
-def train_main():
-    context = encoder()
-    rnn_out = decoder_train(context)
+def train_main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder(is_sparse)
+    rnn_out = decoder_train(context, is_sparse)
     label = pd.data(
         name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
     cost = pd.cross_entropy(input=rnn_out, label=label)
@@ -212,9 +213,13 @@ def train_main():
             batch_id += 1
 
 
-def decode_main():
-    context = encoder()
-    translation_ids, translation_scores = decoder_decode(context)
+def decode_main(use_cuda, is_sparse):
+    if use_cuda and not fluid.core.is_compiled_with_cuda():
+        return
+    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
+
+    context = encoder(is_sparse)
+    translation_ids, translation_scores = decoder_decode(context, is_sparse)
 
     exe = Executor(place)
     exe.run(framework.default_startup_program())
@@ -250,6 +255,60 @@ def decode_main():
         break
 
 
+class TestMachineTranslation(unittest.TestCase):
+    pass
+
+
+@contextlib.contextmanager
+def scope_prog_guard():
+    prog = fluid.Program()
+    startup_prog = fluid.Program()
+    scope = fluid.core.Scope()
+    with fluid.scope_guard(scope):
+        with fluid.program_guard(prog, startup_prog):
+            yield
+
+
+def inject_test_train(use_cuda, is_sparse):
+    f_name = 'test_{0}_{1}_train'.format('cuda' if use_cuda else 'cpu', 'sparse'
+                                         if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            train_main(use_cuda, is_sparse)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+def inject_test_decode(use_cuda, is_sparse, decorator=None):
+    f_name = 'test_{0}_{1}_decode'.format('cuda'
+                                          if use_cuda else 'cpu', 'sparse'
+                                          if is_sparse else 'dense')
+
+    def f(*args):
+        with scope_prog_guard():
+            decode_main(use_cuda, is_sparse)
+
+    if decorator is not None:
+        f = decorator(f)
+
+    setattr(TestMachineTranslation, f_name, f)
+
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+        inject_test_train(_use_cuda_, _is_sparse_)
+
+for _use_cuda_ in (False, True):
+    for _is_sparse_ in (False, True):
+
+        _decorator_ = None
+        if _use_cuda_:
+            _decorator_ = unittest.skip(
+                reason='Beam Search does not support CUDA!')
+
+        inject_test_decode(
+            is_sparse=_is_sparse_, use_cuda=_use_cuda_, decorator=_decorator_)
+
 if __name__ == '__main__':
-    # train_main()
-    decode_main()
+    unittest.main()

From 5530212defd0afd81e202f9e90a499823daf797f Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 1 Feb 2018 16:33:03 +0800
Subject: [PATCH 045/102] add others

---
 benchmark/cluster/vgg16/README.md       | 4 +++-
 benchmark/cluster/vgg16/v2_pserver.yaml | 2 +-
 benchmark/cluster/vgg16/v2_trainer.yaml | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index 725ce59025..b0bdc0288f 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -48,6 +48,8 @@
 
 ### different pserver number
 
+*The performance gap between Fuild and v2 comes from the network interference.*
+
 - Trainer Count: 60
 - Batch Size: 128
 - Metrics: mini-batch / sec
@@ -55,7 +57,7 @@
 | PServer Count | 3 | 6 |10 | 20 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 |
-| PaddlePaddle v2 | - | - | 729.7 | - |
+| PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
 | TensorFlow | - | - | - | - |
 
 
diff --git a/benchmark/cluster/vgg16/v2_pserver.yaml b/benchmark/cluster/vgg16/v2_pserver.yaml
index 935cf0be3c..dd1271e0cf 100644
--- a/benchmark/cluster/vgg16/v2_pserver.yaml
+++ b/benchmark/cluster/vgg16/v2_pserver.yaml
@@ -29,7 +29,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "python -u train.py"
+          value: "python train.py"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT
diff --git a/benchmark/cluster/vgg16/v2_trainer.yaml b/benchmark/cluster/vgg16/v2_trainer.yaml
index 5189009f3e..997bbc81c9 100644
--- a/benchmark/cluster/vgg16/v2_trainer.yaml
+++ b/benchmark/cluster/vgg16/v2_trainer.yaml
@@ -30,7 +30,7 @@ spec:
         - name: TOPOLOGY
           value: ""
         - name: ENTRY
-          value: "cd /workspace && MKL_NUM_THREADS=1 python -u /workspace/vgg16_v2.py"
+          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
         - name: TRAINER_PACKAGE
           value: "/workspace"
         - name: PADDLE_INIT_PORT

From ccef94a376aed4bc8576597f05fc8b00e37ab999 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 1 Feb 2018 16:42:39 +0800
Subject: [PATCH 046/102] add comments

---
 benchmark/cluster/vgg16/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index b0bdc0288f..0d525e9522 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -48,18 +48,18 @@
 
 ### different pserver number
 
-*The performance gap between Fuild and v2 comes from the network interference.*
-
 - Trainer Count: 60
 - Batch Size: 128
 - Metrics: mini-batch / sec
 
 | PServer Count | 3 | 6 |10 | 20 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle Fluid(should fixed in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
 | PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
 | TensorFlow | - | - | - | - |
 
+*The performance gap between Fuild and v2 comes from the network interference.*
+
 
 ## Steps to run the performance test
 

From 00b9aed0060acd983dce1d3cd1db8a859ec21219 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 1 Feb 2018 16:54:53 +0800
Subject: [PATCH 047/102] fix typo

---
 benchmark/cluster/vgg16/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index 0d525e9522..27eb265ce4 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -54,7 +54,7 @@
 
 | PServer Count | 3 | 6 |10 | 20 |
 | -- | -- | -- | -- | -- |
-| PaddlePaddle Fluid(should fixed in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
+| PaddlePaddle Fluid(should fix in next PR) | 589.1 | 592.6 | 656.4 | 655.8 |
 | PaddlePaddle v2 | 593.4 | 791.3 | 729.7 | 821.7 |
 | TensorFlow | - | - | - | - |
 

From 1830e2a01da528dc03ebba334bad9f418074b770 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 1 Feb 2018 17:43:03 +0800
Subject: [PATCH 048/102] fix bugs

---
 doc/howto/usage/cluster/cluster_train_cn.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index c2fc86687d..0f3db59607 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -92,11 +92,11 @@ paddle.init(
 参数说明
 
 - use_gpu： **可选，默认False**，是否启用GPU训练
-- trainer_count：**必选，默认1**，当前训练任务trainer总个数
+- trainer_count：**必选，默认1**，当前trainer的线程数目
 - port：**必选，默认7164**，连接到pserver的端口
 - ports_num：**必选，默认1**，连接到pserver的端口个数
 - ports_num_for_sparse：**必选，默认0**，和pserver之间用于稀疏类型参数通信的端口个数
-- num_gradient_servers：**必选，默认1**，当前训练任务pserver总数
+- num_gradient_servers：**必选，默认1**，当前训练任务trainer总数
 - trainer_id：**必选，默认0**，每个trainer的唯一ID，从0开始的整数
 - pservers：**必选，默认127.0.0.1**，当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开
 

From 2fb280c9f2648d43499839d448e589e71b2b20b0 Mon Sep 17 00:00:00 2001
From: kexinzhao <kexin.zhao.paddle@gmail.com>
Date: Thu, 1 Feb 2018 01:52:51 -0800
Subject: [PATCH 049/102] Revise python save load api using new load/save op
 (#7995)

* initial commit

* add get_parameters method

* add get_parameters method

* small fix

* address comments

* address comments

* address comments

* fix
---
 python/paddle/v2/fluid/framework.py |   3 +-
 python/paddle/v2/fluid/io.py        | 141 +++++++++++++++++++---------
 2 files changed, 97 insertions(+), 47 deletions(-)

diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index ae98e299a4..7f5187d299 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -489,7 +489,8 @@ class Operator(object):
         no_kernel_op_set = {
             'feed', 'fetch', 'save', 'load', 'recurrent',
             'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
-            'recv', 'listen_and_serv', 'parallel_do'
+            'recv', 'listen_and_serv', 'parallel_do', 'save_combine',
+            'load_combine'
         }
         if type not in no_kernel_op_set:
             self.desc.infer_var_type(self.block.desc)
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index d56ec45c53..613dc20b6e 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -46,6 +46,9 @@ def is_parameter(var):
 
 
 def is_persistable(var):
+    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+       var.desc.type() == core.VarDesc.VarType.FETCH_LIST:
+        return False
     return var.persistable
 
 
@@ -60,7 +63,12 @@ def _clone_var_in_block_(block, var):
         persistable=True)
 
 
-def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
+def save_vars(executor,
+              dirname,
+              main_program=None,
+              vars=None,
+              predicate=None,
+              save_file_name=None):
     """
     Save variables to directory by executor.
 
@@ -69,9 +77,12 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param main_program: program. If vars is None, then filter all variables in this
     program which fit `predicate`. Default default_main_program.
     :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the variables will be saved.
-    :param vars: variables need to be saved. If specify vars, program & predicate
+    as a bool. If it returns true, the corresponding input variable will be saved.
+    :param vars: variables need to be saved. If vars is specified, program & predicate
     will be ignored
+    :param save_file_name: The name of a single file that all vars are saved to. 
+    If it is None, save variables to separate files.
+
     :return: None
     """
     if vars is None:
@@ -83,21 +94,39 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
         save_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()),
+            save_file_name=save_file_name)
     else:
         save_program = Program()
         save_block = save_program.global_block()
+
+        save_var_map = {}
         for each_var in vars:
             new_var = _clone_var_in_block_(save_block, each_var)
+            if save_file_name is None:
+                save_block.append_op(
+                    type='save',
+                    inputs={'X': [new_var]},
+                    outputs={},
+                    attrs={'file_path': os.path.join(dirname, new_var.name)})
+            else:
+                save_var_map[new_var.name] = new_var
+
+        if save_file_name is not None:
+            save_var_list = []
+            for name in sorted(save_var_map.keys()):
+                save_var_list.append(save_var_map[name])
+
             save_block.append_op(
-                type='save',
-                inputs={'X': [new_var]},
+                type='save_combine',
+                inputs={'X': save_var_list},
                 outputs={},
-                attrs={'file_path': os.path.join(dirname, new_var.name)})
+                attrs={'file_path': os.path.join(dirname, save_file_name)})
+
         executor.run(save_program)
 
 
-def save_params(executor, dirname, main_program=None):
+def save_params(executor, dirname, main_program=None, save_file_name=None):
     """
     Save all parameters to directory with executor.
     """
@@ -106,10 +135,12 @@ def save_params(executor, dirname, main_program=None):
         dirname=dirname,
         main_program=main_program,
         vars=None,
-        predicate=is_parameter)
+        predicate=is_parameter,
+        save_file_name=save_file_name)
 
 
-def save_persistables(executor, dirname, main_program=None):
+def save_persistables(executor, dirname, main_program=None,
+                      save_file_name=None):
     """
     Save all persistables to directory with executor.
     """
@@ -118,21 +149,30 @@ def save_persistables(executor, dirname, main_program=None):
         dirname=dirname,
         main_program=main_program,
         vars=None,
-        predicate=is_persistable)
+        predicate=is_persistable,
+        save_file_name=save_file_name)
 
 
-def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
+def load_vars(executor,
+              dirname,
+              main_program=None,
+              vars=None,
+              predicate=None,
+              load_file_name=None):
     """
     Load variables from directory by executor.
 
-    :param executor: executor that save variable
+    :param executor: executor that load variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
     program which fit `predicate`. Default default_main_program().
     :param predicate: The Predicate describes a callable that returns a variable
-    as a bool. If it returns true, the variables will be loaded.
-    :param vars: variables need to be loaded. If specify vars, program &
+    as a bool. If it returns true, the corresponding input variable will be loaded.
+    :param vars: variables need to be loaded. If vars is specified, program &
     predicate will be ignored
+    :param load_file_name: The name of the single file that all vars are loaded from.   
+    If it is None, load variables from separate files.
+
     :return: None
     """
     if vars is None:
@@ -144,23 +184,40 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
         load_vars(
             executor,
             dirname=dirname,
-            vars=filter(predicate, main_program.list_vars()))
+            vars=filter(predicate, main_program.list_vars()),
+            load_file_name=load_file_name)
     else:
         load_prog = Program()
         load_block = load_prog.global_block()
+
+        load_var_map = {}
         for each_var in vars:
             assert isinstance(each_var, Variable)
             new_var = _clone_var_in_block_(load_block, each_var)
+            if load_file_name is None:
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [new_var]},
+                    attrs={'file_path': os.path.join(dirname, new_var.name)})
+            else:
+                load_var_map[new_var.name] = new_var
+
+        if load_file_name is not None:
+            load_var_list = []
+            for name in sorted(load_var_map.keys()):
+                load_var_list.append(load_var_map[name])
+
             load_block.append_op(
-                type='load',
+                type='load_combine',
                 inputs={},
-                outputs={"Out": [new_var]},
-                attrs={'file_path': os.path.join(dirname, new_var.name)})
+                outputs={"Out": load_var_list},
+                attrs={'file_path': os.path.join(dirname, load_file_name)})
 
         executor.run(load_prog)
 
 
-def load_params(executor, dirname, main_program=None):
+def load_params(executor, dirname, main_program=None, load_file_name=None):
     """
     load all parameters from directory by executor.
     """
@@ -168,10 +225,12 @@ def load_params(executor, dirname, main_program=None):
         executor,
         dirname=dirname,
         main_program=main_program,
-        predicate=is_parameter)
+        predicate=is_parameter,
+        load_file_name=load_file_name)
 
 
-def load_persistables(executor, dirname, main_program=None):
+def load_persistables(executor, dirname, main_program=None,
+                      load_file_name=None):
     """
     load all persistables from directory by executor.
     """
@@ -179,7 +238,8 @@ def load_persistables(executor, dirname, main_program=None):
         executor,
         dirname=dirname,
         main_program=main_program,
-        predicate=is_persistable)
+        predicate=is_persistable,
+        load_file_name=load_file_name)
 
 
 def get_inference_program(target_vars, main_program=None):
@@ -238,7 +298,8 @@ def save_inference_model(dirname,
                          feeded_var_names,
                          target_vars,
                          executor,
-                         main_program=None):
+                         main_program=None,
+                         save_file_name=None):
     """
     Build a model especially for inference,
     and save it to directory by the executor.
@@ -249,6 +310,8 @@ def save_inference_model(dirname,
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
             Default default_main_program().
+    :param save_file_name: The name of a single file that all parameters are saved to. 
+    If it is None, save parameters to separate files.
 
     :return: None
     """
@@ -283,25 +346,7 @@ def save_inference_model(dirname,
     with open(model_file_name, "wb") as f:
         f.write(inference_program.desc.serialize_to_string())
 
-    save_params(executor, dirname, main_program)
-
-
-def load_persistables_if_exist(executor, dirname, main_program=None):
-    filenames = next(os.walk(dirname))[2]
-    filenames = set(filenames)
-
-    def _is_presistable_and_exist_(var):
-        if not is_persistable(var):
-            return False
-        else:
-            return var.name in filenames
-
-    load_vars(
-        executor,
-        dirname,
-        main_program=main_program,
-        vars=None,
-        predicate=_is_presistable_and_exist_)
+    save_persistables(executor, dirname, inference_program, save_file_name)
 
 
 def get_feed_targets_names(program):
@@ -322,13 +367,15 @@ def get_fetch_targets_names(program):
     return fetch_targets_names
 
 
-def load_inference_model(dirname, executor):
+def load_inference_model(dirname, executor, load_file_name=None):
     """
     Load inference model from a directory
 
     :param dirname: directory path
     :param executor: executor that load inference model
-
+    :param load_file_name: The name of the single file that all parameters are loaded from.   
+    If it is None, load parameters from separate files.
+    
     :return: [program, feed_target_names, fetch_targets]
              program: program especially for inference.
              feed_target_names: Names of variables that need to feed data
@@ -342,7 +389,7 @@ def load_inference_model(dirname, executor):
         program_desc_str = f.read()
 
     program = Program.parse_from_string(program_desc_str)
-    load_persistables_if_exist(executor, dirname, program)
+    load_persistables(executor, dirname, program, load_file_name)
 
     feed_target_names = get_feed_targets_names(program)
     fetch_target_names = get_fetch_targets_names(program)
@@ -359,6 +406,7 @@ def get_parameter_value(para, executor):
 
     :param executor: executor for retrieving the value
     :param para: the given parameter
+
     :return: the LoDTensor for the parameter
     """
     assert is_parameter(para)
@@ -377,6 +425,7 @@ def get_parameter_value_by_name(name, executor, program=None):
     :param name: the name of the parameter
     :param program: the program where the variable is found
             Default default_main_program().
+
     :return: the LoDTensor for the variable
     """
     if program is None:

From d2caf777ae0260d10ce3dfe9249d3ccf53a50641 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Thu, 1 Feb 2018 18:15:18 +0800
Subject: [PATCH 050/102] set FLAGS_warpctc_dir to pass the test_warpctc_op
 unit test

---
 cmake/generic.cmake                         | 4 ++--
 paddle/testing/paddle_gtest_main.cc         | 5 +++--
 python/paddle/v2/fluid/__init__.py          | 4 +++-
 python/paddle/v2/fluid/tests/CMakeLists.txt | 2 ++
 4 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index e10c0ecf68..33ef6860e1 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -470,10 +470,10 @@ function(py_test TARGET_NAME)
   if(WITH_TESTING)
     set(options "")
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS ARGS)
+    set(multiValueArgs SRCS DEPS ARGS ENVS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python ${py_test_ENVS}
              ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
index a2f21e37e4..fd8c4a69da 100644
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -27,9 +27,10 @@ int main(int argc, char** argv) {
   }
 #ifdef PADDLE_WITH_CUDA
   new_argv.push_back(
-      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory,"
+             "warpctc_dir"));
 #else
-  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory,warpctc_dir"));
 #endif
   int new_argc = static_cast<int>(new_argv.size());
   char** new_argv_address = new_argv.data();
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index f52346c3b5..3ee58393c7 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -76,7 +76,9 @@ def __bootstrap__():
 
     os.environ['OMP_NUM_THREADS'] = str(num_threads)
 
-    read_env_flags = ['use_pinned_memory', 'check_nan_inf', 'benchmark']
+    read_env_flags = [
+        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir'
+    ]
     if core.is_compiled_with_cuda():
         read_env_flags += ['fraction_of_gpu_memory_to_use']
     core.init_gflags([sys.argv[0]] +
diff --git a/python/paddle/v2/fluid/tests/CMakeLists.txt b/python/paddle/v2/fluid/tests/CMakeLists.txt
index 628ce60b40..26a80abcb5 100644
--- a/python/paddle/v2/fluid/tests/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/CMakeLists.txt
@@ -5,9 +5,11 @@ if(NOT WITH_DISTRIBUTE)
     list(REMOVE_ITEM TEST_OPS test_recv_op)
 endif(NOT WITH_DISTRIBUTE)
 
+list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
+py_test(test_warpctc_op SRCS test_warpctc_op.py ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
 
 add_subdirectory(book)
 add_subdirectory(book_distribute)

From 7c2d32b849a54e68492ae652eefa74f91cab6501 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 1 Feb 2018 18:50:30 +0800
Subject: [PATCH 051/102] update dockerfile

---
 benchmark/cluster/vgg16/Dockerfile     | 17 ++++++++++-------
 benchmark/cluster/vgg16/README.md      |  4 ++--
 benchmark/cluster/vgg16/vgg16_fluid.py |  7 ++++++-
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
index c34f7e8fcf..54d1b09a0f 100644
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -1,13 +1,16 @@
 FROM python:2.7.14
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
-ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
-RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev && \
-chmod +x /usr/bin/paddle_k8s
+RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
 # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
 #       so we must build one with distribute support to install in this image.
-ADD *.whl /
-RUN pip install /*.whl && rm -f /*.whl
-ENV LD_LIBRARY_PATH=/usr/local/lib
+RUN pip install paddlepaddle
 RUN sh -c 'echo "import paddle.v2 as paddle\npaddle.dataset.cifar.train10()" | python'
+RUN pip uninstall -y paddlepaddle
 
+# below lines may change a lot for debugging
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/paddle_k8s /usr/bin
+ADD https://raw.githubusercontent.com/PaddlePaddle/cloud/develop/docker/k8s_tools.py /root
+ADD *.whl /
+RUN pip install /*.whl && rm -f /*.whl && \
+chmod +x /usr/bin/paddle_k8s
+ENV LD_LIBRARY_PATH=/usr/local/lib
 ADD vgg16_fluid.py vgg16_v2.py /workspace/
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index 69a242e305..6d309217f8 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -40,13 +40,13 @@
 - Batch Size: 128
 - Metrics: samples / sec
 
-| Trainer Counter | 20 | 40 | 80 | 100 |
+| Trainer Count | 20 | 40 | 80 | 100 |
 | -- | -- | -- | -- | -- |
 | PaddlePaddle Fluid | 263.29 (78.64%) | 518.80 (77.47%) | 836.26 (62.44%) | 1019.29 (60.89%) |
 | PaddlePaddle v2 (need more tests) | 326.85 (92.85%) | 534.58 (75.93%) | 853.30 (60.60%) | 1041.99 (59.20%) |
 | TensorFlow | - | - | - | - |
 
-### Different Pserver Number
+### Different Pserver Count
 
 - Trainer Count: 100
 - Batch Size: 128
diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index 87a151db21..e89b96e4a6 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -50,6 +50,11 @@ parser.add_argument(
     default='CPU',
     choices=['CPU', 'GPU'],
     help="The device type.")
+parser.add_argument(
+    '--device_id',
+    type=int,
+    default=0,
+    help="The device id.")
 parser.add_argument(
     '--data_format',
     type=str,
@@ -135,7 +140,7 @@ def main():
     optimize_ops, params_grads = optimizer.minimize(avg_cost)
 
     # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(args.device_id)
     exe = fluid.Executor(place)
 
     # test

From b26a5b5d044c0bfc7bdfbc803ea604449d6e575a Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 1 Feb 2018 19:37:12 +0800
Subject: [PATCH 052/102] fix en doc

---
 doc/howto/usage/cluster/cluster_train_en.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 28cd1fa790..f9424f8f1a 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -95,11 +95,11 @@ paddle.init(
 Parameter Description
 
 - use_gpu: **optional, default False**, set to "True" to enable GPU training.
-- trainer_count: **required, default 1**, total count of trainers in the training job.
+- trainer_count: **required, default 1**, number of threads in current trainer.
 - port: **required, default 7164**, port to connect to parameter server.
 - ports_num: **required, default 1**, number of ports for communication.
 - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
-- num_gradient_servers: **required, default 1**, total number of gradient server.
+- num_gradient_servers: **required, default 1**, number of trainers in current job.
 - trainer_id: **required, default 0**, ID for every trainer, start from 0.
 - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
 

From 52df85f4db27b0f4e9adeb6ffc7ca398473c4ba0 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Thu, 1 Feb 2018 20:35:41 +0800
Subject: [PATCH 053/102] fix style

---
 benchmark/cluster/vgg16/vgg16_fluid.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py
index e89b96e4a6..499e06ec42 100644
--- a/benchmark/cluster/vgg16/vgg16_fluid.py
+++ b/benchmark/cluster/vgg16/vgg16_fluid.py
@@ -50,11 +50,7 @@ parser.add_argument(
     default='CPU',
     choices=['CPU', 'GPU'],
     help="The device type.")
-parser.add_argument(
-    '--device_id',
-    type=int,
-    default=0,
-    help="The device id.")
+parser.add_argument('--device_id', type=int, default=0, help="The device id.")
 parser.add_argument(
     '--data_format',
     type=str,
@@ -140,7 +136,8 @@ def main():
     optimize_ops, params_grads = optimizer.minimize(avg_cost)
 
     # Initialize executor
-    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(args.device_id)
+    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(
+        args.device_id)
     exe = fluid.Executor(place)
 
     # test

From 47ebe435a79ab836649ba11c635129c8a6664ea1 Mon Sep 17 00:00:00 2001
From: dzhwinter <dongzhihong01@baidu.com>
Date: Thu, 1 Feb 2018 20:41:54 +0800
Subject: [PATCH 054/102] Fix/vector (#8045)

* "clean code"

* "clean code"
---
 paddle/framework/mixed_vector.h | 77 +++++++++++++--------------------
 1 file changed, 29 insertions(+), 48 deletions(-)

diff --git a/paddle/framework/mixed_vector.h b/paddle/framework/mixed_vector.h
index 0e0e239586..85caac8dcd 100644
--- a/paddle/framework/mixed_vector.h
+++ b/paddle/framework/mixed_vector.h
@@ -34,18 +34,6 @@ namespace framework {
 
 template <typename T>
 class Vector : public std::vector<T> {
- public:
-  /* NOTE(dzhwinter):
-   * Data always store and modified on Host.
-   * If the data is modified when use cuda_data interface,
-   * You need to call the CopyFromCUDA explicitly to synchronize data.
-   *
-   */
-  enum class kDataPosition {
-    kDataOnHost = 0,
-    kDataOnDevice = 1,
-  };
-
  public:
   using std::vector<T>::vector;
 
@@ -55,11 +43,12 @@ class Vector : public std::vector<T> {
   virtual ~Vector() {
 #ifdef PADDLE_WITH_CUDA
     if (cuda_ptr_ != nullptr) {
-      memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
+      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
     }
 #endif
   }
 
+  /* Get device vector */
   T *cuda_data() {
     CopyToCUDA();
     PADDLE_ENFORCE_NOT_NULL(
@@ -67,81 +56,73 @@ class Vector : public std::vector<T> {
     return static_cast<T *>(cuda_ptr_);
   }
 
+  /* Get host vector */
   T *data() { return std::vector<T>::data(); }
-
   const T *data() const { return std::vector<T>::data(); }
 
+  /* Synchronize host vector to device vector */
   void CopyToCUDA();
-
+  /* Synchronize device vector to host vector */
   void CopyFromCUDA();
-
+  /* Switch device vector location */
   void CopyToPeer(platform::Place);
 
  private:
   void *cuda_ptr_ = nullptr;
-  size_t cuda_size_ = 0;
-  /*The DataPosition is unused now,
-    if we want support random access from cpu and cuda,
-    we need to overload all the vector method */
-
-  kDataPosition position_ = kDataPosition::kDataOnHost;
+  size_t cuda_size_ = 0;  // device vector numel
   platform::CUDAPlace place_;
 };
 
 template <typename T>
 void Vector<T>::CopyToCUDA() {
 #ifdef PADDLE_WITH_CUDA
-  if (cuda_ptr_ == nullptr) {
+  if (cuda_size_ < this->size()) {
+    if (cuda_ptr_ != nullptr) {
+      memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
+    }
     cuda_ptr_ =
         memory::Alloc<platform::CUDAPlace>(place_, this->size() * sizeof(T));
   }
+  cuda_size_ = this->size();
   platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *cuda_ctx = pool.GetByPlace(place_);
-
-  memory::Copy(place_, static_cast<void *>(cuda_ptr_), platform::CPUPlace(),
+  auto *ctx = pool.GetByPlace(place_);
+  memory::Copy(place_, cuda_ptr_, platform::CPUPlace(),
                static_cast<const void *>(this->data()),
-               this->size() * sizeof(T), cuda_ctx->stream());
-  cuda_ctx->Wait();
-
-  cuda_size_ = this->size();
+               this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
 #endif
 }
 
 template <typename T>
 void Vector<T>::CopyFromCUDA() {
 #ifdef PADDLE_WITH_CUDA
-  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-  auto *cuda_ctx = pool.GetByPlace(place_);
   if (cuda_ptr_ == nullptr) {
-    LOG(WARNING) << "No uncommited cuda data.";
+    LOG(WARNING) << "No uncommitted cuda data.";
     return;
   }
   this->resize(cuda_size_);
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  auto *ctx = pool.GetByPlace(place_);
   memory::Copy(platform::CPUPlace(), static_cast<void *>(this->data()), place_,
                static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
-               cuda_ctx->stream());
-  cuda_ctx->Wait();
-
+               ctx->stream());
+  ctx->Wait();
 #endif
 }
 
 template <typename T>
 void Vector<T>::CopyToPeer(platform::Place peer_place) {
-  if (platform::is_cpu_place(peer_place)) {
-    return;
-  }
 #ifdef PADDLE_WITH_CUDA
-  auto *cuda_ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
-  void *peer_cuda_ptr_ = memory::Alloc<platform::CUDAPlace>(
+  auto *ctx = platform::DeviceContextPool::Instance().GetByPlace(place_);
+  void *peer_cuda_ptr = memory::Alloc<platform::CUDAPlace>(
       boost::get<platform::CUDAPlace>(peer_place), this->size() * sizeof(T));
-  memory::Copy(boost::get<platform::CUDAPlace>(peer_place),
-               static_cast<void *>(peer_cuda_ptr_), place_,
-               static_cast<const void *>(cuda_ptr_), this->size() * sizeof(T),
-               cuda_ctx->stream());
-  cuda_ctx->Wait();
-  memory::Free<platform::CUDAPlace>(place_, static_cast<void *>(cuda_ptr_));
+  memory::Copy(boost::get<platform::CUDAPlace>(peer_place), peer_cuda_ptr,
+               place_, cuda_ptr_, this->size() * sizeof(T), ctx->stream());
+  ctx->Wait();
+
+  memory::Free<platform::CUDAPlace>(place_, cuda_ptr_);
   place_ = boost::get<platform::CUDAPlace>(peer_place);
-  cuda_ptr_ = peer_cuda_ptr_;
+  cuda_ptr_ = peer_cuda_ptr;
 #endif
 }
 

From 84ded49d6632aec9733bbbcd242c539029711cd8 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Thu, 1 Feb 2018 23:46:43 +0800
Subject: [PATCH 055/102] fix comments

---
 paddle/operators/conv_op.h              |  3 +++
 paddle/operators/math/depthwise_conv.cu | 11 +++++++----
 paddle/operators/math/depthwise_conv.h  | 11 +++++++----
 3 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/paddle/operators/conv_op.h b/paddle/operators/conv_op.h
index 5b47eefb83..3c1d0e9c1c 100644
--- a/paddle/operators/conv_op.h
+++ b/paddle/operators/conv_op.h
@@ -361,6 +361,9 @@ class DepthwiseConvKernel : public framework::OpKernel<T> {
     Tensor* output = context.Output<Tensor>("Output");
     output->mutable_data<T>(context.GetPlace());
 
+    PADDLE_ENFORCE_EQ(
+        output->dims()[1] % input->dims()[1], 0,
+        "The output channels must be a multiple of the input channels");
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
     std::vector<int> dilations = context.Attr<std::vector<int>>("dilations");
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
index 4aa38151e6..b9b958c92b 100644
--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -203,8 +203,9 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
-                  const framework::Tensor& filter, std::vector<int>& strides,
-                  std::vector<int>& paddings, framework::Tensor* output) {
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -244,7 +245,8 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
                   const framework::Tensor& input,
                   const framework::Tensor& filter,
                   const framework::Tensor& output_grad,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -284,7 +286,8 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input,
                   const framework::Tensor& output_grad,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* filter_grad) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
diff --git a/paddle/operators/math/depthwise_conv.h b/paddle/operators/math/depthwise_conv.h
index 34eecca7b6..4708920bb4 100644
--- a/paddle/operators/math/depthwise_conv.h
+++ b/paddle/operators/math/depthwise_conv.h
@@ -29,8 +29,9 @@ template <typename DeviceContext, typename T>
 class DepthwiseConvFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& filter, std::vector<int>& strides,
-                  std::vector<int>& paddings, framework::Tensor* output);
+                  const framework::Tensor& filter,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, framework::Tensor* output);
 };
 
 template <typename DeviceContext, typename T>
@@ -39,7 +40,8 @@ class DepthwiseConvInputGradFunctor {
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& filter,
                   const framework::Tensor& output_grad,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* input_grad);
 };
 
@@ -48,7 +50,8 @@ class DepthwiseConvFilterGradFunctor {
  public:
   void operator()(const DeviceContext& context, const framework::Tensor& input,
                   const framework::Tensor& output_grad,
-                  std::vector<int>& strides, std::vector<int>& paddings,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings,
                   framework::Tensor* filter_grad);
 };
 

From 6d8bc1378bddc16c713c2ddabc0f9579aa1ab325 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Thu, 1 Feb 2018 14:33:23 -0800
Subject: [PATCH 056/102] Adding an initial implementation for the unbuffered
 channel (#7984)

* Adding an initial implementation for the unbuffered channel

* Including atomic header

* update comment

* Adding the closed attribute

* Add comments

* Updated locking mechanism

* Add simple unbuffered test

* Enhance unit test and fix bug

* Add details
---
 paddle/framework/channel_test.cc              | 21 ++++
 paddle/framework/details/unbuffered_channel.h | 98 +++++++++++++++++--
 2 files changed, 112 insertions(+), 7 deletions(-)

diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index 1510fb8abf..2efa086f00 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -78,3 +78,24 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   t.join();
   delete ch;
 }
+
+TEST(Channel, SimpleUnbufferedChannelTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  std::thread t([&]() {
+    for (int i = 0; i < 5; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 10U);
+  delete ch;
+}
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
index cc2d2e587e..0dc5afd7e5 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <atomic>
 #include <condition_variable>
-#include <deque>
 #include <mutex>
 
 #include "paddle/framework/channel.h"
@@ -36,20 +36,104 @@ class UnBuffered : public paddle::framework::Channel<T> {
   virtual ~UnBuffered();
 
  private:
-  UnBuffered() {}
+  std::mutex mu_ch_;
+  // Mutex for readers and writers who are waiting for other reader
+  // and writer to complete execution
+  std::recursive_mutex mu_read_, mu_write_;
+  // reader_found_ is set true when a reader is ready to accept data
+  // writer_found_ is set true when a writer is ready to send data
+  // A transaction occurs only when both are true
+  std::atomic<bool> reader_found_{false}, writer_found_{false};
+  std::condition_variable cv_channel_;
+  std::condition_variable_any cv_reader_, cv_writer_;
+  T* item{nullptr};
+  std::atomic<bool> closed_{false};
+
+  UnBuffered() : closed_(false) {}
+
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
 
+// This function implements the concept of how data should
+// be sent from a writer to a reader.
+template <typename T>
+void UnBuffered<T>::Send(T* data) {
+  // Prevent other writers from entering
+  std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
+  writer_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
+  // If writer comes first, it should wait till a reader arrives
+  cv_writer_.wait(cv_lock,
+                  [this]() { return reader_found_ == true || closed_; });
+  cv_reader_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> channel_lock(mu_ch_);
+    item = data;
+    channel_lock.unlock();
+    cv_channel_.notify_one();
+    channel_lock.lock();
+    cv_channel_.wait(channel_lock,
+                     [this]() { return item == nullptr || closed_; });
+  }
+  writer_found_ = false;
+}
+
+// This function implements the concept of how
+// data that was sent by a writer is read from a reader.
 template <typename T>
-void UnBuffered<T>::Send(T* channel_element) {}
+void UnBuffered<T>::Receive(T* data) {
+  // Prevent other readers from entering
+  std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
+  reader_found_ = true;
+  std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
+  // If reader comes first, it should wait till a writer arrives
+  cv_reader_.wait(cv_lock,
+                  [this]() { return writer_found_ == true || closed_; });
+  cv_writer_.notify_one();
+  if (!closed_) {
+    std::unique_lock<std::mutex> lock_ch{mu_ch_};
+    // Reader should wait for the writer to first write its data
+    cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
+    if (!closed_) {
+      *data = std::move(*item);
+      item = nullptr;
+      lock_ch.unlock();
+    }
+    cv_channel_.notify_one();
+  }
+  reader_found_ = false;
+}
 
+// This function implements the sequence of events
+// that take place once the channel is closed.
 template <typename T>
-void UnBuffered<T>::Receive(T*) {}
+void UnBuffered<T>::Close() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
 
+// This function implements the sequence of events
+// that are executed once the object of an UnBuffered
+// channel is destroyed.
 template <typename T>
-void UnBuffered<T>::Close() {}
+UnBuffered<T>::~UnBuffered() {
+  std::unique_lock<std::mutex> lock(mu_ch_);
+  item = nullptr;
+  closed_ = true;
+  NotifyAllParticipants(&lock);
+}
 
+// This function notifies all the readers, writers and
+// the channel condition variables.
 template <typename T>
-UnBuffered<T>::~UnBuffered() {}
+void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  cv_writer_.notify_all();
+  cv_channel_.notify_all();
+  cv_reader_.notify_all();
+}
 
 }  // namespace details
 }  // namespace framework

From 148d35feb8fbd6c0cee7c66268d316e4274b8407 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Thu, 1 Feb 2018 16:10:45 -0800
Subject: [PATCH 057/102] Add unit test with less receivers, more senders for
 unbuffered channel. (#8060)

* Add unite test with less receivers, more senders

* Fixed the check
---
 paddle/framework/channel_test.cc | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index 2efa086f00..020f806380 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -99,3 +99,29 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
   EXPECT_EQ(sum_send, 10U);
   delete ch;
 }
+
+TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  // Send should block after three iterations
+  // since we only have three receivers.
+  std::thread t([&]() {
+    // Try to send more number of times
+    // than receivers
+    for (int i = 0; i < 4; i++) {
+      ch->Send(&i);
+      sum_send += i;
+    }
+  });
+  for (int i = 0; i < 3; i++) {
+    int recv;
+    ch->Receive(&recv);
+    EXPECT_EQ(recv, i);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 3U);
+
+  CloseChannel(ch);
+  t.join();
+  delete ch;
+}

From f3415ec55e1daf437080d5ee2febb18b6bcb3a09 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Thu, 1 Feb 2018 21:53:16 +0800
Subject: [PATCH 058/102] Follow comments.

---
 paddle/operators/bipartite_match_op.cc        |  18 ++-
 paddle/operators/multiclass_nms_op.cc         | 104 ++++++++++--------
 .../v2/fluid/tests/test_bipartite_match_op.py |   4 +-
 .../v2/fluid/tests/test_multiclass_nms_op.py  |   2 +-
 4 files changed, 72 insertions(+), 56 deletions(-)

diff --git a/paddle/operators/bipartite_match_op.cc b/paddle/operators/bipartite_match_op.cc
index 83c8778fe4..1e6fa2091d 100644
--- a/paddle/operators/bipartite_match_op.cc
+++ b/paddle/operators/bipartite_match_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -28,12 +28,18 @@ class BipartiteMatchOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("DistMat"),
                    "Input(DistMat) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchIndices"),
+        "Output(ColToRowMatchIndices) of BipartiteMatch should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("ColToRowMatchDist"),
+        "Output(ColToRowMatchDist) of BipartiteMatch should not be null.");
 
     auto dims = ctx->GetInputDim("DistMat");
     PADDLE_ENFORCE_EQ(dims.size(), 2, "The rank of Input(DistMat) must be 2.");
 
     ctx->SetOutputDim("ColToRowMatchIndices", dims);
-    ctx->SetOutputDim("ColToRowMatchDis", dims);
+    ctx->SetOutputDim("ColToRowMatchDist", dims);
   }
 };
 
@@ -91,7 +97,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* dist_mat = context.Input<LoDTensor>("DistMat");
     auto* match_indices = context.Output<Tensor>("ColToRowMatchIndices");
-    auto* match_dist = context.Output<Tensor>("ColToRowMatchDis");
+    auto* match_dist = context.Output<Tensor>("ColToRowMatchDist");
 
     auto& dev_ctx = context.device_context<platform::CPUDeviceContext>();
 
@@ -148,13 +154,13 @@ class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
               "Otherwise, it means B[j] is matched to row "
               "ColToRowMatchIndices[i][j] in i-th instance. The row number of "
               "i-th instance is saved in ColToRowMatchIndices[i][j].");
-    AddOutput("ColToRowMatchDis",
+    AddOutput("ColToRowMatchDist",
               "(Tensor) A 2-D Tensor with shape [N, M] in float type. "
               "N is batch size. If ColToRowMatchIndices[i][j] is -1, "
-              "ColToRowMatchDis[i][j] is also -1.0. Otherwise, assumed "
+              "ColToRowMatchDist[i][j] is also -1.0. Otherwise, assumed "
               "ColToRowMatchIndices[i][j] = d, and the row offsets of each "
               "instance are called LoD. Then "
-              "ColToRowMatchDis[i][j] = DistMat[d+LoD[i]][j]");
+              "ColToRowMatchDist[i][j] = DistMat[d+LoD[i]][j]");
     AddComment(R"DOC(
 This operator is a greedy bipartite matching algorithm, which is used to
 obtain the matching with the maximum distance based on the input
diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
index 4689306d24..cb38e9fa20 100644
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -24,25 +24,33 @@ using LoDTensor = framework::LoDTensor;
 constexpr int64_t kOutputDim = 6;
 constexpr int64_t kBBoxSize = 4;
 
-class MulticlassNMSOp : public framework::OperatorWithKernel {
+class MultiClassNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Bboxes"),
-                   "Input(Bboxes) of MulticlassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("BBoxes"),
+                   "Input(BBoxes) of MultiClassNMS should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Scores"),
-                   "Input(Scores) of MulticlassNMS should not be null.");
+                   "Input(Scores) of MultiClassNMS should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of MultiClassNMS should not be null.");
 
-    auto box_dims = ctx->GetInputDim("Bboxes");
+    auto box_dims = ctx->GetInputDim("BBoxes");
     auto score_dims = ctx->GetInputDim("Scores");
 
     PADDLE_ENFORCE_EQ(box_dims.size(), 2,
-                      "The rank of Input(Bboxes) must be 3.");
+                      "The rank of Input(BBoxes) must be 2.");
     PADDLE_ENFORCE_EQ(score_dims.size(), 3,
                       "The rank of Input(Scores) must be 3.");
-    PADDLE_ENFORCE_EQ(box_dims[1], 4);
-    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2]);
+    PADDLE_ENFORCE_EQ(box_dims[1], 4,
+                      "The 2nd dimension of Input(BBoxes) must be 4, "
+                      "represents the layout of coordinate "
+                      "[xmin, ymin, xmax, ymax]");
+    PADDLE_ENFORCE_EQ(box_dims[0], score_dims[2],
+                      "The 1st dimensiong of Input(BBoxes) must be equal to "
+                      "3rd dimension of Input(Scores), which represents the "
+                      "predicted bboxes.");
 
     // Here the box_dims[0] is not the real dimension of output.
     // It will be rewritten in the computing kernel.
@@ -86,15 +94,16 @@ static inline void GetMaxScoreIndex(
 template <class T>
 T BBoxArea(const T* box, const bool normalized) {
   if (box[2] < box[0] || box[3] < box[1]) {
-    // If bbox is invalid (e.g. xmax < xmin or ymax < ymin), return 0.
-    return T(0.);
+    // If coordinate values are is invalid
+    // (e.g. xmax < xmin or ymax < ymin), return 0.
+    return static_cast<T>(0.);
   } else {
     const T w = box[2] - box[0];
     const T h = box[3] - box[1];
     if (normalized) {
       return w * h;
     } else {
-      // If bbox is not within range [0, 1].
+      // If coordinate values are not within range [0, 1].
       return (w + 1) * (h + 1);
     }
   }
@@ -121,7 +130,7 @@ static inline T JaccardOverlap(const T* box1, const T* box2,
 }
 
 template <typename T>
-class MulticlassNMSKernel : public framework::OpKernel<T> {
+class MultiClassNMSKernel : public framework::OpKernel<T> {
  public:
   void NMSFast(const Tensor& bbox, const Tensor& scores,
                const T score_threshold, const T nms_threshold, const T eta,
@@ -163,10 +172,10 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
     }
   }
 
-  void MulticlassNMS(const framework::ExecutionContext& ctx,
+  void MultiClassNMS(const framework::ExecutionContext& ctx,
                      const Tensor& scores, const Tensor& bboxes,
-                     std::map<int, std::vector<int>>* indices,
-                     int* num_nmsed_out) const {
+                     std::map<int, std::vector<int>>& indices,
+                     int& num_nmsed_out) const {
     int64_t background_label = ctx.Attr<int>("background_label");
     int64_t nms_top_k = ctx.Attr<int>("nms_top_k");
     int64_t keep_top_k = ctx.Attr<int>("keep_top_k");
@@ -181,15 +190,15 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
       if (c == background_label) continue;
       Tensor score = scores.Slice(c, c + 1);
       NMSFast(bboxes, score, score_threshold, nms_threshold, nms_eta, nms_top_k,
-              &((*indices)[c]));
-      num_det += (*indices)[c].size();
+              &(indices[c]));
+      num_det += indices[c].size();
     }
 
-    *num_nmsed_out = num_det;
+    num_nmsed_out = num_det;
     const T* scores_data = scores.data<T>();
     if (keep_top_k > -1 && num_det > keep_top_k) {
       std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-      for (const auto& it : *indices) {
+      for (const auto& it : indices) {
         int label = it.first;
         const T* sdata = scores_data + label * predict_dim;
         const std::vector<int>& label_indices = it.second;
@@ -212,12 +221,12 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
         int idx = score_index_pairs[j].second.second;
         new_indices[label].push_back(idx);
       }
-      new_indices.swap(*indices);
-      *num_nmsed_out = keep_top_k;
+      new_indices.swap(indices);
+      num_nmsed_out = keep_top_k;
     }
   }
 
-  void MulticlassOutput(const Tensor& scores, const Tensor& bboxes,
+  void MultiClassOutput(const Tensor& scores, const Tensor& bboxes,
                         std::map<int, std::vector<int>>& selected_indices,
                         Tensor* outs) const {
     int predict_dim = scores.dims()[1];
@@ -229,23 +238,21 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
     for (const auto& it : selected_indices) {
       int label = it.first;
       const T* sdata = scores_data + label * predict_dim;
-      std::vector<int> indices = it.second;
+      const std::vector<int>& indices = it.second;
       for (int j = 0; j < indices.size(); ++j) {
         int idx = indices[j];
         const T* bdata = bboxes_data + idx * kBBoxSize;
         odata[count * kOutputDim] = label;           // label
         odata[count * kOutputDim + 1] = sdata[idx];  // score
-        odata[count * kOutputDim + 2] = bdata[0];    // xmin
-        odata[count * kOutputDim + 3] = bdata[1];    // ymin
-        odata[count * kOutputDim + 4] = bdata[2];    // xmax
-        odata[count * kOutputDim + 5] = bdata[3];    // ymax
+        // xmin, ymin, xmax, ymax
+        std::memcpy(odata + count * kOutputDim + 2, bdata, 4 * sizeof(T));
         count++;
       }
     }
   }
 
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* boxes = ctx.Input<Tensor>("Bboxes");
+    auto* boxes = ctx.Input<Tensor>("BBoxes");
     auto* scores = ctx.Input<Tensor>("Scores");
     auto* outs = ctx.Output<LoDTensor>("Out");
 
@@ -262,7 +269,7 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
       ins_score.Resize({class_num, predict_dim});
       std::map<int, std::vector<int>> indices;
       int num_nmsed_out = 0;
-      MulticlassNMS(ctx, ins_score, *boxes, &indices, &num_nmsed_out);
+      MultiClassNMS(ctx, ins_score, *boxes, indices, num_nmsed_out);
       all_indices.push_back(indices);
       batch_starts.push_back(batch_starts.back() + num_nmsed_out);
     }
@@ -280,7 +287,7 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
         int64_t e = batch_starts[i + 1];
         if (e > s) {
           Tensor out = outs->Slice(s, e);
-          MulticlassOutput(ins_score, *boxes, all_indices[i], &out);
+          MultiClassOutput(ins_score, *boxes, all_indices[i], &out);
         }
       }
     }
@@ -292,28 +299,31 @@ class MulticlassNMSKernel : public framework::OpKernel<T> {
   }
 };
 
-class MulticlassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
+class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  MulticlassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+  MultiClassNMSOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("Bboxes",
-             "(Tensor) A 2-D Tensor with shape [M, 4] represents the location "
-             "predictions with M bboxes. 4 is the number of "
-             "each location coordinates.");
+    AddInput("BBoxes",
+             "(Tensor) A 2-D Tensor with shape [M, 4] represents the "
+             "predicted locations of M bounding bboxes. Each bounding box "
+             "has four coordinate values and the layout is "
+             "[xmin, ymin, xmax, ymax].");
     AddInput("Scores",
              "(Tensor) A 3-D Tensor with shape [N, C, M] represents the "
-             "confidence predictions. N is the batch size, C is the class "
-             "number, M is number of predictions for each class, which is "
-             "the same with Bboxes.");
+             "predicted confidence predictions. N is the batch size, C is the "
+             "class number, M is number of bounding boxes. For each category "
+             "there are total M scores which corresponding M bounding boxes. "
+             " Please note, M is equal to the 1st dimension of BBoxes. ");
     AddAttr<int>(
         "background_label",
         "(int64_t, defalut: 0) "
-        "The index of background label, the background label will be ignored.")
+        "The index of background label, the background label will be ignored. "
+        "If set to -1, then all categories will be considered.")
         .SetDefault(0);
     AddAttr<float>("score_threshold",
                    "(float) "
-                   "Only consider detections whose confidences are larger than "
-                   "a threshold. If not provided, consider all boxes.");
+                   "Threshold to filter out bounding boxes with low "
+                   "confidence score. If not provided, consider all boxes.");
     AddAttr<int>("nms_top_k",
                  "(int64_t) "
                  "Maximum number of detections to be kept according to the "
@@ -368,8 +378,8 @@ value which is -1.
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OPERATOR(multiclass_nms, ops::MulticlassNMSOp,
-                  ops::MulticlassNMSOpMaker,
+REGISTER_OPERATOR(multiclass_nms, ops::MultiClassNMSOp,
+                  ops::MultiClassNMSOpMaker,
                   paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MulticlassNMSKernel<float>,
-                       ops::MulticlassNMSKernel<double>);
+REGISTER_OP_CPU_KERNEL(multiclass_nms, ops::MultiClassNMSKernel<float>,
+                       ops::MultiClassNMSKernel<double>);
diff --git a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
index c35fb20b10..4943bbb338 100644
--- a/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
+++ b/python/paddle/v2/fluid/tests/test_bipartite_match_op.py
@@ -72,7 +72,7 @@ class TestBipartiteMatchOpWithLoD(OpTest):
         self.inputs = {'DistMat': (dist, lod)}
         self.outputs = {
             'ColToRowMatchIndices': (match_indices),
-            'ColToRowMatchDis': (match_dist),
+            'ColToRowMatchDist': (match_dist),
         }
 
     def test_check_output(self):
@@ -89,7 +89,7 @@ class TestBipartiteMatchOpWithoutLoD(OpTest):
         self.inputs = {'DistMat': dist}
         self.outputs = {
             'ColToRowMatchIndices': match_indices,
-            'ColToRowMatchDis': match_dist,
+            'ColToRowMatchDist': match_dist,
         }
 
     def test_check_output(self):
diff --git a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
index 3097b8388c..3b80d2359b 100644
--- a/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
+++ b/python/paddle/v2/fluid/tests/test_multiclass_nms_op.py
@@ -190,7 +190,7 @@ class TestMulticlassNMSOp(OpTest):
         nmsed_outs = np.array(nmsed_outs).astype('float32')
 
         self.op_type = 'multiclass_nms'
-        self.inputs = {'Bboxes': boxes, 'Scores': scores}
+        self.inputs = {'BBoxes': boxes, 'Scores': scores}
         self.outputs = {'Out': (nmsed_outs, [lod])}
         self.attrs = {
             'background_label': 0,

From 4673a24bdad55f0d135107dd18de451f5a10dab3 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 2 Feb 2018 09:42:08 +0800
Subject: [PATCH 059/102] Add softmax into Python API.

---
 python/paddle/v2/fluid/layers/ops.py        | 1 +
 python/paddle/v2/fluid/tests/test_layers.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index ee3172c7b8..c701e79ad2 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -59,6 +59,7 @@ __all__ = [
     'elementwise_pow',
     'clip',
     'clip_by_norm',
+    'softmax',
     'sequence_softmax',
 ] + __activations__
 
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 3f54e28def..aea43c2517 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -223,6 +223,14 @@ class TestBook(unittest.TestCase):
             self.assertIsNotNone(layers.sequence_softmax(x=seq))
         print(str(program))
 
+    def test_softmax(self):
+        program = Program()
+        with program_guard(program):
+            data = layers.data(name='data', shape=[10], dtype='float32')
+            hid = layers.fc(input=data, size=20)
+            self.assertIsNotNone(layers.softmax(x=hid))
+        print(str(program))
+
     def test_get_places(self):
         program = Program()
         with program_guard(program):

From 6695a204cd739a000ea1d647143d5145c0e6974f Mon Sep 17 00:00:00 2001
From: xuwei06 <xuwei06@baidu.com>
Date: Wed, 10 Jan 2018 14:38:15 -0800
Subject: [PATCH 060/102] helper functions fetch_var and get_var

fetch_var for getting the values of a variable with given name
get_var for getting the Variable with given name
---
 python/paddle/v2/fluid/executor.py            | 48 ++++++++++++++-----
 python/paddle/v2/fluid/framework.py           | 20 ++++++++
 python/paddle/v2/fluid/layers/tensor.py       |  8 ++--
 .../paddle/v2/fluid/tests/test_fetch_var.py   | 23 +++++++++
 4 files changed, 85 insertions(+), 14 deletions(-)
 create mode 100644 python/paddle/v2/fluid/tests/test_fetch_var.py

diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index 9f48815b8b..af69ce2abc 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -17,7 +17,9 @@ import contextlib
 from framework import Program, default_main_program
 from . import core
 
-__all__ = ['Executor', 'global_scope', 'scope_guard', 'switch_scope']
+__all__ = [
+    'Executor', 'global_scope', 'scope_guard', 'switch_scope', 'fetch_var'
+]
 
 g_scope = core.Scope()
 
@@ -80,12 +82,12 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
     Args:
         block: a block instance (typically global block of a program)
         feed_targets: a dictionary of {feed_target_name: feed_target_data}
-        feed_holder_name: the name of the variable that holds the data of 
-            all feed targets. The type of this feed_holder variable is 
+        feed_holder_name: the name of the variable that holds the data of
+            all feed targets. The type of this feed_holder variable is
             FEED_MINIBATCH, which is essentially vector<LoDTensor>.
 
     Returns:
-        A boolean value that indicates whether a block has feed operators 
+        A boolean value that indicates whether a block has feed operators
         that match the info contained in feed_targets and feed_holder_name.
     """
 
@@ -108,7 +110,7 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
 
 def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     """ Check whether the block already has fetch operators.
-    
+
     Return false if the block does not have any fetch operators.
     If some fetch operators have been appended to the block, check that
     the info contained in these fetch operators matches the fetch_targets
@@ -118,13 +120,13 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     Args:
         block: a block instance (typically global block of a program)
         fetch_targets: a dictionary of {fetch_target_name: fetch_target_data}
-        fetch_holder_name: the name of the variable that holds the data of 
-            all fetch targets. The type of this fetch_holder variable is 
-            FETCH_LIST, which is essentially vector<LoDTensor>.    
+        fetch_holder_name: the name of the variable that holds the data of
+            all fetch targets. The type of this fetch_holder variable is
+            FETCH_LIST, which is essentially vector<LoDTensor>.
 
-    Return:    
-        A boolean value that indicates whether a block has fetch operators 
-        that match the info contained in fetch_targets and fetch_holder_name.     
+    Return:
+        A boolean value that indicates whether a block has fetch operators
+        that match the info contained in fetch_targets and fetch_holder_name.
     """
 
     fetch_count = 0
@@ -146,6 +148,30 @@ def has_fetch_operators(block, fetch_targets, fetch_holder_name):
     return fetch_count > 0
 
 
+def fetch_var(name, scope=None, return_numpy=True):
+    """
+    Fetch the value of the variable with the given name from the given scope
+    Args:
+        name(str): name of the variable
+        scope(core.Scope|None): scope object.
+            If None, global_scope() will be used.
+        return_numpy(bool): whether convert the tensor to numpy.ndarray
+    Returns:
+       LodTensor|numpy.ndarray
+    """
+    assert isinstance(name, str)
+    if scope is None:
+        scope = global_scope()
+    assert isinstance(scope, core.Scope)
+
+    var = global_scope().find_var(name)
+    assert var is not None, "Cannot find '%s' in scope." % name
+    tensor = var.get_tensor()
+    if return_numpy:
+        tensor = as_numpy(tensor)
+    return tensor
+
+
 class Executor(object):
     def __init__(self, places):
         if not isinstance(places, list) and not isinstance(places, tuple):
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 7f5187d299..7fcd19b215 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -31,6 +31,7 @@ __all__ = [
     'program_guard',
     'switch_startup_program',
     'switch_main_program',
+    'get_var',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -1124,3 +1125,22 @@ def program_guard(main_program, startup_program=None):
     switch_main_program(main_program)
     if startup_program is not None:
         switch_startup_program(startup_program)
+
+
+def get_var(name, program=None):
+    """
+    Get a variable by name from the global block of a program
+    Args:
+        name(str): name of the variable
+        program(Program|None): program object.
+             If None, default_global_program() will be used.
+
+    Returns:
+        Variable
+    """
+    if program is None:
+        program = default_main_program()
+    assert isinstance(name, str)
+    assert isinstance(name, Program)
+
+    return program.global_block().var(name)
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index c435c5206d..27067d458d 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -35,13 +35,15 @@ __all__ = [
 ]
 
 
-def create_tensor(dtype, name=None):
+def create_tensor(dtype, name=None, persistable=False):
     helper = LayerHelper("create_tensor", **locals())
-    return helper.create_variable(name=helper.name, dtype=dtype)
+    return helper.create_variable(
+        name=helper.name, dtype=dtype, persistable=persistable)
 
 
 def create_parameter(shape,
                      dtype,
+                     name=None,
                      attr=None,
                      is_bias=False,
                      default_initializer=None):
@@ -62,7 +64,7 @@ def create_parameter(shape,
     """
     helper = LayerHelper("create_parameter", **locals())
     if attr is None:
-        attr = ParamAttr()
+        attr = ParamAttr(name=name)
     return helper.create_parameter(attr, shape, dtype, is_bias,
                                    default_initializer)
 
diff --git a/python/paddle/v2/fluid/tests/test_fetch_var.py b/python/paddle/v2/fluid/tests/test_fetch_var.py
new file mode 100644
index 0000000000..670ab54f51
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_fetch_var.py
@@ -0,0 +1,23 @@
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import op_test
+import numpy
+import unittest
+
+
+class TestFetchVar(op_test.OpTest):
+    def test_fetch_var(self):
+        val = numpy.array([1, 3, 5]).astype(numpy.int32)
+        x = layers.create_tensor(dtype="int32", persistable=True, name="x")
+        layers.assign(input=val, output=x)
+        exe = fluid.Executor(fluid.CPUPlace())
+        exe.run(fluid.default_main_program(), feed={}, fetch_list=[])
+        fetched_x = fluid.fetch_var("x")
+        self.assertTrue(
+            numpy.array_equal(fetched_x, val),
+            "fetch_x=%s val=%s" % (fetched_x, val))
+        self.assertEqual(fetched_x.dtype, val.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 7208190701d9a3c6d1e4dc507940f5d89d12024f Mon Sep 17 00:00:00 2001
From: xuwei06 <xuwei06@baidu.com>
Date: Fri, 12 Jan 2018 09:27:38 -0800
Subject: [PATCH 061/102] More informative comment and error message for
 fetch_var()

---
 python/paddle/v2/fluid/executor.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index af69ce2abc..0eddcc3a5a 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -152,8 +152,10 @@ def fetch_var(name, scope=None, return_numpy=True):
     """
     Fetch the value of the variable with the given name from the given scope
     Args:
-        name(str): name of the variable
-        scope(core.Scope|None): scope object.
+        name(str): name of the variable. Typically, only persistable variables
+            can be found in the scope used for running the program.
+        scope(core.Scope|None): scope object. It should be the scope where
+            you pass to Executor.run() when running your program.
             If None, global_scope() will be used.
         return_numpy(bool): whether convert the tensor to numpy.ndarray
     Returns:
@@ -165,7 +167,10 @@ def fetch_var(name, scope=None, return_numpy=True):
     assert isinstance(scope, core.Scope)
 
     var = global_scope().find_var(name)
-    assert var is not None, "Cannot find '%s' in scope." % name
+    assert var is not None, (
+        "Cannot find " + name + " in scope. Perhaps you need to make the"
+        " variable persistable by using var.persistable = True in your"
+        " program.")
     tensor = var.get_tensor()
     if return_numpy:
         tensor = as_numpy(tensor)

From c1ac5b63efbd927ca1971493fb49883d6807294d Mon Sep 17 00:00:00 2001
From: QI JUN <qijun1994@hotmail.com>
Date: Fri, 2 Feb 2018 10:12:04 +0800
Subject: [PATCH 062/102] memory optimization for dynamic RNN (#8041)

* init

* add delete operator

* debug

* add wait

* clean code

* fix bug

* fix bug

* refine code

* remove unused code
---
 paddle/operators/while_op.cc                  |  5 ++++
 .../fluid/memory_optimization_transpiler.py   | 24 +++++++++++++------
 2 files changed, 22 insertions(+), 7 deletions(-)

diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 2fdd25dbbe..733a80ea35 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -99,6 +99,9 @@ class WhileGradOp : public framework::OperatorBase {
 
   void Run(const framework::Scope &scope,
            const platform::Place &dev_place) const override {
+    // get device context from pool
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
     framework::Executor executor(dev_place);
     auto *block = Attr<framework::BlockDesc *>(kStepBlock);
     auto *program = block->Program();
@@ -205,6 +208,8 @@ class WhileGradOp : public framework::OperatorBase {
         sum_op->Run(cur_scope, dev_place);
         cur_scope.Rename(new_inside_name, inside_grad_name);
       }
+      dev_ctx.Wait();
+      const_cast<framework::Scope &>(scope).DeleteScope(&cur_scope);
     }
   }
 };
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
index 956c5b66da..2b00923f5e 100644
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -31,7 +31,7 @@ dtype_to_size = {
 
 
 class ControlFlowGraph(object):
-    def __init__(self, Program, ops, forward_num):
+    def __init__(self, Program, ops, forward_num, skip_opt):
         self._program = Program
         self._ops = ops
         self._forward_num = forward_num
@@ -41,6 +41,7 @@ class ControlFlowGraph(object):
         self._defs = defaultdict(set)
         self._live_in = defaultdict(set)
         self._live_out = defaultdict(set)
+        self._skip_opt = skip_opt
 
     def _add_connections(self, connections):
         for node1, node2 in connections:
@@ -130,6 +131,10 @@ class ControlFlowGraph(object):
                     block_desc, x,
                     is_forward).type() != core.VarDesc.VarType.LOD_TENSOR:
                 return False
+            if x in self._skip_opt:
+                return False
+            if not self._find_var(block_desc, x, is_forward).shape():
+                return False
             return True
 
         self._build_graph()
@@ -140,6 +145,7 @@ class ControlFlowGraph(object):
             if op.type() == "while" or op.type() == "while_grad":
                 continue
             block_desc = op.block()
+            self.current_block_desc = block_desc
             is_forward = i < self._forward_num
             if self.pool:
                 defs_can_optimize = filter(
@@ -197,28 +203,32 @@ def get_cfgs(input_program):
     block_desc = pdesc.block(0)
     op_size = block_desc.op_size()
     # Get global block ops
-    ops_list.append(([block_desc.op(i) for i in range(op_size)], op_size))
+    ops_list.append(
+        ([block_desc.op(i) for i in range(op_size)], op_size, set()))
 
     while_sub_block_ids = []
     while_grad_sub_block_ids = []
-    while_pair = []
+    while_op_output = set()
+    while_block_id_pair = []
 
     for i in range(op_size):
         op = block_desc.op(i)
         if op.type() == "while":
             while_sub_block_ids.append(op.attr("sub_block").id)
+            while_op_output.update(op.output_arg_names())
         elif op.type() == "while_grad":
             while_grad_sub_block_ids.append(op.attr("sub_block").id)
+            while_op_output.update(op.output_arg_names())
 
     # Find while/while_grad block pair
     for grad_id in while_grad_sub_block_ids:
         parent_id = pdesc.block(grad_id).parent
         if parent_id in while_sub_block_ids:
-            while_pair.append((parent_id, grad_id))
+            while_block_id_pair.append((parent_id, grad_id))
             while_sub_block_ids.remove(parent_id)
 
     # Get while/while_grad block ops
-    for parent_id, grad_id in while_pair:
+    for parent_id, grad_id in while_block_id_pair:
         while_block_ops = []
         while_block = pdesc.block(parent_id)
         while_block_op_size = while_block.op_size()
@@ -230,7 +240,7 @@ def get_cfgs(input_program):
         for i in range(while_grad_block_op_size):
             while_block_ops.append(while_grad_block.op(i))
 
-        ops_list.append((while_block_ops, while_block_op_size))
+        ops_list.append((while_block_ops, while_block_op_size, while_op_output))
 
     # Process rest while block ops
     for parent_id in while_sub_block_ids:
@@ -242,7 +252,7 @@ def get_cfgs(input_program):
 
         ops_list.append((while_block_ops, while_block_op_size))
 
-    cfgs = [ControlFlowGraph(input_program, i, j) for i, j in ops_list]
+    cfgs = [ControlFlowGraph(input_program, i, j, k) for i, j, k in ops_list]
     return cfgs
 
 

From 0bbd7bc38e4f9f14f610961e34968f128613af52 Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Fri, 2 Feb 2018 11:05:19 +0800
Subject: [PATCH 063/102] follow comments

---
 benchmark/cluster/vgg16/Dockerfile        |  4 +++-
 paddle/gserver/layers/MultiBoxLossLayer.h | 13 -------------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
index 54d1b09a0f..888486bece 100644
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -1,4 +1,6 @@
-FROM python:2.7.14
+#FROM python:2.7.14
+FROM nvidia/cuda:8.0-runtime-ubuntu16.04
+RUN apt-get update && apt-get install -y python
 RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
 # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
 #       so we must build one with distribute support to install in this image.
diff --git a/paddle/gserver/layers/MultiBoxLossLayer.h b/paddle/gserver/layers/MultiBoxLossLayer.h
index 40df312a25..9935da5644 100644
--- a/paddle/gserver/layers/MultiBoxLossLayer.h
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
@@ -1,16 +1,3 @@
-//  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//    http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 /* copyright (c) 2016 paddlepaddle authors. all rights reserve.
 
 licensed under the apache license, version 2.0 (the "license");

From 8894c67d7168dfa5f5dc8e57ec2b5c60f24e368c Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Fri, 2 Feb 2018 11:35:42 +0800
Subject: [PATCH 064/102] add block graph image for debuging (#8026)

init debuger.
---
 python/paddle/v2/fluid/debuger.py   |  73 ++++++++
 python/paddle/v2/fluid/framework.py |   5 +-
 python/paddle/v2/fluid/graphviz.py  | 272 ++++++++++++++++++++++++++++
 3 files changed, 347 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/v2/fluid/debuger.py
 create mode 100644 python/paddle/v2/fluid/graphviz.py

diff --git a/python/paddle/v2/fluid/debuger.py b/python/paddle/v2/fluid/debuger.py
new file mode 100644
index 0000000000..d379352442
--- /dev/null
+++ b/python/paddle/v2/fluid/debuger.py
@@ -0,0 +1,73 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from graphviz import GraphPreviewGenerator
+import proto.framework_pb2 as framework_pb2
+
+
+def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
+    '''
+    Generate a debug graph for block.
+    Args:
+        block(Block): a block.
+    '''
+    graph = GraphPreviewGenerator("some graph")
+    # collect parameters and args
+    protostr = block.desc.serialize_to_string()
+    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+
+    def need_highlight(name):
+        if highlights is None: return False
+        for pattern in highlights:
+            assert type(pattern) is str
+            if re.match(pattern, name):
+                return True
+        return False
+
+    # draw parameters and args
+    vars = {}
+    for var in desc.vars:
+        shape = [str(i) for i in var.lod_tensor.tensor.dims]
+        if not shape:
+            shape = ['null']
+        # create var
+        if var.persistable:
+            varn = graph.add_param(
+                var.name, var.type, shape, highlight=need_highlight(var.name))
+        else:
+            varn = graph.add_arg(var.name, highlight=need_highlight(var.name))
+        vars[var.name] = varn
+
+    def add_op_link_var(op, var, op2var=False):
+        for arg in var.arguments:
+            if arg not in vars:
+                # add missing variables as argument
+                vars[arg] = graph.add_arg(arg, highlight=need_highlight(arg))
+            varn = vars[arg]
+            highlight = need_highlight(op.description) or need_highlight(
+                varn.description)
+            if op2var:
+                graph.add_edge(op, varn, highlight=highlight)
+            else:
+                graph.add_edge(varn, op, highlight=highlight)
+
+    for op in desc.ops:
+        opn = graph.add_op(op.type, highlight=need_highlight(op.type))
+        for var in op.inputs:
+            add_op_link_var(opn, var, False)
+        for var in op.outputs:
+            add_op_link_var(opn, var, True)
+
+    graph(path, show=True)
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 7f5187d299..69cbebe41e 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -451,9 +451,8 @@ class Operator(object):
             if not given == need:
                 raise ValueError(("Incorrect setting for output(s) of "
                                   "operator \"%s\". Need: [%s] Given: [%s]") %
-                                 (type, ", ".join(str(e)
-                                                  for e in need), ", ".join(
-                                                      str(e) for e in given)))
+                                 (type, ", ".join(str(e) for e in need),
+                                  ", ".join(str(e) for e in given)))
 
             for out_proto in proto.outputs:
                 out_args = outputs[out_proto.name]
diff --git a/python/paddle/v2/fluid/graphviz.py b/python/paddle/v2/fluid/graphviz.py
new file mode 100644
index 0000000000..5881119c39
--- /dev/null
+++ b/python/paddle/v2/fluid/graphviz.py
@@ -0,0 +1,272 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+import subprocess
+import logging
+
+
+def crepr(v):
+    if type(v) is str or type(v) is unicode:
+        return '"%s"' % v
+    return str(v)
+
+
+class Rank(object):
+    def __init__(self, kind, name, priority):
+        '''
+        kind: str
+        name: str
+        priority: int
+        '''
+        self.kind = kind
+        self.name = name
+        self.priority = priority
+        self.nodes = []
+
+    def __str__(self):
+        if not self.nodes:
+            return ''
+
+        return '{' + 'rank={};'.format(self.kind) + \
+               ','.join([node.name for node in self.nodes]) + '}'
+
+
+class Graph(object):
+    rank_counter = 0
+
+    def __init__(self, title, **attrs):
+        self.title = title
+        self.attrs = attrs
+        self.nodes = []
+        self.edges = []
+        self.rank_groups = {}
+
+    def code(self):
+        return self.__str__()
+
+    def rank_group(self, kind, priority):
+        name = "rankgroup-%d" % Graph.rank_counter
+        Graph.rank_counter += 1
+        rank = Rank(kind, name, priority)
+        self.rank_groups[name] = rank
+        return name
+
+    def node(self, label, prefix, description="", **attrs):
+        node = Node(label, prefix, description, **attrs)
+
+        if 'rank' in attrs:
+            rank = self.rank_groups[attrs['rank']]
+            del attrs['rank']
+            rank.nodes.append(node)
+        self.nodes.append(node)
+        return node
+
+    def edge(self, source, target, **attrs):
+        edge = Edge(source, target, **attrs)
+        self.edges.append(edge)
+        return edge
+
+    def compile(self, dot_path):
+        file = open(dot_path, 'w')
+        file.write(self.__str__())
+        image_path = os.path.join(
+            os.path.dirname(__file__), dot_path[:-3] + "pdf")
+        cmd = ["dot", "-Tpdf", dot_path, "-o", image_path]
+        subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+        logging.warning("write block debug graph to {}".format(image_path))
+        return image_path
+
+    def show(self, dot_path):
+        image = self.compile(dot_path)
+        cmd = ["open", image]
+        subprocess.Popen(
+            cmd,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE)
+
+    def _rank_repr(self):
+        ranks = sorted(
+            self.rank_groups.items(),
+            cmp=lambda a, b: a[1].priority > b[1].priority)
+        repr = []
+        for x in ranks:
+            repr.append(str(x[1]))
+        return '\n'.join(repr) + '\n'
+
+    def __str__(self):
+        reprs = [
+            'digraph G {',
+            'title = {}'.format(crepr(self.title)),
+        ]
+
+        for attr in self.attrs:
+            reprs.append("{key}={value};".format(
+                key=attr, value=crepr(self.attrs[attr])))
+
+        reprs.append(self._rank_repr())
+
+        random.shuffle(self.nodes)
+        reprs += [str(node) for node in self.nodes]
+
+        for x in self.edges:
+            reprs.append(str(x))
+
+        reprs.append('}')
+        return '\n'.join(reprs)
+
+
+class Node(object):
+    counter = 1
+
+    def __init__(self, label, prefix, description="", **attrs):
+        self.label = label
+        self.name = "%s_%d" % (prefix, Node.counter)
+        self.description = description
+        self.attrs = attrs
+        Node.counter += 1
+
+    def __str__(self):
+        reprs = '{name} [label={label} {extra} ];'.format(
+            name=self.name,
+            label=self.label,
+            extra=',' + ','.join("%s=%s" % (key, crepr(value))
+                                 for key, value in self.attrs.items())
+            if self.attrs else "")
+        return reprs
+
+
+class Edge(object):
+    def __init__(self, source, target, **attrs):
+        '''
+        Link source to target.
+        :param source: Node
+        :param target: Node
+        :param graph: Graph
+        :param attrs: dic
+        '''
+        self.source = source
+        self.target = target
+        self.attrs = attrs
+
+    def __str__(self):
+        repr = "{source} -> {target} {extra}".format(
+            source=self.source.name,
+            target=self.target.name,
+            extra="" if not self.attrs else
+            "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
+                           for attr in self.attrs.items()) + "]")
+        return repr
+
+
+class GraphPreviewGenerator(object):
+    '''
+    Generate a graph image for ONNX proto.
+    '''
+
+    def __init__(self, title):
+        # init graphviz graph
+        self.graph = Graph(
+            title,
+            layout="dot",
+            concentrate="true",
+            rankdir="TB", )
+
+        self.op_rank = self.graph.rank_group('same', 2)
+        self.param_rank = self.graph.rank_group('same', 1)
+        self.arg_rank = self.graph.rank_group('same', 0)
+
+    def __call__(self, path='temp.dot', show=False):
+        if not show:
+            self.graph.compile(path)
+        else:
+            self.graph.show(path)
+
+    def add_param(self, name, data_type, shape, highlight=False):
+        label = '\n'.join([
+            '<<table cellpadding="5">',
+            '  <tr>',
+            '    <td bgcolor="#2b787e">',
+            '    <b>',
+            name,
+            '    </b>',
+            '    </td>',
+            '  </tr>',
+            '  <tr>',
+            '    <td>',
+            str(data_type),
+            '    </td>'
+            '  </tr>',
+            '  <tr>',
+            '    <td>',
+            '[%s]' % 'x'.join(shape),
+            '    </td>'
+            '  </tr>',
+            '</table>>',
+        ])
+        return self.graph.node(
+            label,
+            prefix="param",
+            description=name,
+            shape="none",
+            style="rounded,filled,bold",
+            width="1.3",
+            color="#148b97" if not highlight else "orange",
+            fontcolor="#ffffff",
+            fontname="Arial")
+
+    def add_op(self, opType, **kwargs):
+        highlight = False
+        if 'highlight' in kwargs:
+            highlight = kwargs['highlight']
+            del kwargs['highlight']
+        return self.graph.node(
+            "<<B>%s</B>>" % opType,
+            prefix="op",
+            description=opType,
+            shape="box",
+            style="rounded, filled, bold",
+            color="#303A3A" if not highlight else "orange",
+            fontname="Arial",
+            fontcolor="#ffffff",
+            width="1.3",
+            height="0.84", )
+
+    def add_arg(self, name, highlight=False):
+        return self.graph.node(
+            crepr(name),
+            prefix="arg",
+            description=name,
+            shape="box",
+            style="rounded,filled,bold",
+            fontname="Arial",
+            fontcolor="#999999",
+            color="#dddddd" if not highlight else "orange")
+
+    def add_edge(self, source, target, **kwargs):
+        highlight = False
+        if 'highlight' in kwargs:
+            highlight = kwargs['highlight']
+            del kwargs['highlight']
+        return self.graph.edge(
+            source,
+            target,
+            color="#00000" if not highlight else "orange",
+            **kwargs)

From 251c2fd50a787b474e49db7f7be9aab27fcd3ccb Mon Sep 17 00:00:00 2001
From: gaoyuan <gaoyuan18@baidu.com>
Date: Fri, 2 Feb 2018 13:35:00 +0800
Subject: [PATCH 065/102] Update according to the code review

---
 paddle/operators/box_coder_op.cc |  2 ++
 paddle/operators/box_coder_op.cu |  2 +-
 paddle/operators/box_coder_op.h  | 17 +++++++++--------
 3 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc
index 41123f9b6e..3836cef96d 100644
--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/operators/box_coder_op.cc
@@ -26,6 +26,8 @@ class BoxCoderOp : public framework::OperatorWithKernel {
                    "Input(PriorBoxVar) of BoxCoderOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
                    "Input(TargetBox) of BoxCoderOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
+                   "Output(OutputBox) of BoxCoderOp should not be null.");
 
     auto prior_box_dims = ctx->GetInputDim("PriorBox");
     auto prior_box_var_dims = ctx->GetInputDim("PriorBoxVar");
diff --git a/paddle/operators/box_coder_op.cu b/paddle/operators/box_coder_op.cu
index 883cc54305..98bd93457f 100644
--- a/paddle/operators/box_coder_op.cu
+++ b/paddle/operators/box_coder_op.cu
@@ -109,7 +109,7 @@ class BoxCoderCUDAKernel : public framework::OpKernel<T> {
     auto* prior_box = context.Input<framework::Tensor>("PriorBox");
     auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<Tensor>("OutputBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
 
     if (target_box->lod().size()) {
       PADDLE_ENFORCE_EQ(target_box->lod().size(), 1,
diff --git a/paddle/operators/box_coder_op.h b/paddle/operators/box_coder_op.h
index d1c9a40459..086251f6e0 100644
--- a/paddle/operators/box_coder_op.h
+++ b/paddle/operators/box_coder_op.h
@@ -16,9 +16,6 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
 enum class BoxCodeType { kEncodeCenterSize = 0, kDecodeCenterSize = 1 };
 
 inline BoxCodeType GetBoxCodeType(const std::string& type) {
@@ -33,8 +30,10 @@ inline BoxCodeType GetBoxCodeType(const std::string& type) {
 template <typename T>
 class BoxCoderKernel : public framework::OpKernel<T> {
  public:
-  void EncodeCenterSize(const Tensor& target_box, const Tensor& prior_box,
-                        const Tensor& prior_box_var, T* output) const {
+  void EncodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
     int64_t row = target_box.dims()[0];
     int64_t col = prior_box.dims()[0];
     int64_t len = prior_box.dims()[1];
@@ -76,8 +75,10 @@ class BoxCoderKernel : public framework::OpKernel<T> {
       }
     }
   }
-  void DecodeCenterSize(const Tensor& target_box, const Tensor& prior_box,
-                        const Tensor& prior_box_var, T* output) const {
+  void DecodeCenterSize(const framework::Tensor& target_box,
+                        const framework::Tensor& prior_box,
+                        const framework::Tensor& prior_box_var,
+                        T* output) const {
     int64_t row = target_box.dims()[0];
     int64_t col = prior_box.dims()[0];
     int64_t len = prior_box.dims()[1];
@@ -124,7 +125,7 @@ class BoxCoderKernel : public framework::OpKernel<T> {
     auto* prior_box = context.Input<framework::Tensor>("PriorBox");
     auto* prior_box_var = context.Input<framework::Tensor>("PriorBoxVar");
     auto* target_box = context.Input<framework::LoDTensor>("TargetBox");
-    auto* output_box = context.Output<Tensor>("OutputBox");
+    auto* output_box = context.Output<framework::Tensor>("OutputBox");
 
     if (target_box->lod().size()) {
       PADDLE_ENFORCE_EQ(target_box->lod().size(), 1UL,

From a4cf6a28194ad8a3f3f04b97672755e46d5750e6 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Thu, 1 Feb 2018 21:48:50 -0800
Subject: [PATCH 066/102] Adding unit tests for checking that closing
 unbuffered channel closes all blocked senders and receivers (#8067)

* Adding unit tests for checking that closing unbuffered channel closes all blocked senders and receivers

* Fixing sleep time
---
 paddle/framework/channel_test.cc | 90 ++++++++++++++++++++++++++++++--
 1 file changed, 86 insertions(+), 4 deletions(-)

diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index 020f806380..31ac72eda9 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -29,16 +29,16 @@ TEST(Channel, MakeAndClose) {
   {
     // MakeChannel should return a buffered channel is buffer_size > 0.
     auto ch = MakeChannel<int>(10);
-    EXPECT_NE(dynamic_cast<Buffered<int>*>(ch), nullptr);
-    EXPECT_EQ(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
     CloseChannel(ch);
     delete ch;
   }
   {
     // MakeChannel should return an un-buffered channel is buffer_size = 0.
     auto ch = MakeChannel<int>(0);
-    EXPECT_EQ(dynamic_cast<Buffered<int>*>(ch), nullptr);
-    EXPECT_NE(dynamic_cast<UnBuffered<int>*>(ch), nullptr);
+    EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
+    EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
     CloseChannel(ch);
     delete ch;
   }
@@ -100,6 +100,88 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
   delete ch;
 }
 
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any receivers waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          ch->Receive(&data);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
+// This tests that closing an unbuffered channel also unblocks
+//  unblocks any senders waiting for senders
+TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(0);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked becausew of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data = 10;
+          ch->Send(&data);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all the threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the thread
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
 TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
   auto ch = MakeChannel<int>(0);
   unsigned sum_send = 0;

From cbc9a59c33b507f26ad4e00e740672ef99bc8fa4 Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 2 Feb 2018 10:03:09 +0800
Subject: [PATCH 067/102] Allow uers to specify the name of moving mean and
 variance in batch_norm interface.

---
 python/paddle/v2/fluid/layers/nn.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index c38e21087d..cb8a4815db 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1478,7 +1478,9 @@ def batch_norm(input,
                param_attr=None,
                bias_attr=None,
                data_layout='NCHW',
-               name=None):
+               name=None,
+               moving_mean_name=None,
+               moving_variance_name=None):
     """
     This function helps create an operator to implement
     the BatchNorm layer using the configurations from the input parameters.
@@ -1508,6 +1510,7 @@ def batch_norm(input,
         attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
     mean = helper.create_global_variable(
+        name=moving_mean_name,
         dtype=input.dtype,
         shape=param_shape,
         persistable=True,
@@ -1515,6 +1518,7 @@ def batch_norm(input,
     helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
 
     variance = helper.create_global_variable(
+        name=moving_variance_name,
         dtype=input.dtype,
         shape=param_shape,
         persistable=True,

From 901cab9ed3e0838954f0015221093fc1d64b5795 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 2 Feb 2018 13:52:41 +0800
Subject: [PATCH 068/102] Add `make clean` in docker/build.sh

---
 paddle/scripts/docker/build.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index df7310d6b7..59f3af0398 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -79,6 +79,7 @@ function run_build() {
     Building in /paddle/build ...
     ============================================
 EOF
+    make clean
     make -j `nproc`
 }
 

From 7d8d9db9ba800833bd5bda76366722b3d37542e2 Mon Sep 17 00:00:00 2001
From: gaoyuan <gaoyuan18@baidu.com>
Date: Fri, 2 Feb 2018 13:56:38 +0800
Subject: [PATCH 069/102] Update according to the code review

---
 paddle/operators/box_coder_op.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/operators/box_coder_op.cc b/paddle/operators/box_coder_op.cc
index 3836cef96d..539813d485 100644
--- a/paddle/operators/box_coder_op.cc
+++ b/paddle/operators/box_coder_op.cc
@@ -24,7 +24,7 @@ class BoxCoderOp : public framework::OperatorWithKernel {
                    "Input(PriorBox) of BoxCoderOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("PriorBoxVar"),
                    "Input(PriorBoxVar) of BoxCoderOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("PriorBox"),
+    PADDLE_ENFORCE(ctx->HasInput("TargetBox"),
                    "Input(TargetBox) of BoxCoderOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("OutputBox"),
                    "Output(OutputBox) of BoxCoderOp should not be null.");

From 7831e0bdd39afea7c404c2d399933b61fbdeddca Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 2 Feb 2018 15:17:13 +0800
Subject: [PATCH 070/102] switch-op design (#8031)

add switch op design
---
 doc/design/switch.md | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 doc/design/switch.md

diff --git a/doc/design/switch.md b/doc/design/switch.md
new file mode 100644
index 0000000000..9db1b2782a
--- /dev/null
+++ b/doc/design/switch.md
@@ -0,0 +1,32 @@
+### Design Doc: Switch
+
+### Background
+
+Many programming languages provide `switch` as a generalization of `if-elif-else`.  We want to add it to Fluid.
+
+The following example shows the usage of `fluid.switch`.
+
+```python
+a = fluid.Var(10)
+b = fluid.Var(0)
+
+switch = fluid.switch()
+with switch.block():
+    with switch.case(fluid.less_equal(a, 10)):
+        fluid.print("Case 1")
+    with switch.case(fluid.larger(a, 0)):
+        fluid.print("Case 2")
+    with switch.default():
+        fluid.print("Case 3")
+```
+
+### The Semantics
+
+1. A `switch` control-flow checks cases one-by-one.
+1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
+1. It runs the first matched case, or the default case if there is one.
+1. Once it matches a case, it runs the corresponding branch and only that branch.  It's like there is a C's `break` keyword at the end of each case.
+
+The above program should print and print only "Case 1".
+
+The implementation of the backward pass of the `switch` control-flow is easier than the backward of the `if_else`, because `switch` runs at most one branch, whereas `if-else` could run more than one branches.

From 4284b857cb61f9ad090044834f3c0f62c339c0b2 Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox@gmail.com>
Date: Fri, 2 Feb 2018 15:45:13 +0800
Subject: [PATCH 071/102] update mine_hard_examples op

---
 paddle/operators/mine_hard_examples_op.cc | 52 ++++++++++++++---------
 1 file changed, 31 insertions(+), 21 deletions(-)

diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc
index 603368f93c..2a3bd139ed 100644
--- a/paddle/operators/mine_hard_examples_op.cc
+++ b/paddle/operators/mine_hard_examples_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ inline bool IsEligibleMining(const MiningType mining_type, const int match_idx,
   }
 }
 
-MiningType GetMiningType(std::string str) {
+inline MiningType GetMiningType(std::string str) {
   if (str == "max_negative") {
     return MiningType::kMaxNegative;
   } else if (str == "hard_example") {
@@ -112,7 +112,7 @@ class MineHardExamplesKernel : public framework::OpKernel<T> {
         neg_sel = std::min(sample_size, neg_sel);
       }
 
-      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<int>);
+      std::sort(loss_idx.begin(), loss_idx.end(), SortScoreDescend<size_t>);
       std::set<int> sel_indices;
       std::vector<int> neg_indices;
       std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
@@ -121,18 +121,27 @@ class MineHardExamplesKernel : public framework::OpKernel<T> {
                        return static_cast<int>(l.second);
                      });
 
-      for (int m = 0; m < prior_num; ++m) {
-        if (match_indices(n, m) > -1) {
-          if (mining_type == MiningType::kHardExample &&
-              sel_indices.find(m) == sel_indices.end()) {
-            match_indices_et(n, m) = -1;
+      if (mining_type == MiningType::kHardExample) {
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) > -1) {
+            if (sel_indices.find(m) == sel_indices.end()) {
+              match_indices_et(n, m) = -1;
+            }
+          } else {
+            if (sel_indices.find(m) != sel_indices.end()) {
+              neg_indices.push_back(m);
+            }
           }
-        } else {
-          if (sel_indices.find(m) != sel_indices.end()) {
+        }
+      } else {
+        for (int m = 0; m < prior_num; ++m) {
+          if (match_indices(n, m) == -1 &&
+              sel_indices.find(m) != sel_indices.end()) {
             neg_indices.push_back(m);
           }
         }
       }
+
       all_neg_indices.push_back(neg_indices);
       batch_starts.push_back(batch_starts.back() + neg_indices.size());
     }
@@ -253,7 +262,7 @@ class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
         "[N, Np], N is the batch size and Np is the number of prior box.");
     AddInput("LocLoss",
              "(Tensor, optional, default Tensor<float>), The localization loss "
-             "wit shape [N, Np], N is the batch size and Np is the number of "
+             "with shape [N, Np], N is the batch size and Np is the number of "
              "prior box.")
         .AsDispensable();
     AddInput("MatchIndices",
@@ -267,15 +276,15 @@ class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
              "Np], N is the batch size and Np is the number of prior box.");
     AddAttr<float>("neg_pos_ratio",
                    "(float) The ratio of the negative box to the positive "
-                   "box. Use only when mining_type is equal to max_negative.")
+                   "box. Use only when mining_type is max_negative.")
         .SetDefault(1.0);
     AddAttr<float>("neg_dist_threshold",
-                   "(float) The negative box dis value threshold. "
-                   "Use only when mining_type is equal to max_negative.")
+                   "(float) The negative overlap upper bound for the unmatched "
+                   "predictions. Use only when mining_type is max_negative.")
         .SetDefault(0.5);
     AddAttr<int>("sample_size",
                  "(float) The max sample size of negative box. Use only when "
-                 "mining_type is equal to hard_example.")
+                 "mining_type is hard_example.")
         .SetDefault(0);
     AddAttr<std::string>("mining_type",
                          "(float) The mining algorithm name, the value is "
@@ -295,7 +304,7 @@ class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
 
     AddOutput("UpdatedMatchIndices",
               "(Tensor<int>) The output of updated MatchIndices, a tensor with "
-              "shape [N, Np]. Only update when mining_type is equal to "
+              "shape [N, Np]. Only update when mining_type is "
               "hard_example. The input MatchIndices elements will be update to "
               "-1 when it is not in the candidate high loss list of negative "
               "examples.");
@@ -303,11 +312,12 @@ class MineHardExamplesOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Mine hard examples Operator.
 This operator implements hard example mining to select a subset of negative box indices.
-For each image, selects the box with highest losses. subject to the condition that the box cannot have
-an Matcht > neg_dist_threshold when mining_type is equals max_negative. The selected number is 
-min(sample_size, max_negative_box_number) when mining_type is equals hard_example,
-or min(neg_pos_ratio * positive_box_number, max_negative_box_number) when mining_type is 
-equals max_negative, where the max_negative_box_number is the count of MatchIndices elements with value -1.
+For each image, selects the box with highest losses. subject to the condition that the 
+box cannot have an Matcht > neg_dist_threshold when mining_type is max_negative. 
+The selected number is min(sample_size, max_negative_box_number) when mining_type is 
+hard_example, or min(neg_pos_ratio * positive_box_number, max_negative_box_number) 
+when mining_type is max_negative, where the max_negative_box_number is the count of 
+MatchIndices elements with value -1.
 )DOC");
   }
 };

From a6f3846d8ff1b9a9d6361381447d1ab7cab7f7ec Mon Sep 17 00:00:00 2001
From: dangqingqing <dangqingqing@baidu.com>
Date: Fri, 2 Feb 2018 16:33:33 +0800
Subject: [PATCH 072/102] Remove the redundant header file and make one
 function inlne.

---
 paddle/operators/multiclass_nms_op.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
index cb38e9fa20..8a65fe69f1 100644
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -92,7 +91,7 @@ static inline void GetMaxScoreIndex(
 }
 
 template <class T>
-T BBoxArea(const T* box, const bool normalized) {
+static inline T BBoxArea(const T* box, const bool normalized) {
   if (box[2] < box[0] || box[3] < box[1]) {
     // If coordinate values are is invalid
     // (e.g. xmax < xmin or ymax < ymin), return 0.

From 2c35e6389af743e9f3cb991c6c88438a37b99c29 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Fri, 2 Feb 2018 16:50:12 +0800
Subject: [PATCH 073/102] Reclassify and change of V2 API documentation
 structure

---
 doc/api/v2/config/layer.rst | 66 +++++++++++++++----------------------
 doc/api/v2/data/dataset.rst |  7 ++++
 2 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index ddf0b055a9..29388f5005 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -87,6 +87,11 @@ roi_pool
 ..  autoclass:: paddle.v2.layer.roi_pool
     :noindex:
 
+pad
+----
+..  autoclass:: paddle.v2.layer.pad
+    :noindex:
+
 Norm Layer
 ==========
 
@@ -133,6 +138,11 @@ grumemory
 ..  autoclass:: paddle.v2.layer.grumemory
     :noindex:
 
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+    
 Recurrent Layer Group
 =====================
 
@@ -340,6 +350,11 @@ bilinear_interp
 ..  autoclass:: paddle.v2.layer.bilinear_interp
     :noindex:
 
+dropout
+--------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
+    
 dot_prod
 ---------
 .. autoclass:: paddle.v2.layer.dot_prod
@@ -402,6 +417,11 @@ scale_shift
 ..  autoclass:: paddle.v2.layer.scale_shift
     :noindex:
 
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
+
 Sampling Layers
 ===============
 
@@ -420,22 +440,6 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
-Factorization Machine Layer
-============================
-
-factorization_machine
----------------------
-..  autoclass:: paddle.v2.layer.factorization_machine
-    :noindex:
-
-Slicing and Joining Layers
-==========================
-
-pad
-----
-..  autoclass:: paddle.v2.layer.pad
-    :noindex:
-
 ..  _api_v2.layer_costs:
 
 Cost Layers
@@ -526,6 +530,11 @@ multibox_loss
 ..  autoclass:: paddle.v2.layer.multibox_loss
     :noindex:
 
+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
+    
 Check Layer
 ============
 
@@ -534,31 +543,10 @@ eos
 ..  autoclass:: paddle.v2.layer.eos
     :noindex:
 
-Miscs
-=====
-
-dropout
---------
-..  autoclass:: paddle.v2.layer.dropout
-    :noindex:
-
-Activation with learnable parameter
-===================================
+Activation
+==========
 
 prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
     :noindex:
-
-gated_unit
------------
-..  autoclass:: paddle.v2.layer.gated_unit
-    :noindex:
-
-Detection output Layer
-======================
-
-detection_output
-----------------
-..  autoclass:: paddle.v2.layer.detection_output
-    :noindex:
diff --git a/doc/api/v2/data/dataset.rst b/doc/api/v2/data/dataset.rst
index 6a8ecc5bb1..02e41564b1 100644
--- a/doc/api/v2/data/dataset.rst
+++ b/doc/api/v2/data/dataset.rst
@@ -73,3 +73,10 @@ wmt14
 ..  automodule:: paddle.v2.dataset.wmt14
     :members:
     :noindex:
+
+wmt16
++++++
+
+..  automodule:: paddle.v2.dataset.wmt16
+    :members:
+    :noindex:

From 8137dd9b5ed0cab202006e2b7d0ab6ff4bee34df Mon Sep 17 00:00:00 2001
From: wanghaox <wanghaox@gmail.com>
Date: Fri, 2 Feb 2018 16:53:33 +0800
Subject: [PATCH 074/102] update mine_hard_examples_op

---
 paddle/operators/mine_hard_examples_op.cc | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/paddle/operators/mine_hard_examples_op.cc b/paddle/operators/mine_hard_examples_op.cc
index 2a3bd139ed..051cc24706 100644
--- a/paddle/operators/mine_hard_examples_op.cc
+++ b/paddle/operators/mine_hard_examples_op.cc
@@ -117,7 +117,7 @@ class MineHardExamplesKernel : public framework::OpKernel<T> {
       std::vector<int> neg_indices;
       std::transform(loss_idx.begin(), loss_idx.begin() + neg_sel,
                      std::inserter(sel_indices, sel_indices.begin()),
-                     [](std::pair<T, size_t> l) -> int {
+                     [](std::pair<T, size_t>& l) -> int {
                        return static_cast<int>(l.second);
                      });
 
@@ -134,12 +134,8 @@ class MineHardExamplesKernel : public framework::OpKernel<T> {
           }
         }
       } else {
-        for (int m = 0; m < prior_num; ++m) {
-          if (match_indices(n, m) == -1 &&
-              sel_indices.find(m) != sel_indices.end()) {
-            neg_indices.push_back(m);
-          }
-        }
+        neg_indices.resize(sel_indices.size());
+        std::copy(sel_indices.begin(), sel_indices.end(), neg_indices.begin());
       }
 
       all_neg_indices.push_back(neg_indices);

From a5acad11e8ead60413192143a0822daa408f67aa Mon Sep 17 00:00:00 2001
From: typhoonzero <typhoonzero1986@gmail.com>
Date: Fri, 2 Feb 2018 17:04:59 +0800
Subject: [PATCH 075/102] update docs

---
 benchmark/cluster/vgg16/Dockerfile |  2 +-
 benchmark/cluster/vgg16/README.md  | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmark/cluster/vgg16/Dockerfile b/benchmark/cluster/vgg16/Dockerfile
index 888486bece..98356cd761 100644
--- a/benchmark/cluster/vgg16/Dockerfile
+++ b/benchmark/cluster/vgg16/Dockerfile
@@ -1,5 +1,5 @@
 #FROM python:2.7.14
-FROM nvidia/cuda:8.0-runtime-ubuntu16.04
+FROM nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04
 RUN apt-get update && apt-get install -y python
 RUN pip install -U kubernetes opencv-python &&   apt-get update -y &&   apt-get install -y iputils-ping libgtk2.0-dev
 # NOTE: By default CI built wheel packages turn WITH_DISTRIBUTE=OFF,
diff --git a/benchmark/cluster/vgg16/README.md b/benchmark/cluster/vgg16/README.md
index 9b96f1c02d..11d00b8f85 100644
--- a/benchmark/cluster/vgg16/README.md
+++ b/benchmark/cluster/vgg16/README.md
@@ -1,4 +1,4 @@
-# Performance for distributed vgg16
+# Performance for Distributed vgg16
 
 ## Test Result
 
@@ -50,7 +50,7 @@
 
 - Trainer Count: 60
 - Batch Size: 128
-- Metrics: mini-batch / sec
+- Metrics: samples/ sec
 
 | PServer Count | 3 | 6 |10 | 20 |
 | -- | -- | -- | -- | -- |
@@ -61,7 +61,7 @@
 *The performance gap between Fuild and v2 comes from the network interference.*
 
 
-## Steps to run the performance test
+## Steps to Run the Performance Test
 
 1. You must re-compile PaddlePaddle and enable `-DWITH_DISTRIBUTE` to build PaddlePaddle with distributed support.
 1. When the build finishes, copy the output `whl` package located under `build/python/dist` to current directory.
@@ -71,6 +71,6 @@
 
 Check the logs for the distributed training progress and analyze the performance.
 
-## Enable verbos logs
+## Enable Verbos Logs
 
-Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` to see what happend in detail.
+Edit `pserver.yaml` and `trainer.yaml` and add an environment variable `GLOG_v=3` and `GLOG_logtostderr=1` to see what happend in detail.

From 0575fd4647bf414662d31c02371a68689273b22c Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 2 Feb 2018 17:31:37 +0800
Subject: [PATCH 076/102] simplify shape inference code

---
 paddle/framework/op_desc.cc         | 19 -------------------
 paddle/framework/operator.cc        |  8 --------
 paddle/framework/shape_inference.cc | 23 +++++++++++++++++++----
 paddle/framework/shape_inference.h  |  8 +++-----
 4 files changed, 22 insertions(+), 36 deletions(-)

diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index f8df2cf97a..f554c77845 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -39,10 +39,6 @@ class CompileTimeInferShapeContext : public InferShapeContext {
 
   bool HasOutputs(const std::string &name) const override;
 
-  DDim GetInputDim(const std::string &name) const override;
-
-  void SetOutputDim(const std::string &name, const DDim &dim) override;
-
   AttrReader Attrs() const override;
 
   const std::vector<std::string> &Inputs(
@@ -444,21 +440,6 @@ bool CompileTimeInferShapeContext::HasOutputs(const std::string &name) const {
   return true;
 }
 
-DDim CompileTimeInferShapeContext::GetInputDim(const std::string &name) const {
-  std::vector<DDim> ddims = GetInputsDim(name);
-  auto length = ddims.size();
-  PADDLE_ENFORCE_EQ(length, 1UL,
-                    "Input(%s) should have 1 value, "
-                    "but it has %d now",
-                    name, length);
-  return ddims[0];
-}
-
-void CompileTimeInferShapeContext::SetOutputDim(const std::string &name,
-                                                const DDim &dim) {
-  SetOutputsDim(name, {dim});
-}
-
 AttrReader CompileTimeInferShapeContext::Attrs() const {
   return AttrReader(op_.GetAttrMap());
 }
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 4e854f54dd..81fa8cf477 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -366,14 +366,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return true;
   }
 
-  DDim GetInputDim(const std::string& name) const override {
-    return GetDim(op_.Input(name));
-  }
-
-  void SetOutputDim(const std::string& name, const DDim& dim) override {
-    SetDim(op_.Output(name), dim);
-  }
-
   AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
 
   const std::vector<std::string>& Inputs(
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index e53cc0cdab..14dba75808 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -18,10 +18,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+framework::DDim InferShapeContext::GetInputDim(const std::string &name) const {
+  const std::vector<std::string> &arg_names = Inputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Input(%s) shoudl holds one element, but now it holds %d",
+                    name, arg_names.size());
+  return this->GetDim(arg_names[0]);
+}
+
 std::vector<framework::DDim> InferShapeContext::GetInputsDim(
     const std::string &name) const {
-  const std::vector<std::string> &names = Inputs(name);
-  return GetDims(names);
+  const std::vector<std::string> &arg_names = Inputs(name);
+  return GetDims(arg_names);
 }
 
 DDim InferShapeContext::GetInputsElementDim(const std::string &name,
@@ -30,13 +38,21 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
   return this->GetDim(names[idx]);
 }
 
+void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
+  auto &arg_names = Outputs(name);
+  PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
+                    "Output(%s) shoudl holds one element, but now it holds %d",
+                    name, arg_names.size());
+  SetDim(arg_names[0], dim);
+}
+
 void InferShapeContext::SetOutputsDim(
     const std::string &name, const std::vector<framework::DDim> &dims) {
   auto &names = Outputs(name);
   SetDims(names, dims);
 }
 
-std::vector<framework::DDim> InferShapeContext::GetDims(
+std::vector<DDim> InferShapeContext::GetDims(
     const std::vector<std::string> &names) const {
   std::vector<framework::DDim> ret;
   ret.reserve(names.size());
@@ -45,7 +61,6 @@ std::vector<framework::DDim> InferShapeContext::GetDims(
       [this](const std::string &name) { return this->GetDim(name); });
   return ret;
 }
-
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
                                 const std::vector<framework::DDim> &dims) {
   size_t length = names.size();
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index f93319d8f2..77fc9359be 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -35,12 +35,12 @@ class InferShapeContext {
   virtual bool HasInputs(const std::string &name) const = 0;
   virtual bool HasOutputs(const std::string &name) const = 0;
 
-  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
+  framework::DDim GetInputDim(const std::string &name) const;
 
   std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
   DDim GetInputsElementDim(const std::string &name, int idx) const;
 
-  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
+  void SetOutputDim(const std::string &name, const DDim &dim);
   void SetOutputsDim(const std::string &name,
                      const std::vector<framework::DDim> &dims);
 
@@ -63,9 +63,7 @@ class InferShapeContext {
   virtual framework::DDim GetDim(const std::string &name) const = 0;
   virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
 
-  std::vector<framework::DDim> GetDims(
-      const std::vector<std::string> &names) const;
-
+  std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
   std::vector<proto::VarDesc::VarType> GetVarTypes(
       const std::vector<std::string> &names) const;
 

From affce7331ce42cb85295f932e408f0d4597ea527 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Fri, 2 Feb 2018 17:31:35 +0800
Subject: [PATCH 077/102] refine elementwise_op

---
 paddle/operators/compare_op.h              | 10 +++++++-
 paddle/operators/elementwise_add_op.h      | 21 ++++++++++++++--
 paddle/operators/elementwise_div_op.h      | 21 ++++++++++++++--
 paddle/operators/elementwise_max_op.h      | 21 ++++++++++++++--
 paddle/operators/elementwise_min_op.h      | 21 ++++++++++++++--
 paddle/operators/elementwise_mul_op.h      | 21 ++++++++++++++--
 paddle/operators/elementwise_op_function.h | 28 ++++++++--------------
 paddle/operators/elementwise_pow_op.h      |  9 ++++++-
 paddle/operators/elementwise_sub_op.h      | 21 ++++++++++++++--
 9 files changed, 141 insertions(+), 32 deletions(-)

diff --git a/paddle/operators/compare_op.h b/paddle/operators/compare_op.h
index 9c655d6c0d..b275fd75b3 100644
--- a/paddle/operators/compare_op.h
+++ b/paddle/operators/compare_op.h
@@ -54,7 +54,15 @@ class CompareOpKernel
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     using T = typename Functor::ELEM_TYPE;
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context);
+    using Tensor = framework::Tensor;
+
+    auto* x = context.Input<Tensor>("X");
+    auto* y = context.Input<Tensor>("Y");
+    auto* z = context.Output<Tensor>("Out");
+    z->mutable_data<T>(context.GetPlace());
+    int axis = context.Attr<int>("axis");
+    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, axis,
+                                                          z);
   }
 };
 
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index a8389429f2..c32288d698 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<AddFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
   }
 };
 
@@ -92,9 +99,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseAddGradFunctor<T>,
                            ElementwiseAddBroadCastGradFunctor<T>,
-                           ElementwiseAddBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseAddBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index ef26cb6c91..07ebade31f 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<DivFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
   }
 };
 
@@ -111,9 +118,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseDivGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
                            ElementwiseDivBroadCastGradFunctor<T>,
-                           ElementwiseDivBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseDivBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_max_op.h b/paddle/operators/elementwise_max_op.h
index 255728e8e6..717e45ab31 100644
--- a/paddle/operators/elementwise_max_op.h
+++ b/paddle/operators/elementwise_max_op.h
@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseMaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MaxFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
   }
 };
 
@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
                            ElementwiseMaxBroadCastGradFunctor<T>,
-                           ElementwiseMaxBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMaxBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_min_op.h b/paddle/operators/elementwise_min_op.h
index e6627a0f1b..0de9a91c52 100644
--- a/paddle/operators/elementwise_min_op.h
+++ b/paddle/operators/elementwise_min_op.h
@@ -28,7 +28,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseMinKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MinFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
   }
 };
 
@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMinGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
                            ElementwiseMinBroadCastGradFunctor<T>,
-                           ElementwiseMinBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMinBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 4b86b00b5a..ae7a71e024 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<MulFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
   }
 };
 
@@ -110,9 +117,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
                            ElementwiseMulBroadCastGradFunctor<T>,
-                           ElementwiseMulBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseMulBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index d749b8e875..213fe1f5a8 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -313,21 +313,18 @@ EIGEN_FUNCTOR(Div, EIGEN_DIV);
 
 template <typename DeviceContext, typename T, typename functor,
           typename broadcastfunctor, typename broadcast2functor>
-void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* out = ctx.Input<Tensor>("Out");
-  auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+void ElementwiseGradCompute(const framework::ExecutionContext& ctx,
 
+                            const framework::Tensor* x,
+                            const framework::Tensor* y,
+                            const framework::Tensor* out,
+                            const framework::Tensor* dout, int axis,
+                            framework::Tensor* dx, framework::Tensor* dy) {
   auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
 
   auto x_dims = x->dims();
   auto y_dims = y->dims();
 
-  auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-  auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
   if (dx) {
     dx->mutable_data<T>(ctx.GetPlace());
   }
@@ -348,7 +345,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
     x_dims = framework::make_ddim(extended_dims);
   }
 
-  int axis = ctx.Attr<int>("axis");
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
 
   int pre, n, post;
@@ -367,13 +363,10 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
 
 template <typename Functor, typename DeviceContext, typename T,
           typename OutType = T>
-void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
-  using Tensor = framework::Tensor;
-
-  auto* x = ctx.Input<Tensor>("X");
-  auto* y = ctx.Input<Tensor>("Y");
-  auto* z = ctx.Output<Tensor>("Out");
-  z->mutable_data<OutType>(ctx.GetPlace());
+void ElementwiseComputeEx(const framework::ExecutionContext& ctx,
+                          const framework::Tensor* x,
+                          const framework::Tensor* y, int axis,
+                          framework::Tensor* z) {
   TransformFunctor<Functor, T, DeviceContext, OutType> functor(
       x, y, z, ctx.template device_context<DeviceContext>(), Functor());
 
@@ -394,7 +387,6 @@ void ElementwiseComputeEx(const framework::ExecutionContext& ctx) {
     x_dims = framework::make_ddim(extended_dims);
   }
 
-  int axis = ctx.Attr<int>("axis");
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
   PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
                  "Axis should be in range [0, x_dims)");
diff --git a/paddle/operators/elementwise_pow_op.h b/paddle/operators/elementwise_pow_op.h
index 6019e709e0..874fd3f09f 100644
--- a/paddle/operators/elementwise_pow_op.h
+++ b/paddle/operators/elementwise_pow_op.h
@@ -29,7 +29,14 @@ template <typename DeviceContext, typename T>
 class ElementwisePowKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<PowFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
   }
 };
 
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index a2aca79302..c2749a8e6b 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -27,7 +27,14 @@ template <typename DeviceContext, typename T>
 class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx);
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    int axis = ctx.Attr<int>("axis");
+    ElementwiseComputeEx<SubFunctor<T>, DeviceContext, T>(ctx, x, y, axis, z);
   }
 };
 
@@ -93,9 +100,19 @@ template <typename DeviceContext, typename T>
 class ElementwiseSubGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    using Tensor = framework::Tensor;
+
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* out = ctx.Input<Tensor>("Out");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    int axis = ctx.Attr<int>("axis");
     ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
                            ElementwiseSubBroadCastGradFunctor<T>,
-                           ElementwiseSubBroadCast2GradFunctor<T>>(ctx);
+                           ElementwiseSubBroadCast2GradFunctor<T>>(
+        ctx, x, y, out, dout, axis, dx, dy);
   }
 };
 

From 2ffa3a8bf6a7cb0e3d5e1ac211417c234ab04f04 Mon Sep 17 00:00:00 2001
From: xzl <zlx_hg@163.com>
Date: Fri, 2 Feb 2018 18:28:23 +0800
Subject: [PATCH 078/102] rename op to depthwise_conv2d, more efficient

---
 paddle/operators/conv_op.cc                   |  8 +-
 paddle/operators/conv_op.cu.cc                |  4 +-
 paddle/operators/math/depthwise_conv.cu       | 79 ++++++-------------
 python/paddle/v2/fluid/layers/nn.py           |  2 +-
 .../paddle/v2/fluid/tests/test_conv2d_op.py   |  4 +-
 5 files changed, 34 insertions(+), 63 deletions(-)

diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index d25f3fd1a0..cef7ddd5fe 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -320,20 +320,20 @@ REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
             ops::ConvOpGrad);
 
 // depthwise convolution op
-REGISTER_OP(depthwise_conv, ops::ConvOp, ops::Conv2DOpMaker,
-            depthwise_conv_grad, ops::ConvOpGrad);
+REGISTER_OP(depthwise_conv2d, ops::ConvOp, ops::Conv2DOpMaker,
+            depthwise_conv2d_grad, ops::ConvOpGrad);
 REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
             ops::ConvOpGrad);
 
 // depthwise conv kernel
 // TODO(xingzhaolong): neon kernel for mobile
 REGISTER_OP_CPU_KERNEL(
-    depthwise_conv,
+    depthwise_conv2d,
     ops::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
 
 REGISTER_OP_CPU_KERNEL(
-    depthwise_conv_grad,
+    depthwise_conv2d_grad,
     ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::GemmConvGradKernel<paddle::platform::CPUDeviceContext, double>);
 
diff --git a/paddle/operators/conv_op.cu.cc b/paddle/operators/conv_op.cu.cc
index 02a4e52466..d0bd40ee95 100644
--- a/paddle/operators/conv_op.cu.cc
+++ b/paddle/operators/conv_op.cu.cc
@@ -17,12 +17,12 @@ limitations under the License. */
 namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv,
+    depthwise_conv2d,
     ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, float>,
     ops::DepthwiseConvKernel<paddle::platform::CUDADeviceContext, double>);
 
 REGISTER_OP_CUDA_KERNEL(
-    depthwise_conv_grad,
+    depthwise_conv2d_grad,
     ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::DepthwiseConvGradKernel<paddle::platform::CUDADeviceContext, double>);
 
diff --git a/paddle/operators/math/depthwise_conv.cu b/paddle/operators/math/depthwise_conv.cu
index b9b958c92b..b212e78208 100644
--- a/paddle/operators/math/depthwise_conv.cu
+++ b/paddle/operators/math/depthwise_conv.cu
@@ -42,38 +42,23 @@ __global__ void KernelDepthwiseConv(
     T value = 0;
     const int h_in_start = -padding_height + h_out * stride_height;
     const int w_in_start = -padding_width + w_out * stride_width;
-    const int h_in_end =
-        -padding_height + h_out * stride_height + filter_height - 1;
-    const int w_in_end =
-        -padding_width + w_out * stride_width + filter_width - 1;
+    const int h_in_end = h_in_start + filter_height;
+    const int w_in_end = w_in_start + filter_width;
 
     const int in_offset =
         ((batch * input_channels + c_in) * input_height) * input_width;
 
-    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
-        (w_in_end < input_width)) {
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          const int offset = in_offset + h_in * input_width + w_in;
-
-          value += (*weight) * input_data[offset];
-          ++weight;
-        }
-      }
-    } else {
-      for (int kh = 0; kh < filter_height; ++kh) {
-        for (int kw = 0; kw < filter_width; ++kw) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
-              (w_in < input_width)) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            value += (*weight) * input_data[offset];
-          }
-          ++weight;
-        }
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        value +=
+            weight[(h_in - h_in_start) * filter_width + (w_in - w_in_start)] *
+            input_data[offset];
       }
     }
     output_data[index] = value;
@@ -162,32 +147,18 @@ __global__ void KernelDepthwiseConvFilterGrad(
         (batch * input_channels + c_in) * input_height * input_width;
 
     T* addr_offset = filter_grad_data + c_out * filter_height * filter_width;
-
-    if ((h_in_start >= 0) && (h_in_end < input_height) && (w_in_start >= 0) &&
-        (w_in_end < input_width)) {
-      for (int kw = 0; kw < filter_width; kw++) {
-        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          const int offset = in_offset + h_in * input_width + w_in;
-          const T diff_temp = output_grad_data[index] * input_data[offset];
-          T* addr = addr_offset + kh * filter_width + kw;
-          paddle::platform::CudaAtomicAdd(addr, diff_temp);
-        }
-      }
-    } else {
-      for (int kw = 0; kw < filter_width; kw++) {
-        for (int kh = 0; kh < filter_height; kh++) {
-          const int h_in = h_in_start + kh;
-          const int w_in = w_in_start + kw;
-          if ((h_in >= 0) && (h_in < input_height) && (w_in >= 0) &&
-              (w_in < input_width)) {
-            const int offset = in_offset + h_in * input_width + w_in;
-            const T diff_temp = output_grad_data[index] * input_data[offset];
-            T* addr = addr_offset + kh * filter_width + kw;
-            paddle::platform::CudaAtomicAdd(addr, diff_temp);
-          }
-        }
+    const int h_end = h_in_end < input_height ? h_in_end : input_height;
+    const int w_end = w_in_end < input_width ? w_in_end : input_width;
+    const int h_start = h_in_start > 0 ? h_in_start : 0;
+    const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+    for (int h_in = h_start; h_in < h_end; h_in++) {
+      for (int w_in = w_start; w_in < w_end; w_in++) {
+        const int offset = in_offset + h_in * input_width + w_in;
+        const T diff_temp = output_grad_data[index] * input_data[offset];
+        T* addr = addr_offset + (h_in - h_in_start) * filter_width +
+                  (w_in - w_in_start);
+        paddle::platform::CudaAtomicAdd(addr, diff_temp);
       }
     }
   }
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 4be6ae8ed6..aaf096f0dd 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -1237,7 +1237,7 @@ def conv2d(input,
     l_type = 'conv2d'
     if (num_channels == groups and num_filters % num_channels == 0 and
             not use_cudnn):
-        l_type = 'depthwise_conv'
+        l_type = 'depthwise_conv2d'
 
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index a034d0ab91..7512ea333e 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -250,7 +250,7 @@ class TestDepthwiseConv(TestConv2dOp):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv"
+        self.op_type = "depthwise_conv2d"
 
 
 class TestDepthwiseConv2(TestConv2dOp):
@@ -262,7 +262,7 @@ class TestDepthwiseConv2(TestConv2dOp):
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] / self.groups
         self.filter_size = [6, f_c, 3, 3]
-        self.op_type = "depthwise_conv"
+        self.op_type = "depthwise_conv2d"
 
 
 #  cudnn v5 does not support dilation conv.

From ab1341eab7c67b991499ed9cad15d8901e2bc76b Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 2 Feb 2018 18:40:52 +0800
Subject: [PATCH 079/102] fix typo

---
 paddle/framework/shape_inference.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index 14dba75808..0e17219e4e 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -21,7 +21,7 @@ namespace framework {
 framework::DDim InferShapeContext::GetInputDim(const std::string &name) const {
   const std::vector<std::string> &arg_names = Inputs(name);
   PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                    "Input(%s) shoudl holds one element, but now it holds %d",
+                    "Input(%s) should hold one element, but now it holds %d",
                     name, arg_names.size());
   return this->GetDim(arg_names[0]);
 }
@@ -41,7 +41,7 @@ DDim InferShapeContext::GetInputsElementDim(const std::string &name,
 void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
   auto &arg_names = Outputs(name);
   PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
-                    "Output(%s) shoudl holds one element, but now it holds %d",
+                    "Output(%s) should hold one element, but now it holds %d",
                     name, arg_names.size());
   SetDim(arg_names[0], dim);
 }

From 9a970702d3cff51af75ff513c07dbeb226920f1d Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Fri, 2 Feb 2018 18:56:16 +0800
Subject: [PATCH 080/102] remove unnecessary framework::

---
 paddle/framework/shape_inference.cc | 12 ++++++------
 paddle/framework/shape_inference.h  | 13 ++++++-------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index 0e17219e4e..a0fa467291 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-framework::DDim InferShapeContext::GetInputDim(const std::string &name) const {
+DDim InferShapeContext::GetInputDim(const std::string &name) const {
   const std::vector<std::string> &arg_names = Inputs(name);
   PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
                     "Input(%s) should hold one element, but now it holds %d",
@@ -26,7 +26,7 @@ framework::DDim InferShapeContext::GetInputDim(const std::string &name) const {
   return this->GetDim(arg_names[0]);
 }
 
-std::vector<framework::DDim> InferShapeContext::GetInputsDim(
+std::vector<DDim> InferShapeContext::GetInputsDim(
     const std::string &name) const {
   const std::vector<std::string> &arg_names = Inputs(name);
   return GetDims(arg_names);
@@ -46,15 +46,15 @@ void InferShapeContext::SetOutputDim(const std::string &name, const DDim &dim) {
   SetDim(arg_names[0], dim);
 }
 
-void InferShapeContext::SetOutputsDim(
-    const std::string &name, const std::vector<framework::DDim> &dims) {
+void InferShapeContext::SetOutputsDim(const std::string &name,
+                                      const std::vector<DDim> &dims) {
   auto &names = Outputs(name);
   SetDims(names, dims);
 }
 
 std::vector<DDim> InferShapeContext::GetDims(
     const std::vector<std::string> &names) const {
-  std::vector<framework::DDim> ret;
+  std::vector<DDim> ret;
   ret.reserve(names.size());
   std::transform(
       names.begin(), names.end(), std::back_inserter(ret),
@@ -62,7 +62,7 @@ std::vector<DDim> InferShapeContext::GetDims(
   return ret;
 }
 void InferShapeContext::SetDims(const std::vector<std::string> &names,
-                                const std::vector<framework::DDim> &dims) {
+                                const std::vector<DDim> &dims) {
   size_t length = names.size();
   PADDLE_ENFORCE_EQ(length, dims.size());
   for (size_t i = 0; i < length; ++i) {
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 77fc9359be..830f199ed1 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -35,14 +35,13 @@ class InferShapeContext {
   virtual bool HasInputs(const std::string &name) const = 0;
   virtual bool HasOutputs(const std::string &name) const = 0;
 
-  framework::DDim GetInputDim(const std::string &name) const;
+  DDim GetInputDim(const std::string &name) const;
 
-  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  std::vector<DDim> GetInputsDim(const std::string &name) const;
   DDim GetInputsElementDim(const std::string &name, int idx) const;
 
   void SetOutputDim(const std::string &name, const DDim &dim);
-  void SetOutputsDim(const std::string &name,
-                     const std::vector<framework::DDim> &dims);
+  void SetOutputsDim(const std::string &name, const std::vector<DDim> &dims);
 
   virtual AttrReader Attrs() const = 0;
   virtual const std::vector<std::string> &Inputs(
@@ -57,11 +56,11 @@ class InferShapeContext {
 
   // Note: In while op, we need this to be public
   void SetDims(const std::vector<std::string> &names,
-               const std::vector<framework::DDim> &dims);
+               const std::vector<DDim> &dims);
 
  protected:
-  virtual framework::DDim GetDim(const std::string &name) const = 0;
-  virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
+  virtual DDim GetDim(const std::string &name) const = 0;
+  virtual void SetDim(const std::string &name, const DDim &dim) = 0;
 
   std::vector<DDim> GetDims(const std::vector<std::string> &names) const;
   std::vector<proto::VarDesc::VarType> GetVarTypes(

From 37a251ebafce61776b2fea7a2fb2ee16defd14ea Mon Sep 17 00:00:00 2001
From: xuwei06 <xuwei06@baidu.com>
Date: Fri, 2 Feb 2018 15:46:51 -0800
Subject: [PATCH 081/102] Fix copyright for test_fetch_var.py

---
 python/paddle/v2/fluid/tests/test_fetch_var.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/python/paddle/v2/fluid/tests/test_fetch_var.py b/python/paddle/v2/fluid/tests/test_fetch_var.py
index 670ab54f51..ed75a350b0 100644
--- a/python/paddle/v2/fluid/tests/test_fetch_var.py
+++ b/python/paddle/v2/fluid/tests/test_fetch_var.py
@@ -1,3 +1,17 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import paddle.v2.fluid as fluid
 import paddle.v2.fluid.layers as layers
 import op_test

From dbe06551b86460d5ebf18ee33218cd6d11cd07e4 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <aroraabhinav@baidu.com>
Date: Fri, 2 Feb 2018 16:12:53 -0800
Subject: [PATCH 082/102] Channel should notify both condition variables on
 close

---
 paddle/framework/details/buffered_channel.h | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index b093e15892..9c806461aa 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -49,6 +49,7 @@ class Buffered : public paddle::framework::Channel<T> {
   }
 
   void NotifyAllSenders(std::unique_lock<std::mutex>*);
+  void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
 
 template <typename T>
@@ -80,7 +81,7 @@ template <typename T>
 void Buffered<T>::Close() {
   std::unique_lock<std::mutex> lock(mu_);
   closed_ = true;
-  NotifyAllSenders(&lock);
+  NotifyAllParticipants(&lock);
 }
 
 template <typename T>
@@ -88,7 +89,7 @@ Buffered<T>::~Buffered() {
   std::unique_lock<std::mutex> lock(mu_);
   closed_ = true;
   channel_.clear();
-  NotifyAllSenders(&lock);
+  NotifyAllParticipants(&lock);
 }
 
 template <typename T>
@@ -97,6 +98,13 @@ void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
   full_cond_var_.notify_all();
 }
 
+template <typename T>
+void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
+  lock->unlock();
+  full_cond_var_.notify_all();
+  empty_cond_var_.notify_all();
+}
+
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle

From 022e5dee8e685134e6c0199d7d0ee8762a03eb80 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Fri, 2 Feb 2018 20:08:39 -0800
Subject: [PATCH 083/102] Added more receivers less senders. Receivers should
 block. (#8061)

* Adding more receivers less senders

* Added more receivers less senders

* Added more send

* Updated comment

* Fixed code style

* Fixing review comments
---
 paddle/framework/channel_test.cc | 36 +++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index 31ac72eda9..c3533bbb1a 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -67,7 +67,7 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   std::thread t([&]() {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      ch->Send(&i);  // should not block
+      ch->Send(&i);  // should block after 10 iterations
       sum += i;
     }
   });
@@ -207,3 +207,37 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
   t.join();
   delete ch;
 }
+
+TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
+  auto ch = MakeChannel<int>(0);
+  unsigned sum_send = 0;
+  unsigned sum_receive = 0;
+  // The receiver should block after 5
+  // iterations, since there are only 5 senders.
+  std::thread t([&]() {
+    for (int i = 0; i < 8; i++) {
+      int recv;
+      ch->Receive(&recv);  // should block after the fifth iteration.
+      EXPECT_EQ(recv, i);
+      sum_receive += i;
+    }
+  });
+  for (int i = 0; i < 5; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(500));  // wait 0.5 sec
+  EXPECT_EQ(sum_send, 10U);
+  EXPECT_EQ(sum_receive, 10U);
+  // send three more elements
+  for (int i = 5; i < 8; i++) {
+    ch->Send(&i);
+    sum_send += i;
+  }
+
+  CloseChannel(ch);
+  t.join();
+  EXPECT_EQ(sum_send, 28U);
+  EXPECT_EQ(sum_receive, 28U);
+  delete ch;
+}

From b60da6729fa2484506869bc29271761de91676b7 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Sat, 3 Feb 2018 23:32:56 +0800
Subject: [PATCH 084/102] Refine buffer channel (#8098)

* refine buffer channel

*  refine Receive and Send

* follow comments
---
 paddle/framework/channel.h                    |  4 +--
 paddle/framework/details/buffered_channel.h   | 25 ++++++++-----------
 paddle/framework/details/unbuffered_channel.h | 14 ++++++++---
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/paddle/framework/channel.h b/paddle/framework/channel.h
index 0570980c5a..b679387b11 100644
--- a/paddle/framework/channel.h
+++ b/paddle/framework/channel.h
@@ -23,8 +23,8 @@ namespace framework {
 template <typename T>
 class Channel {
  public:
-  virtual void Send(T*) = 0;
-  virtual void Receive(T*) = 0;
+  virtual bool Send(T*) = 0;
+  virtual bool Receive(T*) = 0;
   virtual size_t Cap() = 0;
   virtual void Close() = 0;
   virtual ~Channel() {}
diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index 9c806461aa..7ac234b8d4 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -30,8 +30,8 @@ class Buffered : public paddle::framework::Channel<T> {
   friend void paddle::framework::CloseChannel<T>(Channel<T>*);
 
  public:
-  virtual void Send(T*);
-  virtual void Receive(T*);
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
   virtual size_t Cap() { return cap_; }
   virtual void Close();
   virtual ~Buffered();
@@ -48,33 +48,36 @@ class Buffered : public paddle::framework::Channel<T> {
     PADDLE_ENFORCE_GT(cap, 0);
   }
 
-  void NotifyAllSenders(std::unique_lock<std::mutex>*);
   void NotifyAllParticipants(std::unique_lock<std::mutex>*);
 };
 
 template <typename T>
-void Buffered<T>::Send(T* item) {
+bool Buffered<T>::Send(T* item) {
   std::unique_lock<std::mutex> lock(mu_);
   full_cond_var_.wait(lock,
                       [this]() { return channel_.size() < cap_ || closed_; });
+  bool ret = false;
   if (!closed_) {
     channel_.push_back(std::move(*item));
     lock.unlock();
     empty_cond_var_.notify_one();
+    ret = true;
   }
+  return ret;
 }
 
 template <typename T>
-void Buffered<T>::Receive(T* item) {
+bool Buffered<T>::Receive(T* item) {
   std::unique_lock<std::mutex> lock(mu_);
   empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
+  bool ret = false;
   if (!closed_) {
     *item = std::move(channel_.front());
     channel_.pop_front();
-    NotifyAllSenders(&lock);
-  } else {
-    item = nullptr;
+    full_cond_var_.notify_one();
+    ret = true;
   }
+  return ret;
 }
 
 template <typename T>
@@ -92,12 +95,6 @@ Buffered<T>::~Buffered() {
   NotifyAllParticipants(&lock);
 }
 
-template <typename T>
-void Buffered<T>::NotifyAllSenders(std::unique_lock<std::mutex>* lock) {
-  lock->unlock();
-  full_cond_var_.notify_all();
-}
-
 template <typename T>
 void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
   lock->unlock();
diff --git a/paddle/framework/details/unbuffered_channel.h b/paddle/framework/details/unbuffered_channel.h
index 0dc5afd7e5..f86a894bb4 100644
--- a/paddle/framework/details/unbuffered_channel.h
+++ b/paddle/framework/details/unbuffered_channel.h
@@ -29,8 +29,8 @@ class UnBuffered : public paddle::framework::Channel<T> {
   friend void paddle::framework::CloseChannel<T>(Channel<T>*);
 
  public:
-  virtual void Send(T*);
-  virtual void Receive(T*);
+  virtual bool Send(T*);
+  virtual bool Receive(T*);
   virtual size_t Cap() { return 0; }
   virtual void Close();
   virtual ~UnBuffered();
@@ -57,7 +57,7 @@ class UnBuffered : public paddle::framework::Channel<T> {
 // This function implements the concept of how data should
 // be sent from a writer to a reader.
 template <typename T>
-void UnBuffered<T>::Send(T* data) {
+bool UnBuffered<T>::Send(T* data) {
   // Prevent other writers from entering
   std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
   writer_found_ = true;
@@ -66,6 +66,7 @@ void UnBuffered<T>::Send(T* data) {
   cv_writer_.wait(cv_lock,
                   [this]() { return reader_found_ == true || closed_; });
   cv_reader_.notify_one();
+  bool ret = false;
   if (!closed_) {
     std::unique_lock<std::mutex> channel_lock(mu_ch_);
     item = data;
@@ -74,14 +75,16 @@ void UnBuffered<T>::Send(T* data) {
     channel_lock.lock();
     cv_channel_.wait(channel_lock,
                      [this]() { return item == nullptr || closed_; });
+    ret = true;
   }
   writer_found_ = false;
+  return ret;
 }
 
 // This function implements the concept of how
 // data that was sent by a writer is read from a reader.
 template <typename T>
-void UnBuffered<T>::Receive(T* data) {
+bool UnBuffered<T>::Receive(T* data) {
   // Prevent other readers from entering
   std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
   reader_found_ = true;
@@ -90,6 +93,7 @@ void UnBuffered<T>::Receive(T* data) {
   cv_reader_.wait(cv_lock,
                   [this]() { return writer_found_ == true || closed_; });
   cv_writer_.notify_one();
+  bool ret = false;
   if (!closed_) {
     std::unique_lock<std::mutex> lock_ch{mu_ch_};
     // Reader should wait for the writer to first write its data
@@ -98,10 +102,12 @@ void UnBuffered<T>::Receive(T* data) {
       *data = std::move(*item);
       item = nullptr;
       lock_ch.unlock();
+      ret = true;
     }
     cv_channel_.notify_one();
   }
   reader_found_ = false;
+  return ret;
 }
 
 // This function implements the sequence of events

From be65516876ae32fe2f8cfde1aaa2d22926ccc583 Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Sun, 4 Feb 2018 16:37:02 +0000
Subject: [PATCH 085/102] Fix the error when sorted_key is none in profiler

---
 paddle/platform/profiler.cc        | 2 +-
 python/paddle/v2/fluid/profiler.py | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/platform/profiler.cc b/paddle/platform/profiler.cc
index 2a8afc9403..6df087d154 100644
--- a/paddle/platform/profiler.cc
+++ b/paddle/platform/profiler.cc
@@ -233,7 +233,7 @@ void ParseEvents(std::vector<std::vector<Event>>& events,
       };
       break;
     default:
-      sorted_domain = "event end time";
+      sorted_domain = "event first end time";
   }
 
   std::vector<std::vector<EventItem>> events_table;
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
index d4a2cd7eea..d33a4c52a8 100644
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
@@ -103,10 +103,10 @@ def profiler(state, sorted_key=None):
     core.enable_profiler(prof_state)
     yield
 
-    if sorted_key not in ['calls', 'total', 'max', 'min', 'ave']:
-        raise ValueError("The state must be in 'calls', 'total', "
-                         "'max', 'min', 'ave'")
     sorted_key = 'default' if sorted_key is None else sorted_key
+    if sorted_key not in ['default', 'calls', 'total', 'max', 'min', 'ave']:
+        raise ValueError("The sorted_key must be None or in 'calls', 'total', "
+                         "'max', 'min' and 'ave'")
     key_map = {
         'default': core.EventSortingKey.kDefault,
         'calls': core.EventSortingKey.kCalls,

From 1d2dd9c4a5b99074cec3cb642f64bfd2124e6412 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Sun, 4 Feb 2018 10:04:53 -0800
Subject: [PATCH 086/102] Close buffered channel should unblock the blocked 
 senders and receivers (#8109)

---
 paddle/framework/channel_test.cc | 113 +++++++++++++++++++++++++++++--
 1 file changed, 106 insertions(+), 7 deletions(-)

diff --git a/paddle/framework/channel_test.cc b/paddle/framework/channel_test.cc
index c3533bbb1a..444d68498c 100644
--- a/paddle/framework/channel_test.cc
+++ b/paddle/framework/channel_test.cc
@@ -48,12 +48,12 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
   const size_t buffer_size = 10;
   auto ch = MakeChannel<size_t>(buffer_size);
   for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Send(&i);  // should not block
+    EXPECT_EQ(ch->Send(&i), true);  // should not block
   }
 
   size_t out;
   for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Receive(&out);  // should not block
+    EXPECT_EQ(ch->Receive(&out), true);  // should not block
     EXPECT_EQ(out, i);
   }
   CloseChannel(ch);
@@ -67,7 +67,10 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
   std::thread t([&]() {
     // Try to write more than buffer size.
     for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      ch->Send(&i);  // should block after 10 iterations
+      if (i < buffer_size)
+        EXPECT_EQ(ch->Send(&i), true);  // should block after 10 iterations
+      else
+        EXPECT_EQ(ch->Send(&i), false);
       sum += i;
     }
   });
@@ -84,13 +87,13 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
   unsigned sum_send = 0;
   std::thread t([&]() {
     for (int i = 0; i < 5; i++) {
-      ch->Send(&i);
+      EXPECT_EQ(ch->Send(&i), true);
       sum_send += i;
     }
   });
   for (int i = 0; i < 5; i++) {
     int recv;
-    ch->Receive(&recv);
+    EXPECT_EQ(ch->Receive(&recv), true);
     EXPECT_EQ(recv, i);
   }
 
@@ -100,6 +103,102 @@ TEST(Channel, SimpleUnbufferedChannelTest) {
   delete ch;
 }
 
+// This tests that closing a buffered channel also unblocks
+//  any receivers waiting on the channel
+TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
+  auto ch = MakeChannel<int>(1);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+
+  // Launches threads that try to read and are blocked because of no writers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    t[i] = std::thread(
+        [&](bool *p) {
+          int data;
+          // All reads should return false
+          EXPECT_EQ(ch->Receive(&data), false);
+          *p = true;
+        },
+        &thread_ended[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that all threads are blocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], false);
+  }
+
+  // Explicitly close the channel
+  // This should unblock all receivers
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
+// This tests that closing a buffered channel also unblocks
+//  any senders waiting for channel to have write space
+TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
+  auto ch = MakeChannel<int>(1);
+  size_t num_threads = 5;
+  std::thread t[num_threads];
+  bool thread_ended[num_threads];
+  bool send_success[num_threads];
+
+  // Launches threads that try to write and are blocked because of no readers
+  for (size_t i = 0; i < num_threads; i++) {
+    thread_ended[i] = false;
+    send_success[i] = false;
+    t[i] = std::thread(
+        [&](bool *ended, bool *success) {
+          int data = 10;
+          *success = ch->Send(&data);
+          *ended = true;
+        },
+        &thread_ended[i], &send_success[i]);
+  }
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
+
+  // Verify that atleast 4 threads are blocked
+  int ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (thread_ended[i] == false) ct++;
+  }
+  // Atleast 4 threads must be blocked
+  EXPECT_GE(ct, 4);
+
+  // Explicitly close the thread
+  // This should unblock all senders
+  CloseChannel(ch);
+
+  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
+
+  // Verify that all threads got unblocked
+  for (size_t i = 0; i < num_threads; i++) {
+    EXPECT_EQ(thread_ended[i], true);
+  }
+
+  // Verify that only 1 send was successful
+  ct = 0;
+  for (size_t i = 0; i < num_threads; i++) {
+    if (send_success[i]) ct++;
+  }
+  // Only 1 send must be successful
+  EXPECT_EQ(ct, 1);
+
+  for (size_t i = 0; i < num_threads; i++) t[i].join();
+  delete ch;
+}
+
 // This tests that closing an unbuffered channel also unblocks
 //  unblocks any receivers waiting for senders
 TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
@@ -114,7 +213,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
     t[i] = std::thread(
         [&](bool *p) {
           int data;
-          ch->Receive(&data);
+          EXPECT_EQ(ch->Receive(&data), false);
           *p = true;
         },
         &thread_ended[i]);
@@ -155,7 +254,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
     t[i] = std::thread(
         [&](bool *p) {
           int data = 10;
-          ch->Send(&data);
+          EXPECT_EQ(ch->Send(&data), false);
           *p = true;
         },
         &thread_ended[i]);

From 6f28084b4d062100336fd3889012b91c6e278bcc Mon Sep 17 00:00:00 2001
From: Yan Chunwei <yanchunwei@outlook.com>
Date: Mon, 5 Feb 2018 10:53:26 +0800
Subject: [PATCH 087/102] debug/format protobuf to human-readable codes (#8086)

---
 python/paddle/v2/fluid/debuger.py | 192 ++++++++++++++++++++++++++++++
 1 file changed, 192 insertions(+)

diff --git a/python/paddle/v2/fluid/debuger.py b/python/paddle/v2/fluid/debuger.py
index d379352442..db1808c647 100644
--- a/python/paddle/v2/fluid/debuger.py
+++ b/python/paddle/v2/fluid/debuger.py
@@ -12,10 +12,202 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import re
 from graphviz import GraphPreviewGenerator
 import proto.framework_pb2 as framework_pb2
 
+_vartype2str_ = [
+    "UNK",
+    "LoDTensor",
+    "SelectedRows",
+    "FeedMinibatch",
+    "FetchList",
+    "StepScopes",
+    "LodRankTable",
+    "LoDTensorArray",
+    "PlaceList",
+]
+_dtype2str_ = [
+    "bool",
+    "int16",
+    "int32",
+    "int64",
+    "float16",
+    "float32",
+    "float64",
+]
+
+
+def repr_data_type(type):
+    return _dtype2str_[type]
+
+
+def repr_tensor(proto):
+    return "tensor(type={}, shape={})".format(_dtype2str_[int(proto.data_type)],
+                                              str(proto.dims))
+
+
+reprtpl = "{ttype} {name} ({reprs})"
+
+
+def repr_lodtensor(proto):
+    if not proto.lod_tensor: return
+    level = proto.lod_tensor.lod_level
+    reprs = repr_tensor(proto.lod_tensor.tensor)
+    return reprtpl.format(
+        ttype="LoDTensor" if level > 0 else "Tensor",
+        name=proto.name,
+        reprs="level=%d, %s" % (level, reprs) if level > 0 else reprs)
+
+
+def repr_selected_rows(proto):
+    if not proto.selected_rows: return
+    return reprtpl.format(
+        ttype="SelectedRows",
+        name=proto.name,
+        reprs=repr_tensor(proto.selected_rows))
+
+
+def repr_tensor_array(proto):
+    if not proto.tensor_array: return
+    return reprtpl.format(
+        ttype="TensorArray",
+        name=proto.name,
+        reprs="level=%d, %s" % (proto.tensor_array.lod_level,
+                                repr_tensor(proto.lod_tensor)))
+
+
+type_handlers = [
+    repr_lodtensor,
+    repr_selected_rows,
+    repr_tensor_array,
+]
+
+
+def repr_var(vardesc):
+    for handler in type_handlers:
+        res = handler(vardesc)
+        if res:
+            return res
+
+
+def pprint_program_codes(program_desc):
+    reprs = []
+    for block_idx in range(program_desc.num_blocks()):
+        block_desc = program_desc.block(block_idx)
+        block_repr = pprint_block_codes(block_desc)
+        reprs.append(block_repr)
+    return '\n'.join(reprs)
+
+
+def pprint_block_codes(block_desc, show_backward=False):
+    def is_op_backward(op_desc):
+        if op_desc.type.endswith('_grad'): return True
+
+        def is_var_backward(var):
+            if "@GRAD" in var.parameter: return True
+            for arg in var.arguments:
+                if "@GRAD" in arg: return True
+
+        for var in op_desc.inputs:
+            if is_var_backward(var): return True
+        for var in op_desc.outputs:
+            if is_var_backward(var): return True
+        return False
+
+    def is_var_backward(var_desc):
+        return "@GRAD" in var_desc.name
+
+    if type(block_desc) is not framework_pb2.BlockDesc:
+        block_desc = framework_pb2.BlockDesc.FromString(
+            block_desc.serialize_to_string())
+    var_reprs = []
+    op_reprs = []
+    for var in block_desc.vars:
+        if not show_backward and is_var_backward(var):
+            continue
+        var_reprs.append(repr_var(var))
+
+    for op in block_desc.ops:
+        if not show_backward and is_op_backward(op): continue
+        op_reprs.append(repr_op(op))
+
+    tpl = "// block-{idx}  parent-{pidx}\n// variables\n{vars}\n\n// operators\n{ops}\n"
+    return tpl.format(
+        idx=block_desc.idx,
+        pidx=block_desc.parent_idx,
+        vars='\n'.join(var_reprs),
+        ops='\n'.join(op_reprs), )
+
+
+def repr_attr(desc):
+    tpl = "{key}={value}"
+    valgetter = [
+        lambda attr: attr.i,
+        lambda attr: attr.f,
+        lambda attr: attr.s,
+        lambda attr: attr.ints,
+        lambda attr: attr.floats,
+        lambda attr: attr.strings,
+        lambda attr: attr.b,
+        lambda attr: attr.bools,
+        lambda attr: attr.block_idx,
+        lambda attr: attr.l,
+    ]
+    key = desc.name
+    value = valgetter[desc.type](desc)
+    if key == "dtype":
+        value = repr_data_type(value)
+    return tpl.format(key=key, value=str(value)), (key, value)
+
+
+def _repr_op_fill_constant(optype, inputs, outputs, attrs):
+    if optype == "fill_constant":
+        return "{output} = {data} [shape={shape}]".format(
+            output=','.join(outputs),
+            data=attrs['value'],
+            shape=str(attrs['shape']))
+
+
+op_repr_handlers = [_repr_op_fill_constant, ]
+
+
+def repr_op(opdesc):
+    optype = None
+    attrs = []
+    attr_dict = {}
+    is_target = None
+    inputs = []
+    outputs = []
+
+    tpl = "{outputs} = {optype}({inputs}{is_target}) [{attrs}]"
+    args2value = lambda args: args[0] if len(args) == 1 else str(list(args))
+    for var in opdesc.inputs:
+        key = var.parameter
+        value = args2value(var.arguments)
+        inputs.append("%s=%s" % (key, value))
+    for var in opdesc.outputs:
+        value = args2value(var.arguments)
+        outputs.append(value)
+    for attr in opdesc.attrs:
+        attr_repr, attr_pair = repr_attr(attr)
+        attrs.append(attr_repr)
+        attr_dict[attr_pair[0]] = attr_pair[1]
+
+    is_target = opdesc.is_target
+
+    for handler in op_repr_handlers:
+        res = handler(opdesc.type, inputs, outputs, attr_dict)
+        if res: return res
+
+    return tpl.format(
+        outputs=', '.join(outputs),
+        optype=opdesc.type,
+        inputs=', '.join(inputs),
+        attrs="{%s}" % ','.join(attrs),
+        is_target=", is_target" if is_target else "")
+
 
 def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
     '''

From 96d4bf5337c985feff01a549c26133e3ed1c3bde Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 5 Feb 2018 12:38:37 +0800
Subject: [PATCH 088/102] prevent make clean from cleaning ExternalProject
 boost

---
 CMakeLists.txt             | 2 +-
 cmake/external/boost.cmake | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e8ea828dd2..49334279f6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
-include(external/boost)     # download, build, install boost
+include(external/boost)     # download boost
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
diff --git a/cmake/external/boost.cmake b/cmake/external/boost.cmake
index c70d83b3f4..dbc676bdac 100644
--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -21,6 +21,7 @@ set(BOOST_URL           "http://sourceforge.net/projects/boost/files/boost/${BOO
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
 set(BOOST_DOWNLOAD_DIR  "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
 set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
+set_directory_properties(PROPERTIES CLEAN_NO_CUSTOM 1)
 
 include_directories(${BOOST_INCLUDE_DIR})
 

From eef381d07482f845a875269f1b963f1d135e2cdc Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 5 Feb 2018 12:47:25 +0800
Subject: [PATCH 089/102] remove duplicated mobile index

---
 doc/index_cn.rst        | 1 -
 doc/index_en.rst        | 1 -
 doc/mobile/index_cn.rst | 9 ---------
 doc/mobile/index_en.rst | 9 ---------
 4 files changed, 20 deletions(-)
 delete mode 100644 doc/mobile/index_cn.rst
 delete mode 100644 doc/mobile/index_en.rst

diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index ada51c2d73..9279bac7f4 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -8,4 +8,3 @@ PaddlePaddle 文档
   howto/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
-  mobile/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 23b64b6cad..64684b8b9b 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -7,4 +7,3 @@ PaddlePaddle Documentation
   getstarted/index_en.rst
   howto/index_en.rst
   api/index_en.rst
-  mobile/index_en.rst
diff --git a/doc/mobile/index_cn.rst b/doc/mobile/index_cn.rst
deleted file mode 100644
index 1d99666e58..0000000000
--- a/doc/mobile/index_cn.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-..  toctree::
-  :maxdepth: 1
-
-  cross_compiling_for_android_cn.md
-  cross_compiling_for_ios_cn.md
-  cross_compiling_for_raspberry_cn.md
diff --git a/doc/mobile/index_en.rst b/doc/mobile/index_en.rst
deleted file mode 100644
index ef421dacad..0000000000
--- a/doc/mobile/index_en.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-MOBILE
-======
-
-..  toctree::
-  :maxdepth: 1
-
-  cross_compiling_for_android_en.md
-  cross_compiling_for_ios_en.md
-  cross_compiling_for_raspberry_en.md

From 7dabee27960b5e043b85aca3ee51568443b326f4 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 5 Feb 2018 15:00:03 +0800
Subject: [PATCH 090/102] Add type Reader for VarDesc

Add a new type `Reader` for `VarDesc`, which can holds more than one
LoDTensor.
---
 paddle/framework/backward.cc                  |   4 +-
 paddle/framework/framework.proto              |  10 +-
 paddle/framework/op_desc.cc                   |   4 +-
 paddle/framework/program_desc_test.cc         |   4 +-
 paddle/framework/var_desc.cc                  | 174 ++++++++++++++++--
 paddle/framework/var_desc.h                   |  20 +-
 paddle/inference/io.cc                        |   2 +-
 paddle/pybind/protobuf.cc                     |  14 +-
 .../v2/fluid/tests/test_protobuf_descs.py     |  38 ++++
 9 files changed, 246 insertions(+), 24 deletions(-)

diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 85e693434a..f52a51519f 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -534,7 +534,7 @@ ParamGradInfoMap AppendBackward(
   auto root_block = program_desc.MutableBlock(root_block_idx);
 
   std::string fill_one_op_out = GradVarName(target.Name());
-  bool is_scalar = target.Shape() == std::vector<int64_t>{1};
+  bool is_scalar = target.GetShape() == std::vector<int64_t>{1};
   PADDLE_ENFORCE(is_scalar, "target should be scalar");
   VLOG(3) << "backward from loss=" << target.Name()
           << " data_type=" << target.GetDataType();
@@ -565,7 +565,7 @@ ParamGradInfoMap AppendBackward(
 
   auto var = root_block->Var(fill_one_op_out);
   var->SetDataType(target.GetDataType());
-  var->SetShape(target.Shape());
+  var->SetShape(target.GetShape());
   auto& target_grad = retv[target.Name()];
   target_grad.name_ = fill_one_op_out;
   target_grad.block_idx_ = root_block_idx;
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 5b6ef03f61..f65ccae6e6 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -116,6 +116,8 @@ message LoDTensorArrayDesc {
   optional int32 lod_level = 2 [ default = 0 ];
 }
 
+message Reader { repeated LoDTensorDesc lod_tensor = 1; }
+
 message VarDesc {
   enum VarType {
     LOD_TENSOR = 1;
@@ -126,13 +128,15 @@ message VarDesc {
     LOD_RANK_TABLE = 6;
     LOD_TENSOR_ARRAY = 7;
     PLACE_LIST = 8;
+    READER = 9;
   }
   required string name = 1;
   required VarType type = 2;
-  optional LoDTensorDesc lod_tensor = 3;
-  optional TensorDesc selected_rows = 4;
+  optional bool persistable = 3 [ default = false ];
+  optional LoDTensorDesc lod_tensor = 4;
+  optional TensorDesc selected_rows = 5;
   optional LoDTensorArrayDesc tensor_array = 6;
-  optional bool persistable = 5 [ default = false ];
+  optional Reader reader = 7;
 }
 
 message BlockDesc {
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index f554c77845..ad361852ec 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -458,11 +458,11 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
   auto var = block_.FindVarRecursive(name);
   PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
   try {
-    auto shape = var->Shape();
+    auto shape = var->GetShape();
     if (shape.empty()) {
       return framework::make_ddim({0UL});
     } else {
-      return framework::make_ddim(var->Shape());
+      return framework::make_ddim(var->GetShape());
     }
   } catch (...) {
     VLOG(5) << "GetDim of variable " << name << " error";
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
index 59947c9f21..9945aee31b 100644
--- a/paddle/framework/program_desc_test.cc
+++ b/paddle/framework/program_desc_test.cc
@@ -53,7 +53,7 @@ TEST(ProgramDesc, copy_ctor) {
     ASSERT_NE(copy, var_before);
     ASSERT_EQ(copy->Name(), var_before->Name());
     ASSERT_EQ(copy->GetType(), var_before->GetType());
-    ASSERT_EQ(copy->Shape(), var_before->Shape());
+    ASSERT_EQ(copy->GetShape(), var_before->GetShape());
     ASSERT_EQ(copy->Proto()->SerializeAsString(),
               var_before->Proto()->SerializeAsString());
   };
@@ -117,7 +117,7 @@ TEST(ProgramDescBind, serialize_and_deserialize) {
     ASSERT_NE(restored, var_before);
     ASSERT_EQ(restored->Name(), var_before->Name());
     ASSERT_EQ(restored->GetType(), var_before->GetType());
-    ASSERT_EQ(restored->Shape(), var_before->Shape());
+    ASSERT_EQ(restored->GetShape(), var_before->GetShape());
     ASSERT_EQ(restored->Proto()->SerializeAsString(),
               var_before->Proto()->SerializeAsString());
   };
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 62ab6593ef..44bd2363c8 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -26,18 +26,91 @@ void VarDesc::SetShape(const std::vector<int64_t> &dims) {
   VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
 
+void VarDesc::SetTensorDescNum(size_t num) {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      auto *lod_tensors_ptr = desc_.mutable_reader()->mutable_lod_tensor();
+      lod_tensors_ptr->Clear();
+      for (size_t i = 0; i < num; ++i) {
+        lod_tensors_ptr->Add();
+      }
+      return;
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+size_t VarDesc::GetTensorDescNum() const {
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      return desc_.reader().lod_tensor_size();
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'sub_tensor_number' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetShapes(
+    const std::vector<const std::vector<int64_t>> &multiple_dims) {
+  PADDLE_ENFORCE_EQ(multiple_dims.size(), GetTensorDescNum(),
+                    "The number of given shapes(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_dims.size(), GetTensorDescNum());
+  std::vector<proto::TensorDesc *> tensors = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_dims.size(); ++i) {
+    VectorToRepeated(multiple_dims[i], tensors[i]->mutable_dims());
+  }
+}
+
+std::vector<int64_t> VarDesc::GetShape() const {
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<std::vector<int64_t>> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(RepeatedToVector(tensor_desc.dims()));
+  }
+  return res;
+}
+
 void VarDesc::SetDataType(proto::DataType data_type) {
   mutable_tensor_desc()->set_data_type(data_type);
 }
 
-std::vector<int64_t> VarDesc::Shape() const {
-  return RepeatedToVector(tensor_desc().dims());
+void VarDesc::SetDataTypes(
+    const std::vector<proto::DataType> &multiple_data_type) {
+  PADDLE_ENFORCE_EQ(multiple_data_type.size(), GetTensorDescNum(),
+                    "The number of given data types(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_data_type.size(), GetTensorDescNum());
+  std::vector<proto::TensorDesc *> tensor_descs = mutable_tensor_descs();
+  for (size_t i = 0; i < multiple_data_type.size(); ++i) {
+    tensor_descs[i]->set_data_type(multiple_data_type[i]);
+  }
 }
 
 proto::DataType VarDesc::GetDataType() const {
   return tensor_desc().data_type();
 }
 
+std::vector<proto::DataType> VarDesc::GetDataTypes() const {
+  std::vector<proto::TensorDesc> descs = tensor_descs();
+  std::vector<proto::DataType> res;
+  res.reserve(descs.size());
+  for (const auto &tensor_desc : descs) {
+    res.push_back(tensor_desc.data_type());
+  }
+  return res;
+}
+
 void VarDesc::SetLoDLevel(int32_t lod_level) {
   switch (desc_.type()) {
     case proto::VarDesc::LOD_TENSOR:
@@ -47,8 +120,28 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
       desc_.mutable_tensor_array()->set_lod_level(lod_level);
       break;
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
+      PADDLE_THROW(
+          "Setting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+void VarDesc::SetLoDLevels(const std::vector<int32_t> &multiple_lod_level) {
+  PADDLE_ENFORCE_EQ(multiple_lod_level.size(), GetTensorDescNum(),
+                    "The number of given data types(%d) doesn't equal to the "
+                    "number of sub tensor.",
+                    multiple_lod_level.size(), GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER: {
+      size_t i = 0;
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        lod_tensor.set_lod_level(multiple_lod_level[i++]);
+      }
+    } break;
+    default:
+      PADDLE_THROW(
+          "Setting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
   }
 }
 
@@ -59,13 +152,31 @@ int32_t VarDesc::GetLoDLevel() const {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().lod_level();
     default:
-      PADDLE_THROW("Tensor type=%d does not support LoDLevel",
-                   desc_.tensor_array().lod_level());
+      PADDLE_THROW(
+          "Getting 'lod_level' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<int32_t> VarDesc::GetLoDLevels() const {
+  std::vector<int32_t> res;
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      res.reserve(desc_.reader().lod_tensor_size());
+      for (auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.lod_level());
+      }
+      return res;
+      break;
+    default:
+      PADDLE_THROW(
+          "Getting 'lod_levels' is not supported by the type of var %s.",
+          this->Name());
   }
 }
 
 const proto::TensorDesc &VarDesc::tensor_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
   switch (desc_.type()) {
     case proto::VarDesc::SELECTED_ROWS:
       return desc_.selected_rows();
@@ -74,13 +185,32 @@ const proto::TensorDesc &VarDesc::tensor_desc() const {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.tensor_array().tensor();
     default:
-      PADDLE_THROW("The type of var %s is unsupported.", this->Name());
+      PADDLE_THROW(
+          "Getting 'tensor_desc' is not supported by the type of var %s.",
+          this->Name());
+  }
+}
+
+std::vector<proto::TensorDesc> VarDesc::tensor_descs() const {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (const auto &lod_tensor : desc_.reader().lod_tensor()) {
+        res.push_back(lod_tensor.tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
   }
 }
 
 proto::TensorDesc *VarDesc::mutable_tensor_desc() {
-  PADDLE_ENFORCE(desc_.has_type(),
-                 "invoke MutableTensorDesc must after set type");
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
   switch (desc_.type()) {
     case proto::VarDesc::SELECTED_ROWS:
       return desc_.mutable_selected_rows();
@@ -89,8 +219,30 @@ proto::TensorDesc *VarDesc::mutable_tensor_desc() {
     case proto::VarDesc::LOD_TENSOR_ARRAY:
       return desc_.mutable_tensor_array()->mutable_tensor();
     default:
-      PADDLE_THROW("Unexpected branch.");
+      PADDLE_THROW(
+          "Getting 'mutable_tensor_desc' is not supported by the type of var "
+          "%s.",
+          this->Name());
   }
 }
+
+std::vector<proto::TensorDesc *> VarDesc::mutable_tensor_descs() {
+  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
+  std::vector<proto::TensorDesc *> res;
+  res.reserve(GetTensorDescNum());
+  switch (desc_.type()) {
+    case proto::VarDesc::READER:
+      for (auto &lod_tensor : *desc_.mutable_reader()->mutable_lod_tensor()) {
+        res.push_back(lod_tensor.mutable_tensor());
+      }
+      return res;
+    default:
+      PADDLE_THROW(
+          "Getting 'tensor_descs' is not supported by the type of var "
+          "%s.",
+          this->Name());
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 9316b14bb6..862b9a5d80 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -68,18 +68,34 @@ class VarDesc {
 
   void SetName(std::string name) { desc_.set_name(name); }
 
+  void SetTensorDescNum(size_t num);
+
+  size_t GetTensorDescNum() const;
+
   void SetShape(const std::vector<int64_t> &dims);
 
+  void SetShapes(const std::vector<const std::vector<int64_t>> &multiple_dims);
+
+  std::vector<int64_t> GetShape() const;
+
+  std::vector<std::vector<int64_t>> GetShapes() const;
+
   void SetDataType(proto::DataType data_type);
 
-  std::vector<int64_t> Shape() const;
+  void SetDataTypes(const std::vector<proto::DataType> &multiple_data_type);
 
   proto::DataType GetDataType() const;
 
+  std::vector<proto::DataType> GetDataTypes() const;
+
   void SetLoDLevel(int32_t lod_level);
 
+  void SetLoDLevels(const std::vector<int32_t> &multiple_lod_level);
+
   int32_t GetLoDLevel() const;
 
+  std::vector<int32_t> GetLoDLevels() const;
+
   proto::VarDesc::VarType GetType() const;
 
   void SetType(proto::VarDesc::VarType type);
@@ -90,7 +106,9 @@ class VarDesc {
 
  private:
   const proto::TensorDesc &tensor_desc() const;
+  std::vector<proto::TensorDesc> tensor_descs() const;
   proto::TensorDesc *mutable_tensor_desc();
+  std::vector<proto::TensorDesc *> mutable_tensor_descs();
 
   proto::VarDesc desc_;
 };
diff --git a/paddle/inference/io.cc b/paddle/inference/io.cc
index 60ad7af1c0..1ed14b69c8 100644
--- a/paddle/inference/io.cc
+++ b/paddle/inference/io.cc
@@ -55,7 +55,7 @@ void LoadPersistables(framework::Executor& executor,
       VLOG(3) << "parameter's name: " << var->Name();
 
       framework::VarDesc* new_var = load_block->Var(var->Name());
-      new_var->SetShape(var->Shape());
+      new_var->SetShape(var->GetShape());
       new_var->SetDataType(var->GetDataType());
       new_var->SetType(var->GetType());
       new_var->SetLoDLevel(var->GetLoDLevel());
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 371d6119d4..0f1953abe0 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -214,11 +214,20 @@ void BindVarDsec(py::module &m) {
            py::return_value_policy::reference)
       .def("set_name", &VarDesc::SetName)
       .def("set_shape", &VarDesc::SetShape)
+      .def("set_shapes", &VarDesc::SetShapes)
       .def("set_dtype", &VarDesc::SetDataType)
-      .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
+      .def("set_dtypes", &VarDesc::SetDataTypes)
+      .def("set_tensor_num", &VarDesc::SetTensorDescNum)
+      .def("tensor_num", &VarDesc::GetTensorDescNum)
+      .def("shape", &VarDesc::GetShape, py::return_value_policy::reference)
+      .def("shapes", &VarDesc::GetShapes, py::return_value_policy::reference)
       .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
+      .def("dtypes", &VarDesc::GetDataTypes, py::return_value_policy::reference)
       .def("lod_level", &VarDesc::GetLoDLevel)
+      .def("lod_levels", &VarDesc::GetLoDLevels,
+           py::return_value_policy::reference)
       .def("set_lod_level", &VarDesc::SetLoDLevel)
+      .def("set_lod_levels", &VarDesc::SetLoDLevels)
       .def("type", &VarDesc::GetType)
       .def("set_type", &VarDesc::SetType)
       .def("serialize_to_string", SerializeMessage<VarDesc>)
@@ -233,7 +242,8 @@ void BindVarDsec(py::module &m) {
       .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
       .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
       .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
-      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST);
+      .value("PLACE_LIST", proto::VarDesc::PLACE_LIST)
+      .value("READER", proto::VarDesc::READER);
 }
 
 void BindOpDesc(py::module &m) {
diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
index 9034b2f4ef..ac6de68b5f 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -115,6 +115,20 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(src_shape, res_shape)
         self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
+    def test_multiple_shape(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        var.set_tensor_num(3)
+        src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
+        var.set_shapes(src_shapes)
+        #import pdb
+        # pdb.set_trace()
+        res_shapes = var.shapes()
+        self.assertEqual(src_shapes, res_shapes)
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
     def test_dtype(self):
         program_desc = core.ProgramDesc()
         block = program_desc.block(0)
@@ -124,6 +138,30 @@ class TestVarDesc(unittest.TestCase):
         self.assertEqual(core.DataType.INT32, var.dtype())
         self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
+    def test_multiple_dtype(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        var.set_tensor_num(3)
+        src_types = [
+            core.DataType.INT32, core.DataType.FP64, core.DataType.FP32
+        ]
+        var.set_dtypes(src_types)
+        self.assertEqual(src_types, var.dtypes())
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
+    def test_multiple_lod_level(self):
+        program_desc = core.ProgramDesc()
+        block = program_desc.block(0)
+        var = block.var('my_reader')
+        var.set_type(core.VarDesc.VarType.READER)
+        var.set_tensor_num(3)
+        src_types = [3, 1, 2]
+        var.set_lod_levels(src_types)
+        self.assertEqual(src_types, var.lod_levels())
+        self.assertEqual(core.VarDesc.VarType.READER, var.type())
+
 
 class TestBlockDesc(unittest.TestCase):
     def test_add_var(self):

From 0d03cab5e9b16dba434ed4a25b5dff887d60a897 Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Mon, 5 Feb 2018 15:18:10 +0800
Subject: [PATCH 091/102] fix a compile error

---
 paddle/framework/var_desc.cc | 2 +-
 paddle/framework/var_desc.h  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 44bd2363c8..6d83e2e411 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -56,7 +56,7 @@ size_t VarDesc::GetTensorDescNum() const {
 }
 
 void VarDesc::SetShapes(
-    const std::vector<const std::vector<int64_t>> &multiple_dims) {
+    const std::vector<std::vector<int64_t>> &multiple_dims) {
   PADDLE_ENFORCE_EQ(multiple_dims.size(), GetTensorDescNum(),
                     "The number of given shapes(%d) doesn't equal to the "
                     "number of sub tensor.",
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 862b9a5d80..72da2fbb0a 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -74,7 +74,7 @@ class VarDesc {
 
   void SetShape(const std::vector<int64_t> &dims);
 
-  void SetShapes(const std::vector<const std::vector<int64_t>> &multiple_dims);
+  void SetShapes(const std::vector<std::vector<int64_t>> &multiple_dims);
 
   std::vector<int64_t> GetShape() const;
 

From 4e5202647684f4ff6525775ce62a6dd674257917 Mon Sep 17 00:00:00 2001
From: Luo Tao <luotao02@baidu.com>
Date: Mon, 5 Feb 2018 16:55:53 +0800
Subject: [PATCH 092/102] add independent sphinx tree for api

---
 doc/CMakeLists.txt                 |  2 ++
 doc/api/CMakeLists.txt             | 20 ++++++++++++++++++++
 paddle/scripts/docker/build.sh     |  2 +-
 paddle/scripts/travis/build_doc.sh |  6 ++++--
 4 files changed, 27 insertions(+), 3 deletions(-)
 create mode 100644 doc/api/CMakeLists.txt

diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 94dd3457fb..58ce5d61c9 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -47,3 +47,5 @@ sphinx_add_target(paddle_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
+
+add_subdirectory(api)
diff --git a/doc/api/CMakeLists.txt b/doc/api/CMakeLists.txt
new file mode 100644
index 0000000000..4e0bc1d5b8
--- /dev/null
+++ b/doc/api/CMakeLists.txt
@@ -0,0 +1,20 @@
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
+
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/../templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_api_docs
+                  html
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_EN})
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 59f3af0398..ba496db5f8 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -117,7 +117,7 @@ EOF
             -DWITH_STYLE_CHECK=OFF
         make -j `nproc` gen_proto_py
         make -j `nproc` paddle_python
-        make -j `nproc` paddle_docs paddle_docs_cn
+        make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
         make -j `nproc` print_operators_doc
         paddle/pybind/print_operators_doc > doc/en/html/operators.json
         popd
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 0db8d33bbc..4af4ac4f5e 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -9,13 +9,14 @@ cd $TRAVIS_BUILD_DIR/build
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
 make -j `nproc` gen_proto_py
 make -j `nproc` paddle_python
-make -j `nproc` paddle_docs paddle_docs_cn
+make -j `nproc` paddle_docs paddle_docs_cn paddle_api_docs
 make -j `nproc` print_operators_doc
 paddle/pybind/print_operators_doc > doc/en/html/operators.json
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
 linkchecker doc/cn/html/index.html
+linkchecker doc/api/en/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
@@ -54,10 +55,11 @@ function deploy_docs() {
   mkdir -p ${DIR}
   # remove old docs. mv new docs.
   set +e
-  rm -rf ${DIR}/doc ${DIR}/doc_cn
+  rm -rf ${DIR}/doc ${DIR}/doc_cn ${DIR}/api_doc
   set -e
   cp -r ../doc/cn/html ${DIR}/doc_cn
   cp -r ../doc/en/html ${DIR}/doc
+  cp -r ../doc/api/en/html ${DIR}/api_doc
   git add .
 }
 

From 93734a79138945e6a603b1c9b28ea8cb1b32569e Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 5 Feb 2018 19:01:26 +0800
Subject: [PATCH 093/102] fix bug

---
 paddle/operators/prior_box_op.cc | 69 ++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 29 deletions(-)

diff --git a/paddle/operators/prior_box_op.cc b/paddle/operators/prior_box_op.cc
index 105ff4ac3e..f35273bf41 100644
--- a/paddle/operators/prior_box_op.cc
+++ b/paddle/operators/prior_box_op.cc
@@ -44,12 +44,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
     auto aspect_ratios = ctx->Attrs().Get<std::vector<float>>("aspect_ratios");
     bool flip = ctx->Attrs().Get<bool>("flip");
 
-    PADDLE_ENFORCE_GT(min_sizes.size(), 0,
-                      "Size of min_sizes must be at least 1.");
-    for (size_t i = 0; i < min_sizes.size(); ++i) {
-      PADDLE_ENFORCE_GT(min_sizes[i], 0, "min_sizes[%d] must be positive.", i);
-    }
-
     std::vector<float> aspect_ratios_vec;
     ExpandAspectRatios(aspect_ratios, flip, aspect_ratios_vec);
 
@@ -65,17 +59,6 @@ class PriorBoxOp : public framework::OperatorWithKernel {
       }
     }
 
-    PADDLE_ENFORCE_EQ(variances.size(), 4, "Must and only provide 4 variance.");
-    for (size_t i = 0; i < variances.size(); ++i) {
-      PADDLE_ENFORCE_GT(variances[i], 0.0,
-                        "variance[%d] must be greater than 0.", i);
-    }
-
-    const float step_h = ctx->Attrs().Get<float>("step_h");
-    PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
-    const float step_w = ctx->Attrs().Get<float>("step_w");
-    PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
-
     std::vector<int64_t> dim_vec(4);
     dim_vec[0] = input_dims[2];
     dim_vec[1] = input_dims[3];
@@ -106,26 +89,54 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
               "PriorBoxOp. The layout is [H, W, num_priors, 4]. "
               "H is the height of input, W is the width of input, num_priors "
               "is the box count of each position.");
-    AddAttr<std::vector<int>>("min_sizes", "(vector<int>) ",
-                              "List of min sizes of generated prior boxes.");
-    AddAttr<std::vector<int>>("max_sizes", "(vector<int>) ",
-                              "List of max sizes of generated prior boxes.");
+
+    AddAttr<std::vector<int>>("min_sizes",
+                              "(vector<int>) List of min sizes "
+                              "of generated prior boxes.")
+        .AddCustomChecker([](const std::vector<int>& min_sizes) {
+          PADDLE_ENFORCE_GT(min_sizes.size(), 0,
+                            "Size of min_sizes must be at least 1.");
+          for (size_t i = 0; i < min_sizes.size(); ++i) {
+            PADDLE_ENFORCE_GT(min_sizes[i], 0,
+                              "min_sizes[%d] must be positive.", i);
+          }
+        });
+    AddAttr<std::vector<int>>(
+        "max_sizes",
+        "(vector<int>) List of max sizes of generated prior boxes.");
     AddAttr<std::vector<float>>(
-        "aspect_ratios", "(vector<float>) ",
-        "List of aspect ratios of generated prior boxes.");
+        "aspect_ratios",
+        "(vector<float>) List of aspect ratios of generated prior boxes.");
+
     AddAttr<std::vector<float>>(
-        "variances", "(vector<float>) ",
-        "List of variances to be encoded in prior boxes.");
-    AddAttr<bool>("flip", "(bool) ", "Whether to flip aspect ratios.")
+        "variances",
+        "(vector<float>) List of variances to be encoded in prior boxes.")
+        .AddCustomChecker([](const std::vector<float>& variances) {
+          PADDLE_ENFORCE_EQ(variances.size(), 4,
+                            "Must and only provide 4 variance.");
+          for (size_t i = 0; i < variances.size(); ++i) {
+            PADDLE_ENFORCE_GT(variances[i], 0.0,
+                              "variance[%d] must be greater than 0.", i);
+          }
+        });
+    AddAttr<bool>("flip", "(bool) Whether to flip aspect ratios.")
         .SetDefault(true);
-    AddAttr<bool>("clip", "(bool) ", "Whether to clip out-of-boundary boxes.")
+    AddAttr<bool>("clip", "(bool) Whether to clip out-of-boundary boxes.")
         .SetDefault(true);
+
     AddAttr<float>("step_w",
                    "Prior boxes step across width, 0 for auto calculation.")
-        .SetDefault(0.0);
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_w) {
+          PADDLE_ENFORCE_GT(step_w, 0.0, "step_h should be larger than 0.");
+        });
     AddAttr<float>("step_h",
                    "Prior boxes step across height, 0 for auto calculation.")
-        .SetDefault(0.0);
+        .SetDefault(0.0)
+        .AddCustomChecker([](const float& step_h) {
+          PADDLE_ENFORCE_GT(step_h, 0.0, "step_h should be larger than 0.");
+        });
+
     AddAttr<float>("offset",
                    "(float) "
                    "Prior boxes center offset.")

From d7a371cbf25f4dcc5dcbfbf0a043e6dc98ae322a Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 5 Feb 2018 19:51:42 +0800
Subject: [PATCH 094/102] follow comments

---
 paddle/operators/prior_box_op.cc | 2 +-
 paddle/operators/prior_box_op.h  | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/operators/prior_box_op.cc b/paddle/operators/prior_box_op.cc
index f35273bf41..1dc4b28855 100644
--- a/paddle/operators/prior_box_op.cc
+++ b/paddle/operators/prior_box_op.cc
@@ -128,7 +128,7 @@ class PriorBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "Prior boxes step across width, 0 for auto calculation.")
         .SetDefault(0.0)
         .AddCustomChecker([](const float& step_w) {
-          PADDLE_ENFORCE_GT(step_w, 0.0, "step_h should be larger than 0.");
+          PADDLE_ENFORCE_GT(step_w, 0.0, "step_w should be larger than 0.");
         });
     AddAttr<float>("step_h",
                    "Prior boxes step across height, 0 for auto calculation.")
diff --git a/paddle/operators/prior_box_op.h b/paddle/operators/prior_box_op.h
index e0a663ace8..12ff162356 100644
--- a/paddle/operators/prior_box_op.h
+++ b/paddle/operators/prior_box_op.h
@@ -25,7 +25,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
                                std::vector<float>& output_aspect_ratior) {
   constexpr float epsilon = 1e-6;
   output_aspect_ratior.clear();
-  output_aspect_ratior.push_back(1.);
+  output_aspect_ratior.push_back(1.0f);
   for (size_t i = 0; i < input_aspect_ratior.size(); ++i) {
     float ar = input_aspect_ratior[i];
     bool already_exist = false;
@@ -38,7 +38,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
     if (!already_exist) {
       output_aspect_ratior.push_back(ar);
       if (flip) {
-        output_aspect_ratior.push_back(1. / ar);
+        output_aspect_ratior.push_back(1.0f / ar);
       }
     }
   }

From f367ad6c6cae825c46b7262c77fa0cf6f8394796 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Mon, 5 Feb 2018 20:03:50 +0800
Subject: [PATCH 095/102] add "inline" for ClipFunctor and refine code

---
 paddle/operators/prior_box_op.h | 39 ++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/paddle/operators/prior_box_op.h b/paddle/operators/prior_box_op.h
index 12ff162356..6b221cb74e 100644
--- a/paddle/operators/prior_box_op.h
+++ b/paddle/operators/prior_box_op.h
@@ -46,7 +46,7 @@ inline void ExpandAspectRatios(const std::vector<float>& input_aspect_ratior,
 
 template <typename T>
 struct ClipFunctor {
-  HOSTDEVICE T operator()(T in) const {
+  HOSTDEVICE inline T operator()(T in) const {
     return std::min<T>(std::max<T>(in, 0.), 1.);
   }
 };
@@ -97,6 +97,9 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
     boxes->mutable_data<T>(ctx.GetPlace());
     vars->mutable_data<T>(ctx.GetPlace());
 
+    T inv_img_width = 1.0 / img_width;
+    T inv_img_height = 1.0 / img_height;
+
     auto e_boxes = framework::EigenTensor<T, 4>::From(*boxes);
     for (int h = 0; h < feature_height; ++h) {
       for (int w = 0; w < feature_width; ++w) {
@@ -109,13 +112,15 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
           // first prior: aspect_ratio = 1, size = min_size
           box_width = box_height = min_size;
           // xmin
-          e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+          e_boxes(h, w, idx, 0) = (center_x - box_width * 0.5) * inv_img_width;
           // ymin
-          e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+          e_boxes(h, w, idx, 1) =
+              (center_y - box_height * 0.5) * inv_img_height;
           // xmax
-          e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+          e_boxes(h, w, idx, 2) = (center_x + box_width * 0.5) * inv_img_width;
           // ymax
-          e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+          e_boxes(h, w, idx, 3) =
+              (center_y + box_height * 0.5) * inv_img_height;
 
           idx++;
           if (max_sizes.size() > 0) {
@@ -124,13 +129,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             // size = sqrt(min_size * max_size)
             box_width = box_height = sqrt(min_size * max_size);
             // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
             // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
             // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
             // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
             idx++;
           }
 
@@ -143,13 +152,17 @@ class PriorBoxOpKernel : public framework::OpKernel<T> {
             box_width = min_size * sqrt(ar);
             box_height = min_size / sqrt(ar);
             // xmin
-            e_boxes(h, w, idx, 0) = (center_x - box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 0) =
+                (center_x - box_width * 0.5) * inv_img_width;
             // ymin
-            e_boxes(h, w, idx, 1) = (center_y - box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 1) =
+                (center_y - box_height * 0.5) * inv_img_height;
             // xmax
-            e_boxes(h, w, idx, 2) = (center_x + box_width / 2.) / img_width;
+            e_boxes(h, w, idx, 2) =
+                (center_x + box_width * 0.5) * inv_img_width;
             // ymax
-            e_boxes(h, w, idx, 3) = (center_y + box_height / 2.) / img_height;
+            e_boxes(h, w, idx, 3) =
+                (center_y + box_height * 0.5) * inv_img_height;
             idx++;
           }
         }

From e9e24249217c1b234a9ce8f8d0d9c1e6e18fd2d3 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Mon, 5 Feb 2018 21:38:53 +0800
Subject: [PATCH 096/102] Fix warnings in multiclass_nms_op.cc.

---
 paddle/operators/multiclass_nms_op.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/operators/multiclass_nms_op.cc b/paddle/operators/multiclass_nms_op.cc
index 8a65fe69f1..41b9335fb8 100644
--- a/paddle/operators/multiclass_nms_op.cc
+++ b/paddle/operators/multiclass_nms_op.cc
@@ -85,7 +85,7 @@ static inline void GetMaxScoreIndex(
   std::stable_sort(sorted_indices->begin(), sorted_indices->end(),
                    SortScorePairDescend<int>);
   // Keep top_k scores if needed.
-  if (top_k > -1 && top_k < sorted_indices->size()) {
+  if (top_k > -1 && top_k < static_cast<int>(sorted_indices->size())) {
     sorted_indices->resize(top_k);
   }
 }
@@ -151,7 +151,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     while (sorted_indices.size() != 0) {
       const int idx = sorted_indices.front().second;
       bool keep = true;
-      for (int k = 0; k < selected_indices->size(); ++k) {
+      for (size_t k = 0; k < selected_indices->size(); ++k) {
         if (keep) {
           const int kept_idx = (*selected_indices)[k];
           T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
@@ -201,7 +201,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         int label = it.first;
         const T* sdata = scores_data + label * predict_dim;
         const std::vector<int>& label_indices = it.second;
-        for (int j = 0; j < label_indices.size(); ++j) {
+        for (size_t j = 0; j < label_indices.size(); ++j) {
           int idx = label_indices[j];
           PADDLE_ENFORCE_LT(idx, predict_dim);
           score_index_pairs.push_back(
@@ -215,7 +215,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
 
       // Store the new indices.
       std::map<int, std::vector<int>> new_indices;
-      for (int j = 0; j < score_index_pairs.size(); ++j) {
+      for (size_t j = 0; j < score_index_pairs.size(); ++j) {
         int label = score_index_pairs[j].second.first;
         int idx = score_index_pairs[j].second.second;
         new_indices[label].push_back(idx);
@@ -238,7 +238,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       int label = it.first;
       const T* sdata = scores_data + label * predict_dim;
       const std::vector<int>& indices = it.second;
-      for (int j = 0; j < indices.size(); ++j) {
+      for (size_t j = 0; j < indices.size(); ++j) {
         int idx = indices[j];
         const T* bdata = bboxes_data + idx * kBBoxSize;
         odata[count * kOutputDim] = label;           // label

From 497a131e53316fc3d81cf92e68845d2fd33243e3 Mon Sep 17 00:00:00 2001
From: kavyasrinet <kavyasrinet@baidu.com>
Date: Mon, 5 Feb 2018 10:45:43 -0800
Subject: [PATCH 097/102] Proposing Python syntax for send and recv in design
 doc (#8093)

* Adding send and recv in design doc

* fix typo

* fixed code

* Adding threading
---
 doc/design/csp.md | 76 +++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 73 insertions(+), 3 deletions(-)

diff --git a/doc/design/csp.md b/doc/design/csp.md
index ba9cacfdea..2f6ce8d6fa 100644
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@@ -71,14 +71,14 @@ ch1 := make(chan int, 100)  // a channel that can buffer 100 ints.
 In Fluid, we should be able to do the same:
 
 ```python
-ch  = fluid.make_chan(dtype=INT)
-ch1 = fluid.make_chan(dtype=INT, 100)
+ch  = fluid.make_channel(dtype=INT)
+ch1 = fluid.make_channel(dtype=INT, 100)
 ```
 
 In addition to that, we want channels that can hold more complex element types, e.g., Tensors of float16:
 
 ```python
-ch = fluid.make_chan(dtype=Tensor, etype=float16)
+ch = fluid.make_channel(dtype=Tensor, etype=float16)
 ```
 
 or Tensors of Tensors of float16 etc.
@@ -87,6 +87,76 @@ The point here is that we need a consistent way to compose types, like in C++ we
 
 ### Send and Recv
 
+In Go, we first create a channel as explained in the section above and then perform read and write operations on top of the channels.
+
+```go
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+```
+
+To write (or perform a `Send` operation) the value of a variable `x`, to channel `ch1` above, we perform the following:
+
+```go
+ch1 <- x
+fmt.Println("Written to the channel")
+```
+Now to read (or perform a `Recv` operation) the value stored in `ch2` into a variable `y`, we perform the following:
+
+```go
+y <- ch2
+fmt.Println("Received from channel")
+```
+
+In Fluid, we should be able to perform the above operations on the channel objects as well. As of now, we support two different kinds of channels : [Buffered Channel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [UnBuffered Channel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+Send and Receive can be performed as following on a buffered channel:
+
+```python
+import threading
+
+def send_to_channel(channel, num_time=1):
+  for i in xrange(num_time):
+    channel.send(i)
+
+# Create a buffered channel of capacity 10
+buffer_size = 10;
+ch = fluid.make_channel(dtype=INT, buffer_size)
+
+# Now write three elements to the channel
+thread = threading.Thread(target=send_to_channel, args=(ch, 3, ))
+thread.daemon = True
+thread.start()
+
+# Read all the data from the channel
+for i in xrange(3):
+  y = ch.recv()
+
+# Done receiving , now close the channel
+ch.close()
+```
+
+The send and receive operations will be similar for unbuffered channel as well, except for the fact that there is no buffer in an unbuffered channel, so the operations are completely synchronized. For example:
+
+```python
+import threading
+
+def send_to_channel(channel, data):
+  channel.send(data)
+
+# Create an unbuffered channel
+ch = fluid.make_channel(dtype=INT)
+
+# Writes and Reads are synchronous otherwise the calls will block.
+thread = threading.Thread(target=send_to_channel, args=(ch, 10, ))
+thread.daemon = True
+thread.start()
+
+y = ch.recv()
+
+# Done receiving , now close the channel
+ch.close()
+```
+
 ### Select
 
 ## Example Programs

From 1ead6c2691be09f34303c06d119c17ba4e4aeab7 Mon Sep 17 00:00:00 2001
From: Abhinav Arora <abhinavarora28@gmail.com>
Date: Mon, 5 Feb 2018 11:06:02 -0800
Subject: [PATCH 098/102] Add proposed fluid syntax for select statement in
 Fluid's implementation of CSP (#7908)

* Add proposed fluid syntax for select statement in Fluid's implementation of CSP

* Fix Typo
---
 doc/design/csp.md | 49 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 49 insertions(+)

diff --git a/doc/design/csp.md b/doc/design/csp.md
index 2f6ce8d6fa..36422d8236 100644
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@@ -159,6 +159,55 @@ ch.close()
 
 ### Select
 
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks untill one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+
+```go
+
+ch1  := make(chan int)       
+ch2  := make(chan int, 100)
+
+x := 0
+
+for {
+    select {
+    case ch1 <- x:
+      x := x + 1
+    case y <- ch2:
+      fmt.Println("Received on channel")
+    default:
+      fmt.Println("Default")
+    }
+  }
+
+```
+
+In Fluid, we should be able to do the same:
+
+```python
+ch1  = fluid.make_chan(dtype=INT)
+ch2 = fluid.make_chan(dtype=INT, 100)
+
+sel = fluid.select()
+
+with sel.case(ch1, 'w', X):
+    fluid.layers.increment(X)
+
+with sel.case(ch2, 'r', Y):
+    fluid.print("Received on Channel")
+
+with sel.default():
+    fluid.print("Default")
+
+```
+
+In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
+
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familar to write syntax in Python I/O.
+
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familar to read syntax in Python I/O.
+
+- `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
+
 ## Example Programs
 
 ### 1. RPC between Trainers and Parameter Servers

From b0ecb36583ed97737bd5c43cbafbdc8fa29cbd68 Mon Sep 17 00:00:00 2001
From: Yi Wang <yi.wang.2005@gmail.com>
Date: Mon, 5 Feb 2018 17:11:11 -0800
Subject: [PATCH 099/102] Rewrite the Send/Recv part of csp.md (#8164)

* Update csp.md

* Update csp.md

* Update csp.md
---
 doc/design/csp.md | 110 +++++++++++++++++++++++++---------------------
 1 file changed, 59 insertions(+), 51 deletions(-)

diff --git a/doc/design/csp.md b/doc/design/csp.md
index 36422d8236..ae2e3e1b99 100644
--- a/doc/design/csp.md
+++ b/doc/design/csp.md
@@ -42,7 +42,7 @@ The type *channel* is conceptually the blocking queue.  In Go, its implemented i
 
 The `select` operation has been in OS kernels long before Go language.  All Unix kernels implement system calls *poll* and *select*.  They monitor multiple file descriptors to see if I/O is possible on any of them.  This takes O(N) time.  Since Linux 2.6, a new system call, *epoll*, can do the same in O(1) time.  In BSD systems, there is a similar system call *kqueue*.  Go's Linux implementation uses epoll.
 
-It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way, so we could focus on Python binding and the syntax.
+It might be a good idea to implement Fluid's select using epoll too.  In this design doc, we start from the O(N) way so that we could focus on Python binding and the syntax.
 
 ### Type Channel
 
@@ -87,79 +87,87 @@ The point here is that we need a consistent way to compose types, like in C++ we
 
 ### Send and Recv
 
-In Go, we first create a channel as explained in the section above and then perform read and write operations on top of the channels.
+Go's CSP implementation depends on data type *channel*. There are two types of channels:
 
-```go
-ch1  := make(chan int)       
-ch2  := make(chan int, 100)
-```
+1. The unblocked channel, or buffered channel, is a blocking queue with a non-zero sized buffer. The sending to buffered channel blocks if the buffer is full, and the receive operation blocks if the buffer is empty.
+1. blocked channel, or unbuffered channel, is a blocking queue with no buffer.  Both sending and receiving block with unbuffered channels.
 
-To write (or perform a `Send` operation) the value of a variable `x`, to channel `ch1` above, we perform the following:
+There are four types of actions with a channel:
 
-```go
-ch1 <- x
-fmt.Println("Written to the channel")
-```
-Now to read (or perform a `Recv` operation) the value stored in `ch2` into a variable `y`, we perform the following:
+1. Create a channel
 
-```go
-y <- ch2
-fmt.Println("Received from channel")
-```
+   ```go
+   ch := make(chan int) // this is an unbuffered channel
+   ch := make(chan int, 100) // this is a buffered channel of 100 ints.
+   ```
 
-In Fluid, we should be able to perform the above operations on the channel objects as well. As of now, we support two different kinds of channels : [Buffered Channel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [UnBuffered Channel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+1. Send
 
-Send and Receive can be performed as following on a buffered channel:
+   ```go
+   ch <- 111
+   ```
 
-```python
-import threading
+1. Recv
 
-def send_to_channel(channel, num_time=1):
-  for i in xrange(num_time):
-    channel.send(i)
+   ```go
+   y, ok <- ch
+   ```
 
-# Create a buffered channel of capacity 10
-buffer_size = 10;
-ch = fluid.make_channel(dtype=INT, buffer_size)
+1. Close
 
-# Now write three elements to the channel
-thread = threading.Thread(target=send_to_channel, args=(ch, 3, ))
-thread.daemon = True
-thread.start()
+   ```go
+   close(ch)
+   ```
+   
+   Please be aware that a closed channel is not a nil channel, which is `var ch chan int`.
+   
+There are some [axioms with channels](https://dave.cheney.net/2014/03/19/channel-axioms):
 
-# Read all the data from the channel
-for i in xrange(3):
-  y = ch.recv()
+1. A send to a nil channel blocks forever
 
-# Done receiving , now close the channel
-ch.close()
-```
+1. A receive from a nil channel blocks forever
+
+1. A send to a closed channel panics
+
+1. A receive from a closed channel returns the residual values and then zeros.
 
-The send and receive operations will be similar for unbuffered channel as well, except for the fact that there is no buffer in an unbuffered channel, so the operations are completely synchronized. For example:
+In Fluid, we have [buffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/buffered_channel.h) and [unbuffered channels](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/details/unbuffered_channel.h)
+
+The following program illustrates the Python syntax for accessing Fluid buffers.
 
 ```python
-import threading
+import fluid
+
+buffer_size = 10
+ch = fluid.make_channel(dtype=INT, buffer_size)
 
-def send_to_channel(channel, data):
-  channel.send(data)
+# Now write three elements to the channel
+with fluid.while(steps=buffer_size):
+  fluid.send(ch, step)
+  fluid.close_channel(ch)
+  
+with fluid.while(steps=buffer_size):
+  fluid.print(fluid.recv(ch))
+```
+
+The following example shows that to avoid the always-blocking behavior of unbuffered channels, we need to use Fluid's goroutines.
+
+```python
+import fluid
 
-# Create an unbuffered channel
 ch = fluid.make_channel(dtype=INT)
 
-# Writes and Reads are synchronous otherwise the calls will block.
-thread = threading.Thread(target=send_to_channel, args=(ch, 10, ))
-thread.daemon = True
-thread.start()
+with fluid.go():
+  fluid.send(ch)
 
-y = ch.recv()
+y = fluid.recv(ch)
 
-# Done receiving , now close the channel
-ch.close()
+fluid.close_channel(ch)
 ```
 
 ### Select
 
-In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks untill one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
+In Go, the `select` statement lets a goroutine wait on multiple communication operations. A `select` blocks until one of its cases can run, then it executes that case. It chooses one at random if multiple are ready.
 
 ```go
 
@@ -202,9 +210,9 @@ with sel.default():
 
 In the above code snippet, `X` and `Y` are variables. Now let us look at each of these statements one by one.
 
-- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familar to write syntax in Python I/O.
+- `sel.case(ch1, 'w', X)` : This specifies that we are writing to `ch1` and we want to write the integer in variable `X` to the channel. The character `w` is used here to make the syntax familiar to write syntax in Python I/O.
 
-- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familar to read syntax in Python I/O.
+- `sel.case(ch2, 'r', Y)` : This specifies that we would like to read the result from `ch2` into variable `Y`. The character `r` is used here to make the syntax familiar to read syntax in Python I/O.
 
 - `sel.default()` : This is equivalent to the default in Go `select`. If none of the channels are ready for read or write, then the fluid code in the default block will be executed.
 

From 165450ff6ca5bc0f02ffe63ec11f50ed4c240f09 Mon Sep 17 00:00:00 2001
From: Yiqun Liu <liuyiqun01@baidu.com>
Date: Tue, 6 Feb 2018 09:52:18 +0800
Subject: [PATCH 100/102] Refine the inference unittest recognize_digits.
 (#8147)

---
 .../book/test_inference_recognize_digits.cc   | 63 ++++++++++++++-----
 .../fluid/tests/book/test_recognize_digits.py |  4 +-
 2 files changed, 49 insertions(+), 18 deletions(-)

diff --git a/paddle/inference/tests/book/test_inference_recognize_digits.cc b/paddle/inference/tests/book/test_inference_recognize_digits.cc
index 26dc2aee04..ce8772587f 100644
--- a/paddle/inference/tests/book/test_inference_recognize_digits.cc
+++ b/paddle/inference/tests/book/test_inference_recognize_digits.cc
@@ -58,6 +58,47 @@ void TestInference(const std::string& dirname,
   delete scope;
 }
 
+template <typename T>
+void SetupTensor(paddle::framework::LoDTensor& input,
+                 paddle::framework::DDim dims,
+                 T lower,
+                 T upper) {
+  srand(time(0));
+  float* input_ptr = input.mutable_data<T>(dims, paddle::platform::CPUPlace());
+  for (int i = 0; i < input.numel(); ++i) {
+    input_ptr[i] =
+        (static_cast<T>(rand()) / static_cast<T>(RAND_MAX)) * (upper - lower) +
+        lower;
+  }
+}
+
+template <typename T>
+void CheckError(paddle::framework::LoDTensor& output1,
+                paddle::framework::LoDTensor& output2) {
+  // Check lod information
+  EXPECT_EQ(output1.lod(), output2.lod());
+
+  EXPECT_EQ(output1.dims(), output2.dims());
+  EXPECT_EQ(output1.numel(), output2.numel());
+
+  T err = static_cast<T>(0);
+  if (typeid(T) == typeid(float)) {
+    err = 1E-3;
+  } else if (typeid(T) == typeid(double)) {
+    err = 1E-6;
+  } else {
+    err = 0;
+  }
+
+  size_t count = 0;
+  for (int64_t i = 0; i < output1.numel(); ++i) {
+    if (fabs(output1.data<T>()[i] - output2.data<T>()[i]) > err) {
+      count++;
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+}
+
 TEST(inference, recognize_digits) {
   if (FLAGS_dirname.empty()) {
     LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
@@ -70,12 +111,10 @@ TEST(inference, recognize_digits) {
   // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
 
   paddle::framework::LoDTensor input;
-  srand(time(0));
-  float* input_ptr =
-      input.mutable_data<float>({1, 28, 28}, paddle::platform::CPUPlace());
-  for (int i = 0; i < 784; ++i) {
-    input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
-  }
+  // Use normilized image pixels as input data,
+  // which should be in the range [-1.0, 1.0].
+  SetupTensor<float>(
+      input, {1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
   std::vector<paddle::framework::LoDTensor*> cpu_feeds;
   cpu_feeds.push_back(&input);
 
@@ -98,16 +137,6 @@ TEST(inference, recognize_digits) {
       dirname, cpu_feeds, cpu_fetchs2);
   LOG(INFO) << output2.dims();
 
-  EXPECT_EQ(output1.dims(), output2.dims());
-  EXPECT_EQ(output1.numel(), output2.numel());
-
-  float err = 1E-3;
-  int count = 0;
-  for (int64_t i = 0; i < output1.numel(); ++i) {
-    if (fabs(output1.data<float>()[i] - output2.data<float>()[i]) > err) {
-      count++;
-    }
-  }
-  EXPECT_EQ(count, 0) << "There are " << count << " different elements.";
+  CheckError<float>(output1, output2);
 #endif
 }
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
index b8f55c813b..fb6b1f7192 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits.py
@@ -166,7 +166,9 @@ def infer(use_cuda, save_dirname=None):
      fetch_targets] = fluid.io.load_inference_model(save_dirname, exe)
 
     # The input's dimension of conv should be 4-D or 5-D.
-    tensor_img = numpy.random.rand(1, 1, 28, 28).astype("float32")
+    # Use normilized image pixels as input data, which should be in the range [-1.0, 1.0].
+    tensor_img = numpy.random.uniform(-1.0, 1.0,
+                                      [1, 1, 28, 28]).astype("float32")
 
     # Construct feed as a dictionary of {feed_target_name: feed_target_data}
     # and results will contain a list of data corresponding to fetch_targets.

From 9a1fa890a0c510ca1863eea358423bc89fd4fdef Mon Sep 17 00:00:00 2001
From: fengjiayi <fengjiayi@baidu.com>
Date: Tue, 6 Feb 2018 11:10:34 +0800
Subject: [PATCH 101/102] remove unnecessary comments

---
 python/paddle/v2/fluid/tests/test_protobuf_descs.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/paddle/v2/fluid/tests/test_protobuf_descs.py b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
index ac6de68b5f..8f335d13db 100644
--- a/python/paddle/v2/fluid/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/fluid/tests/test_protobuf_descs.py
@@ -123,8 +123,6 @@ class TestVarDesc(unittest.TestCase):
         var.set_tensor_num(3)
         src_shapes = [[2, 3, 3], [4, 5], [6, 7, 8, 9]]
         var.set_shapes(src_shapes)
-        #import pdb
-        # pdb.set_trace()
         res_shapes = var.shapes()
         self.assertEqual(src_shapes, res_shapes)
         self.assertEqual(core.VarDesc.VarType.READER, var.type())

From 70324911e701829eb1a5ef484ec26920b6578d96 Mon Sep 17 00:00:00 2001
From: chengduoZH <zhaochengduo@163.com>
Date: Tue, 6 Feb 2018 12:52:10 +0800
Subject: [PATCH 102/102] refine buffer receive

---
 paddle/framework/details/buffered_channel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/framework/details/buffered_channel.h b/paddle/framework/details/buffered_channel.h
index 7ac234b8d4..b9761eab9b 100644
--- a/paddle/framework/details/buffered_channel.h
+++ b/paddle/framework/details/buffered_channel.h
@@ -71,7 +71,7 @@ bool Buffered<T>::Receive(T* item) {
   std::unique_lock<std::mutex> lock(mu_);
   empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
   bool ret = false;
-  if (!closed_) {
+  if (!channel_.empty()) {
     *item = std::move(channel_.front());
     channel_.pop_front();
     full_cond_var_.notify_one();