From 9773f38f99e0fbb1a19348bb0a1a60d3995afaf6 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 1 Mar 2019 12:39:49 +0800
Subject: [PATCH 01/73] cache runtime_context

test=develop
---
 paddle/fluid/framework/operator.cc | 21 ++++++++++++++-------
 paddle/fluid/framework/operator.h  |  1 +
 2 files changed, 15 insertions(+), 7 deletions(-)
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 64592d73e1..c2063b5e6a 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -916,7 +916,14 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  RuntimeContext ctx(Inputs(), Outputs(), scope);
+  if (!runtime_ctx_) {
+    // RuntimeContext is used to relate input/output names of Operator with
+    // the corresponding variables in Scope.
+    // Since the input/output names of Operator do not change in the execution,
+    // RuntimeContext could be created only at the first iteration of
+    // the execution to save the elapsed time.
+    runtime_ctx_ = new RuntimeContext(Inputs(), Outputs(), scope);
+  }
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -931,7 +938,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   OpKernelMap& kernels = kernels_iter->second;
 
   auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, ctx, nullptr));
+      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx_, nullptr));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -955,8 +962,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope =
-      PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx);
+  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
+                                     &transfered_inplace_vars, runtime_ctx_);
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
@@ -966,12 +973,12 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
     dev_ctx = pool.Get(expected_kernel_key.place_);
   }
 
-  RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx);
+  RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx_);
   this->InferShape(&infer_shape_ctx);
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
-  kernel_iter->second(
-      ExecutionContext(*this, exec_scope, *dev_ctx, ctx, kernel_configs));
+  kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
+                                       *runtime_ctx_, kernel_configs));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 8a86813e93..e34a0e2141 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -541,6 +541,7 @@ class OperatorWithKernel : public OperatorBase {
 
  protected:
   mutable OpKernelConfigsMap kernel_configs_map_;
+  mutable RuntimeContext* runtime_ctx_ = nullptr;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);

From 82b0bb9d72b0a023477e2b1361e79a432cf39957 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 1 Mar 2019 18:23:42 +0800
Subject: [PATCH 02/73] fix cpplint error

test=develop
---
 paddle/fluid/framework/operator.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index e34a0e2141..3c3e9096c0 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -16,9 +16,11 @@ limitations under the License. */
 
 #include <algorithm>
 #include <atomic>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "glog/logging.h"  // For VLOG

From 784826a4f507e6045d582b2cdf2332af44a46b1a Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Mon, 4 Mar 2019 13:49:38 +0800
Subject: [PATCH 03/73] enhance cache runtime_context for different scope

test=develop
---
 paddle/fluid/framework/operator.cc | 20 +++++++++++++-------
 paddle/fluid/framework/operator.h  |  3 ++-
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 0b436f4c8b..ef0a4779dc 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_transform.h"
+#include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -917,13 +918,18 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  if (!runtime_ctx_) {
+  const Scope* cur_scope = &scope;
+  if (!runtime_ctx_ || pre_scope_ != cur_scope ||
+      scope.FindVar(details::kLocalExecScopeName)) {
     // RuntimeContext is used to relate input/output names of Operator with
     // the corresponding variables in Scope.
-    // Since the input/output names of Operator do not change in the execution,
-    // RuntimeContext could be created only at the first iteration of
-    // the execution to save the elapsed time.
-    runtime_ctx_ = new RuntimeContext(Inputs(), Outputs(), scope);
+    // In a same Scope, since the input/output names of Operator do not change
+    // in the execution, RuntimeContext could be created only at the first
+    // iteration of the execution to save the elapsed time.
+    // Note that the Scope should not be the local scope, since local scope
+    // would be cleaned regularly.
+    runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+    pre_scope_ = cur_scope;
   }
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
@@ -963,8 +969,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
-                                     &transfered_inplace_vars, runtime_ctx_);
+  auto* transfer_scope = PrepareData(
+      scope, expected_kernel_key, &transfered_inplace_vars, runtime_ctx_.get());
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 3c3e9096c0..6d21d0c749 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -543,7 +543,8 @@ class OperatorWithKernel : public OperatorBase {
 
  protected:
   mutable OpKernelConfigsMap kernel_configs_map_;
-  mutable RuntimeContext* runtime_ctx_ = nullptr;
+  mutable std::unique_ptr<RuntimeContext> runtime_ctx_;
+  mutable const Scope* pre_scope_ = nullptr;
 };
 
 extern bool OpSupportGPU(const std::string& op_type);

From c0b240aa433939081730f31563f38fc2f410847d Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Tue, 5 Mar 2019 15:33:53 +0800
Subject: [PATCH 04/73] try to fix distributed unit-test

test=develop
---
 paddle/fluid/framework/operator.cc | 19 ++++++++++---------
 paddle/fluid/framework/scope.cc    |  4 ++++
 paddle/fluid/framework/scope.h     |  4 ++++
 3 files changed, 18 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ef0a4779dc..3959728a20 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -919,15 +919,16 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
   const Scope* cur_scope = &scope;
-  if (!runtime_ctx_ || pre_scope_ != cur_scope ||
-      scope.FindVar(details::kLocalExecScopeName)) {
-    // RuntimeContext is used to relate input/output names of Operator with
-    // the corresponding variables in Scope.
-    // In a same Scope, since the input/output names of Operator do not change
-    // in the execution, RuntimeContext could be created only at the first
-    // iteration of the execution to save the elapsed time.
-    // Note that the Scope should not be the local scope, since local scope
-    // would be cleaned regularly.
+  // RuntimeContext is used to relate input/output names of Operator with
+  // the corresponding variables in Scope.
+  // In a same Scope, since the input/output names of Operator do not change
+  // in the execution, RuntimeContext could be created only at the first
+  // iteration of the execution to save the elapsed time.
+  // Note that the Scope should not be the local scope, since local scope
+  // would be cleaned regularly.
+  if (scope.FindVar(details::kLocalExecScopeName)) {
+    runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+  } else if (!runtime_ctx_ || pre_scope_ != cur_scope) {
     runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
     pre_scope_ = cur_scope;
   }
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 87f0f307d3..e6de477171 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -107,6 +107,10 @@ const Scope* Scope::FindScope(const Variable* var) const {
   return FindScopeInternal(var);
 }
 
+bool Scope::HasLocalVar(const std::string& name) const {
+  return vars_.find(name) != vars_.end();
+}
+
 void Scope::DropKids() {
   SCOPE_KIDS_WRITER_LOCK
   for (Scope* s : kids_) delete s;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index f0915d2eee..38d3b4d6ce 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -75,6 +75,10 @@ class Scope {
   /// Caller doesn't own the returned Variable.
   Variable* FindLocalVar(const std::string& name) const;
 
+  /// Find whether a variable in the current scope.
+  /// Return false if cannot find.
+  bool HasLocalVar(const std::string& name) const;
+
   const Scope* parent() const { return parent_; }
 
   /// Find the scope or an ancestor scope that contains the given variable.

From 3896d955c75aa537a30e1a8dde9ad02f6540bb73 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 18 Feb 2019 21:14:40 +0800
Subject: [PATCH 05/73] add yolo_box_op CPU kernel

---
 .../fluid/operators/detection/CMakeLists.txt  |   1 +
 .../fluid/operators/detection/yolo_box_op.cc  | 144 ++++++++++++++++++
 .../fluid/operators/detection/yolo_box_op.cu  |  71 +++++++++
 .../fluid/operators/detection/yolo_box_op.h   | 127 +++++++++++++++
 python/paddle/fluid/layers/detection.py       |  66 ++++++++
 python/paddle/fluid/tests/test_detection.py   |   9 +-
 .../fluid/tests/unittests/test_yolo_box_op.py | 105 +++++++++++++
 .../tests/unittests/test_yolov3_loss_op.py    |  12 +-
 8 files changed, 526 insertions(+), 9 deletions(-)
 create mode 100644 paddle/fluid/operators/detection/yolo_box_op.cc
 create mode 100644 paddle/fluid/operators/detection/yolo_box_op.cu
 create mode 100644 paddle/fluid/operators/detection/yolo_box_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_yolo_box_op.py

diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index c87837e694..94a2016aa5 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -33,6 +33,7 @@ detection_library(rpn_target_assign_op SRCS rpn_target_assign_op.cc)
 detection_library(generate_proposal_labels_op SRCS generate_proposal_labels_op.cc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
+detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op.cu)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc box_decoder_and_assign_op.cu)
 
 if(WITH_GPU)
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
new file mode 100644
index 0000000000..4c2c5d1e6f
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -0,0 +1,144 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class YoloBoxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Boxes"),
+                   "Output(Boxes) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Scores"),
+                   "Output(Scores) of YoloBoxOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");
+    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    int anchor_num = anchors.size() / 2;
+    auto class_num = ctx->Attrs().Get<int>("class_num");
+    auto conf_thresh = ctx->Attrs().Get<float>("conf_thresh");
+
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        dim_x[1], anchor_num * (5 + class_num),
+        "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
+        "+ class_num)).");
+    PADDLE_ENFORCE_GT(anchors.size(), 0,
+                      "Attr(anchors) length should be greater then 0.");
+    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
+                      "Attr(anchors) length should be even integer.");
+    PADDLE_ENFORCE_GT(class_num, 0,
+                      "Attr(class_num) should be an integer greater then 0.");
+
+    int box_num = dim_x[2] * dim_x[3] * anchor_num;
+    std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
+    ctx->SetOutputDim("Boxes", framework::make_ddim(dim_boxes));
+
+    std::vector<int64_t> dim_scores({dim_x[0], box_num, class_num});
+    ctx->SetOutputDim("Scores", framework::make_ddim(dim_scores));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<Tensor>("X")->type(),
+                                   ctx.GetPlace());
+  }
+};
+
+class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of YoloBox operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W]."
+             "H and W should be same, and the second dimention(C) stores"
+             "box locations, confidence score and classification one-hot"
+             "keys of each anchor box. Generally, X should be the output"
+             "of YOLOv3 network.");
+    AddOutput("Boxes",
+              "The output tensor of detection boxes of YoloBox operator, "
+              "This is a 3-D tensor with shape of [N, M, 4], N is the"
+              "batch num, M is output box number, and the 3rd dimention"
+              "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
+    AddOutput("Scores",
+              "The output tensor ofdetection boxes scores of YoloBox"
+              "operator, This is a 3-D tensor with shape of [N, M, C],"
+              "N is the batch num, M is output box number, C is the"
+              "class number.");
+
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.")
+        .SetDefault(std::vector<int>{});
+    AddAttr<int>("downsample_ratio",
+                 "The downsample ratio from network input to YoloBox operator "
+                 "input, so 32, 16, 8 should be set for the first, second, "
+                 "and thrid YoloBox operators.")
+        .SetDefault(32);
+    AddAttr<float>("conf_thresh",
+                   "The confidence scores threshold of detection boxes."
+                   "boxes with confidence scores under threshold should"
+                   "be ignored.")
+        .SetDefault(0.01);
+    AddComment(R"DOC(
+         This operator generate YOLO detection boxes fron output of YOLOv3 network.
+         
+         The output of previous network is in shape [N, C, H, W], while H and W
+         should be the same, specify the grid size, each grid point predict given
+         number boxes, this given number is specified by anchors, it should be 
+         half anchors length, which following will be represented as S. In the 
+         second dimention(the channel dimention), C should be S * (class_num + 5),
+         class_num is the box categoriy number of source dataset(such as coco), 
+         so in the second dimention, stores 4 box location coordinates x, y, w, h 
+         and confidence score of the box and class one-hot key of each anchor box.
+
+         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
+         correspnd to:
+
+         $$
+         b_x = \sigma(t_x) + c_x
+         b_y = \sigma(t_y) + c_y
+         b_w = p_w e^{t_w}
+         b_h = p_h e^{t_h}
+         $$
+
+         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
+         is specified by anchors.
+
+         The logistic scores of the 5rd channel of each anchor prediction boxes
+         represent the confidence score of each prediction scores, and the logistic
+         scores of the last class_num channels of each anchor prediction boxes 
+         represent the classifcation scores. Boxes with confidence scores less then
+         conf_thresh should be ignored, and boxes final scores if the products result
+         of confidence scores and classification scores.
+
+         )DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolo_box, ops::YoloBoxOp, ops::YoloBoxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(yolo_box, ops::YoloBoxKernel<float>,
+                       ops::YoloBoxKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
new file mode 100644
index 0000000000..9cc94794f2
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -0,0 +1,71 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/detection/yolo_box_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+static __global__ void GenDensityPriorBox(
+    const int height, const int width, const int im_height, const int im_width,
+    const T offset, const T step_width, const T step_height,
+    const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin,
+    const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) {
+  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
+  int gidy = blockIdx.y * blockDim.y + threadIdx.y;
+  int step_x = blockDim.x * gridDim.x;
+  int step_y = blockDim.y * gridDim.y;
+}
+
+template <typename T>
+class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* boxes = ctx.Output<Tensor>("Boxes");
+    auto* scores = ctx.Output<Tensor>("Scores");
+
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float conf_thresh = ctx.Attr<float>("conf_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int box_num = boxes->dims()[1];
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    const T* input_data = input->data<T>();
+    T* boxes_data = boxes->mutable_data<T>({n}, ctx.GetPlace());
+    memset(loss_data, 0, boxes->numel() * sizeof(T));
+    T* scores_data = scores->mutable_data<T>({n}, ctx.GetPlace());
+    memset(scores_data, 0, scores->numel() * sizeof(T));
+  }
+};  // namespace operators
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(density_prior_box,
+                        ops::DensityPriorBoxOpCUDAKernel<float>,
+                        ops::DensityPriorBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
new file mode 100644
index 0000000000..7a9ebf46d5
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -0,0 +1,127 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+struct Box {
+  T x, y, w, h;
+};
+
+template <typename T>
+static inline T sigmoid(T x) {
+  return 1.0 / (1.0 + std::exp(-x));
+}
+
+template <typename T>
+static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
+                                int j, int an_idx, int grid_size,
+                                int input_size, int index, int stride) {
+  Box<T> b;
+  b.x = (i + sigmoid<T>(x[index])) * input_size / grid_size;
+  b.y = (j + sigmoid<T>(x[index + stride])) * input_size / grid_size;
+  b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx];
+  b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1];
+  return b;
+}
+
+static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num,
+                                int an_stride, int stride, int entry) {
+  return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
+}
+
+template <typename T>
+static inline void CalcDetectionBox(T* boxes, Box<T> pred, const int box_idx) {
+  boxes[box_idx] = pred.x - pred.w / 2;
+  boxes[box_idx + 1] = pred.y - pred.h / 2;
+  boxes[box_idx + 2] = pred.x + pred.w / 2;
+  boxes[box_idx + 3] = pred.y + pred.h / 2;
+}
+
+template <typename T>
+static inline void CalcLabelScore(T* scores, const T* input,
+                                  const int label_idx, const int score_idx,
+                                  const int class_num, const T conf,
+                                  const int stride) {
+  for (int i = 0; i < class_num; i++) {
+    scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
+  }
+}
+
+template <typename T>
+class YoloBoxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<Tensor>("X");
+    auto* boxes = ctx.Output<Tensor>("Boxes");
+    auto* scores = ctx.Output<Tensor>("Scores");
+    auto anchors = ctx.Attr<std::vector<int>>("anchors");
+    int class_num = ctx.Attr<int>("class_num");
+    float conf_thresh = ctx.Attr<float>("conf_thresh");
+    int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+
+    const int n = input->dims()[0];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+    const int box_num = boxes->dims()[1];
+    const int an_num = anchors.size() / 2;
+    int input_size = downsample_ratio * h;
+
+    const int stride = h * w;
+    const int an_stride = (class_num + 5) * stride;
+
+    const T* input_data = input->data<T>();
+    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
+    memset(boxes_data, 0, boxes->numel() * sizeof(T));
+    T* scores_data =
+        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
+    memset(scores_data, 0, scores->numel() * sizeof(T));
+
+    for (int i = 0; i < n; i++) {
+      for (int j = 0; j < an_num; j++) {
+        for (int k = 0; k < h; k++) {
+          for (int l = 0; l < w; l++) {
+            int obj_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 4);
+            T conf = sigmoid<T>(input_data[obj_idx]);
+            if (conf < conf_thresh) {
+              continue;
+            }
+
+            int box_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
+            Box<T> pred = GetYoloBox(input_data, anchors, l, k, j, h,
+                                     input_size, box_idx, stride);
+            box_idx = (i * box_num + j * stride + k * w + l) * 4;
+            CalcDetectionBox<T>(boxes_data, pred, box_idx);
+
+            int label_idx =
+                GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
+            int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+            CalcLabelScore<T>(scores_data, input_data, label_idx, score_idx,
+                              class_num, conf, stride);
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index cbedd70f85..29020f8246 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -49,6 +49,7 @@ __all__ = [
     'box_coder',
     'polygon_box_transform',
     'yolov3_loss',
+    'yolo_box',
     'box_clip',
     'multiclass_nms',
     'distribute_fpn_proposals',
@@ -609,6 +610,71 @@ def yolov3_loss(x,
     return loss
 
 
+@templatedoc(op_type="yolo_box")
+def yolo_box(x, anchors, class_num, conf_thresh, downsample_ratio, name=None):
+    """
+    ${comment}
+
+    Args:
+        x (Variable): ${x_comment}
+        anchors (list|tuple): ${anchors_comment}
+        class_num (int): ${class_num_comment}
+        conf_thresh (float): ${conf_thresh_comment}
+        downsample_ratio (int): ${downsample_ratio_comment}
+        name (string): the name of yolov3 loss
+
+    Returns:
+        Variable: A 1-D tensor with shape [1], the value of yolov3 loss
+
+    Raises:
+        TypeError: Input x of yolov_box must be Variable
+        TypeError: Attr anchors of yolo box must be list or tuple
+        TypeError: Attr class_num of yolo box must be an integer
+        TypeError: Attr conf_thresh of yolo box must be a float number
+
+    Examples:
+    .. code-block:: python
+
+        x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
+        anchors = [10, 13, 16, 30, 33, 23]
+        loss = fluid.layers.yolov3_loss(x=x, class_num=80, anchors=anchors, 
+                                        conf_thresh=0.01, downsample_ratio=32)
+    """
+    helper = LayerHelper('yolo_box', **locals())
+
+    if not isinstance(x, Variable):
+        raise TypeError("Input x of yolov3_loss must be Variable")
+    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
+        raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
+    if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
+        raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple")
+    if not isinstance(class_num, int):
+        raise TypeError("Attr class_num of yolov3_loss must be an integer")
+    if not isinstance(conf_thresh, float):
+        raise TypeError(
+            "Attr ignore_thresh of yolov3_loss must be a float number")
+
+    boxes = helper.create_variable_for_type_inference(dtype=x.dtype)
+    scores = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+    attrs = {
+        "anchors": anchors,
+        "class_num": class_num,
+        "conf_thresh": ignore_thresh,
+        "downsample_ratio": downsample_ratio,
+    }
+
+    helper.append_op(
+        type='yolo_box',
+        inputs={"X": x, },
+        outputs={
+            'Boxes': boxes,
+            'Scores': scores,
+        },
+        attrs=attrs)
+    return boxes, scores
+
+
 @templatedoc()
 def detection_map(detect_res,
                   label,
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 6218db7345..9592bbe2ec 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -478,9 +478,16 @@ class TestYoloDetection(unittest.TestCase):
             gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
             loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13],
                                       [0, 1], 10, 0.7, 32)
-
             self.assertIsNotNone(loss)
 
+    def test_yolo_box(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
+            boxes, scores = layers.yolo_box(x, [10, 13, 30, 13], 10, 0.01, 32)
+            self.assertIsNotNone(boxes)
+            self.assertIsNotNone(scores)
+
 
 class TestBoxClip(unittest.TestCase):
     def test_box_clip(self):
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
new file mode 100644
index 0000000000..bed0be9a50
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -0,0 +1,105 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+from paddle.fluid import core
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(-1.0 * x))
+
+
+def YoloBox(x, attrs):
+    n, c, h, w = x.shape
+    anchors = attrs['anchors']
+    an_num = int(len(anchors) // 2)
+    class_num = attrs['class_num']
+    conf_thresh = attrs['conf_thresh']
+    downsample = attrs['downsample']
+    input_size = downsample * h
+
+    x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
+
+    pred_box = x[:, :, :, :, :4].copy()
+    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
+    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
+    pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
+    pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
+
+    anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
+    anchors_s = np.array(
+        [(an_w / input_size, an_h / input_size) for an_w, an_h in anchors])
+    anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
+    anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
+    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
+    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
+
+    pred_conf = sigmoid(x[:, :, :, :, 4:5])
+    pred_conf[pred_conf < conf_thresh] = 0.
+    pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
+    pred_box = pred_box * (pred_conf > 0.).astype('float32')
+
+    pred_box = pred_box.reshape((n, -1, 4))
+    pred_box[:, :, :
+             2], pred_box[:, :, 2:
+                          4] = pred_box[:, :, :
+                                        2] - pred_box[:, :, 2:
+                                                      4] / 2., pred_box[:, :, :
+                                                                        2] + pred_box[:, :,
+                                                                                      2:
+                                                                                      4] / 2.0
+    pred_box = pred_box * input_size
+
+    return pred_box, pred_score.reshape((n, -1, class_num))
+
+
+class TestYoloBoxOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'yolo_box'
+        x = np.random.random(self.x_shape).astype('float32')
+
+        self.attrs = {
+            "anchors": self.anchors,
+            "class_num": self.class_num,
+            "conf_thresh": self.conf_thresh,
+            "downsample": self.downsample,
+        }
+
+        self.inputs = {'X': x, }
+        boxes, scores = YoloBox(x, self.attrs)
+        self.outputs = {
+            "Boxes": boxes,
+            "Scores": scores,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int(len(self.anchors) // 2)
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.x_shape = (3, an_num * (5 + self.class_num), 5, 5)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 020c113923..569fe63d05 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -75,8 +75,8 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
     mask_num = len(anchor_mask)
     class_num = attrs["class_num"]
     ignore_thresh = attrs['ignore_thresh']
-    downsample = attrs['downsample']
-    input_size = downsample * h
+    downsample_ratio = attrs['downsample_ratio']
+    input_size = downsample_ratio * h
     x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
     loss = np.zeros((n)).astype('float32')
 
@@ -86,10 +86,6 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
     pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
     pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
 
-    x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
-                                 np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
-                                 class_num)
-
     mask_anchors = []
     for m in anchor_mask:
         mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
@@ -176,7 +172,7 @@ class TestYolov3LossOp(OpTest):
             "anchor_mask": self.anchor_mask,
             "class_num": self.class_num,
             "ignore_thresh": self.ignore_thresh,
-            "downsample": self.downsample,
+            "downsample_ratio": self.downsample_ratio,
         }
 
         self.inputs = {
@@ -208,7 +204,7 @@ class TestYolov3LossOp(OpTest):
         self.anchor_mask = [1, 2]
         self.class_num = 5
         self.ignore_thresh = 0.5
-        self.downsample = 32
+        self.downsample_ratio = 32
         self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
         self.gtbox_shape = (3, 5, 4)
 

From 452373decbf9f196d9c3f52fd21214e439a68ece Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 19 Feb 2019 10:31:28 +0800
Subject: [PATCH 06/73] resize box in input image scale. test=develop

---
 .../fluid/operators/detection/yolo_box_op.cc  | 14 +++++++++
 .../fluid/operators/detection/yolo_box_op.h   | 23 +++++++++-----
 python/paddle/fluid/layers/detection.py       | 27 +++++++++++-----
 python/paddle/fluid/tests/test_detection.py   |  4 ++-
 .../fluid/tests/unittests/test_yolo_box_op.py | 31 +++++++++++--------
 5 files changed, 70 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 4c2c5d1e6f..f78a980674 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -23,12 +23,15 @@ class YoloBoxOp : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of YoloBoxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("ImgSize"),
+                   "Input(ImgSize) of YoloBoxOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Boxes"),
                    "Output(Boxes) of YoloBoxOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Scores"),
                    "Output(Scores) of YoloBoxOp should not be null.");
 
     auto dim_x = ctx->GetInputDim("X");
+    auto dim_imgsize = ctx->GetInputDim("ImgSize");
     auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
     int anchor_num = anchors.size() / 2;
     auto class_num = ctx->Attrs().Get<int>("class_num");
@@ -39,6 +42,12 @@ class YoloBoxOp : public framework::OperatorWithKernel {
         dim_x[1], anchor_num * (5 + class_num),
         "Input(X) dim[1] should be equal to (anchor_mask_number * (5 "
         "+ class_num)).");
+    PADDLE_ENFORCE_EQ(dim_imgsize.size(), 2,
+                      "Input(ImgSize) should be a 2-D tensor.");
+    PADDLE_ENFORCE_EQ(
+        dim_imgsize[0], dim_x[0],
+        "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.");
+    PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2.");
     PADDLE_ENFORCE_GT(anchors.size(), 0,
                       "Attr(anchors) length should be greater then 0.");
     PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
@@ -72,6 +81,11 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
              "box locations, confidence score and classification one-hot"
              "keys of each anchor box. Generally, X should be the output"
              "of YOLOv3 network.");
+    AddInput("ImgSize",
+             "The image size tensor of YoloBox operator, "
+             "This is a 2-D tensor with shape of [N, 2]. This tensor holds"
+             "height and width of each input image using for resize output"
+             "box in input image scale.");
     AddOutput("Boxes",
               "The output tensor of detection boxes of YoloBox operator, "
               "This is a 3-D tensor with shape of [N, M, 4], N is the"
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 7a9ebf46d5..0ea8c17861 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -32,12 +32,15 @@ static inline T sigmoid(T x) {
 template <typename T>
 static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
                                 int j, int an_idx, int grid_size,
-                                int input_size, int index, int stride) {
+                                int input_size, int index, int stride,
+                                int img_height, int img_width) {
   Box<T> b;
-  b.x = (i + sigmoid<T>(x[index])) * input_size / grid_size;
-  b.y = (j + sigmoid<T>(x[index + stride])) * input_size / grid_size;
-  b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx];
-  b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1];
+  b.x = (i + sigmoid<T>(x[index])) * img_width / grid_size;
+  b.y = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
+  b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+        input_size;
+  b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * img_height /
+        input_size;
   return b;
 }
 
@@ -69,6 +72,7 @@ class YoloBoxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<Tensor>("X");
+    auto* imgsize = ctx.Input<Tensor>("ImgSize");
     auto* boxes = ctx.Output<Tensor>("Boxes");
     auto* scores = ctx.Output<Tensor>("Scores");
     auto anchors = ctx.Attr<std::vector<int>>("anchors");
@@ -87,6 +91,7 @@ class YoloBoxKernel : public framework::OpKernel<T> {
     const int an_stride = (class_num + 5) * stride;
 
     const T* input_data = input->data<T>();
+    const int* imgsize_data = imgsize->data<int>();
     T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
     memset(boxes_data, 0, boxes->numel() * sizeof(T));
     T* scores_data =
@@ -94,6 +99,9 @@ class YoloBoxKernel : public framework::OpKernel<T> {
     memset(scores_data, 0, scores->numel() * sizeof(T));
 
     for (int i = 0; i < n; i++) {
+      int img_height = imgsize_data[2 * i];
+      int img_width = imgsize_data[2 * i + 1];
+
       for (int j = 0; j < an_num; j++) {
         for (int k = 0; k < h; k++) {
           for (int l = 0; l < w; l++) {
@@ -106,8 +114,9 @@ class YoloBoxKernel : public framework::OpKernel<T> {
 
             int box_idx =
                 GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
-            Box<T> pred = GetYoloBox(input_data, anchors, l, k, j, h,
-                                     input_size, box_idx, stride);
+            Box<T> pred =
+                GetYoloBox(input_data, anchors, l, k, j, h, input_size, box_idx,
+                           stride, img_height, img_width);
             box_idx = (i * box_num + j * stride + k * w + l) * 4;
             CalcDetectionBox<T>(boxes_data, pred, box_idx);
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 29020f8246..b64e19320b 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -611,12 +611,19 @@ def yolov3_loss(x,
 
 
 @templatedoc(op_type="yolo_box")
-def yolo_box(x, anchors, class_num, conf_thresh, downsample_ratio, name=None):
+def yolo_box(x,
+             img_size,
+             anchors,
+             class_num,
+             conf_thresh,
+             downsample_ratio,
+             name=None):
     """
     ${comment}
 
     Args:
         x (Variable): ${x_comment}
+        img_size (Variable): ${img_size_comment}
         anchors (list|tuple): ${anchors_comment}
         class_num (int): ${class_num_comment}
         conf_thresh (float): ${conf_thresh_comment}
@@ -643,16 +650,17 @@ def yolo_box(x, anchors, class_num, conf_thresh, downsample_ratio, name=None):
     helper = LayerHelper('yolo_box', **locals())
 
     if not isinstance(x, Variable):
-        raise TypeError("Input x of yolov3_loss must be Variable")
+        raise TypeError("Input x of yolo_box must be Variable")
+    if not isinstance(img_size, Variable):
+        raise TypeError("Input img_size of yolo_box must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
-        raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
+        raise TypeError("Attr anchors of yolo_box must be list or tuple")
     if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
-        raise TypeError("Attr anchor_mask of yolov3_loss must be list or tuple")
+        raise TypeError("Attr anchor_mask of yolo_box must be list or tuple")
     if not isinstance(class_num, int):
-        raise TypeError("Attr class_num of yolov3_loss must be an integer")
+        raise TypeError("Attr class_num of yolo_box must be an integer")
     if not isinstance(conf_thresh, float):
-        raise TypeError(
-            "Attr ignore_thresh of yolov3_loss must be a float number")
+        raise TypeError("Attr ignore_thresh of yolo_box must be a float number")
 
     boxes = helper.create_variable_for_type_inference(dtype=x.dtype)
     scores = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -666,7 +674,10 @@ def yolo_box(x, anchors, class_num, conf_thresh, downsample_ratio, name=None):
 
     helper.append_op(
         type='yolo_box',
-        inputs={"X": x, },
+        inputs={
+            "X": x,
+            "ImgSize": img_size,
+        },
         outputs={
             'Boxes': boxes,
             'Scores': scores,
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 9592bbe2ec..b8743debe2 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -484,7 +484,9 @@ class TestYoloDetection(unittest.TestCase):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            boxes, scores = layers.yolo_box(x, [10, 13, 30, 13], 10, 0.01, 32)
+            img_size = layers.data(name='x', shape=[2], dtype='int32')
+            boxes, scores = layers.yolo_box(x, img_size, [10, 13, 30, 13], 10,
+                                            0.01, 32)
             self.assertIsNotNone(boxes)
             self.assertIsNotNone(scores)
 
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index bed0be9a50..48465c8f68 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -25,7 +25,7 @@ def sigmoid(x):
     return 1.0 / (1.0 + np.exp(-1.0 * x))
 
 
-def YoloBox(x, attrs):
+def YoloBox(x, img_size, attrs):
     n, c, h, w = x.shape
     anchors = attrs['anchors']
     an_num = int(len(anchors) // 2)
@@ -56,15 +56,14 @@ def YoloBox(x, attrs):
     pred_box = pred_box * (pred_conf > 0.).astype('float32')
 
     pred_box = pred_box.reshape((n, -1, 4))
-    pred_box[:, :, :
-             2], pred_box[:, :, 2:
-                          4] = pred_box[:, :, :
-                                        2] - pred_box[:, :, 2:
-                                                      4] / 2., pred_box[:, :, :
-                                                                        2] + pred_box[:, :,
-                                                                                      2:
-                                                                                      4] / 2.0
-    pred_box = pred_box * input_size
+    pred_box[:, :, :2], pred_box[:, :, 2:4] = \
+        pred_box[:, :, :2] - pred_box[:, :, 2:4] / 2., \
+        pred_box[:, :, :2] + pred_box[:, :, 2:4] / 2.0
+    # pred_box = pred_box * input_size
+    pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis]
+    pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis]
 
     return pred_box, pred_score.reshape((n, -1, class_num))
 
@@ -74,6 +73,7 @@ class TestYoloBoxOp(OpTest):
         self.initTestCase()
         self.op_type = 'yolo_box'
         x = np.random.random(self.x_shape).astype('float32')
+        img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32')
 
         self.attrs = {
             "anchors": self.anchors,
@@ -82,8 +82,11 @@ class TestYoloBoxOp(OpTest):
             "downsample": self.downsample,
         }
 
-        self.inputs = {'X': x, }
-        boxes, scores = YoloBox(x, self.attrs)
+        self.inputs = {
+            'X': x,
+            'ImgSize': img_size,
+        }
+        boxes, scores = YoloBox(x, img_size, self.attrs)
         self.outputs = {
             "Boxes": boxes,
             "Scores": scores,
@@ -95,10 +98,12 @@ class TestYoloBoxOp(OpTest):
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]
         an_num = int(len(self.anchors) // 2)
+        self.batch_size = 3
         self.class_num = 2
         self.conf_thresh = 0.5
         self.downsample = 32
-        self.x_shape = (3, an_num * (5 + self.class_num), 5, 5)
+        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 5, 5)
+        self.imgsize_shape = (self.batch_size, 2)
 
 
 if __name__ == "__main__":

From 04b8b9e96cee6dfbc76c0df2c013195da86079d0 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 19 Feb 2019 13:59:00 +0800
Subject: [PATCH 07/73] add yolo_box_op CUDA kernel

---
 .../fluid/operators/detection/yolo_box_op.cc  |  1 -
 .../fluid/operators/detection/yolo_box_op.cu  | 62 ++++++++++++++-----
 .../fluid/operators/detection/yolo_box_op.h   | 28 +++++----
 3 files changed, 62 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index f78a980674..c018a6498a 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -35,7 +35,6 @@ class YoloBoxOp : public framework::OperatorWithKernel {
     auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
     int anchor_num = anchors.size() / 2;
     auto class_num = ctx->Attrs().Get<int>("class_num");
-    auto conf_thresh = ctx->Attrs().Get<float>("conf_thresh");
 
     PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
     PADDLE_ENFORCE_EQ(
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 9cc94794f2..38b514fe90 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -20,15 +20,44 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-static __global__ void GenDensityPriorBox(
-    const int height, const int width, const int im_height, const int im_width,
-    const T offset, const T step_width, const T step_height,
-    const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin,
-    const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) {
-  int gidx = blockIdx.x * blockDim.x + threadIdx.x;
-  int gidy = blockIdx.y * blockDim.y + threadIdx.y;
-  int step_x = blockDim.x * gridDim.x;
-  int step_y = blockDim.y * gridDim.y;
+__global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
+                            T* scores, const float conf_thresh,
+                            std::vector<int> anchors, const int h, const in w,
+                            const int an_num, const int class_num,
+                            const int box_num, const int input_size) {
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  int stride = blockDim.x * gridDim.x;
+  for (; tid < box_num; tid += stride) {
+    int grid_num = h * w;
+    int i = tid / box_num;
+    int j = (tid % box_num) / grid_num;
+    int k = (tid % grid_num) / w;
+    int l = tid % w;
+
+    int an_stride = an_num * grid_num;
+    int img_height = imgsize[2 * i];
+    int img_width = imgsize[2 * i + 1];
+
+    int obj_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4);
+    T conf = sigmoid<T>(input[obj_idx]);
+    if (conf < conf_thresh) {
+      continue;
+    }
+
+    int box_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
+    Box<T> pred = GetYoloBox<T>(input, anchors, l, k, j, h, input_size, box_idx,
+                                grid_num, img_height, img_width);
+    box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
+    CalcDetectionBox<T>(boxes, pred, box_idx);
+
+    int label_idx =
+        GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
+    int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+    CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
+                      grid_num);
+  }
 }
 
 template <typename T>
@@ -36,6 +65,7 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* input = ctx.Input<Tensor>("Input");
+    auto* img_size = ctx.Input<Tensor>("ImgSize");
     auto* boxes = ctx.Output<Tensor>("Boxes");
     auto* scores = ctx.Output<Tensor>("Scores");
 
@@ -51,14 +81,16 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     const int an_num = anchors.size() / 2;
     int input_size = downsample_ratio * h;
 
-    const int stride = h * w;
-    const int an_stride = (class_num + 5) * stride;
-
     const T* input_data = input->data<T>();
-    T* boxes_data = boxes->mutable_data<T>({n}, ctx.GetPlace());
-    memset(loss_data, 0, boxes->numel() * sizeof(T));
-    T* scores_data = scores->mutable_data<T>({n}, ctx.GetPlace());
+    const int* imgsize_data = imgsize->data<int>();
+    T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
+    memset(boxes_data, 0, boxes->numel() * sizeof(T));
+    T* scores_data =
+        scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
     memset(scores_data, 0, scores->numel() * sizeof(T));
+
+    int grid_dim = (n * box_num + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
   }
 };  // namespace operators
 
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 0ea8c17861..90933e123e 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -30,10 +30,10 @@ static inline T sigmoid(T x) {
 }
 
 template <typename T>
-static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
-                                int j, int an_idx, int grid_size,
-                                int input_size, int index, int stride,
-                                int img_height, int img_width) {
+HOSTDEVICE inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
+                                    int j, int an_idx, int grid_size,
+                                    int input_size, int index, int stride,
+                                    int img_height, int img_width) {
   Box<T> b;
   b.x = (i + sigmoid<T>(x[index])) * img_width / grid_size;
   b.y = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
@@ -44,13 +44,15 @@ static inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
   return b;
 }
 
-static inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num,
-                                int an_stride, int stride, int entry) {
+HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
+                                    int an_num, int an_stride, int stride,
+                                    int entry) {
   return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx;
 }
 
 template <typename T>
-static inline void CalcDetectionBox(T* boxes, Box<T> pred, const int box_idx) {
+HOSTDEVICE inline void CalcDetectionBox(T* boxes, Box<T> pred,
+                                        const int box_idx) {
   boxes[box_idx] = pred.x - pred.w / 2;
   boxes[box_idx + 1] = pred.y - pred.h / 2;
   boxes[box_idx + 2] = pred.x + pred.w / 2;
@@ -58,10 +60,10 @@ static inline void CalcDetectionBox(T* boxes, Box<T> pred, const int box_idx) {
 }
 
 template <typename T>
-static inline void CalcLabelScore(T* scores, const T* input,
-                                  const int label_idx, const int score_idx,
-                                  const int class_num, const T conf,
-                                  const int stride) {
+HOSTDEVICE inline void CalcLabelScore(T* scores, const T* input,
+                                      const int label_idx, const int score_idx,
+                                      const int class_num, const T conf,
+                                      const int stride) {
   for (int i = 0; i < class_num; i++) {
     scores[score_idx + i] = conf * sigmoid<T>(input[label_idx + i * stride]);
   }
@@ -115,8 +117,8 @@ class YoloBoxKernel : public framework::OpKernel<T> {
             int box_idx =
                 GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
             Box<T> pred =
-                GetYoloBox(input_data, anchors, l, k, j, h, input_size, box_idx,
-                           stride, img_height, img_width);
+                GetYoloBox<T>(input_data, anchors, l, k, j, h, input_size,
+                              box_idx, stride, img_height, img_width);
             box_idx = (i * box_num + j * stride + k * w + l) * 4;
             CalcDetectionBox<T>(boxes_data, pred, box_idx);
 

From cb2dca53c173a725604a310855fb238e898d5079 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 19 Feb 2019 11:52:46 +0000
Subject: [PATCH 08/73] fix cuda kernel error

---
 .../fluid/operators/detection/yolo_box_op.cu  | 38 ++++++++++++------
 .../fluid/operators/detection/yolo_box_op.h   | 40 +++++++++----------
 .../fluid/tests/unittests/test_yolo_box_op.py |  7 ++--
 3 files changed, 49 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 38b514fe90..bc563107f8 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -22,11 +23,12 @@ using Tensor = framework::Tensor;
 template <typename T>
 __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
                             T* scores, const float conf_thresh,
-                            std::vector<int> anchors, const int h, const in w,
+                            const int* anchors, const int h, const int w,
                             const int an_num, const int class_num,
-                            const int box_num, const int input_size) {
+                            const int box_num, int input_size) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
+  T box[4];
   for (; tid < box_num; tid += stride) {
     int grid_num = h * w;
     int i = tid / box_num;
@@ -47,10 +49,10 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
 
     int box_idx =
         GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
-    Box<T> pred = GetYoloBox<T>(input, anchors, l, k, j, h, input_size, box_idx,
+    GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
                                 grid_num, img_height, img_width);
     box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
-    CalcDetectionBox<T>(boxes, pred, box_idx);
+    CalcDetectionBox<T>(boxes, box, box_idx);
 
     int label_idx =
         GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
@@ -64,7 +66,7 @@ template <typename T>
 class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<Tensor>("Input");
+    auto* input = ctx.Input<Tensor>("X");
     auto* img_size = ctx.Input<Tensor>("ImgSize");
     auto* boxes = ctx.Output<Tensor>("Boxes");
     auto* scores = ctx.Output<Tensor>("Scores");
@@ -81,23 +83,35 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     const int an_num = anchors.size() / 2;
     int input_size = downsample_ratio * h;
 
+    Tensor anchors_t, cpu_anchors_t;
+    auto cpu_anchors_data = cpu_anchors_t.mutable_data<int>({an_num*2}, platform::CPUPlace());
+    std::copy(anchors.begin(), anchors.end(), cpu_anchors_data);
+    TensorCopySync(cpu_anchors_t, ctx.GetPlace(), &anchors_t);
+    auto anchors_data = anchors_t.data<int>();
+
     const T* input_data = input->data<T>();
-    const int* imgsize_data = imgsize->data<int>();
+    const int* imgsize_data = img_size->data<int>();
     T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
-    memset(boxes_data, 0, boxes->numel() * sizeof(T));
     T* scores_data =
         scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    memset(scores_data, 0, scores->numel() * sizeof(T));
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    set_zero(dev_ctx, boxes, static_cast<T>(0));
+    set_zero(dev_ctx, scores, static_cast<T>(0));
 
     int grid_dim = (n * box_num + 512 - 1) / 512;
     grid_dim = grid_dim > 8 ? 8 : grid_dim;
+
+    KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+	input_data, imgsize_data, boxes_data, scores_data, conf_thresh, 
+	anchors_data, h, w, an_num, class_num, box_num, input_size);	
   }
-};  // namespace operators
+};
 
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(density_prior_box,
-                        ops::DensityPriorBoxOpCUDAKernel<float>,
-                        ops::DensityPriorBoxOpCUDAKernel<double>);
+REGISTER_OP_CUDA_KERNEL(yolo_box,
+                        ops::YoloBoxOpCUDAKernel<float>,
+                        ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 90933e123e..6188c5f32b 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -13,35 +13,30 @@
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename T>
-struct Box {
-  T x, y, w, h;
-};
 
 template <typename T>
-static inline T sigmoid(T x) {
+HOSTDEVICE inline T sigmoid(T x) {
   return 1.0 / (1.0 + std::exp(-x));
 }
 
 template <typename T>
-HOSTDEVICE inline Box<T> GetYoloBox(const T* x, std::vector<int> anchors, int i,
+HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
                                     int j, int an_idx, int grid_size,
                                     int input_size, int index, int stride,
                                     int img_height, int img_width) {
-  Box<T> b;
-  b.x = (i + sigmoid<T>(x[index])) * img_width / grid_size;
-  b.y = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
-  b.w = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
+  box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
+  box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
+  box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
         input_size;
-  b.h = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * img_height /
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * img_height /
         input_size;
-  return b;
 }
 
 HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
@@ -51,12 +46,12 @@ HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
 }
 
 template <typename T>
-HOSTDEVICE inline void CalcDetectionBox(T* boxes, Box<T> pred,
+HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box,
                                         const int box_idx) {
-  boxes[box_idx] = pred.x - pred.w / 2;
-  boxes[box_idx + 1] = pred.y - pred.h / 2;
-  boxes[box_idx + 2] = pred.x + pred.w / 2;
-  boxes[box_idx + 3] = pred.y + pred.h / 2;
+  boxes[box_idx] = box[0] - box[2] / 2;
+  boxes[box_idx + 1] = box[1] - box[3] / 2;
+  boxes[box_idx + 2] = box[0] + box[2] / 2;
+  boxes[box_idx + 3] = box[1] + box[3] / 2;
 }
 
 template <typename T>
@@ -92,6 +87,9 @@ class YoloBoxKernel : public framework::OpKernel<T> {
     const int stride = h * w;
     const int an_stride = (class_num + 5) * stride;
 
+    int anchors_[anchors.size()];
+    std::copy(anchors.begin(), anchors.end(), anchors_);
+
     const T* input_data = input->data<T>();
     const int* imgsize_data = imgsize->data<int>();
     T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
@@ -100,6 +98,7 @@ class YoloBoxKernel : public framework::OpKernel<T> {
         scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
     memset(scores_data, 0, scores->numel() * sizeof(T));
 
+    T box[4];
     for (int i = 0; i < n; i++) {
       int img_height = imgsize_data[2 * i];
       int img_width = imgsize_data[2 * i + 1];
@@ -116,11 +115,10 @@ class YoloBoxKernel : public framework::OpKernel<T> {
 
             int box_idx =
                 GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
-            Box<T> pred =
-                GetYoloBox<T>(input_data, anchors, l, k, j, h, input_size,
-                              box_idx, stride, img_height, img_width);
+	    GetYoloBox<T>(box, input_data, anchors_, l, k, j, h, input_size,
+		          box_idx, stride, img_height, img_width);
             box_idx = (i * box_num + j * stride + k * w + l) * 4;
-            CalcDetectionBox<T>(boxes_data, pred, box_idx);
+            CalcDetectionBox<T>(boxes_data, box, box_idx);
 
             int label_idx =
                 GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index 48465c8f68..e28c05e3e6 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -93,16 +93,17 @@ class TestYoloBoxOp(OpTest):
         }
 
     def test_check_output(self):
-        self.check_output()
+	place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
 
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]
         an_num = int(len(self.anchors) // 2)
-        self.batch_size = 3
+        self.batch_size = 1
         self.class_num = 2
         self.conf_thresh = 0.5
         self.downsample = 32
-        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 5, 5)
+        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 2, 2)
         self.imgsize_shape = (self.batch_size, 2)
 
 

From 7808f4c097cdac0eabd694f128dee4c93cd95788 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 19 Feb 2019 13:18:33 +0000
Subject: [PATCH 09/73] fix unittest for yolo_box_op. test=develop

---
 paddle/fluid/operators/detection/yolo_box_op.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index bc563107f8..fbe934c7ea 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -36,7 +36,7 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
     int k = (tid % grid_num) / w;
     int l = tid % w;
 
-    int an_stride = an_num * grid_num;
+    int an_stride = (5 + class_num) * grid_num;
     int img_height = imgsize[2 * i];
     int img_width = imgsize[2 * i + 1];
 
@@ -56,7 +56,7 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
 
     int label_idx =
         GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
-    int score_idx = (i * box_num + j * stride + k * w + l) * class_num;
+    int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num;
     CalcLabelScore<T>(scores, input, label_idx, score_idx, class_num, conf,
                       grid_num);
   }
@@ -99,12 +99,12 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     set_zero(dev_ctx, boxes, static_cast<T>(0));
     set_zero(dev_ctx, scores, static_cast<T>(0));
 
-    int grid_dim = (n * box_num + 512 - 1) / 512;
-    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+    int grid_dim = (n * box_num + 4 - 1) / 4;
+    grid_dim = grid_dim > 2 ? 2 : grid_dim;
 
-    KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeYoloBoxFw<T><<<grid_dim, 4, 0, ctx.cuda_device_context().stream()>>>(
 	input_data, imgsize_data, boxes_data, scores_data, conf_thresh, 
-	anchors_data, h, w, an_num, class_num, box_num, input_size);	
+	anchors_data, h, w, an_num, class_num, n * box_num, input_size);	
   }
 };
 

From c9d4676bee930d8678c5c590e75c31e7a40a77be Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 19 Feb 2019 13:26:15 +0000
Subject: [PATCH 10/73] fix multi batch idx error. test=develop

---
 paddle/fluid/operators/detection/yolo_box_op.cu    | 14 +++++++-------
 .../fluid/tests/unittests/test_yolo_box_op.py      |  4 ++--
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index fbe934c7ea..a4513bd2f4 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -22,14 +22,14 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
-                            T* scores, const float conf_thresh,
-                            const int* anchors, const int h, const int w,
+                            T* scores, const float conf_thresh, const int* anchors, 
+			    const int n, const int h, const int w,
                             const int an_num, const int class_num,
                             const int box_num, int input_size) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
   T box[4];
-  for (; tid < box_num; tid += stride) {
+  for (; tid < n * box_num; tid += stride) {
     int grid_num = h * w;
     int i = tid / box_num;
     int j = (tid % box_num) / grid_num;
@@ -99,12 +99,12 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     set_zero(dev_ctx, boxes, static_cast<T>(0));
     set_zero(dev_ctx, scores, static_cast<T>(0));
 
-    int grid_dim = (n * box_num + 4 - 1) / 4;
-    grid_dim = grid_dim > 2 ? 2 : grid_dim;
+    int grid_dim = (n * box_num + 512 - 1) / 512;
+    grid_dim = grid_dim > 8 ? 8 : grid_dim;
 
-    KeYoloBoxFw<T><<<grid_dim, 4, 0, ctx.cuda_device_context().stream()>>>(
+    KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
 	input_data, imgsize_data, boxes_data, scores_data, conf_thresh, 
-	anchors_data, h, w, an_num, class_num, n * box_num, input_size);	
+	anchors_data, n, h, w, an_num, class_num, box_num, input_size);	
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index e28c05e3e6..a1da4f64b6 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -99,11 +99,11 @@ class TestYoloBoxOp(OpTest):
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]
         an_num = int(len(self.anchors) // 2)
-        self.batch_size = 1
+        self.batch_size = 32
         self.class_num = 2
         self.conf_thresh = 0.5
         self.downsample = 32
-        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 2, 2)
+        self.x_shape = (self.batch_size, an_num * (5 + self.class_num), 13, 13)
         self.imgsize_shape = (self.batch_size, 2)
 
 

From deec3ac1e95d7b3501c17d10bbeaa673475859a1 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Wed, 20 Feb 2019 07:16:00 +0000
Subject: [PATCH 11/73] fix infer error.

---
 python/paddle/fluid/layers/detection.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index b64e19320b..aca5f0f1d6 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -655,8 +655,6 @@ def yolo_box(x,
         raise TypeError("Input img_size of yolo_box must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolo_box must be list or tuple")
-    if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
-        raise TypeError("Attr anchor_mask of yolo_box must be list or tuple")
     if not isinstance(class_num, int):
         raise TypeError("Attr class_num of yolo_box must be an integer")
     if not isinstance(conf_thresh, float):
@@ -668,7 +666,7 @@ def yolo_box(x,
     attrs = {
         "anchors": anchors,
         "class_num": class_num,
-        "conf_thresh": ignore_thresh,
+        "conf_thresh": conf_thresh,
         "downsample_ratio": downsample_ratio,
     }
 

From fb863b48202f690563ce932edcb474a06bf02d19 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 25 Feb 2019 20:09:38 +0800
Subject: [PATCH 12/73] add API.spec for yolo_box. test=develop

---
 paddle/fluid/API.spec | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index df3497de20..14d19f79aa 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -328,6 +328,7 @@ paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=Non
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
+paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))

From 72a18bb16028a1c54a038eed15887b384d25f42a Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 26 Feb 2019 02:26:23 +0000
Subject: [PATCH 13/73] add bbox range limit. test=develop

---
 paddle/fluid/operators/detection/yolo_box_op.cu |  2 +-
 paddle/fluid/operators/detection/yolo_box_op.h  | 11 ++++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index a4513bd2f4..c9b5a19f82 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -52,7 +52,7 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
     GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
                                 grid_num, img_height, img_width);
     box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
-    CalcDetectionBox<T>(boxes, box, box_idx);
+    CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
 
     int label_idx =
         GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 6188c5f32b..cf028a6e06 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -46,12 +46,17 @@ HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
 }
 
 template <typename T>
-HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box,
-                                        const int box_idx) {
+HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
+                                        const int img_height, const int img_width) {
   boxes[box_idx] = box[0] - box[2] / 2;
   boxes[box_idx + 1] = box[1] - box[3] / 2;
   boxes[box_idx + 2] = box[0] + box[2] / 2;
   boxes[box_idx + 3] = box[1] + box[3] / 2;
+  
+  boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
+  boxes[box_idx + 1] = boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 ? boxes[box_idx + 2] : static_cast<T>(img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 ? boxes[box_idx + 3] : static_cast<T>(img_height - 1);
 }
 
 template <typename T>
@@ -118,7 +123,7 @@ class YoloBoxKernel : public framework::OpKernel<T> {
 	    GetYoloBox<T>(box, input_data, anchors_, l, k, j, h, input_size,
 		          box_idx, stride, img_height, img_width);
             box_idx = (i * box_num + j * stride + k * w + l) * 4;
-            CalcDetectionBox<T>(boxes_data, box, box_idx);
+            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width);
 
             int label_idx =
                 GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);

From ad897304f913e3f75854c94874e8b93f84b70499 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 26 Feb 2019 13:13:30 +0800
Subject: [PATCH 14/73] fix pre-commit. test=develop

---
 .../fluid/operators/detection/yolo_box_op.cu  | 18 ++++----
 .../fluid/operators/detection/yolo_box_op.h   | 42 +++++++++++--------
 .../fluid/tests/unittests/test_yolo_box_op.py | 12 ++++--
 3 files changed, 43 insertions(+), 29 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index c9b5a19f82..a0c60ae673 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -22,9 +22,9 @@ using Tensor = framework::Tensor;
 
 template <typename T>
 __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
-                            T* scores, const float conf_thresh, const int* anchors, 
-			    const int n, const int h, const int w,
-                            const int an_num, const int class_num,
+                            T* scores, const float conf_thresh,
+                            const int* anchors, const int n, const int h,
+                            const int w, const int an_num, const int class_num,
                             const int box_num, int input_size) {
   int tid = blockIdx.x * blockDim.x + threadIdx.x;
   int stride = blockDim.x * gridDim.x;
@@ -50,7 +50,7 @@ __global__ void KeYoloBoxFw(const T* input, const int* imgsize, T* boxes,
     int box_idx =
         GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0);
     GetYoloBox<T>(box, input, anchors, l, k, j, h, input_size, box_idx,
-                                grid_num, img_height, img_width);
+                  grid_num, img_height, img_width);
     box_idx = (i * box_num + j * grid_num + k * w + l) * 4;
     CalcDetectionBox<T>(boxes, box, box_idx, img_height, img_width);
 
@@ -84,7 +84,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     int input_size = downsample_ratio * h;
 
     Tensor anchors_t, cpu_anchors_t;
-    auto cpu_anchors_data = cpu_anchors_t.mutable_data<int>({an_num*2}, platform::CPUPlace());
+    auto cpu_anchors_data =
+        cpu_anchors_t.mutable_data<int>({an_num * 2}, platform::CPUPlace());
     std::copy(anchors.begin(), anchors.end(), cpu_anchors_data);
     TensorCopySync(cpu_anchors_t, ctx.GetPlace(), &anchors_t);
     auto anchors_data = anchors_t.data<int>();
@@ -103,8 +104,8 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     grid_dim = grid_dim > 8 ? 8 : grid_dim;
 
     KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
-	input_data, imgsize_data, boxes_data, scores_data, conf_thresh, 
-	anchors_data, n, h, w, an_num, class_num, box_num, input_size);	
+        input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
+        anchors_data, n, h, w, an_num, class_num, box_num, input_size);
   }
 };
 
@@ -112,6 +113,5 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(yolo_box,
-                        ops::YoloBoxOpCUDAKernel<float>,
+REGISTER_OP_CUDA_KERNEL(yolo_box, ops::YoloBoxOpCUDAKernel<float>,
                         ops::YoloBoxOpCUDAKernel<double>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index cf028a6e06..546a5a66b4 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -20,7 +20,6 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-
 template <typename T>
 HOSTDEVICE inline T sigmoid(T x) {
   return 1.0 / (1.0 + std::exp(-x));
@@ -28,15 +27,15 @@ HOSTDEVICE inline T sigmoid(T x) {
 
 template <typename T>
 HOSTDEVICE inline void GetYoloBox(T* box, const T* x, const int* anchors, int i,
-                                    int j, int an_idx, int grid_size,
-                                    int input_size, int index, int stride,
-                                    int img_height, int img_width) {
+                                  int j, int an_idx, int grid_size,
+                                  int input_size, int index, int stride,
+                                  int img_height, int img_width) {
   box[0] = (i + sigmoid<T>(x[index])) * img_width / grid_size;
   box[1] = (j + sigmoid<T>(x[index + stride])) * img_height / grid_size;
   box[2] = std::exp(x[index + 2 * stride]) * anchors[2 * an_idx] * img_width /
-        input_size;
-  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] * img_height /
-        input_size;
+           input_size;
+  box[3] = std::exp(x[index + 3 * stride]) * anchors[2 * an_idx + 1] *
+           img_height / input_size;
 }
 
 HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
@@ -47,16 +46,22 @@ HOSTDEVICE inline int GetEntryIndex(int batch, int an_idx, int hw_idx,
 
 template <typename T>
 HOSTDEVICE inline void CalcDetectionBox(T* boxes, T* box, const int box_idx,
-                                        const int img_height, const int img_width) {
+                                        const int img_height,
+                                        const int img_width) {
   boxes[box_idx] = box[0] - box[2] / 2;
   boxes[box_idx + 1] = box[1] - box[3] / 2;
   boxes[box_idx + 2] = box[0] + box[2] / 2;
   boxes[box_idx + 3] = box[1] + box[3] / 2;
-  
+
   boxes[box_idx] = boxes[box_idx] > 0 ? boxes[box_idx] : static_cast<T>(0);
-  boxes[box_idx + 1] = boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
-  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1 ? boxes[box_idx + 2] : static_cast<T>(img_width - 1);
-  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1 ? boxes[box_idx + 3] : static_cast<T>(img_height - 1);
+  boxes[box_idx + 1] =
+      boxes[box_idx + 1] > 0 ? boxes[box_idx + 1] : static_cast<T>(0);
+  boxes[box_idx + 2] = boxes[box_idx + 2] < img_width - 1
+                           ? boxes[box_idx + 2]
+                           : static_cast<T>(img_width - 1);
+  boxes[box_idx + 3] = boxes[box_idx + 3] < img_height - 1
+                           ? boxes[box_idx + 3]
+                           : static_cast<T>(img_height - 1);
 }
 
 template <typename T>
@@ -92,8 +97,10 @@ class YoloBoxKernel : public framework::OpKernel<T> {
     const int stride = h * w;
     const int an_stride = (class_num + 5) * stride;
 
-    int anchors_[anchors.size()];
-    std::copy(anchors.begin(), anchors.end(), anchors_);
+    Tensor anchors_;
+    auto anchors_data =
+        anchors_.mutable_data<int>({an_num * 2}, ctx.GetPlace());
+    std::copy(anchors.begin(), anchors.end(), anchors_data);
 
     const T* input_data = input->data<T>();
     const int* imgsize_data = imgsize->data<int>();
@@ -120,10 +127,11 @@ class YoloBoxKernel : public framework::OpKernel<T> {
 
             int box_idx =
                 GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 0);
-	    GetYoloBox<T>(box, input_data, anchors_, l, k, j, h, input_size,
-		          box_idx, stride, img_height, img_width);
+            GetYoloBox<T>(box, input_data, anchors_data, l, k, j, h, input_size,
+                          box_idx, stride, img_height, img_width);
             box_idx = (i * box_num + j * stride + k * w + l) * 4;
-            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height, img_width);
+            CalcDetectionBox<T>(boxes_data, box, box_idx, img_height,
+                                img_width);
 
             int label_idx =
                 GetEntryIndex(i, j, k * w + l, an_num, an_stride, stride, 5);
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index a1da4f64b6..d4a179794c 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -59,12 +59,19 @@ def YoloBox(x, img_size, attrs):
     pred_box[:, :, :2], pred_box[:, :, 2:4] = \
         pred_box[:, :, :2] - pred_box[:, :, 2:4] / 2., \
         pred_box[:, :, :2] + pred_box[:, :, 2:4] / 2.0
-    # pred_box = pred_box * input_size
     pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis]
     pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis]
     pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis]
     pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis]
 
+    for i in range(len(pred_box)):
+        pred_box[i, :, 0] = np.clip(pred_box[i, :, 0], 0, np.inf)
+        pred_box[i, :, 1] = np.clip(pred_box[i, :, 1], 0, np.inf)
+        pred_box[i, :, 2] = np.clip(pred_box[i, :, 2], -np.inf,
+                                    img_size[i, 1] - 1)
+        pred_box[i, :, 3] = np.clip(pred_box[i, :, 3], -np.inf,
+                                    img_size[i, 0] - 1)
+
     return pred_box, pred_score.reshape((n, -1, class_num))
 
 
@@ -93,8 +100,7 @@ class TestYoloBoxOp(OpTest):
         }
 
     def test_check_output(self):
-	place = core.CUDAPlace(0)
-        self.check_output_with_place(place, atol=1e-3)
+        self.check_output()
 
     def initTestCase(self):
         self.anchors = [10, 13, 16, 30, 33, 23]

From b399ee2a23cbdae6a979bcab00cdb0af14551b38 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Fri, 8 Mar 2019 11:19:04 +0000
Subject: [PATCH 15/73] fix doc. test=develop

---
 .../fluid/operators/detection/yolo_box_op.cc  | 36 +++++++++++--------
 .../fluid/operators/detection/yolo_box_op.cu  |  2 +-
 .../fluid/operators/detection/yolo_box_op.h   |  2 +-
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index c018a6498a..6cc9b241c6 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
@@ -48,11 +48,11 @@ class YoloBoxOp : public framework::OperatorWithKernel {
         "Input(ImgSize) dim[0] and Input(X) dim[0] should be same.");
     PADDLE_ENFORCE_EQ(dim_imgsize[1], 2, "Input(ImgSize) dim[1] should be 2.");
     PADDLE_ENFORCE_GT(anchors.size(), 0,
-                      "Attr(anchors) length should be greater then 0.");
+                      "Attr(anchors) length should be greater than 0.");
     PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
                       "Attr(anchors) length should be even integer.");
     PADDLE_ENFORCE_GT(class_num, 0,
-                      "Attr(class_num) should be an integer greater then 0.");
+                      "Attr(class_num) should be an integer greater than 0.");
 
     int box_num = dim_x[2] * dim_x[3] * anchor_num;
     std::vector<int64_t> dim_boxes({dim_x[0], box_num, 4});
@@ -76,7 +76,7 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X",
              "The input tensor of YoloBox operator, "
              "This is a 4-D tensor with shape of [N, C, H, W]."
-             "H and W should be same, and the second dimention(C) stores"
+             "H and W should be same, and the second dimension(C) stores"
              "box locations, confidence score and classification one-hot"
              "keys of each anchor box. Generally, X should be the output"
              "of YOLOv3 network.");
@@ -88,7 +88,7 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Boxes",
               "The output tensor of detection boxes of YoloBox operator, "
               "This is a 3-D tensor with shape of [N, M, 4], N is the"
-              "batch num, M is output box number, and the 3rd dimention"
+              "batch num, M is output box number, and the 3rd dimension"
               "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
     AddOutput("Scores",
               "The output tensor ofdetection boxes scores of YoloBox"
@@ -112,36 +112,42 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "be ignored.")
         .SetDefault(0.01);
     AddComment(R"DOC(
-         This operator generate YOLO detection boxes fron output of YOLOv3 network.
+         This operator generate YOLO detection boxes from output of YOLOv3 network.
          
          The output of previous network is in shape [N, C, H, W], while H and W
          should be the same, specify the grid size, each grid point predict given
          number boxes, this given number is specified by anchors, it should be 
          half anchors length, which following will be represented as S. In the 
-         second dimention(the channel dimention), C should be S * (class_num + 5),
+         second dimension(the channel dimension), C should be S * (class_num + 5),
          class_num is the box categoriy number of source dataset(such as coco), 
-         so in the second dimention, stores 4 box location coordinates x, y, w, h 
+         so in the second dimension, stores 4 box location coordinates x, y, w, h 
          and confidence score of the box and class one-hot key of each anchor box.
 
-         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
-         correspnd to:
+         While the 4 location coordinates if :math:`tx, ty, tw, th`, the box 
+         predictions correspnd to:
 
          $$
          b_x = \sigma(t_x) + c_x
+         $$
+         $$
          b_y = \sigma(t_y) + c_y
+         $$
+         $$
          b_w = p_w e^{t_w}
+         $$
+         $$
          b_h = p_h e^{t_h}
          $$
 
-         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
-         is specified by anchors.
+         While :math:`c_x, c_y` is the left top corner of current grid and 
+         :math:`p_w, p_h` is specified by anchors.
 
          The logistic scores of the 5rd channel of each anchor prediction boxes
          represent the confidence score of each prediction scores, and the logistic
          scores of the last class_num channels of each anchor prediction boxes 
-         represent the classifcation scores. Boxes with confidence scores less then
-         conf_thresh should be ignored, and boxes final scores if the products result
-         of confidence scores and classification scores.
+         represent the classifcation scores. Boxes with confidence scores less than
+         conf_thresh should be ignored, and box final scores is the product of 
+         confidence scores and classification scores.
 
          )DOC");
   }
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index a0c60ae673..30175be8bb 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 546a5a66b4..8b7c7df0f3 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

From abb5a9c7265cfb5bb51b8bd3231714fede91766b Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Sat, 9 Mar 2019 14:50:13 +0800
Subject: [PATCH 16/73] fix doc statement. test=develop

---
 .../fluid/operators/detection/yolo_box_op.cc  | 65 ++++++++++---------
 python/paddle/fluid/layers/detection.py       |  7 +-
 2 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 6cc9b241c6..e6cf3f58dd 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -75,25 +75,25 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override {
     AddInput("X",
              "The input tensor of YoloBox operator, "
-             "This is a 4-D tensor with shape of [N, C, H, W]."
-             "H and W should be same, and the second dimension(C) stores"
-             "box locations, confidence score and classification one-hot"
-             "keys of each anchor box. Generally, X should be the output"
+             "This is a 4-D tensor with shape of [N, C, H, W]. "
+             "H and W should be same, and the second dimension(C) stores "
+             "box locations, confidence score and classification one-hot "
+             "keys of each anchor box. Generally, X should be the output "
              "of YOLOv3 network.");
     AddInput("ImgSize",
              "The image size tensor of YoloBox operator, "
-             "This is a 2-D tensor with shape of [N, 2]. This tensor holds"
-             "height and width of each input image using for resize output"
+             "This is a 2-D tensor with shape of [N, 2]. This tensor holds "
+             "height and width of each input image using for resize output "
              "box in input image scale.");
     AddOutput("Boxes",
               "The output tensor of detection boxes of YoloBox operator, "
-              "This is a 3-D tensor with shape of [N, M, 4], N is the"
-              "batch num, M is output box number, and the 3rd dimension"
+              "This is a 3-D tensor with shape of [N, M, 4], N is the "
+              "batch num, M is output box number, and the 3rd dimension "
               "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
     AddOutput("Scores",
-              "The output tensor ofdetection boxes scores of YoloBox"
-              "operator, This is a 3-D tensor with shape of [N, M, C],"
-              "N is the batch num, M is output box number, C is the"
+              "The output tensor ofdetection boxes scores of YoloBox "
+              "operator, This is a 3-D tensor with shape of [N, M, C], "
+              "N is the batch num, M is output box number, C is the "
               "class number.");
 
     AddAttr<int>("class_num", "The number of classes to predict.");
@@ -107,30 +107,31 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                  "and thrid YoloBox operators.")
         .SetDefault(32);
     AddAttr<float>("conf_thresh",
-                   "The confidence scores threshold of detection boxes."
-                   "boxes with confidence scores under threshold should"
+                   "The confidence scores threshold of detection boxes. "
+                   "Boxes with confidence scores under threshold should "
                    "be ignored.")
         .SetDefault(0.01);
     AddComment(R"DOC(
          This operator generate YOLO detection boxes from output of YOLOv3 network.
          
          The output of previous network is in shape [N, C, H, W], while H and W
-         should be the same, specify the grid size, each grid point predict given
-         number boxes, this given number is specified by anchors, it should be 
-         half anchors length, which following will be represented as S. In the 
-         second dimension(the channel dimension), C should be S * (class_num + 5),
-         class_num is the box categoriy number of source dataset(such as coco), 
-         so in the second dimension, stores 4 box location coordinates x, y, w, h 
-         and confidence score of the box and class one-hot key of each anchor box.
-
-         While the 4 location coordinates if :math:`tx, ty, tw, th`, the box 
-         predictions correspnd to:
+         should be the same, H and W specify the grid size, each grid point predict 
+         given number boxes, this given number, which following will be represented as S,
+         is specified by the number of anchors, In the second dimension(the channel
+         dimension), C should be equal to S * (class_num + 5), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so in the 
+         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+         also includes confidence score of the box and class one-hot key of each anchor 
+         box.
+
+         Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box 
+         predictions should be as follows:
 
          $$
-         b_x = \sigma(t_x) + c_x
+         b_x = \\sigma(t_x) + c_x
          $$
          $$
-         b_y = \sigma(t_y) + c_y
+         b_y = \\sigma(t_y) + c_y
          $$
          $$
          b_w = p_w e^{t_w}
@@ -139,14 +140,14 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
          b_h = p_h e^{t_h}
          $$
 
-         While :math:`c_x, c_y` is the left top corner of current grid and 
-         :math:`p_w, p_h` is specified by anchors.
+         in the equation above, :math:`c_x, c_y` is the left top corner of current grid
+         and :math:`p_w, p_h` is specified by anchors.
 
-         The logistic scores of the 5rd channel of each anchor prediction boxes
-         represent the confidence score of each prediction scores, and the logistic
-         scores of the last class_num channels of each anchor prediction boxes 
-         represent the classifcation scores. Boxes with confidence scores less than
-         conf_thresh should be ignored, and box final scores is the product of 
+         The logistic regression value of the 5rd channel of each anchor prediction boxes
+         represent the confidence score of each prediction box, and the logistic
+         regression value of the last :attr:`class_num` channels of each anchor prediction 
+         boxes represent the classifcation scores. Boxes with confidence scores less than
+         :attr:`conf_thresh` should be ignored, and box final scores is the product of 
          confidence scores and classification scores.
 
          )DOC");
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index aca5f0f1d6..6cfd852fa5 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -628,10 +628,12 @@ def yolo_box(x,
         class_num (int): ${class_num_comment}
         conf_thresh (float): ${conf_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): the name of yolov3 loss
+        name (string): the name of yolo box layer
 
     Returns:
-        Variable: A 1-D tensor with shape [1], the value of yolov3 loss
+        Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
+        and a 3-D tensor with shape [N, M, C], the classification scores
+        of boxes.
 
     Raises:
         TypeError: Input x of yolov_box must be Variable
@@ -640,6 +642,7 @@ def yolo_box(x,
         TypeError: Attr conf_thresh of yolo box must be a float number
 
     Examples:
+
     .. code-block:: python
 
         x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')

From 00e822d2a0bbcecf3deff2e79fd84afb6d867d45 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Sun, 10 Mar 2019 12:04:32 +0800
Subject: [PATCH 17/73] fix test_detection. test=develop

---
 python/paddle/fluid/tests/test_detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index b8743debe2..b863eb8d72 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -484,7 +484,7 @@ class TestYoloDetection(unittest.TestCase):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            img_size = layers.data(name='x', shape=[2], dtype='int32')
+            img_size = layers.data(name='img_size', shape=[2], dtype='int32')
             boxes, scores = layers.yolo_box(x, img_size, [10, 13, 30, 13], 10,
                                             0.01, 32)
             self.assertIsNotNone(boxes)

From 33c8607ef3fd5f190be8cb91e88e2b1eba5c7fc4 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 11 Mar 2019 16:39:47 +0800
Subject: [PATCH 18/73] fix doc. test=develop

---
 paddle/fluid/API.spec                           |  2 +-
 paddle/fluid/operators/detection/yolo_box_op.cc | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 14d19f79aa..55b65e0d9a 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -328,7 +328,7 @@ paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=Non
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
-paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
+paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '170091cef6ebfcba6e54c55b496d0021'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index e6cf3f58dd..6d8dac38f7 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -83,7 +83,7 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("ImgSize",
              "The image size tensor of YoloBox operator, "
              "This is a 2-D tensor with shape of [N, 2]. This tensor holds "
-             "height and width of each input image using for resize output "
+             "height and width of each input image used for resizing output "
              "box in input image scale.");
     AddOutput("Boxes",
               "The output tensor of detection boxes of YoloBox operator, "
@@ -117,9 +117,9 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
          The output of previous network is in shape [N, C, H, W], while H and W
          should be the same, H and W specify the grid size, each grid point predict 
          given number boxes, this given number, which following will be represented as S,
-         is specified by the number of anchors, In the second dimension(the channel
-         dimension), C should be equal to S * (class_num + 5), class_num is the object 
-         category number of source dataset(such as 80 in coco dataset), so in the 
+         is specified by the number of anchors. In the second dimension(the channel
+         dimension), C should be equal to S * (5 + class_num), class_num is the object 
+         category number of source dataset(such as 80 in coco dataset), so the 
          second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
          also includes confidence score of the box and class one-hot key of each anchor 
          box.
@@ -143,10 +143,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
          in the equation above, :math:`c_x, c_y` is the left top corner of current grid
          and :math:`p_w, p_h` is specified by anchors.
 
-         The logistic regression value of the 5rd channel of each anchor prediction boxes
-         represent the confidence score of each prediction box, and the logistic
+         The logistic regression value of the 5th channel of each anchor prediction boxes
+         represents the confidence score of each prediction box, and the logistic
          regression value of the last :attr:`class_num` channels of each anchor prediction 
-         boxes represent the classifcation scores. Boxes with confidence scores less than
+         boxes represents the classifcation scores. Boxes with confidence scores less than
          :attr:`conf_thresh` should be ignored, and box final scores is the product of 
          confidence scores and classification scores.
 

From 626fb859d9d02023f383fe3566c7438a8a772891 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 12 Mar 2019 12:02:13 +0800
Subject: [PATCH 19/73] add param default doc. test=develop

---
 paddle/fluid/API.spec                   | 2 +-
 python/paddle/fluid/layers/detection.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 55b65e0d9a..6ebbf1f7c4 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -328,7 +328,7 @@ paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=Non
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
-paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '170091cef6ebfcba6e54c55b496d0021'))
+paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf73e1c87dc933371ce96d66878838d9'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 6cfd852fa5..6d82b8a12e 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -628,7 +628,7 @@ def yolo_box(x,
         class_num (int): ${class_num_comment}
         conf_thresh (float): ${conf_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): the name of yolo box layer
+        name (string): the name of yolo box layer. Default None.
 
     Returns:
         Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,

From f0804433b040310e26e472fd4129a9b64967722a Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 5 Mar 2019 14:28:03 +0800
Subject: [PATCH 20/73] add mixup score and label_smooth for yolov3_loss.
 test=develop

---
 .../operators/detection/yolov3_loss_op.cc     | 21 +++++
 .../operators/detection/yolov3_loss_op.h      | 81 +++++++++++++------
 2 files changed, 76 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index ab01bdf7ca..38eb43a3cc 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -72,6 +72,18 @@ class Yolov3LossOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_GT(class_num, 0,
                       "Attr(class_num) should be an integer greater then 0.");
 
+    if (ctx->HasInput("GTScore")) {
+      auto dim_gtscore = ctx->GetInputDim("GTScore");
+      PADDLE_ENFORCE_EQ(dim_gtscore.size(), 2,
+                        "Input(GTScore) should be a 2-D tensor");
+      PADDLE_ENFORCE_EQ(
+          dim_gtscore[0], dim_gtbox[0],
+          "Input(GTBox) and Input(GTScore) dim[0] should be same");
+      PADDLE_ENFORCE_EQ(
+          dim_gtscore[1], dim_gtbox[1],
+          "Input(GTBox) and Input(GTScore) dim[1] should be same");
+    }
+
     std::vector<int64_t> dim_out({dim_x[0]});
     ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
 
@@ -112,6 +124,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
              "This is a 2-D tensor with shape of [N, max_box_num], "
              "and each element should be an integer to indicate the "
              "box class id.");
+    AddInput("GTScore",
+             "The score of GTLabel, This is a 2-D tensor in same shape "
+             "GTLabel, and score values should in range (0, 1). This "
+             "input is for GTLabel score can be not 1.0 in image mixup "
+             "augmentation.");
     AddOutput("Loss",
               "The output yolov3 loss tensor, "
               "This is a 1-D tensor with shape of [N]");
@@ -143,6 +160,8 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("ignore_thresh",
                    "The ignore threshold to ignore confidence loss.")
         .SetDefault(0.7);
+    AddAttr<bool>("use_label_smooth", "bool,default True", "use label smooth")
+        .SetDefault(true);
     AddComment(R"DOC(
          This operator generates yolov3 loss based on given predict result and ground
          truth boxes.
@@ -240,6 +259,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
     op->SetInput("X", Input("X"));
     op->SetInput("GTBox", Input("GTBox"));
     op->SetInput("GTLabel", Input("GTLabel"));
+    op->SetInput("GTScore", Input("GTScore"));
     op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
     op->SetInput("ObjectnessMask", Output("ObjectnessMask"));
     op->SetInput("GTMatchMask", Output("GTMatchMask"));
@@ -249,6 +269,7 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
     op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
     op->SetOutput(framework::GradVarName("GTBox"), {});
     op->SetOutput(framework::GradVarName("GTLabel"), {});
+    op->SetOutput(framework::GradVarName("GTScore"), {});
     return std::unique_ptr<framework::OpDesc>(op);
   }
 };
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
index 8407d4e6e8..54038b6e65 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -37,8 +37,8 @@ static T SigmoidCrossEntropy(T x, T label) {
 }
 
 template <typename T>
-static T L2Loss(T x, T y) {
-  return 0.5 * (y - x) * (y - x);
+static T L1Loss(T x, T y) {
+  return std::abs(y - x);
 }
 
 template <typename T>
@@ -47,8 +47,8 @@ static T SigmoidCrossEntropyGrad(T x, T label) {
 }
 
 template <typename T>
-static T L2LossGrad(T x, T y) {
-  return x - y;
+static T L1LossGrad(T x, T y) {
+  return x > y ? 1.0 : -1.0;
 }
 
 static int GetMaskIndex(std::vector<int> mask, int val) {
@@ -121,47 +121,49 @@ template <typename T>
 static void CalcBoxLocationLoss(T* loss, const T* input, Box<T> gt,
                                 std::vector<int> anchors, int an_idx,
                                 int box_idx, int gi, int gj, int grid_size,
-                                int input_size, int stride) {
+                                int input_size, int stride, T score) {
   T tx = gt.x * grid_size - gi;
   T ty = gt.y * grid_size - gj;
   T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
   T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
 
-  T scale = (2.0 - gt.w * gt.h);
+  T scale = (2.0 - gt.w * gt.h) * score;
   loss[0] += SigmoidCrossEntropy<T>(input[box_idx], tx) * scale;
   loss[0] += SigmoidCrossEntropy<T>(input[box_idx + stride], ty) * scale;
-  loss[0] += L2Loss<T>(input[box_idx + 2 * stride], tw) * scale;
-  loss[0] += L2Loss<T>(input[box_idx + 3 * stride], th) * scale;
+  loss[0] += L1Loss<T>(input[box_idx + 2 * stride], tw) * scale;
+  loss[0] += L1Loss<T>(input[box_idx + 3 * stride], th) * scale;
 }
 
 template <typename T>
 static void CalcBoxLocationLossGrad(T* input_grad, const T loss, const T* input,
                                     Box<T> gt, std::vector<int> anchors,
                                     int an_idx, int box_idx, int gi, int gj,
-                                    int grid_size, int input_size, int stride) {
+                                    int grid_size, int input_size, int stride,
+                                    T score) {
   T tx = gt.x * grid_size - gi;
   T ty = gt.y * grid_size - gj;
   T tw = std::log(gt.w * input_size / anchors[2 * an_idx]);
   T th = std::log(gt.h * input_size / anchors[2 * an_idx + 1]);
 
-  T scale = (2.0 - gt.w * gt.h);
+  T scale = (2.0 - gt.w * gt.h) * score;
   input_grad[box_idx] =
       SigmoidCrossEntropyGrad<T>(input[box_idx], tx) * scale * loss;
   input_grad[box_idx + stride] =
       SigmoidCrossEntropyGrad<T>(input[box_idx + stride], ty) * scale * loss;
   input_grad[box_idx + 2 * stride] =
-      L2LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
+      L1LossGrad<T>(input[box_idx + 2 * stride], tw) * scale * loss;
   input_grad[box_idx + 3 * stride] =
-      L2LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
+      L1LossGrad<T>(input[box_idx + 3 * stride], th) * scale * loss;
 }
 
 template <typename T>
 static inline void CalcLabelLoss(T* loss, const T* input, const int index,
                                  const int label, const int class_num,
-                                 const int stride) {
+                                 const int stride, const T pos, const T neg,
+                                 T score) {
   for (int i = 0; i < class_num; i++) {
     T pred = input[index + i * stride];
-    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? 1.0 : 0.0);
+    loss[0] += SigmoidCrossEntropy<T>(pred, (i == label) ? pos : neg) * score;
   }
 }
 
@@ -169,11 +171,13 @@ template <typename T>
 static inline void CalcLabelLossGrad(T* input_grad, const T loss,
                                      const T* input, const int index,
                                      const int label, const int class_num,
-                                     const int stride) {
+                                     const int stride, const T pos, const T neg,
+                                     T score) {
   for (int i = 0; i < class_num; i++) {
     T pred = input[index + i * stride];
     input_grad[index + i * stride] =
-        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? 1.0 : 0.0) * loss;
+        SigmoidCrossEntropyGrad<T>(pred, (i == label) ? pos : neg) * score *
+        loss;
   }
 }
 
@@ -188,8 +192,8 @@ static inline void CalcObjnessLoss(T* loss, const T* input, const T* objness,
         for (int l = 0; l < w; l++) {
           T obj = objness[k * w + l];
           if (obj > 1e-5) {
-            // positive sample: obj = 1
-            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0);
+            // positive sample: obj = mixup score
+            loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 1.0) * obj;
           } else if (obj > -0.5) {
             // negetive sample: obj = 0
             loss[i] += SigmoidCrossEntropy<T>(input[k * w + l], 0.0);
@@ -215,7 +219,8 @@ static inline void CalcObjnessLossGrad(T* input_grad, const T* loss,
           T obj = objness[k * w + l];
           if (obj > 1e-5) {
             input_grad[k * w + l] =
-                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * loss[i];
+                SigmoidCrossEntropyGrad<T>(input[k * w + l], 1.0) * obj *
+                loss[i];
           } else if (obj > -0.5) {
             input_grad[k * w + l] =
                 SigmoidCrossEntropyGrad<T>(input[k * w + l], 0.0) * loss[i];
@@ -252,6 +257,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("X");
     auto* gt_box = ctx.Input<Tensor>("GTBox");
     auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* gt_score = ctx.Input<Tensor>("GTScore");
     auto* loss = ctx.Output<Tensor>("Loss");
     auto* objness_mask = ctx.Output<Tensor>("ObjectnessMask");
     auto* gt_match_mask = ctx.Output<Tensor>("GTMatchMask");
@@ -260,6 +266,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     int class_num = ctx.Attr<int>("class_num");
     float ignore_thresh = ctx.Attr<float>("ignore_thresh");
     int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
 
     const int n = input->dims()[0];
     const int h = input->dims()[2];
@@ -272,9 +279,17 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     const int stride = h * w;
     const int an_stride = (class_num + 5) * stride;
 
+    T label_pos = 1.0;
+    T label_neg = 0.0;
+    if (use_label_smooth) {
+      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
+      label_neg = 1.0 / static_cast<T>(class_num);
+    }
+
     const T* input_data = input->data<T>();
     const T* gt_box_data = gt_box->data<T>();
     const int* gt_label_data = gt_label->data<int>();
+    const T* gt_score_data = gt_score->data<T>();
     T* loss_data = loss->mutable_data<T>({n}, ctx.GetPlace());
     memset(loss_data, 0, loss->numel() * sizeof(T));
     T* obj_mask_data =
@@ -355,19 +370,20 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
         int mask_idx = GetMaskIndex(anchor_mask, best_n);
         gt_match_mask_data[i * b + t] = mask_idx;
         if (mask_idx >= 0) {
+          T score = gt_score_data[i * b + t];
           int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                       an_stride, stride, 0);
           CalcBoxLocationLoss<T>(loss_data + i, input_data, gt, anchors, best_n,
-                                 box_idx, gi, gj, h, input_size, stride);
+                                 box_idx, gi, gj, h, input_size, stride, score);
 
           int obj_idx = (i * mask_num + mask_idx) * stride + gj * w + gi;
-          obj_mask_data[obj_idx] = 1.0;
+          obj_mask_data[obj_idx] = score;
 
           int label = gt_label_data[i * b + t];
           int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                         an_stride, stride, 5);
           CalcLabelLoss<T>(loss_data + i, input_data, label_idx, label,
-                           class_num, stride);
+                           class_num, stride, label_pos, label_neg, score);
         }
       }
     }
@@ -384,6 +400,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     auto* input = ctx.Input<Tensor>("X");
     auto* gt_box = ctx.Input<Tensor>("GTBox");
     auto* gt_label = ctx.Input<Tensor>("GTLabel");
+    auto* gt_score = ctx.Input<Tensor>("GTScore");
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* loss_grad = ctx.Input<Tensor>(framework::GradVarName("Loss"));
     auto* objness_mask = ctx.Input<Tensor>("ObjectnessMask");
@@ -392,6 +409,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     auto anchor_mask = ctx.Attr<std::vector<int>>("anchor_mask");
     int class_num = ctx.Attr<int>("class_num");
     int downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    bool use_label_smooth = ctx.Attr<bool>("use_label_smooth");
 
     const int n = input_grad->dims()[0];
     const int c = input_grad->dims()[1];
@@ -404,9 +422,17 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     const int stride = h * w;
     const int an_stride = (class_num + 5) * stride;
 
+    T label_pos = 1.0;
+    T label_neg = 0.0;
+    if (use_label_smooth) {
+      label_pos = 1.0 - 1.0 / static_cast<T>(class_num);
+      label_neg = 1.0 / static_cast<T>(class_num);
+    }
+
     const T* input_data = input->data<T>();
     const T* gt_box_data = gt_box->data<T>();
     const int* gt_label_data = gt_label->data<int>();
+    const T* gt_score_data = gt_score->data<T>();
     const T* loss_grad_data = loss_grad->data<T>();
     const T* obj_mask_data = objness_mask->data<T>();
     const int* gt_match_mask_data = gt_match_mask->data<int>();
@@ -418,21 +444,24 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
       for (int t = 0; t < b; t++) {
         int mask_idx = gt_match_mask_data[i * b + t];
         if (mask_idx >= 0) {
+          T score = gt_score_data[i * b + t];
           Box<T> gt = GetGtBox(gt_box_data, i, b, t);
           int gi = static_cast<int>(gt.x * w);
           int gj = static_cast<int>(gt.y * h);
 
           int box_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                       an_stride, stride, 0);
-          CalcBoxLocationLossGrad<T>(
-              input_grad_data, loss_grad_data[i], input_data, gt, anchors,
-              anchor_mask[mask_idx], box_idx, gi, gj, h, input_size, stride);
+          CalcBoxLocationLossGrad<T>(input_grad_data, loss_grad_data[i],
+                                     input_data, gt, anchors,
+                                     anchor_mask[mask_idx], box_idx, gi, gj, h,
+                                     input_size, stride, score);
 
           int label = gt_label_data[i * b + t];
           int label_idx = GetEntryIndex(i, mask_idx, gj * w + gi, mask_num,
                                         an_stride, stride, 5);
           CalcLabelLossGrad<T>(input_grad_data, loss_grad_data[i], input_data,
-                               label_idx, label, class_num, stride);
+                               label_idx, label, class_num, stride, label_pos,
+                               label_neg, score);
         }
       }
     }

From 0d1a9996ac07453fc87e732694664f24e295b37b Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 5 Mar 2019 16:22:05 +0800
Subject: [PATCH 21/73] fix unittest for yolov3_loss. test=develop

---
 .../operators/detection/yolov3_loss_op.cc     |  9 +++
 python/paddle/fluid/layers/detection.py       | 39 +++++++++---
 python/paddle/fluid/tests/test_detection.py   | 12 +++-
 .../tests/unittests/test_yolov3_loss_op.py    | 62 ++++++++++++-------
 4 files changed, 86 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 38eb43a3cc..3b1d4d2a80 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -223,6 +223,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          loss = (loss_{xy} + loss_{wh}) * weight_{box}
               + loss_{conf} + loss_{class}
          $$
+
+         While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
+         target will be smoothed when calculating classification loss, target of 
+         positive samples will be smoothed to $$1.0 - 1.0/class_num$$ and target of
+         negetive samples will be smoothed to $$1.0/class_num$$.
+
+         While :attr:`GTScore` is given, which means the mixup score of ground truth 
+         boxes, all looses incured by a ground truth box will be multiplied by its 
+         mixup score.
          )DOC");
   }
 };
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index cbedd70f85..aa7b4a50be 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -515,7 +515,9 @@ def yolov3_loss(x,
                 class_num,
                 ignore_thresh,
                 downsample_ratio,
-                name=None):
+                name=None,
+                gtscore=None,
+                use_label_smooth=True):
     """
     ${comment}
 
@@ -534,27 +536,34 @@ def yolov3_loss(x,
         ignore_thresh (float): ${ignore_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
         name (string): the name of yolov3 loss
+        gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
+                            of [N, B].
+        use_label_smooth (bool): ${use_label_smooth_comment}
 
     Returns:
-        Variable: A 1-D tensor with shape [1], the value of yolov3 loss
+        Variable: A 1-D tensor with shape [N], the value of yolov3 loss
 
     Raises:
         TypeError: Input x of yolov3_loss must be Variable
         TypeError: Input gtbox of yolov3_loss must be Variable"
         TypeError: Input gtlabel of yolov3_loss must be Variable"
+        TypeError: Input gtscore of yolov3_loss must be Variable"
         TypeError: Attr anchors of yolov3_loss must be list or tuple
         TypeError: Attr class_num of yolov3_loss must be an integer
         TypeError: Attr ignore_thresh of yolov3_loss must be a float number
+        TypeError: Attr use_label_smooth of yolov3_loss must be a bool value
 
     Examples:
       .. code-block:: python
 
           x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-          gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32')
-          gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32')
+          gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
+          gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
+          gtscore = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
           anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
           anchor_mask = [0, 1, 2]
-          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel, anchors=anchors, 
+          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
+                                          gtscore=gtscore, anchors=anchors, 
                                           anchor_mask=anchor_mask, class_num=80,
                                           ignore_thresh=0.7, downsample_ratio=32)
     """
@@ -566,6 +575,8 @@ def yolov3_loss(x,
         raise TypeError("Input gtbox of yolov3_loss must be Variable")
     if not isinstance(gtlabel, Variable):
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
+    if not isinstance(gtscore, Variable):
+        raise TypeError("Input gtscore of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
     if not isinstance(anchor_mask, list) and not isinstance(anchor_mask, tuple):
@@ -575,6 +586,9 @@ def yolov3_loss(x,
     if not isinstance(ignore_thresh, float):
         raise TypeError(
             "Attr ignore_thresh of yolov3_loss must be a float number")
+    if not isinstance(use_label_smooth, bool):
+        raise TypeError(
+            "Attr use_label_smooth of yolov3_loss must be a bool value")
 
     if name is None:
         loss = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -585,21 +599,26 @@ def yolov3_loss(x,
     objectness_mask = helper.create_variable_for_type_inference(dtype='int32')
     gt_match_mask = helper.create_variable_for_type_inference(dtype='int32')
 
+    inputs = {
+        "X": x,
+        "GTBox": gtbox,
+        "GTLabel": gtlabel,
+    }
+    if gtscore:
+        inputs["GTScore"] = gtscore
+
     attrs = {
         "anchors": anchors,
         "anchor_mask": anchor_mask,
         "class_num": class_num,
         "ignore_thresh": ignore_thresh,
         "downsample_ratio": downsample_ratio,
+        "use_label_smooth": use_label_smooth,
     }
 
     helper.append_op(
         type='yolov3_loss',
-        inputs={
-            "X": x,
-            "GTBox": gtbox,
-            "GTLabel": gtlabel,
-        },
+        inputs=inputs,
         outputs={
             'Loss': loss,
             'ObjectnessMask': objectness_mask,
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 6218db7345..b202c0ffea 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -476,8 +476,16 @@ class TestYoloDetection(unittest.TestCase):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
             gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
             gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13],
-                                      [0, 1], 10, 0.7, 32)
+            gtscore = layers.data(name='gtscore', shape=[10], dtype='int32')
+            loss = layers.yolov3_loss(
+                x,
+                gtbox,
+                gtlabel, [10, 13, 30, 13], [0, 1],
+                10,
+                0.7,
+                32,
+                gtscore=gtscore,
+                use_label_smooth=False)
 
             self.assertIsNotNone(loss)
 
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 020c113923..b3d1ff8058 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -23,8 +23,8 @@ from op_test import OpTest
 from paddle.fluid import core
 
 
-def l2loss(x, y):
-    return 0.5 * (y - x) * (y - x)
+def l1loss(x, y):
+    return abs(x - y)
 
 
 def sce(x, label):
@@ -66,7 +66,7 @@ def batch_xywh_box_iou(box1, box2):
     return inter_area / union
 
 
-def YOLOv3Loss(x, gtbox, gtlabel, attrs):
+def YOLOv3Loss(x, gtbox, gtlabel, gtscore, attrs):
     n, c, h, w = x.shape
     b = gtbox.shape[1]
     anchors = attrs['anchors']
@@ -75,21 +75,21 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
     mask_num = len(anchor_mask)
     class_num = attrs["class_num"]
     ignore_thresh = attrs['ignore_thresh']
-    downsample = attrs['downsample']
-    input_size = downsample * h
+    downsample_ratio = attrs['downsample_ratio']
+    use_label_smooth = attrs['use_label_smooth']
+    input_size = downsample_ratio * h
     x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
     loss = np.zeros((n)).astype('float32')
 
+    label_pos = 1.0 - 1.0 / class_num if use_label_smooth else 1.0
+    label_neg = 1.0 / class_num if use_label_smooth else 0.0
+
     pred_box = x[:, :, :, :, :4].copy()
     grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
     grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
     pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
     pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
 
-    x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
-                                 np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
-                                 class_num)
-
     mask_anchors = []
     for m in anchor_mask:
         mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
@@ -138,21 +138,22 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
             ty = gtbox[i, j, 1] * w - gj
             tw = np.log(gtbox[i, j, 2] * input_size / mask_anchors[an_idx][0])
             th = np.log(gtbox[i, j, 3] * input_size / mask_anchors[an_idx][1])
-            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3])
+            scale = (2.0 - gtbox[i, j, 2] * gtbox[i, j, 3]) * gtscore[i, j]
             loss[i] += sce(x[i, an_idx, gj, gi, 0], tx) * scale
             loss[i] += sce(x[i, an_idx, gj, gi, 1], ty) * scale
-            loss[i] += l2loss(x[i, an_idx, gj, gi, 2], tw) * scale
-            loss[i] += l2loss(x[i, an_idx, gj, gi, 3], th) * scale
+            loss[i] += l1loss(x[i, an_idx, gj, gi, 2], tw) * scale
+            loss[i] += l1loss(x[i, an_idx, gj, gi, 3], th) * scale
 
-            objness[i, an_idx * h * w + gj * w + gi] = 1.0
+            objness[i, an_idx * h * w + gj * w + gi] = gtscore[i, j]
 
             for label_idx in range(class_num):
-                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx],
-                               float(label_idx == gtlabel[i, j]))
+                loss[i] += sce(x[i, an_idx, gj, gi, 5 + label_idx], label_pos
+                               if label_idx == gtlabel[i, j] else
+                               label_neg) * gtscore[i, j]
 
         for j in range(mask_num * h * w):
             if objness[i, j] > 0:
-                loss[i] += sce(pred_obj[i, j], 1.0)
+                loss[i] += sce(pred_obj[i, j], 1.0) * objness[i, j]
             elif objness[i, j] == 0:
                 loss[i] += sce(pred_obj[i, j], 0.0)
 
@@ -167,6 +168,7 @@ class TestYolov3LossOp(OpTest):
         x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32'))
         gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
         gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
+        gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32')
         gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
         gtbox = gtbox * gtmask[:, :, np.newaxis]
         gtlabel = gtlabel * gtmask
@@ -176,15 +178,18 @@ class TestYolov3LossOp(OpTest):
             "anchor_mask": self.anchor_mask,
             "class_num": self.class_num,
             "ignore_thresh": self.ignore_thresh,
-            "downsample": self.downsample,
+            "downsample_ratio": self.downsample_ratio,
+            "use_label_smooth": self.use_label_smooth,
         }
 
         self.inputs = {
             'X': x,
             'GTBox': gtbox.astype('float32'),
             'GTLabel': gtlabel.astype('int32'),
+            'GTScore': gtscore.astype('float32')
         }
-        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, self.attrs)
+        loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, gtscore,
+                                               self.attrs)
         self.outputs = {
             'Loss': loss,
             'ObjectnessMask': objness,
@@ -193,24 +198,33 @@ class TestYolov3LossOp(OpTest):
 
     def test_check_output(self):
         place = core.CPUPlace()
-        self.check_output_with_place(place, atol=1e-3)
+        self.check_output_with_place(place, atol=2e-3)
 
     def test_check_grad_ignore_gtbox(self):
         place = core.CPUPlace()
         self.check_grad_with_place(
             place, ['X'],
             'Loss',
-            no_grad_set=set(["GTBox", "GTLabel"]),
-            max_relative_error=0.3)
+            no_grad_set=set(["GTBox", "GTLabel", "GTScore"]),
+            max_relative_error=0.2)
 
     def initTestCase(self):
-        self.anchors = [10, 13, 16, 30, 33, 23]
-        self.anchor_mask = [1, 2]
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
         self.class_num = 5
         self.ignore_thresh = 0.5
-        self.downsample = 32
+        self.downsample_ratio = 32
         self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
         self.gtbox_shape = (3, 5, 4)
+        self.use_label_smooth = True
+
+
+class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp):
+    def set_label_smooth(self):
+        self.use_label_smooth = False
 
 
 if __name__ == "__main__":

From af4ef80e5b53fb44fa8dab4e857b796e2705c27c Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 5 Mar 2019 20:46:32 +0800
Subject: [PATCH 22/73] fix API.spec not add defaults. test=develop

---
 paddle/fluid/API.spec                         |  2 +-
 .../operators/detection/yolov3_loss_op.cc     |  3 +-
 .../operators/detection/yolov3_loss_op.h      | 28 ++++++++++-
 .../tests/unittests/test_yolov3_loss_op.py    | 46 +++++++++++++++----
 4 files changed, 66 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index df3497de20..8ad66eaa4d 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -327,7 +327,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name', 'gtscore', 'use_label_smooth'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 3b1d4d2a80..2dbe46de0b 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -128,7 +128,8 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
              "The score of GTLabel, This is a 2-D tensor in same shape "
              "GTLabel, and score values should in range (0, 1). This "
              "input is for GTLabel score can be not 1.0 in image mixup "
-             "augmentation.");
+             "augmentation.")
+        .AsDispensable();
     AddOutput("Loss",
               "The output yolov3 loss tensor, "
               "This is a 1-D tensor with shape of [N]");
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
index 54038b6e65..df6f28d41d 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -289,7 +289,6 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
     const T* gt_box_data = gt_box->data<T>();
     const int* gt_label_data = gt_label->data<int>();
-    const T* gt_score_data = gt_score->data<T>();
     T* loss_data = loss->mutable_data<T>({n}, ctx.GetPlace());
     memset(loss_data, 0, loss->numel() * sizeof(T));
     T* obj_mask_data =
@@ -298,6 +297,19 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     int* gt_match_mask_data =
         gt_match_mask->mutable_data<int>({n, b}, ctx.GetPlace());
 
+    const T* gt_score_data;
+    if (!gt_score) {
+      Tensor _gt_score;
+      _gt_score.mutable_data<T>({n, b}, ctx.GetPlace());
+      math::SetConstant<platform::CPUDeviceContext, T>()(
+          ctx.template device_context<platform::CPUDeviceContext>(), &_gt_score,
+          static_cast<T>(1.0));
+      gt_score = &_gt_score;
+      gt_score_data = _gt_score.data<T>();
+    } else {
+      gt_score_data = gt_score->data<T>();
+    }
+
     // calc valid gt box mask, avoid calc duplicately in following code
     Tensor gt_valid_mask;
     bool* gt_valid_mask_data =
@@ -432,7 +444,6 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     const T* input_data = input->data<T>();
     const T* gt_box_data = gt_box->data<T>();
     const int* gt_label_data = gt_label->data<int>();
-    const T* gt_score_data = gt_score->data<T>();
     const T* loss_grad_data = loss_grad->data<T>();
     const T* obj_mask_data = objness_mask->data<T>();
     const int* gt_match_mask_data = gt_match_mask->data<int>();
@@ -440,6 +451,19 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
         input_grad->mutable_data<T>({n, c, h, w}, ctx.GetPlace());
     memset(input_grad_data, 0, input_grad->numel() * sizeof(T));
 
+    const T* gt_score_data;
+    if (!gt_score) {
+      Tensor _gt_score;
+      _gt_score.mutable_data<T>({n, b}, ctx.GetPlace());
+      math::SetConstant<platform::CPUDeviceContext, T>()(
+          ctx.template device_context<platform::CPUDeviceContext>(), &_gt_score,
+          static_cast<T>(1.0));
+      gt_score = &_gt_score;
+      gt_score_data = _gt_score.data<T>();
+    } else {
+      gt_score_data = gt_score->data<T>();
+    }
+
     for (int i = 0; i < n; i++) {
       for (int t = 0; t < b; t++) {
         int mask_idx = gt_match_mask_data[i * b + t];
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index b3d1ff8058..e4d6edc72c 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -168,7 +168,6 @@ class TestYolov3LossOp(OpTest):
         x = logit(np.random.uniform(0, 1, self.x_shape).astype('float32'))
         gtbox = np.random.random(size=self.gtbox_shape).astype('float32')
         gtlabel = np.random.randint(0, self.class_num, self.gtbox_shape[:2])
-        gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32')
         gtmask = np.random.randint(0, 2, self.gtbox_shape[:2])
         gtbox = gtbox * gtmask[:, :, np.newaxis]
         gtlabel = gtlabel * gtmask
@@ -186,8 +185,13 @@ class TestYolov3LossOp(OpTest):
             'X': x,
             'GTBox': gtbox.astype('float32'),
             'GTLabel': gtlabel.astype('int32'),
-            'GTScore': gtscore.astype('float32')
         }
+
+        gtscore = np.ones(self.gtbox_shape[:2]).astype('float32')
+        if self.gtscore:
+            gtscore = np.random.random(self.gtbox_shape[:2]).astype('float32')
+            self.inputs['GTScore'] = gtscore
+
         loss, objness, gt_matches = YOLOv3Loss(x, gtbox, gtlabel, gtscore,
                                                self.attrs)
         self.outputs = {
@@ -202,11 +206,7 @@ class TestYolov3LossOp(OpTest):
 
     def test_check_grad_ignore_gtbox(self):
         place = core.CPUPlace()
-        self.check_grad_with_place(
-            place, ['X'],
-            'Loss',
-            no_grad_set=set(["GTBox", "GTLabel", "GTScore"]),
-            max_relative_error=0.2)
+        self.check_grad_with_place(place, ['X'], 'Loss', max_relative_error=0.2)
 
     def initTestCase(self):
         self.anchors = [
@@ -215,17 +215,45 @@ class TestYolov3LossOp(OpTest):
         ]
         self.anchor_mask = [0, 1, 2]
         self.class_num = 5
-        self.ignore_thresh = 0.5
+        self.ignore_thresh = 0.7
         self.downsample_ratio = 32
         self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
         self.gtbox_shape = (3, 5, 4)
+        self.gtscore = True
         self.use_label_smooth = True
 
 
 class TestYolov3LossWithoutLabelSmooth(TestYolov3LossOp):
-    def set_label_smooth(self):
+    def initTestCase(self):
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
+        self.class_num = 5
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
+        self.gtscore = True
         self.use_label_smooth = False
 
 
+class TestYolov3LossNoGTScore(TestYolov3LossOp):
+    def initTestCase(self):
+        self.anchors = [
+            10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198,
+            373, 326
+        ]
+        self.anchor_mask = [0, 1, 2]
+        self.class_num = 5
+        self.ignore_thresh = 0.7
+        self.downsample_ratio = 32
+        self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
+        self.gtbox_shape = (3, 5, 4)
+        self.gtscore = False
+        self.use_label_smooth = True
+
+
 if __name__ == "__main__":
     unittest.main()

From 5b37cf0addd6bbce7e73a08d35a03156d49566e4 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 5 Mar 2019 13:00:44 +0000
Subject: [PATCH 23/73] fix API.spec for yolov3_loss. test=develop

---
 paddle/fluid/API.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 8ad66eaa4d..e9912581c9 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -327,7 +327,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name', 'gtscore', 'use_label_smooth'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name', 'gtscore', 'use_label_smooth'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '04028671d2a95a9147fd6cb70b9af773'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))

From afdf3c3f84d9fc5d10ff777eb17b45772ceba558 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 5 Mar 2019 21:39:23 +0800
Subject: [PATCH 24/73] fix doc.test=develop

---
 paddle/fluid/operators/detection/yolov3_loss_op.cc | 4 ++--
 python/paddle/fluid/layers/detection.py            | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 2dbe46de0b..f4a6a68cd0 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -227,8 +227,8 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
 
          While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
          target will be smoothed when calculating classification loss, target of 
-         positive samples will be smoothed to $$1.0 - 1.0/class_num$$ and target of
-         negetive samples will be smoothed to $$1.0/class_num$$.
+         positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
+         negetive samples will be smoothed to :math:`1.0 / class\_num`.
 
          While :attr:`GTScore` is given, which means the mixup score of ground truth 
          boxes, all looses incured by a ground truth box will be multiplied by its 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index aa7b4a50be..1ecb9b9e34 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -545,9 +545,9 @@ def yolov3_loss(x,
 
     Raises:
         TypeError: Input x of yolov3_loss must be Variable
-        TypeError: Input gtbox of yolov3_loss must be Variable"
-        TypeError: Input gtlabel of yolov3_loss must be Variable"
-        TypeError: Input gtscore of yolov3_loss must be Variable"
+        TypeError: Input gtbox of yolov3_loss must be Variable
+        TypeError: Input gtlabel of yolov3_loss must be Variable
+        TypeError: Input gtscore of yolov3_loss must be Variable
         TypeError: Attr anchors of yolov3_loss must be list or tuple
         TypeError: Attr class_num of yolov3_loss must be an integer
         TypeError: Attr ignore_thresh of yolov3_loss must be a float number
@@ -559,7 +559,7 @@ def yolov3_loss(x,
           x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
           gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
           gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
-          gtscore = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
+          gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='int32')
           anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
           anchor_mask = [0, 1, 2]
           loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,

From b307533b7d13fdf3cc8dcb3361abd6b4dfde7d70 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Fri, 8 Mar 2019 16:43:08 +0800
Subject: [PATCH 25/73] fix format. test=develop

---
 paddle/fluid/operators/detection/yolov3_loss_op.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index f4a6a68cd0..7179f80cca 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -10,6 +10,7 @@
    limitations under the License. */
 
 #include "paddle/fluid/operators/detection/yolov3_loss_op.h"
+#include <memory>
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {

From 585766acc00bf3e7dc07f22ecf05a1987aa43664 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Mon, 11 Mar 2019 15:54:58 +0800
Subject: [PATCH 26/73] fix spell mistake in doc. test=develop

---
 paddle/fluid/API.spec                              | 2 +-
 paddle/fluid/operators/detection/yolov3_loss_op.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index e9912581c9..c8e2da9b6e 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -327,7 +327,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name', 'gtscore', 'use_label_smooth'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '04028671d2a95a9147fd6cb70b9af773'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name', 'gtscore', 'use_label_smooth'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '3a76e87a6d7b961612094d7b67cd8b2a'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 7179f80cca..27d8bb0132 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -232,7 +232,7 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
          negetive samples will be smoothed to :math:`1.0 / class\_num`.
 
          While :attr:`GTScore` is given, which means the mixup score of ground truth 
-         boxes, all looses incured by a ground truth box will be multiplied by its 
+         boxes, all losses incured by a ground truth box will be multiplied by its 
          mixup score.
          )DOC");
   }

From aad62eeca0eb1c58c2a51b1f5779e3dbb7d76e70 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 12 Mar 2019 11:51:38 +0800
Subject: [PATCH 27/73] add doc for param default. test=develop

---
 paddle/fluid/API.spec                              | 2 +-
 paddle/fluid/operators/detection/yolov3_loss_op.cc | 3 ++-
 python/paddle/fluid/layers/detection.py            | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c8e2da9b6e..aa373efdae 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -327,7 +327,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name', 'gtscore', 'use_label_smooth'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '3a76e87a6d7b961612094d7b67cd8b2a'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name', 'gtscore', 'use_label_smooth'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '3da31ac0f0c102b15f613bdaa2c2f7ac'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.cc b/paddle/fluid/operators/detection/yolov3_loss_op.cc
index 27d8bb0132..6c37da17f4 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -162,7 +162,8 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("ignore_thresh",
                    "The ignore threshold to ignore confidence loss.")
         .SetDefault(0.7);
-    AddAttr<bool>("use_label_smooth", "bool,default True", "use label smooth")
+    AddAttr<bool>("use_label_smooth",
+                  "Whether to use label smooth. Default True.")
         .SetDefault(true);
     AddComment(R"DOC(
          This operator generates yolov3 loss based on given predict result and ground
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 1ecb9b9e34..4cc10182ea 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -535,9 +535,9 @@ def yolov3_loss(x,
         class_num (int): ${class_num_comment}
         ignore_thresh (float): ${ignore_thresh_comment}
         downsample_ratio (int): ${downsample_ratio_comment}
-        name (string): the name of yolov3 loss
+        name (string): the name of yolov3 loss. Default None.
         gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
-                            of [N, B].
+                            of [N, B]. Default None.
         use_label_smooth (bool): ${use_label_smooth_comment}
 
     Returns:

From d31693afec7c036a7530d18d1826e95a71d78dac Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 12 Mar 2019 08:05:49 +0000
Subject: [PATCH 28/73] no use _gt_score. test=develop

---
 paddle/fluid/API.spec                         |  2 +-
 .../operators/detection/yolov3_loss_op.h      | 20 +++++++++----------
 python/paddle/fluid/layers/detection.py       | 10 +++++-----
 python/paddle/fluid/tests/test_detection.py   |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index aa373efdae..7d342c047a 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -327,7 +327,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name', 'gtscore', 'use_label_smooth'], varargs=None, keywords=None, defaults=(None, None, True)), ('document', '3da31ac0f0c102b15f613bdaa2c2f7ac'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
index df6f28d41d..a004b022b7 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -299,13 +299,13 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
 
     const T* gt_score_data;
     if (!gt_score) {
-      Tensor _gt_score;
-      _gt_score.mutable_data<T>({n, b}, ctx.GetPlace());
+      Tensor gtscore;
+      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
       math::SetConstant<platform::CPUDeviceContext, T>()(
-          ctx.template device_context<platform::CPUDeviceContext>(), &_gt_score,
+          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
           static_cast<T>(1.0));
-      gt_score = &_gt_score;
-      gt_score_data = _gt_score.data<T>();
+      gt_score = &gtscore;
+      gt_score_data = gtscore.data<T>();
     } else {
       gt_score_data = gt_score->data<T>();
     }
@@ -453,13 +453,13 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
 
     const T* gt_score_data;
     if (!gt_score) {
-      Tensor _gt_score;
-      _gt_score.mutable_data<T>({n, b}, ctx.GetPlace());
+      Tensor gtscore;
+      gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
       math::SetConstant<platform::CPUDeviceContext, T>()(
-          ctx.template device_context<platform::CPUDeviceContext>(), &_gt_score,
+          ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
           static_cast<T>(1.0));
-      gt_score = &_gt_score;
-      gt_score_data = _gt_score.data<T>();
+      gt_score = &gtscore;
+      gt_score_data = gtscore.data<T>();
     } else {
       gt_score_data = gt_score->data<T>();
     }
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 4cc10182ea..8225b7067a 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -515,9 +515,9 @@ def yolov3_loss(x,
                 class_num,
                 ignore_thresh,
                 downsample_ratio,
-                name=None,
                 gtscore=None,
-                use_label_smooth=True):
+                use_label_smooth=True,
+                name=None):
     """
     ${comment}
 
@@ -547,7 +547,7 @@ def yolov3_loss(x,
         TypeError: Input x of yolov3_loss must be Variable
         TypeError: Input gtbox of yolov3_loss must be Variable
         TypeError: Input gtlabel of yolov3_loss must be Variable
-        TypeError: Input gtscore of yolov3_loss must be Variable
+        TypeError: Input gtscore of yolov3_loss must be None or Variable
         TypeError: Attr anchors of yolov3_loss must be list or tuple
         TypeError: Attr class_num of yolov3_loss must be an integer
         TypeError: Attr ignore_thresh of yolov3_loss must be a float number
@@ -559,7 +559,7 @@ def yolov3_loss(x,
           x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
           gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
           gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
-          gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='int32')
+          gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32')
           anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
           anchor_mask = [0, 1, 2]
           loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
@@ -575,7 +575,7 @@ def yolov3_loss(x,
         raise TypeError("Input gtbox of yolov3_loss must be Variable")
     if not isinstance(gtlabel, Variable):
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
-    if not isinstance(gtscore, Variable):
+    if gtsocre is not None and not isinstance(gtscore, Variable):
         raise TypeError("Input gtscore of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index b202c0ffea..b756c532ca 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -476,7 +476,7 @@ class TestYoloDetection(unittest.TestCase):
             x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
             gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
             gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
-            gtscore = layers.data(name='gtscore', shape=[10], dtype='int32')
+            gtscore = layers.data(name='gtscore', shape=[10], dtype='float32')
             loss = layers.yolov3_loss(
                 x,
                 gtbox,

From e4e3764060a292a35e7ce8ec692240ad3d9fd597 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 12 Mar 2019 08:55:02 +0000
Subject: [PATCH 29/73] use memory Copy. test=develop

---
 .../fluid/operators/detection/yolo_box_op.cc  | 19 ++++++++-------
 .../fluid/operators/detection/yolo_box_op.cu  | 23 +++++++++++++------
 python/paddle/fluid/layers/detection.py       |  6 ++---
 .../fluid/tests/unittests/test_yolo_box_op.py |  2 +-
 .../tests/unittests/test_yolov3_loss_op.py    | 12 ++++++----
 5 files changed, 39 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cc b/paddle/fluid/operators/detection/yolo_box_op.cc
index 6d8dac38f7..e0d7e25d94 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cc
+++ b/paddle/fluid/operators/detection/yolo_box_op.cc
@@ -74,9 +74,8 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
     AddInput("X",
-             "The input tensor of YoloBox operator, "
-             "This is a 4-D tensor with shape of [N, C, H, W]. "
-             "H and W should be same, and the second dimension(C) stores "
+             "The input tensor of YoloBox operator is a 4-D tensor with "
+             "shape of [N, C, H, W]. The second dimension(C) stores "
              "box locations, confidence score and classification one-hot "
              "keys of each anchor box. Generally, X should be the output "
              "of YOLOv3 network.");
@@ -91,10 +90,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
               "batch num, M is output box number, and the 3rd dimension "
               "stores [xmin, ymin, xmax, ymax] coordinates of boxes.");
     AddOutput("Scores",
-              "The output tensor ofdetection boxes scores of YoloBox "
-              "operator, This is a 3-D tensor with shape of [N, M, C], "
-              "N is the batch num, M is output box number, C is the "
-              "class number.");
+              "The output tensor of detection boxes scores of YoloBox "
+              "operator, This is a 3-D tensor with shape of "
+              "[N, M, :attr:`class_num`], N is the batch num, M is "
+              "output box number.");
 
     AddAttr<int>("class_num", "The number of classes to predict.");
     AddAttr<std::vector<int>>("anchors",
@@ -112,7 +111,7 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
                    "be ignored.")
         .SetDefault(0.01);
     AddComment(R"DOC(
-         This operator generate YOLO detection boxes from output of YOLOv3 network.
+         This operator generates YOLO detection boxes from output of YOLOv3 network.
          
          The output of previous network is in shape [N, C, H, W], while H and W
          should be the same, H and W specify the grid size, each grid point predict 
@@ -150,6 +149,10 @@ class YoloBoxOpMaker : public framework::OpProtoAndCheckerMaker {
          :attr:`conf_thresh` should be ignored, and box final scores is the product of 
          confidence scores and classification scores.
 
+         $$
+         score_{pred} = score_{conf} * score_{class}
+         $$
+
          )DOC");
   }
 };
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 30175be8bb..12555f5347 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -83,12 +83,22 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     const int an_num = anchors.size() / 2;
     int input_size = downsample_ratio * h;
 
-    Tensor anchors_t, cpu_anchors_t;
-    auto cpu_anchors_data =
-        cpu_anchors_t.mutable_data<int>({an_num * 2}, platform::CPUPlace());
-    std::copy(anchors.begin(), anchors.end(), cpu_anchors_data);
-    TensorCopySync(cpu_anchors_t, ctx.GetPlace(), &anchors_t);
-    auto anchors_data = anchors_t.data<int>();
+    /* Tensor anchors_t, cpu_anchors_t; */
+    /* auto cpu_anchors_data = */
+    /*     cpu_anchors_t.mutable_data<int>({an_num * 2}, platform::CPUPlace()); */
+    /* std::copy(anchors.begin(), anchors.end(), cpu_anchors_data); */
+    /* TensorCopySync(cpu_anchors_t, ctx.GetPlace(), &anchors_t); */
+    /* auto anchors_data = anchors_t.data<int>(); */
+    auto& dev_ctx = ctx.cuda_device_context();
+    auto& allocator = 
+      platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    int bytes = sizeof(int) * anchors.size();
+    auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size());
+    int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
+    const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+    const auto cplace = platform::CPUPlace();
+    memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
+                            dev_ctx.stream());
 
     const T* input_data = input->data<T>();
     const int* imgsize_data = img_size->data<int>();
@@ -96,7 +106,6 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     T* scores_data =
         scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
     math::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     set_zero(dev_ctx, boxes, static_cast<T>(0));
     set_zero(dev_ctx, scores, static_cast<T>(0));
 
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 6d82b8a12e..56589c1728 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -632,8 +632,8 @@ def yolo_box(x,
 
     Returns:
         Variable: A 3-D tensor with shape [N, M, 4], the coordinates of boxes,
-        and a 3-D tensor with shape [N, M, C], the classification scores
-        of boxes.
+        and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification 
+        scores of boxes.
 
     Raises:
         TypeError: Input x of yolov_box must be Variable
@@ -647,7 +647,7 @@ def yolo_box(x,
 
         x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
         anchors = [10, 13, 16, 30, 33, 23]
-        loss = fluid.layers.yolov3_loss(x=x, class_num=80, anchors=anchors, 
+        loss = fluid.layers.yolo_box(x=x, class_num=80, anchors=anchors, 
                                         conf_thresh=0.01, downsample_ratio=32)
     """
     helper = LayerHelper('yolo_box', **locals())
diff --git a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
index d4a179794c..416e6ea9f4 100644
--- a/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolo_box_op.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
index 569fe63d05..020c113923 100644
--- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py
@@ -75,8 +75,8 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
     mask_num = len(anchor_mask)
     class_num = attrs["class_num"]
     ignore_thresh = attrs['ignore_thresh']
-    downsample_ratio = attrs['downsample_ratio']
-    input_size = downsample_ratio * h
+    downsample = attrs['downsample']
+    input_size = downsample * h
     x = x.reshape((n, mask_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2))
     loss = np.zeros((n)).astype('float32')
 
@@ -86,6 +86,10 @@ def YOLOv3Loss(x, gtbox, gtlabel, attrs):
     pred_box[:, :, :, :, 0] = (grid_x + sigmoid(pred_box[:, :, :, :, 0])) / w
     pred_box[:, :, :, :, 1] = (grid_y + sigmoid(pred_box[:, :, :, :, 1])) / h
 
+    x[:, :, :, :, 5:] = np.where(x[:, :, :, :, 5:] < -0.5, x[:, :, :, :, 5:],
+                                 np.ones_like(x[:, :, :, :, 5:]) * 1.0 /
+                                 class_num)
+
     mask_anchors = []
     for m in anchor_mask:
         mask_anchors.append((anchors[2 * m], anchors[2 * m + 1]))
@@ -172,7 +176,7 @@ class TestYolov3LossOp(OpTest):
             "anchor_mask": self.anchor_mask,
             "class_num": self.class_num,
             "ignore_thresh": self.ignore_thresh,
-            "downsample_ratio": self.downsample_ratio,
+            "downsample": self.downsample,
         }
 
         self.inputs = {
@@ -204,7 +208,7 @@ class TestYolov3LossOp(OpTest):
         self.anchor_mask = [1, 2]
         self.class_num = 5
         self.ignore_thresh = 0.5
-        self.downsample_ratio = 32
+        self.downsample = 32
         self.x_shape = (3, len(self.anchor_mask) * (5 + self.class_num), 5, 5)
         self.gtbox_shape = (3, 5, 4)
 

From 746740c41b356913bf0e15245b09d41d6a7027c6 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 12 Mar 2019 08:58:32 +0000
Subject: [PATCH 30/73] fix API.spec. test=develop

---
 paddle/fluid/API.spec | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6ebbf1f7c4..5a81c1b4c7 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -328,7 +328,7 @@ paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=Non
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
 paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '991e934c3e09abf0edec7c9c978b4691'))
-paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf73e1c87dc933371ce96d66878838d9'))
+paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))
 paddle.fluid.layers.distribute_fpn_proposals (ArgSpec(args=['fpn_rois', 'min_level', 'max_level', 'refer_level', 'refer_scale', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '7bb011ec26bace2bc23235aa4a17647d'))

From b33e6bf5ef14c7c7d769ec127dcc6ed2e602ee2b Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 12 Mar 2019 09:12:04 +0000
Subject: [PATCH 31/73] remove comment code. test=develop

---
 paddle/fluid/operators/detection/yolo_box_op.cu | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 12555f5347..7d223e84a8 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -83,12 +83,6 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     const int an_num = anchors.size() / 2;
     int input_size = downsample_ratio * h;
 
-    /* Tensor anchors_t, cpu_anchors_t; */
-    /* auto cpu_anchors_data = */
-    /*     cpu_anchors_t.mutable_data<int>({an_num * 2}, platform::CPUPlace()); */
-    /* std::copy(anchors.begin(), anchors.end(), cpu_anchors_data); */
-    /* TensorCopySync(cpu_anchors_t, ctx.GetPlace(), &anchors_t); */
-    /* auto anchors_data = anchors_t.data<int>(); */
     auto& dev_ctx = ctx.cuda_device_context();
     auto& allocator = 
       platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);

From 0ff9a403d00778e85f718665e09e54485c72908d Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 12 Mar 2019 18:40:17 +0800
Subject: [PATCH 32/73] fix format. test=develop

---
 paddle/fluid/operators/detection/yolo_box_op.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 7d223e84a8..5a882958e6 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -84,15 +84,15 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     int input_size = downsample_ratio * h;
 
     auto& dev_ctx = ctx.cuda_device_context();
-    auto& allocator = 
-      platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
     int bytes = sizeof(int) * anchors.size();
     auto anchors_ptr = allocator.Allocate(sizeof(int) * anchors.size());
     int* anchors_data = reinterpret_cast<int*>(anchors_ptr->ptr());
     const auto gplace = boost::get<platform::CUDAPlace>(ctx.GetPlace());
     const auto cplace = platform::CPUPlace();
     memory::Copy(gplace, anchors_data, cplace, anchors.data(), bytes,
-                            dev_ctx.stream());
+                 dev_ctx.stream());
 
     const T* input_data = input->data<T>();
     const int* imgsize_data = img_size->data<int>();

From 2c0abba0c388a051db2e2aad1268e14c1fc9ce90 Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 12 Mar 2019 19:25:30 +0800
Subject: [PATCH 33/73] fix test_detection. test=develop

---
 python/paddle/fluid/layers/detection.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 8225b7067a..9183bfd43b 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -575,7 +575,7 @@ def yolov3_loss(x,
         raise TypeError("Input gtbox of yolov3_loss must be Variable")
     if not isinstance(gtlabel, Variable):
         raise TypeError("Input gtlabel of yolov3_loss must be Variable")
-    if gtsocre is not None and not isinstance(gtscore, Variable):
+    if gtscore is not None and not isinstance(gtscore, Variable):
         raise TypeError("Input gtscore of yolov3_loss must be Variable")
     if not isinstance(anchors, list) and not isinstance(anchors, tuple):
         raise TypeError("Attr anchors of yolov3_loss must be list or tuple")

From d94fd972306e2fb237212c341adc4f90f7181c06 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Wed, 13 Mar 2019 17:22:39 +0800
Subject: [PATCH 34/73] add runtime_context_cache_pass

test=develop
---
 paddle/fluid/framework/ir/CMakeLists.txt      |  1 +
 .../ir/runtime_context_cache_pass.cc          | 39 +++++++++++++++++++
 .../framework/ir/runtime_context_cache_pass.h | 32 +++++++++++++++
 paddle/fluid/framework/operator.cc            | 20 ++++------
 paddle/fluid/framework/operator.h             |  8 ++++
 paddle/fluid/framework/scope.cc               |  4 --
 paddle/fluid/framework/scope.h                |  4 --
 paddle/fluid/inference/api/analysis_config.cc |  8 ++++
 .../inference/api/paddle_analysis_config.h    | 26 +++++++++++++
 .../inference/tests/api/config_printer.h      |  3 +-
 paddle/fluid/pybind/inference_api.cc          |  4 ++
 11 files changed, 127 insertions(+), 22 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/runtime_context_cache_pass.cc
 create mode 100644 paddle/fluid/framework/ir/runtime_context_cache_pass.h

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ca6b0229e9..f7d82d5ead 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -66,6 +66,7 @@ pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
+pass_library(runtime_context_cache_pass base)
 
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
new file mode 100644
index 0000000000..75f3795185
--- /dev/null
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/runtime_context_cache_pass.h"
+#include <memory>
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Applies Runtime Context Cache strategy.";
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      n->Op()->SetAttr(kEnableRuntimeContext, true);
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(runtime_context_cache_pass,
+              paddle::framework::ir::RuntimeContextCachePass);
diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.h b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
new file mode 100644
index 0000000000..a6cf1a9ae5
--- /dev/null
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class RuntimeContextCachePass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index eeced516ed..980c5d8582 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_transform.h"
-#include "paddle/fluid/framework/details/op_handle_base.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -877,19 +876,14 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
 
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                  const platform::Place& place) const {
-  const Scope* cur_scope = &scope;
-  // RuntimeContext is used to relate input/output names of Operator with
-  // the corresponding variables in Scope.
-  // In a same Scope, since the input/output names of Operator do not change
-  // in the execution, RuntimeContext could be created only at the first
-  // iteration of the execution to save the elapsed time.
-  // Note that the Scope should not be the local scope, since local scope
-  // would be cleaned regularly.
-  if (scope.FindVar(details::kLocalExecScopeName)) {
+  if (!HasAttr(kEnableRuntimeContext)) {
     runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-  } else if (!runtime_ctx_ || pre_scope_ != cur_scope) {
-    runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
-    pre_scope_ = cur_scope;
+  } else {
+    const Scope* cur_scope = &scope;
+    if (!runtime_ctx_ || pre_scope_ != cur_scope) {
+      runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+      pre_scope_ = cur_scope;
+    }
   }
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 6a2d4478a1..29b9c45ccf 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -62,6 +62,14 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 /// Variables with this suffix are the new Gradient.
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 
+/// RuntimeContext is used to relate input/output names of Operator with
+/// the corresponding variables in Scope.
+/// If an Op has attribute kEnableRuntimeContext, it means that in a same Scope,
+/// since the input/output names of this Op do not change in the execution,
+/// RuntimeContext could be created only at the first iteration of this Op's
+/// execution to save the elapsed time.
+constexpr char kEnableRuntimeContext[] = "@ENABLE_RUNTIME_CONTEXT@";
+
 // define some kernel priority
 /* Define multiple kernel type fallback order*/
 extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index e6de477171..87f0f307d3 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -107,10 +107,6 @@ const Scope* Scope::FindScope(const Variable* var) const {
   return FindScopeInternal(var);
 }
 
-bool Scope::HasLocalVar(const std::string& name) const {
-  return vars_.find(name) != vars_.end();
-}
-
 void Scope::DropKids() {
   SCOPE_KIDS_WRITER_LOCK
   for (Scope* s : kids_) delete s;
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 38d3b4d6ce..f0915d2eee 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -75,10 +75,6 @@ class Scope {
   /// Caller doesn't own the returned Variable.
   Variable* FindLocalVar(const std::string& name) const;
 
-  /// Find whether a variable in the current scope.
-  /// Return false if cannot find.
-  bool HasLocalVar(const std::string& name) const;
-
   const Scope* parent() const { return parent_; }
 
   /// Find the scope or an ancestor scope that contains the given variable.
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 7741111222..a9e477f883 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -118,6 +118,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   CP_MEMBER(serialized_info_cache_);
 
+  // framework related.
+  CP_MEMBER(enable_runtime_context_cache_);
+
   if (use_gpu_) {
     pass_builder_.reset(new GpuPassStrategy(
         *static_cast<GpuPassStrategy *>(other.pass_builder())));
@@ -225,6 +228,10 @@ void AnalysisConfig::Update() {
   if (ir_debug_) {
     pass_builder()->TurnOnDebug();
   }
+
+  if (enable_runtime_context_cache_) {
+    pass_builder()->AppendPass("runtime_context_cache_pass");
+  }
 }
 
 std::string AnalysisConfig::SerializeInfoCache() {
@@ -258,6 +265,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
 
   ss << specify_input_name_;
   ss << cpu_math_library_num_threads_;
+  ss << enable_runtime_context_cache_;
 
   return ss.str();
 }
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 9b05c33504..85639eebe4 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -194,6 +194,23 @@ struct AnalysisConfig {
   /** Tell whether the memory optimization is activated. */
   bool enable_memory_optim() const;
 
+  // framework related
+  /** \brief Control whether to perform runtime context cache optimization.
+   *
+   * If turned off, in Op's every execution, RuntimeContext would be called to
+   * relate input/output names of this Op with the corresponding variables in
+   * Scope.
+   */
+  void SwitchRuntimeContextCache(int x = true) {
+    enable_runtime_context_cache_ = x;
+  }
+  /** A boolean state tell whether the runtime context cache optimization is
+   * actived.
+   */
+  bool runtime_context_cache_enabled() const {
+    return enable_runtime_context_cache_;
+  }
+
   friend class ::paddle::AnalysisPredictor;
 
   /** NOTE just for developer, not an official API, easily to be broken.
@@ -254,6 +271,15 @@ struct AnalysisConfig {
 
   int cpu_math_library_num_threads_{1};
 
+  // framework related
+  // RuntimeContext is used to relate input/output names of Operator with
+  // the corresponding variables in Scope.
+  // If enable_runtime_context_cache_ is true, it means that in a same Scope,
+  // since the input/output names of this Op do not change in the execution,
+  // RuntimeContext could be created only at the first iteration of this Op's
+  // execution to save the elapsed time.
+  bool enable_runtime_context_cache_{true};
+
   // A runtime cache, shouldn't be transferred to others.
   std::string serialized_info_cache_;
 
diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h
index b0c23fbd53..b7b39d4dd4 100644
--- a/paddle/fluid/inference/tests/api/config_printer.h
+++ b/paddle/fluid/inference/tests/api/config_printer.h
@@ -72,7 +72,8 @@ std::ostream &operator<<(std::ostream &os, const AnalysisConfig &config) {
   }
   os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
      << "\n";
-  os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.ir_optim()
+  os << GenSpaces(num_spaces)
+     << "use_runtime_context_cache: " << config.runtime_context_cache_enabled()
      << "\n";
   os << GenSpaces(num_spaces)
      << "use_feed_fetch_ops: " << config.use_feed_fetch_ops_enabled() << "\n";
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 236afc77f7..11e9725aea 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -242,6 +242,10 @@ void BindAnalysisConfig(py::module *m) {
       .def("set_mkldnn_op", &AnalysisConfig::SetMKLDNNOp)
       .def("set_model_buffer", &AnalysisConfig::SetModelBuffer)
       .def("model_from_memory", &AnalysisConfig::model_from_memory)
+      .def("runtime_context_cache_enabled",
+           &AnalysisConfig::runtime_context_cache_enabled)
+      .def("switch_runtime_context_cache",
+           &AnalysisConfig::SwitchRuntimeContextCache, py::arg("x") = true)
       .def("pass_builder", &AnalysisConfig::pass_builder,
            py::return_value_policy::reference);
 }

From 1510b866b6b254584e3ff9fddebb76a436c1071f Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Wed, 13 Mar 2019 21:00:52 +0800
Subject: [PATCH 35/73] turn off runtime_context_cache for tensorrt

test=develop
---
 paddle/fluid/inference/api/analysis_config.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index a9e477f883..2189b87381 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -205,6 +205,8 @@ void AnalysisConfig::Update() {
       // Append after the Affine_channel_conv_fuse pass.
       pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
     }
+    // runtime_context_cache isn't fit for tensorrt.
+    enable_runtime_context_cache_ = false;
   }
 
   if (use_mkldnn_) {

From 42e96a029fcf07d691fc61502432fdd4bce091ae Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 13 Mar 2019 22:44:35 +0800
Subject: [PATCH 36/73] Accelerate CPU part

---
 CMakeLists.txt                              | 10 ++++++
 paddle/fluid/framework/grad_op_desc_maker.h |  5 ++-
 paddle/fluid/imperative/CMakeLists.txt      |  1 +
 paddle/fluid/imperative/layer.cc            |  6 ++--
 paddle/fluid/imperative/layer.h             | 28 ++++++++++------
 paddle/fluid/imperative/tracer.cc           | 37 ++-------------------
 paddle/fluid/imperative/tracer.h            |  4 +++
 paddle/fluid/inference/CMakeLists.txt       |  3 +-
 paddle/fluid/pybind/CMakeLists.txt          |  2 +-
 paddle/fluid/pybind/pybind.cc               |  7 ++++
 python/paddle/fluid/__init__.py             |  3 +-
 python/paddle/fluid/framework.py            |  4 +++
 python/paddle/fluid/imperative/__init__.py  |  4 +++
 python/paddle/fluid/imperative/nn.py        |  3 ++
 14 files changed, 64 insertions(+), 53 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e7ffe72b5..ce06c73bdc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,16 @@ if(WIN32)
     set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
     set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
+# else()
+		# set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr <TARGET> <LINK_FLAGS> <OBJECTS>")
+		# set(CMAKE_C_ARCHIVE_APPEND "<CMAKE_AR> --target elf64-x86-64 r  <TARGET> <LINK_FLAGS> <OBJECTS>")
+		# # set(CMAKE_C_ARCHIVE_FINISH "<CMAKE_RANLIB> --enable-64-bit-archive <TARGET>")
+		# set(CMAKE_CXX_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE})
+		# set(CMAKE_CXX_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND})
+		# # set(CMAKE_CXX_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH})
+		# set(CMAKE_Fortran_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE})
+		# set(CMAKE_Fortran_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND})
+		# # set(CMAKE_Fortran_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH})
 endif(WIN32)
 
 find_package(CUDA QUIET)
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index 9bccb1a32b..46ebf4051f 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -55,7 +55,10 @@ class GradOpDescMakerBase {
                    std::back_inserter(ret_val),
                    [this](const std::string& fwd_var_name) -> std::string {
                      auto g_name = GradVarName(fwd_var_name);
-                     if (no_grad_set_.count(g_name)) {
+                     if (no_grad_set_.empty()) {
+                       (*this->grad_to_var_)[g_name] = fwd_var_name;
+                       return g_name;
+                     } else if (no_grad_set_.count(g_name)) {
                        return kEmptyVarName;
                      } else {
                        (*this->grad_to_var_)[g_name] = fwd_var_name;
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index ec8dedd605..0d116a6495 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -2,4 +2,5 @@ if(WITH_PYTHON)
 cc_library(layer SRCS layer.cc DEPS proto_desc operator device_context blas pybind)
 cc_library(tracer SRCS tracer.cc DEPS proto_desc device_context pybind)
 cc_library(engine SRCS engine.cc)
+cc_library(imperative_profiler SRCS profiler.cc)
 endif()
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 5530823b90..d7f7967f72 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -239,7 +239,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
       VLOG(3) << "apply grad op " << grad_op_desc->Type();
 
       // Allocate tmp grad output variable
-      for (auto it : grad_output_variable_map) {
+      for (const auto& it : grad_output_variable_map) {
         auto& outputs = tmp_grad_outputs[k][it.first];
         outputs.reserve(it.second.size());
         for (size_t i = 0; i < it.second.size(); ++i) {
@@ -273,9 +273,9 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
 
   // Add tmp grad outputs to original grad vars
   for (size_t k = 0; k < grad_output_vars_.size(); ++k) {
-    for (auto it : grad_output_vars_[k]) {
+    for (const auto& it : grad_output_vars_[k]) {
       auto& outputs = tmp_grad_outputs[k][it.first];
-      auto& origin_outputs = it.second;
+      const auto& origin_outputs = it.second;
       PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
 
       for (size_t i = 0; i < outputs.size(); ++i) {
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 618a5b7a03..27cb1c84f5 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -294,17 +294,23 @@ class PYBIND11_HIDDEN OpBase {
 
   void InvokeBackwardHooks();
 
-  void TrackPreOp(const VarBase* inp_var, const std::string& inp_name) {
-    if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
-      VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
-              << inp_name;
-      pre_ops_[inp_name].push_back(inp_var->PreOp());
-      pre_ops_out_idx_[inp_name].push_back(inp_var->PreOpOutIdx());
-    } else {
-      VLOG(3) << "no pre op in slot " << inp_name
-              << " input var stop_gradient: " << inp_var->IsStopGradient();
-      pre_ops_[inp_name].push_back(nullptr);
-      // pre_ops_out_idx_[inp_name].push_back(-1);
+  void TrackPreOp(const std::string& inp_name,
+                  const std::vector<VarBase*>& inputs) {
+    auto& pre_ops_list = pre_ops_[inp_name];
+    pre_ops_list.reserve(inputs.size());
+    auto& pre_ops_out_idx_list = pre_ops_out_idx_[inp_name];
+    for (VarBase* inp_var : inputs) {
+      if (inp_var->PreOp() && !inp_var->IsStopGradient()) {
+        VLOG(3) << "add pre op " << inp_var->PreOp()->Type() << " in slot "
+                << inp_name;
+        pre_ops_list.emplace_back(inp_var->PreOp());
+        pre_ops_out_idx_list.push_back(inp_var->PreOpOutIdx());
+      } else {
+        VLOG(3) << "no pre op in slot " << inp_name
+                << " input var stop_gradient: " << inp_var->IsStopGradient();
+        pre_ops_list.emplace_back(nullptr);
+        // pre_ops_out_idx_list.push_back(-1);
+      }
     }
   }
 
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7ee92b4d8c..7773a3f8fc 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -23,23 +23,9 @@
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 
-#ifdef WITH_GPERFTOOLS
-#include "gperftools/profiler.h"
-#endif
-
-DEFINE_string(
-    tracer_profile_fname, "",
-    "Profiler filename for imperative tracer, which generated by gperftools."
-    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
-
 namespace paddle {
 namespace imperative {
 
-static std::once_flag gTracerProfileOnce;
-#ifdef WITH_GPERFTOOLS
-static bool gTracerProfilerStarted = false;
-#endif
-
 void CreateGradOp(const framework::OpDesc& op_desc,
                   const std::unordered_set<std::string>& no_grad_set,
                   const std::vector<framework::BlockDesc*>& grad_sub_block,
@@ -146,17 +132,6 @@ framework::VariableNameMap CreateOutputVarNameMap(
 }
 
 Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
-  if (!FLAGS_tracer_profile_fname.empty()) {
-    std::call_once(gTracerProfileOnce, [] {
-#ifdef WITH_GPERFTOOLS
-      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
-      gTracerProfilerStarted = true;
-#else
-      LOG(WARNING) << "Paddle is not compiled with gperftools. "
-                      "FLAGS_tracer_profile_fname will be ignored";
-#endif
-    });
-  }
 }
 
 std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
@@ -164,12 +139,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                                     framework::AttributeMap attrs_map,
                                     const platform::Place expected_place,
                                     const bool stop_gradient) {
-#ifdef WITH_GPERFTOOLS
-  if (gTracerProfilerStarted) {
-    ProfilerFlush();
-  }
-#endif
-
   framework::VariableValueMap invars_map;
   framework::VariableValueMap outvars_map;
 
@@ -184,7 +153,6 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                               inp->Name());
 
       invars.emplace_back(inp->var_);
-      op->TrackPreOp(inp, it.first);
       if (!stop_gradient) {
         current_vars_map[inp->Name()] = inp;
       }
@@ -192,6 +160,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
               << " inited: " << inp->var_->IsInitialized()
               << " stop_grad: " << inp->IsStopGradient();
     }
+    op->TrackPreOp(it.first, it.second);
   }
 
   op->output_vars_ = outputs;
@@ -319,9 +288,7 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
   std::vector<framework::Variable*> ret_vars =
       PyLayer::Apply(op->forward_id_, inputs);
 
-  for (VarBase* inp : inputs) {
-    op->TrackPreOp(inp, PyLayer::kFwdInp);
-  }
+  op->TrackPreOp(PyLayer::kFwdInp, inputs);
 
   std::vector<VarBase*>& outputs = op->output_vars_[PyLayer::kFwdOut];
   outputs.reserve(ret_vars.size());
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 7b65d55e9e..62d8eecfcc 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -56,6 +56,10 @@ class Tracer {
   std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
                                 bool stop_gradient = false);
 
+  static void StartProfile();
+
+  static void StopProfile();
+
  private:
   platform::Place GetPlace(const VarBasePtrMap& inputs);
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 762640d6d1..0d682fc0a5 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -29,6 +29,7 @@ endif(WIN32)
 if(WIN32)
   sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 else(WIN32)
+  #set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr paddle_fluid_origin ${fluid_modules} paddle_fluid_api")
   cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
 endif(WIN32)
 
@@ -91,5 +92,5 @@ if(WITH_TESTING)
   add_subdirectory(tests/book)
   if(WITH_INFERENCE_API_TEST)
     add_subdirectory(tests/api)
-  endif()  
+  endif()
 endif()
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 4ac5b83c56..f1385f5718 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune
   feed_fetch_method pass_builder parallel_executor profiler layer scope_pool
-  tracer analysis_predictor)
+  tracer analysis_predictor imperative_profiler)
 
 if(WITH_PYTHON)
   list(APPEND PYBIND_DEPS py_func_op)
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 395093a1f5..7a6a5b8645 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -36,6 +36,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/framework/version.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/profiler.h"
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/legacy_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
@@ -148,6 +149,12 @@ PYBIND11_MODULE(core, m) {
   m.def("print_mem_usage",
         []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
 
+  m.def("start_imperative_profiler",
+        []() { imperative::StartProfile(); });
+
+  m.def("stop_imperative_profiler",
+        []() { imperative::StopProfile(); });
+
   py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
       .def(
           py::init<const std::string &, paddle::framework::proto::VarType::Type,
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index 103c4d3dd0..017d482d39 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -132,7 +132,8 @@ def __bootstrap__():
         'allocator_strategy', 'reader_queue_speed_test_mode',
         'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
         'inner_op_parallelism', 'enable_parallel_graph',
-        'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize'
+        'multiple_of_cupti_buffer_size', 'enable_subgraph_optimize',
+        'tracer_profile_fname'
     ]
     if 'Darwin' not in sysstr:
         read_env_flags.append('use_pinned_memory')
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 5b9dd86931..3fcea010db 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -290,6 +290,7 @@ class Variable(object):
                                                 dtype='float32')
     """
 
+    #  @profile
     def __init__(self,
                  block,
                  type=core.VarDesc.VarType.LOD_TENSOR,
@@ -645,6 +646,7 @@ class Operator(object):
         'checkpoint_notify', 'gen_nccl_id'
     }
 
+    #  @profile
     def __init__(self,
                  block,
                  desc,
@@ -1239,6 +1241,7 @@ class Block(object):
         return (item[1] for item in six.iteritems(self.vars)
                 if isinstance(item[1], Parameter))
 
+    #  @profile
     def create_var(self, *args, **kwargs):
         var = Variable(block=self, *args, **kwargs)
         if 'initializer' in kwargs:
@@ -1347,6 +1350,7 @@ class Block(object):
                 initializer(param, self)
         return param
 
+    #  @profile
     def append_op(self, *args, **kwargs):
         """
         Appends a new Operator according to the giving arguments.
diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py
index 034a11e0a6..7f31ca1b9b 100644
--- a/python/paddle/fluid/imperative/__init__.py
+++ b/python/paddle/fluid/imperative/__init__.py
@@ -26,8 +26,12 @@ from .nn import *
 from . import tracer
 from .tracer import *
 
+from . import profiler
+from .profiler import *
+
 __all__ = []
 __all__ += layers.__all__
 __all__ += base.__all__
 __all__ += nn.__all__
 __all__ += tracer.__all__
+__all__ += profiler.__all__
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index bf3f7ce52e..07a690e704 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -97,6 +97,7 @@ class Conv2D(layers.Layer):
             dtype=self._dtype,
             is_bias=True)
 
+    #  @profile
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
@@ -169,6 +170,7 @@ class Pool2D(layers.Layer):
         self._exclusive = exclusive
         self._l_type = 'pool2d'
 
+    #  @profile
     def forward(self, input):
         pool_out = self._helper.create_variable_for_type_inference(self._dtype)
 
@@ -229,6 +231,7 @@ class FC(layers.Layer):
         else:
             self._b = None
 
+    #  @profile
     def forward(self, input):
         tmp = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(

From 98dfb492bb1ca6fae5d565e95891943b261d2a89 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 14 Mar 2019 10:20:03 +0800
Subject: [PATCH 37/73] Release GIL lock

---
 paddle/fluid/CMakeLists.txt           |  2 +-
 paddle/fluid/inference/CMakeLists.txt | 52 +++++++++++++--------------
 paddle/fluid/pybind/imperative.cc     |  2 ++
 3 files changed, 29 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 595454e90b..6e951e42a0 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,4 +9,4 @@ add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-add_subdirectory(train)
+#add_subdirectory(train)
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 0d682fc0a5..a89dc3caf5 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -25,13 +25,13 @@ if (WIN32)
 list(APPEND fluid_third_partys gflags glog protobuf cblas)
 endif(WIN32)
 
-# paddle_fluid_origin exclude inference api interface
-if(WIN32)
-  sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-else(WIN32)
-  #set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr paddle_fluid_origin ${fluid_modules} paddle_fluid_api")
-  cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-endif(WIN32)
+# # paddle_fluid_origin exclude inference api interface
+# if(WIN32)
+  # sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+# else(WIN32)
+  # set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr paddle_fluid_origin ${fluid_modules} paddle_fluid_api")
+  # cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+# endif(WIN32)
 
 add_subdirectory(api)
 
@@ -41,19 +41,19 @@ set(SHARED_INFERENCE_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
 
-if(WIN32)
-  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
-              analysis_config paddle_pass_builder)
-else(WIN32)
-  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
-             zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
-endif(WIN32)
+# if(WIN32)
+  # sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
+              # analysis_config paddle_pass_builder)
+# else(WIN32)
+  # cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
+             # zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
+# endif(WIN32)
 
-if(NOT APPLE)
-  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
-  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
-  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-endif()
+# if(NOT APPLE)
+  # # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
+  # set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
+  # set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+# endif()
 
 # Create shared library
 if(WIN32)
@@ -87,10 +87,10 @@ if(NOT APPLE AND NOT WIN32)
   add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 
-if(WITH_TESTING)
-    # tests/book depends the models that generated by python/paddle/fluid/tests/book
-  add_subdirectory(tests/book)
-  if(WITH_INFERENCE_API_TEST)
-    add_subdirectory(tests/api)
-  endif()
-endif()
+# if(WITH_TESTING)
+    # # tests/book depends the models that generated by python/paddle/fluid/tests/book
+  # add_subdirectory(tests/book)
+  # if(WITH_INFERENCE_API_TEST)
+    # add_subdirectory(tests/api)
+  # endif()
+# endif()
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 6bbda69297..29c8e6a129 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -42,6 +42,7 @@ void BindTracer(pybind11::module* m) {
               framework::AttributeMap attrs_map,
               const platform::CPUPlace expected_place,
               const bool stop_gradient = false) {
+             pybind11::gil_scoped_release release;
              return self.Trace(op, inputs, outputs, attrs_map, expected_place,
                                stop_gradient);
            })
@@ -52,6 +53,7 @@ void BindTracer(pybind11::module* m) {
               framework::AttributeMap attrs_map,
               const platform::CUDAPlace expected_place,
               const bool stop_gradient = false) {
+             pybind11::gil_scoped_release release;
              return self.Trace(op, inputs, outputs, attrs_map, expected_place,
                                stop_gradient);
            })

From 7355d41834928eeb341180fef50acdb4620aa8f1 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 14 Mar 2019 14:55:26 +0800
Subject: [PATCH 38/73] 1. Add imperative gperf profiler 2. Add binutils 2.27
 in manylinux support

test=develop
---
 CMakeLists.txt                             | 12 +---
 paddle/fluid/CMakeLists.txt                |  2 +-
 paddle/fluid/imperative/profiler.cc        | 65 ++++++++++++++++++++++
 paddle/fluid/imperative/profiler.h         | 25 +++++++++
 paddle/fluid/inference/CMakeLists.txt      | 51 +++++++++--------
 paddle/fluid/pybind/pybind.cc              |  5 +-
 python/paddle/fluid/imperative/profiler.py | 30 ++++++++++
 tools/manylinux1/build_scripts/build.sh    |  6 ++
 8 files changed, 156 insertions(+), 40 deletions(-)
 create mode 100644 paddle/fluid/imperative/profiler.cc
 create mode 100644 paddle/fluid/imperative/profiler.h
 create mode 100644 python/paddle/fluid/imperative/profiler.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ce06c73bdc..6bb0e5f51f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,8 @@ message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
         "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
 message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
         "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
+message(STATUS "AR tools: ${CMAKE_AR}")
+
 if(WIN32)
     set(CMAKE_SUPPRESS_REGENERATION ON)
     set(CMAKE_STATIC_LIBRARY_PREFIX lib)
@@ -37,16 +39,6 @@ if(WIN32)
     set(CMAKE_STATIC_LINKER_FLAGS  "${CMAKE_STATIC_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
     set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
     set(CMAKE_EXE_LINKER_FLAGS  "${CMAKE_EXE_LINKER_FLAGS} ${PADDLE_LINK_FLAGS}")
-# else()
-		# set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr <TARGET> <LINK_FLAGS> <OBJECTS>")
-		# set(CMAKE_C_ARCHIVE_APPEND "<CMAKE_AR> --target elf64-x86-64 r  <TARGET> <LINK_FLAGS> <OBJECTS>")
-		# # set(CMAKE_C_ARCHIVE_FINISH "<CMAKE_RANLIB> --enable-64-bit-archive <TARGET>")
-		# set(CMAKE_CXX_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE})
-		# set(CMAKE_CXX_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND})
-		# # set(CMAKE_CXX_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH})
-		# set(CMAKE_Fortran_ARCHIVE_CREATE ${CMAKE_C_ARCHIVE_CREATE})
-		# set(CMAKE_Fortran_ARCHIVE_APPEND ${CMAKE_C_ARCHIVE_APPEND})
-		# # set(CMAKE_Fortran_ARCHIVE_FINISH ${CMAKE_C_ARCHIVE_FINISH})
 endif(WIN32)
 
 find_package(CUDA QUIET)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index 6e951e42a0..595454e90b 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -9,4 +9,4 @@ add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-#add_subdirectory(train)
+add_subdirectory(train)
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
new file mode 100644
index 0000000000..828c36c5ae
--- /dev/null
+++ b/paddle/fluid/imperative/profiler.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/imperative/profiler.h"
+
+#ifdef WITH_GPERFTOOLS
+#include "gperftools/profiler.h"
+#endif
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <mutex>   // NOLINT
+#include <thread>  // NOLINT
+
+DEFINE_string(
+    tracer_profile_fname, "xxgperf",
+    "Profiler filename for imperative tracer, which generated by gperftools."
+    "Only valid when compiled `WITH_PROFILER=ON`. Empty if disable.");
+
+namespace paddle {
+namespace imperative {
+
+static std::once_flag gTracerProfileOnce;
+#ifdef WITH_GPERFTOOLS
+static bool gTracerProfilerStarted = false;
+#endif
+
+void StartProfile() {
+  LOG(ERROR) << "XX " << FLAGS_tracer_profile_fname;
+  if (!FLAGS_tracer_profile_fname.empty()) {
+    std::call_once(gTracerProfileOnce, [] {
+#ifdef WITH_GPERFTOOLS
+      ProfilerStart(FLAGS_tracer_profile_fname.c_str());
+      gTracerProfilerStarted = true;
+      LOG(ERROR) << "YY";
+#else
+      LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                      "FLAGS_tracer_profile_fname will be ignored";
+#endif
+    });
+  }
+}
+
+void StopProfile() {
+  LOG(ERROR) << "ZZ " << FLAGS_tracer_profile_fname;
+#ifdef WITH_GPERFTOOLS
+  ProfilerFlush();
+#else
+  LOG(WARNING) << "Paddle is not compiled with gperftools. "
+                  "FLAGS_tracer_profile_fname will be ignored";
+#endif
+}
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/imperative/profiler.h b/paddle/fluid/imperative/profiler.h
new file mode 100644
index 0000000000..d52aeed4e8
--- /dev/null
+++ b/paddle/fluid/imperative/profiler.h
@@ -0,0 +1,25 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace imperative {
+
+extern void StartProfile();
+
+extern void StopProfile();
+
+}  // namespace imperative
+}  // namespace paddle
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index a89dc3caf5..d27ef8fe3c 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -25,13 +25,12 @@ if (WIN32)
 list(APPEND fluid_third_partys gflags glog protobuf cblas)
 endif(WIN32)
 
-# # paddle_fluid_origin exclude inference api interface
-# if(WIN32)
-  # sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-# else(WIN32)
-  # set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> --target elf64-x86-64 cr paddle_fluid_origin ${fluid_modules} paddle_fluid_api")
-  # cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
-# endif(WIN32)
+# paddle_fluid_origin exclude inference api interface
+if(WIN32)
+  sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+else(WIN32)
+  cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api)
+endif(WIN32)
 
 add_subdirectory(api)
 
@@ -41,19 +40,19 @@ set(SHARED_INFERENCE_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc)
 
-# if(WIN32)
-  # sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
-              # analysis_config paddle_pass_builder)
-# else(WIN32)
-  # cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
-             # zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
-# endif(WIN32)
+if(WIN32)
+  sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array
+              analysis_config paddle_pass_builder)
+else(WIN32)
+  cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS}
+             zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder)
+endif(WIN32)
 
-# if(NOT APPLE)
-  # # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
-  # set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
-  # set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
-# endif()
+if(NOT APPLE)
+  # TODO(liuyiqu: Temporarily disable the link flag because it is not support on Mac.
+  set(LINK_FLAGS "-Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/paddle_fluid.sym")
+  set_target_properties(paddle_fluid PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
+endif()
 
 # Create shared library
 if(WIN32)
@@ -87,10 +86,10 @@ if(NOT APPLE AND NOT WIN32)
   add_custom_target(check_symbol ALL DEPENDS "${CMAKE_CURRENT_BINARY_DIR}/.check_symbol")
 endif()
 
-# if(WITH_TESTING)
-    # # tests/book depends the models that generated by python/paddle/fluid/tests/book
-  # add_subdirectory(tests/book)
-  # if(WITH_INFERENCE_API_TEST)
-    # add_subdirectory(tests/api)
-  # endif()
-# endif()
+if(WITH_TESTING)
+    # tests/book depends the models that generated by python/paddle/fluid/tests/book
+  add_subdirectory(tests/book)
+  if(WITH_INFERENCE_API_TEST)
+    add_subdirectory(tests/api)
+  endif()
+endif()
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 7a6a5b8645..df6e2fbab6 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -149,11 +149,10 @@ PYBIND11_MODULE(core, m) {
   m.def("print_mem_usage",
         []() { return memory::allocation::GPUMemMonitor.PrintMemUsage(); });
 
-  m.def("start_imperative_profiler",
+  m.def("start_imperative_gperf_profiler",
         []() { imperative::StartProfile(); });
 
-  m.def("stop_imperative_profiler",
-        []() { imperative::StopProfile(); });
+  m.def("stop_imperative_gperf_profiler", []() { imperative::StopProfile(); });
 
   py::class_<imperative::VarBase>(m, "VarBase", R"DOC()DOC")
       .def(
diff --git a/python/paddle/fluid/imperative/profiler.py b/python/paddle/fluid/imperative/profiler.py
new file mode 100644
index 0000000000..04c865500b
--- /dev/null
+++ b/python/paddle/fluid/imperative/profiler.py
@@ -0,0 +1,30 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from .. import core
+
+__all__ = [
+    'start_gperf_profiler',
+    'stop_gperf_profiler',
+]
+
+
+def start_gperf_profiler():
+    core.start_imperative_gperf_profiler()
+
+
+def stop_gperf_profiler():
+    core.stop_imperative_gperf_profiler()
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
index 1b0059a8c6..3be94a42d5 100644
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -153,3 +153,9 @@ done
 
 # Restore LD_LIBRARY_PATH
 LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
+
+# According to ar issues: https://lists.gnu.org/archive/html/bug-binutils/2016-05/msg00211.html
+# we should install new version ar with 64-bit supported here
+wget https://ftp.gnu.org/gnu/binutils/binutils-2.27.tar.gz
+tar xzf binutils-2.27.tar.gz && cd binutils-2.27
+./configure --prefix=/opt/rh/devtoolset-2/root/usr/ --enable-64-bit-archive && make -j `nproc` && make install

From f83739499c9d43fa4260884ad79de8d91f87c841 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Thu, 14 Mar 2019 14:57:58 +0800
Subject: [PATCH 39/73] Polish code

test=develop
---
 paddle/fluid/imperative/tracer.h     | 4 ----
 python/paddle/fluid/framework.py     | 4 ----
 python/paddle/fluid/imperative/nn.py | 3 ---
 3 files changed, 11 deletions(-)

diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 62d8eecfcc..7b65d55e9e 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -56,10 +56,6 @@ class Tracer {
   std::vector<VarBase*> PyTrace(OpBase* op, const std::vector<VarBase*>& inputs,
                                 bool stop_gradient = false);
 
-  static void StartProfile();
-
-  static void StopProfile();
-
  private:
   platform::Place GetPlace(const VarBasePtrMap& inputs);
 
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 3fcea010db..5b9dd86931 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -290,7 +290,6 @@ class Variable(object):
                                                 dtype='float32')
     """
 
-    #  @profile
     def __init__(self,
                  block,
                  type=core.VarDesc.VarType.LOD_TENSOR,
@@ -646,7 +645,6 @@ class Operator(object):
         'checkpoint_notify', 'gen_nccl_id'
     }
 
-    #  @profile
     def __init__(self,
                  block,
                  desc,
@@ -1241,7 +1239,6 @@ class Block(object):
         return (item[1] for item in six.iteritems(self.vars)
                 if isinstance(item[1], Parameter))
 
-    #  @profile
     def create_var(self, *args, **kwargs):
         var = Variable(block=self, *args, **kwargs)
         if 'initializer' in kwargs:
@@ -1350,7 +1347,6 @@ class Block(object):
                 initializer(param, self)
         return param
 
-    #  @profile
     def append_op(self, *args, **kwargs):
         """
         Appends a new Operator according to the giving arguments.
diff --git a/python/paddle/fluid/imperative/nn.py b/python/paddle/fluid/imperative/nn.py
index 07a690e704..bf3f7ce52e 100644
--- a/python/paddle/fluid/imperative/nn.py
+++ b/python/paddle/fluid/imperative/nn.py
@@ -97,7 +97,6 @@ class Conv2D(layers.Layer):
             dtype=self._dtype,
             is_bias=True)
 
-    #  @profile
     def forward(self, input):
         pre_bias = self._helper.create_variable_for_type_inference(
             dtype=self._dtype)
@@ -170,7 +169,6 @@ class Pool2D(layers.Layer):
         self._exclusive = exclusive
         self._l_type = 'pool2d'
 
-    #  @profile
     def forward(self, input):
         pool_out = self._helper.create_variable_for_type_inference(self._dtype)
 
@@ -231,7 +229,6 @@ class FC(layers.Layer):
         else:
             self._b = None
 
-    #  @profile
     def forward(self, input):
         tmp = self._helper.create_variable_for_type_inference(self._dtype)
         self._helper.append_op(

From a59b7d47a86018418d9d9eaef77789feb706aecc Mon Sep 17 00:00:00 2001
From: Aurelius84 <liujiezhangbupt@gmail.com>
Date: Fri, 15 Mar 2019 02:06:58 +0000
Subject: [PATCH 40/73] improve layers.fc api doc test=develop

---
 paddle/fluid/API.spec            | 16 +---------
 python/paddle/fluid/layers/nn.py | 55 +++++++++++++++++++++++++-------
 2 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 68c6c8fd67..c4b984cd23 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -68,7 +68,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor
 paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee'))
 paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29'))
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '1929058262994f212620599c63aea6bd'))
+paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '0fd03868c3c4f25d7f8d43daac69e6d3'))
 paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a'))
 paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6'))
 paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d'))
@@ -511,17 +511,3 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
-paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
-paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
-paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
-paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
-paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
-paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
-paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560'))
-paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69'))
-paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
-paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
-paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a'))
-paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8'))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 9886f4e84c..8cf0a457fd 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -205,16 +205,23 @@ def fc(input,
     **Fully Connected Layer**
 
     This function creates a fully connected layer in the network. It can take
-    multiple tensors as its inputs. It creates a variable called weights for
-    each input tensor, which represents a fully connected weight matrix from
-    each input unit to each output unit. The fully connected layer multiplies
-    each input tensor with its coresponding weight to produce an output Tensor.
-    If multiple input tensors are given, the results of multiple multiplications
-    will be sumed up. If bias_attr is not None, a bias variable will be created
-    and added to the output. Finally, if activation is not None, it will be applied
-    to the output as well.
+    one or multiple tensors as its inputs(input can be a list of Variable, see
+    Args in detail). It creates a variable called weights foreach input tensor,
+    which represents a fully connected weight matrix from each input unit to
+    each output unit. The fully connected layer multiplies each input tensor
+    with its corresponding weight to produce an output Tensor with shape [M, `size`],
+    where M is batch size. If multiple input tensors are given, the results of
+    multiple output tensors with shape [M, `size`] will be sumed up. If bias_attr
+    is not None, a bias variable will be created and added to the output.
+    Finally, if activation is not None, it will be applied to the output as well.
+
+    When the input is single tensor:
 
-    This process can be formulated as follows:
+    .. math::
+
+        Out = Act({XW + b})
+
+    When the input are multiple tensors:
 
     .. math::
 
@@ -222,13 +229,31 @@ def fc(input,
 
     In the above equation:
 
-    * :math:`N`: Number of the input.
-    * :math:`X_i`: The input tensor.
-    * :math:`W`: The weights created by this layer.
+    * :math:`N`: Number of the input. N equals to len(input) if input is list of Variable.
+    * :math:`X_i`: The i-th input tensor.
+    * :math:`W_i`: The i-th weights matrix corresponding i-th input tensor.
     * :math:`b`: The bias parameter created by this layer (if needed).
     * :math:`Act`: The activation function.
     * :math:`Out`: The output tensor.
 
+    See below for an example.
+
+    .. code-block:: text
+
+        Given:
+            data_1.data = [[[0.1, 0.2],
+                           [0.3, 0.4]]]
+            data_1.shape = (1, 2, 2) # 1 is batch_size
+
+            data_2 = [[[0.1, 0.2, 0.3]]]
+            data_2.shape = (1, 1, 3)
+
+            out = fluid.layers.fc(input=[data_1, data_2], size=2)
+
+        Then:
+            out.data = [[0.18669507, 0.1893476]]
+            out.shape = (1, 2)
+
     Args:
         input (Variable|list of Variable): The input tensor(s) of this layer, and the dimension of
             the input tensor(s) is at least 2.
@@ -260,8 +285,14 @@ def fc(input,
     Examples:
         .. code-block:: python
 
+          # when input is single tensor
           data = fluid.layers.data(name="data", shape=[32, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
+
+          # when input are multiple tensors
+          data_1 = fluid.layers.data(name="data_1", shape=[32, 32], dtype="float32")
+          data_2 = fluid.layers.data(name="data_2", shape=[24, 36], dtype="float32")
+          fc = fluid.layers.fc(input=[data_1, data_2], size=1000, act="tanh")
     """
 
     helper = LayerHelper("fc", **locals())

From e5e7628a623f86a6bd6831c6e5ad64753fa92817 Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Fri, 15 Mar 2019 10:09:59 +0800
Subject: [PATCH 41/73] Skip compile infer shape in box_coder_op test=develop

---
 paddle/fluid/operators/detection/box_coder_op.cc | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/detection/box_coder_op.cc b/paddle/fluid/operators/detection/box_coder_op.cc
index 0a51d50e06..de36126774 100644
--- a/paddle/fluid/operators/detection/box_coder_op.cc
+++ b/paddle/fluid/operators/detection/box_coder_op.cc
@@ -60,14 +60,15 @@ class BoxCoderOp : public framework::OperatorWithKernel {
     } else if (code_type == BoxCodeType::kDecodeCenterSize) {
       PADDLE_ENFORCE_EQ(target_box_dims.size(), 3,
                         "The rank of Input TargetBox must be 3");
-      if (axis == 0) {
-        PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
-      } else if (axis == 1) {
-        PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
-      } else {
-        PADDLE_THROW("axis must be 0 or 1.");
+      PADDLE_ENFORCE(axis == 0 || axis == 1, "axis must be 0 or 1");
+      if (ctx->IsRuntime()) {
+        if (axis == 0) {
+          PADDLE_ENFORCE_EQ(target_box_dims[1], prior_box_dims[0]);
+        } else if (axis == 1) {
+          PADDLE_ENFORCE_EQ(target_box_dims[0], prior_box_dims[0]);
+        }
+        PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
       }
-      PADDLE_ENFORCE_EQ(target_box_dims[2], prior_box_dims[1]);
       ctx->ShareDim("TargetBox", /*->*/ "OutputBox");
     }
 

From f0d108f58903bf8ade17186838c8a4517b5b2ecf Mon Sep 17 00:00:00 2001
From: sneaxiy <sneaxiy@126.com>
Date: Fri, 15 Mar 2019 02:06:34 +0000
Subject: [PATCH 42/73] fix const_cast test=develop

---
 paddle/fluid/operators/optimizers/adam_op.h   | 49 ++++++-------------
 .../fluid/operators/optimizers/momentum_op.h  | 19 +++----
 .../fluid/operators/optimizers/rmsprop_op.h   | 18 ++-----
 3 files changed, 25 insertions(+), 61 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
index 09255f60e6..6262ef0c2d 100644
--- a/paddle/fluid/operators/optimizers/adam_op.h
+++ b/paddle/fluid/operators/optimizers/adam_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>  // for sqrt in CPU and CUDA
 #include <Eigen/Dense>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/threadpool.h"
@@ -311,17 +312,17 @@ struct SparseAdamFunctor<T, CPUAdam> {
     T beta1_pow = *beta1_pow_;
     T beta2_pow = *beta2_pow_;
     lr *= sqrt(1 - beta2_pow) / (1 - beta1_pow);
-    size_t row_count = numel / row_numel_;
+    int64_t row_count = static_cast<int64_t>(numel / row_numel_);
 
-    for (size_t i = 0U, j = 0U; i != row_count; ++i) {
+    for (int64_t i = 0, j = 0; i != row_count; ++i) {
       if (i == *(rows_ + j)) {
-        for (size_t k = 0U; k != row_numel_; ++k) {
+        for (int64_t k = 0; k != row_numel_; ++k) {
           T g = grad_[j * row_numel_ + k];
           adam_update(i * row_numel_ + k, g);
         }
         ++j;
       } else {
-        for (size_t k = 0U; k != row_numel_; ++k) {
+        for (int64_t k = 0; k != row_numel_; ++k) {
           T mom1 = moment1_[i * row_numel_ + k];
           T mom2 = moment2_[i * row_numel_ + k];
           T p = param_[i * row_numel_ + k];
@@ -427,43 +428,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
         }
       }
 
-      framework::SelectedRows cpu_grad_merge;
+      framework::SelectedRows tmp_grad_merge;
       const framework::SelectedRows* grad_merge_ptr;
       if (is_strict_sorted) {
         grad_merge_ptr = &grad;
       } else {
         // merge duplicated rows if any.
         // The rows of grad_merge have been sorted inside MergeAdd functor
-        framework::SelectedRows* grad_merge_var;
         scatter::MergeAdd<DeviceContext, T> merge_func;
-        if (platform::is_cpu_place(ctx.GetPlace())) {
-          grad_merge_var = &cpu_grad_merge;
-        } else {
-          // FIXME(qiao): GPU also need to fix this
-          grad_merge_var = const_cast<framework::Scope&>(ctx.scope())
-                               .Var()
-                               ->GetMutable<framework::SelectedRows>();
-        }
         merge_func(ctx.template device_context<DeviceContext>(), grad,
-                   grad_merge_var, true);
-        grad_merge_ptr = grad_merge_var;
+                   &tmp_grad_merge, true);
+        grad_merge_ptr = &tmp_grad_merge;
       }
 
       auto& grad_merge = *grad_merge_ptr;
       auto& grad_tensor = grad_merge.value();
       const T* grad_data = grad_tensor.template data<T>();
-      const int64_t* rows = nullptr;
-// When compiled without CUDA, the CUDAData() interface should not be
-// provided.
-#if defined(PADDLE_WITH_CUDA)
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = grad_merge.rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = grad_merge.rows().data();
-#if defined(PADDLE_WITH_CUDA)
-      }
-#endif
+      const int64_t* rows = grad_merge.rows().Data(ctx.GetPlace());
       auto row_numel = grad_tensor.numel() / grad_merge.rows().size();
 
       if (platform::is_cpu_place(ctx.GetPlace())) {
@@ -488,7 +469,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
           }
         }
 #ifndef _WIN32
-        else if (FLAGS_inner_op_parallelism > 1 &&
+        else if (FLAGS_inner_op_parallelism > 1 &&  // NOLINT
                  min_row_size_to_use_multithread > 0 &&
                  param.dims()[0] > min_row_size_to_use_multithread) {
           VLOG(3) << "use multi thread, inner_op_parallelism="
@@ -516,11 +497,11 @@ class AdamOpKernel : public framework::OpKernel<T> {
           for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) {
             int64_t start = i * line_in_each_thread;
             int64_t end = (i + 1) * line_in_each_thread;
-            if (start >= param_row_count) {
+            if (start >= static_cast<int64_t>(param_row_count)) {
               break;
             }
-            if (end > param_row_count) {
-              end = param_row_count;
+            if (end > static_cast<int64_t>(param_row_count)) {
+              end = static_cast<int64_t>(param_row_count);
             }
             fs.push_back(
                 framework::Async([&functor, &row_id_to_grad_row_offset,
@@ -545,8 +526,8 @@ class AdamOpKernel : public framework::OpKernel<T> {
           }
           for (size_t i = 0; i < fs.size(); ++i) fs[i].wait();
         }
-#endif  // !_WIN32
-        else {
+#endif          // !_WIN32
+        else {  // NOLINT
           functor(param.numel());
         }
       } else if (platform::is_gpu_place(ctx.GetPlace())) {
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
index 3ed1bff5ff..29a2ae6755 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.h
+++ b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
@@ -69,6 +70,7 @@ class MomentumOp : public framework::OperatorWithKernel {
     ctx->SetOutputDim("ParamOut", param_dim);
     ctx->SetOutputDim("VelocityOut", param_dim);
   }
+
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param"));
@@ -351,23 +353,14 @@ class MomentumOpKernel : public framework::OpKernel<T> {
         VLOG(3) << "Grad SelectedRows contains no data!";
         return;
       }
-      auto* merged_grad = const_cast<framework::Scope&>(ctx.scope())
-                              .Var()
-                              ->GetMutable<framework::SelectedRows>();
+
+      framework::SelectedRows tmp_merged_grad;
+      framework::SelectedRows* merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(ctx.template device_context<DeviceContext>(), *grad,
                  merged_grad);
 
-      const int64_t* rows = nullptr;
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = merged_grad->rows().data();
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
+      const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
       int64_t row_numel =
           merged_grad->value().numel() / merged_grad->rows().size();
       platform::ForRange<DeviceContext> for_range(
diff --git a/paddle/fluid/operators/optimizers/rmsprop_op.h b/paddle/fluid/operators/optimizers/rmsprop_op.h
index 389c84d246..4550052b2d 100644
--- a/paddle/fluid/operators/optimizers/rmsprop_op.h
+++ b/paddle/fluid/operators/optimizers/rmsprop_op.h
@@ -216,24 +216,14 @@ class RmspropOpKernel : public framework::OpKernel<T> {
       }
     } else if (grad_var->IsType<framework::SelectedRows>()) {
       auto &grad = grad_var->Get<framework::SelectedRows>();
-      auto *merged_grad = const_cast<framework::Scope &>(ctx.scope())
-                              .Var()
-                              ->GetMutable<framework::SelectedRows>();
-
+      framework::SelectedRows tmp_merged_grad;
+      framework::SelectedRows *merged_grad = &tmp_merged_grad;
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
       merge_func(dev_ctx, grad, merged_grad);
 
       platform::ForRange<DeviceContext> for_range(dev_ctx, limit);
-      const int64_t *rows;
-#ifdef PADDLE_WITH_CUDA
-      if (platform::is_gpu_place(ctx.GetPlace())) {
-        rows = merged_grad->rows().CUDAData(ctx.GetPlace());
-      } else {
-#endif
-        rows = merged_grad->rows().data();
-#ifdef PADDLE_WITH_CUDA
-      }
-#endif
+      const int64_t *rows = merged_grad->rows().Data(ctx.GetPlace());
+
       auto &merged_tensor = merged_grad->value();
       int64_t row_count = merged_grad->rows().size();
       int64_t row_numel = merged_tensor.numel() / row_count;

From ca392c7e971d64eeddd38c86b532044ca7e2593e Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 15 Mar 2019 10:24:06 +0800
Subject: [PATCH 43/73] Implement infer var type context

---
 .../fluid/framework/details/graph_test_base.h |  10 +-
 paddle/fluid/framework/details/op_registry.h  |   4 +-
 paddle/fluid/framework/ir/graph_test.cc       |  14 +--
 paddle/fluid/framework/op_desc.cc             |   4 +-
 paddle/fluid/framework/type_defs.h            |   3 +-
 paddle/fluid/framework/var_type_inference.h   | 106 ++++++++++++++++--
 .../framework/var_type_inference_test.cc      |  12 +-
 .../fluid/operators/beam_search_decode_op.cc  |  21 ++--
 paddle/fluid/operators/beam_search_op.cc      |  15 +--
 .../operators/controlflow/get_places_op.cc    |   8 +-
 .../controlflow/tensor_array_read_write_op.cc |  15 +--
 .../operators/distributed_ops/merge_ids_op.cc |   9 +-
 .../operators/distributed_ops/split_ids_op.cc |   9 +-
 paddle/fluid/operators/fill_constant_op.cc    |   9 +-
 .../fused/fused_embedding_seq_pool_op.cc      |  14 +--
 .../get_tensor_from_selected_rows_op.cc       |  15 +--
 .../operators/hierarchical_sigmoid_op.cc      |  24 ++--
 paddle/fluid/operators/lod_rank_table_op.cc   |   8 +-
 .../fluid/operators/lod_tensor_to_array_op.cc |   7 +-
 paddle/fluid/operators/lookup_table_op.cc     |  14 +--
 paddle/fluid/operators/nce_op.cc              |  14 +--
 .../operators/ngraph/ngraph_engine_op.cc      |   3 +-
 .../operators/optimizers/lars_momentum_op.cc  |   7 +-
 .../fluid/operators/optimizers/momentum_op.cc |  18 ++-
 paddle/fluid/operators/optimizers/sgd_op.cc   |  14 +--
 paddle/fluid/operators/py_func_op.cc          |  36 +++---
 .../reader/create_custom_reader_op.cc         |  23 ++--
 paddle/fluid/operators/reader/read_op.cc      |  17 ++-
 .../operators/reader/reader_op_registry.h     |   6 +-
 paddle/fluid/operators/save_op.cc             |   9 +-
 paddle/fluid/operators/scale_op.cc            |  14 +--
 .../fluid/operators/split_selected_rows_op.cc |   7 +-
 paddle/fluid/operators/sum_op.cc              |  31 ++---
 .../operators/tensor_array_to_tensor_op.cc    |   7 +-
 .../operators/tensorrt/tensorrt_engine_op.cc  |   3 +-
 paddle/fluid/operators/uniform_random_op.cc   |  14 +--
 36 files changed, 283 insertions(+), 261 deletions(-)

diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h
index 126959bcd8..2fae684516 100644
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker {
 
 class DummyVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc& op_desc, BlockDesc* block) const override {
-    auto& inputs = op_desc.Input("X");
-    auto type = block->Var(inputs.front())->GetType();
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(type);
+  void operator()(framework::InferVarTypeContext& ctx) const override {
+    auto& inputs = ctx.Input("X");
+    auto type = ctx.GetType(inputs.front());
+    auto out_var_name = ctx.Output("Out").front();
+    ctx.SetType(out_var_name, type);
   }
 };
 
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 0901e59f97..79281863f6 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -127,9 +127,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](const OpDesc& fwd_op, BlockDesc* block) {
+    info->infer_var_type_ = [](InferVarTypeContext& context) {
       T inference;
-      inference(fwd_op, block);
+      inference(context);
     };
   }
 };
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 7ed2f96eb2..2940f3ceeb 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -43,20 +43,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
-    auto &inputs = op_desc.Input("X");
+  void operator()(InferVarTypeContext &ctx) const override {
+    auto &inputs = ctx.Input("X");
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [ctx](const std::string &name) {
+          return ctx.GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
       default_var_type = proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    auto out_var_name = ctx.Output("Out").front();
+    ctx.SetType(out_var_name, default_var_type);
   }
 };
 
@@ -71,7 +71,7 @@ class DummyOpMaker : public OpProtoAndCheckerMaker {
 
 class DummyOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext &ctx) const override {}
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 0e7b0cbeb9..aae0eafe6c 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/shape_inference.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 
 namespace paddle {
 namespace framework {
@@ -677,7 +678,8 @@ void OpDesc::InferVarType(BlockDesc *block) const {
   // var type inference. Hence, we don't do any "default" setting here.
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
-    info.infer_var_type_(*this, block);
+    InferVarTypeContext context(this, block);
+    info.infer_var_type_(context);
   }
 }
 
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index d02c699b97..a774f9ff49 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -27,6 +27,7 @@ namespace framework {
 class OperatorBase;
 class OpDesc;
 class InferShapeContext;
+class InferVarTypeContext;
 class BlockDesc;
 class Variable;
 
@@ -53,7 +54,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
     const std::vector<BlockDesc*>& grad_block)>;
 
 using InferVarTypeFN =
-    std::function<void(const OpDesc& /*op_desc*/, BlockDesc* /*block*/)>;
+    std::function<void(framework::InferVarTypeContext& /*context*/)>;
 
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index 64236b78d2..ed52e1ad81 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/type_defs.h"
@@ -21,26 +22,113 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class OpDesc;
+class BlockDesc;
+// default infer var type context
+class InferVarTypeContext {
+ public:
+  InferVarTypeContext(const OpDesc* op, BlockDesc* block)
+      : op_(op), block_(block) {}
+
+  Attribute GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->GetAttr(name);
+  }
+
+  inline bool HasVar(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindVarRecursive(name) != nullptr;
+  }
+
+  inline bool HasInput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Inputs().count(name) > 0;
+  }
+
+  inline bool HasOutput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Outputs().count(name) > 0;
+  }
+
+  inline const std::vector<std::string>& Input(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Input(name);
+  }
+
+  inline const std::vector<std::string>& Output(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(op_);
+    return op_->Output(name);
+  }
+
+  inline proto::VarType::Type GetType(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetType();
+  }
+
+  inline void SetType(const std::string& name, proto::VarType::Type type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetType(type);
+  }
+
+  inline proto::VarType::Type GetDataType(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetDataType();
+  }
+
+  inline void SetDataType(const std::string& name, proto::VarType::Type type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetDataType(type);
+  }
+
+  inline std::vector<int64_t> GetShape(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetShape();
+  }
+
+  inline void SetShape(const std::string& name,
+                       const std::vector<int64_t>& dims) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetShape(dims);
+  }
+
+  inline int32_t GetLoDLevel(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetLoDLevel();
+  }
+
+  inline void SetLoDLevel(const std::string& name, int32_t lod_level) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level);
+  }
+
+ private:
+  const OpDesc* op_;
+  BlockDesc* block_;
+};
+
+// infer var type context for imperative mode
+class RuntimeInferVarTypeContext : public InferVarTypeContext {
+ public:
+  RuntimeInferVarTypeContext() : InferVarTypeContext(nullptr, nullptr) {}
+};
+
 class VarTypeInference {
  public:
   virtual ~VarTypeInference() {}
-  virtual void operator()(const OpDesc& op_desc, BlockDesc* block) const = 0;
+  virtual void operator()(InferVarTypeContext& context) const = 0;  // NOLINT
 };
 
 class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const final {
+  void operator()(framework::InferVarTypeContext& ctx) const final {  // NOLINT
     auto in_out_var_names = this->GetInputOutputWithSameType();
 
     for (auto& i_o_n : in_out_var_names) {
-      auto& x_name = op_desc.Input(i_o_n.first).at(0);
-      auto& out_name = op_desc.Output(i_o_n.second).at(0);
+      auto& x_name = ctx.Input(i_o_n.first).at(0);
+      auto& out_name = ctx.Output(i_o_n.second).at(0);
 
-      auto& x = block->FindRecursiveOrCreateVar(x_name);
-      auto& out = block->FindRecursiveOrCreateVar(out_name);
-      out.SetType(x.GetType());
-      out.SetDataType(x.GetDataType());
+      ctx.SetType(out_name, ctx.GetType(x_name));
+      ctx.SetDataType(out_name, ctx.GetDataType(x_name));
     }
   }
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 2a75394fca..d7d3e0a033 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -44,20 +44,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {
-    auto &inputs = op_desc.Input("X");
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto &inputs = ctx.Input("X");
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string &name) {
-          return block->Var(name)->GetType() == proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [ctx](const std::string &name) {
+          return ctx.GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
       default_var_type = proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetType(default_var_type);
+    auto out_var_name = ctx.Output("Out").front();
+    ctx.SetType(out_var_name, default_var_type);
   }
 };
 }  // namespace framework
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index cf78c83297..703edcad11 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -178,10 +178,10 @@ Beam Search Decode Operator. This Operator constructs the full hypotheses for
 each source sentence by walking back along the LoDTensorArray Input(ids)
 whose lods can be used to restore the path in the beam search tree.
 
-The Output(SentenceIds) and Output(SentenceScores) separately contain the 
-generated id sequences and the corresponding scores. The shapes and lods of the 
-two LodTensor are same. The lod level is 2 and the two levels separately 
-indicate how many hypotheses each source sentence has and how many ids each 
+The Output(SentenceIds) and Output(SentenceScores) separately contain the
+generated id sequences and the corresponding scores. The shapes and lods of the
+two LodTensor are same. The lod level is 2 and the two levels separately
+indicate how many hypotheses each source sentence has and how many ids each
 hypothesis has.
 )DOC");
   }
@@ -203,15 +203,12 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    for (auto& o : op_desc.Output("SentenceIds")) {
-      auto& sentence_ids = block->FindRecursiveOrCreateVar(o);
-      sentence_ids.SetType(framework::proto::VarType::LOD_TENSOR);
+  void operator()(framework::InferVarTypeContext& ctx) const override {
+    for (auto& o : ctx.Output("SentenceIds")) {
+      ctx.SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
-    for (auto& o : op_desc.Output("SentenceScores")) {
-      auto& sentence_scores = block->FindRecursiveOrCreateVar(o);
-      sentence_scores.SetType(framework::proto::VarType::LOD_TENSOR);
+    for (auto& o : ctx.Output("SentenceScores")) {
+      ctx.SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index fa6b09b4e7..8958d00a68 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -65,7 +65,7 @@ class BeamSearchOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(true);
 
     AddComment(R"DOC(
-This operator does the search in beams for one time step. 
+This operator does the search in beams for one time step.
 Specifically, it selects the top-K candidate word ids of current step from
 Input(ids) according to their Input(scores) for all source sentences,
 where K is Attr(beam_size) and Input(ids), Input(scores) are predicted results
@@ -120,15 +120,12 @@ class BeamSearchOp : public framework::OperatorWithKernel {
 
 class BeamSearchInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o : op_desc.Output("selected_ids")) {
-      auto &selected_ids = block->FindRecursiveOrCreateVar(o);
-      selected_ids.SetType(framework::proto::VarType::LOD_TENSOR);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    for (auto &o : ctx.Output("selected_ids")) {
+      ctx.SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
-    for (auto &o : op_desc.Output("selected_scores")) {
-      auto &selected_scores = block->FindRecursiveOrCreateVar(o);
-      selected_scores.SetType(framework::proto::VarType::LOD_TENSOR);
+    for (auto &o : ctx.Output("selected_scores")) {
+      ctx.SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 1a157688f3..0258739d6d 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -93,11 +93,9 @@ execution.
 
 class GetPlacesInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o_name : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o_name).SetType(
-          framework::proto::VarType::PLACE_LIST);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    for (auto &o_name : ctx.Output("Out")) {
+      ctx.SetType(o_name, framework::proto::VarType::PLACE_LIST);
     }
   }
 };
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index fa18ade323..041eef602e 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -100,16 +100,13 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 
 class WriteToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto x_name = op_desc.Input("X")[0];
-    auto out_name = op_desc.Output("Out")[0];
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto x_name = ctx.Input("X")[0];
+    auto out_name = ctx.Output("Out")[0];
     VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    auto &out = block->FindRecursiveOrCreateVar(out_name);
-    out.SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
-    auto *x = block->FindVarRecursive(x_name);
-    if (x != nullptr) {
-      out.SetDataType(x->GetDataType());
+    ctx.SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY);
+    if (ctx.HasVar(x_name)) {
+      ctx.SetDataType(out_name, ctx.GetDataType(x_name));
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
index da0185b8c4..0a269c7575 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
@@ -114,11 +114,10 @@ class MergeIdsOp : public framework::OperatorWithKernel {
 
 class MergeIdsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(input_var->GetType());
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto input_type = ctx.GetType(ctx.Input("Ids")[0]);
+    for (auto &out_var : ctx.Output("Out")) {
+      ctx.SetType(out_var, input_type);
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
index f61d387fbe..2932a202a5 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
@@ -71,11 +71,10 @@ class SplitIdsOp : public framework::OperatorWithKernel {
 
 class SplitIdsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto *input_var = block->Var(op_desc.Input("Ids")[0]);
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(input_var->GetType());
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto input_type = ctx.GetType(ctx.Input("Ids")[0]);
+    for (auto &out_var : ctx.Output("Out")) {
+      ctx.SetType(out_var, input_type);
     }
   }
 };
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index c86430524e..eb5996d50e 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -39,12 +39,11 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
+  void operator()(framework::InferVarTypeContext& ctx) const override {
     auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(op_desc.GetAttr("dtype")));
-    auto& out_var_name = op_desc.Output("Out").front();
-    block->Var(out_var_name)->SetDataType(data_type);
+        boost::get<int>(ctx.GetAttr("dtype")));
+    auto& out_var_name = ctx.Output("Out").front();
+    ctx.SetDataType(out_var_name, data_type);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index a0026427e2..27a761c29f 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -137,22 +137,20 @@ class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
 class FusedEmbeddingSeqPoolOpGradVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto attr = op_desc.GetAttr("is_sparse");
+  void operator()(framework::InferVarTypeContext& ctx) const override {
+    auto out_var_name = ctx.Output(framework::GradVarName("W")).front();
+    auto attr = ctx.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "fused_embedding_seq_pool_grad op "
               << framework::GradVarName("W") << " is set to SelectedRows";
-      block->Var(out_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx.SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "fused_embedding_seq_pool_grad op "
               << framework::GradVarName("W") << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx.SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx.SetDataType(out_var_name, ctx.GetDataType(ctx.Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index a4ae19d9c1..5388e65497 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -81,15 +81,12 @@ GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
 class GetTensorFromSelectedRowsOpVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const final {
-    auto out_var_name = op_desc.Output("Out").front();
-    auto in_var_name = op_desc.Input("X").front();
-
-    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto in_var = block->FindRecursiveOrCreateVar(in_var_name);
-    out_var.SetType(framework::proto::VarType::LOD_TENSOR);
-    out_var.SetDataType(in_var.GetDataType());
+  void operator()(framework::InferVarTypeContext &ctx) const {  // NOLINT
+    auto out_var_name = ctx.Output("Out").front();
+    auto in_var_name = ctx.Input("X").front();
+
+    ctx.SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    ctx.SetDataType(out_var_name, ctx.GetDataType(in_var_name));
   }
 };
 
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 6ca6f0bc04..508c99b953 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -197,38 +197,32 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
 class HierarchicalSigmoidGradOpGradVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto w_grad_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto bias_grad_var_name_vec =
-        op_desc.Output(framework::GradVarName("Bias"));
+  void operator()(framework::InferVarTypeContext& ctx) const override {
+    auto w_grad_var_name = ctx.Output(framework::GradVarName("W")).front();
+    auto bias_grad_var_name_vec = ctx.Output(framework::GradVarName("Bias"));
     std::string bias_grad_var_name;
     bool hasBias = false;
     if (bias_grad_var_name_vec.size()) {
       hasBias = true;
-      bias_grad_var_name =
-          op_desc.Output(framework::GradVarName("Bias")).front();
+      bias_grad_var_name = ctx.Output(framework::GradVarName("Bias")).front();
     }
-    auto attr = op_desc.GetAttr("is_sparse");
+    auto attr = ctx.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to SelectedRows";
-      block->Var(w_grad_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx.SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to LoDTensor";
-      block->Var(w_grad_var_name)
-          ->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx.SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR);
     }
     if (hasBias) {
       VLOG(30) << "hierarchical_sigmoid_grad op "
                << framework::GradVarName("Bias") << " is set to LoDTensor";
-      block->Var(bias_grad_var_name)
-          ->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx.SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(w_grad_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx.SetDataType(w_grad_var_name, ctx.GetDataType(ctx.Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index 166952fe23..a7bbb49827 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -64,11 +64,9 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 
 class LoDRankTableInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &o : op_desc.Output("Out")) {
-      block->FindRecursiveOrCreateVar(o).SetType(
-          framework::proto::VarType::LOD_RANK_TABLE);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    for (auto &o : ctx.Output("Out")) {
+      ctx.SetType(o, framework::proto::VarType::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 9b91cf5260..4fd45db67b 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -201,10 +201,9 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    for (auto &out_var : ctx.Output("Out")) {
+      ctx.SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index 0029932bc0..a59ff23f93 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -147,22 +147,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
-    auto attr = op_desc.GetAttr("is_sparse");
+  void operator()(framework::InferVarTypeContext& ctx) const override {
+    auto out_var_name = ctx.Output(framework::GradVarName("W")).front();
+    auto attr = ctx.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      block->Var(out_var_name)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx.SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx.SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    ctx.SetDataType(out_var_name, ctx.GetDataType(ctx.Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 256da34912..3c3d79cc7b 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -237,23 +237,21 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 
 class NCEOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto weight_grad = op_desc.Output(framework::GradVarName("Weight")).front();
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto weight_grad = ctx.Output(framework::GradVarName("Weight")).front();
 
-    auto attr = op_desc.GetAttr("is_sparse");
+    auto attr = ctx.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to SelectedRows";
-      block->Var(weight_grad)
-          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      ctx.SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to LoDTensor";
-      block->Var(weight_grad)->SetType(framework::proto::VarType::LOD_TENSOR);
+      ctx.SetType(weight_grad, framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(weight_grad)->SetDataType(block->Var("Input")->GetDataType());
+    ctx.SetDataType(weight_grad, ctx.GetDataType(ctx.Input("Input")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
index f941f917c8..a88ddf33a0 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -37,8 +37,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class NgraphEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext &ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
index 574a03680b..668fa889ac 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -56,9 +56,9 @@ This optimizer use LARS (https://arxiv.org/abs/1708.03888) to optimize each
 weight using a local learning rate:
 
 $$
-local\_lr = \eta  * 
+local\_lr = \eta  *
     \frac{\left \| param \right \|}{\left \| grad \right \| + \beta *\left \| param \right \|} \\
-velocity = mu * velocity + 
+velocity = mu * velocity +
     local\_lr * (grad + \beta * param) \\
 param = param - velocity. \\
 $$
@@ -72,8 +72,7 @@ use L2 regularizers in case of using LARS.
 
 class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext &ctx) const override {}
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index cde238c076..1be423da5b 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -21,18 +21,14 @@ using Tensor = framework::Tensor;
 
 class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto input_var = op_desc.Input("Param")[0];
-    for (auto& out_var : op_desc.Output("ParamOut")) {
-      if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
-          framework::proto::VarType::SELECTED_ROWS) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::SELECTED_ROWS);
-      } else if (block->FindRecursiveOrCreateVar(input_var).GetType() ==
+  void operator()(framework::InferVarTypeContext& ctx) const override {
+    auto& input_var = ctx.Input("Param")[0];
+    for (auto& out_var : ctx.Output("ParamOut")) {
+      if (ctx.GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) {
+        ctx.SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
+      } else if (ctx.GetType(input_var) ==
                  framework::proto::VarType::LOD_TENSOR) {
-        block->FindRecursiveOrCreateVar(out_var).SetType(
-            framework::proto::VarType::LOD_TENSOR);
+        ctx.SetType(out_var, framework::proto::VarType::LOD_TENSOR);
       } else {
         PADDLE_THROW(
             "Only support LodTensor and SelectedRows, Unexpected Input Type.");
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index 690381a67f..cac3d9b68f 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -50,20 +50,18 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto input_var_n = op_desc.Input("Param")[0];
-    auto in_var_type = block->FindRecursiveOrCreateVar(input_var_n).GetType();
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto &input_var_n = ctx.Input("Param")[0];
+    auto in_var_type = ctx.GetType(input_var_n);
     PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
                        in_var_type == framework::proto::VarType::LOD_TENSOR,
                    "The input Var's type should be LoDtensor or SelectedRows,"
                    " but the received var(%s)'s type is %s",
                    input_var_n, in_var_type);
 
-    for (auto &out_var_n : op_desc.Output("ParamOut")) {
-      auto &out_var = block->FindRecursiveOrCreateVar(out_var_n);
-      if (out_var.GetType() != in_var_type) {
-        out_var.SetType(in_var_type);
+    for (auto &out_var_n : ctx.Output("ParamOut")) {
+      if (ctx.GetType(out_var_n) != in_var_type) {
+        ctx.SetType(out_var_n, in_var_type);
       }
     }
   }
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 53eff2de3e..f630ad678f 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -91,15 +91,12 @@ static void CallPythonFunc(py::object *callable,
   }
 }
 
-class PyFuncOpVarTypInference : public framework::VarTypeInference {
+class PyFuncOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op,
-                  framework::BlockDesc *block) const override {
-    auto &outs = op.Outputs();
-    bool has_out = (outs.count("Out") > 0 && !outs.at("Out").empty());
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    bool has_out = (ctx.HasOutput("Out") && !ctx.Output("Out").empty());
 
-    auto &ins = op.Inputs();
-    bool has_in = (ins.count("X") > 0 && !ins.at("X").empty());
+    bool has_in = (ctx.HasInput("X") && !ctx.Input("Out").empty());
 
     /**
      * X or Out can be empty, so that py_func can be more flexible
@@ -107,7 +104,7 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
      */
     PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist");
 
-    PADDLE_ENFORCE_GE(boost::get<int>(op.GetAttr(kForwardPythonCallableId)), 0,
+    PADDLE_ENFORCE_GE(boost::get<int>(ctx.GetAttr(kForwardPythonCallableId)), 0,
                       "Function id cannot be less than 0");
 
     if (!has_out) return;
@@ -118,7 +115,7 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
      * the corresponding forward variable
      */
     const std::string kGradVarSuffix = framework::kGradVarSuffix;
-    auto &out_var_names = outs.at("Out");
+    auto &out_var_names = ctx.Output("Out");
     for (auto &out_var_name : out_var_names) {
       if (out_var_name == framework::kEmptyVarName ||
           out_var_name.size() < kGradVarSuffix.size()) {
@@ -128,18 +125,17 @@ class PyFuncOpVarTypInference : public framework::VarTypeInference {
       size_t len = out_var_name.size() - kGradVarSuffix.size();
       if (out_var_name.substr(len) == kGradVarSuffix) {
         auto fwd_var_name = out_var_name.substr(0, len);
-        auto *out_var_desc = block->FindVarRecursive(out_var_name);
-        auto *fwd_var_desc = block->FindVarRecursive(fwd_var_name);
-        PADDLE_ENFORCE_NOT_NULL(out_var_desc, "Backward variable %s not found",
-                                out_var_name);
-        PADDLE_ENFORCE_NOT_NULL(fwd_var_desc, "Forward variable %s not found",
-                                fwd_var_name);
+        PADDLE_ENFORCE(ctx.HasVar(out_var_name),
+                       "Backward variable %s not found", out_var_name);
+        PADDLE_ENFORCE(ctx.HasVar(fwd_var_name),
+                       "Backward variable %s not found", fwd_var_name);
         VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input("
                  << fwd_var_name << ")";
-        out_var_desc->SetShape(fwd_var_desc->GetShape());
-        out_var_desc->SetDataType(fwd_var_desc->GetDataType());
-        out_var_desc->SetLoDLevel(fwd_var_desc->GetLoDLevel());
-        out_var_desc->SetType(fwd_var_desc->GetType());
+
+        ctx.SetShape(out_var_name, ctx.GetShape(fwd_var_name));
+        ctx.SetDataType(out_var_name, ctx.GetDataType(fwd_var_name));
+        ctx.SetLoDLevel(out_var_name, ctx.GetLoDLevel(fwd_var_name));
+        ctx.SetType(out_var_name, ctx.GetType(fwd_var_name));
       }
     }
   }
@@ -309,5 +305,5 @@ class PyFuncOp : public framework::OperatorBase {
 namespace ops = paddle::operators;
 
 REGISTER_OPERATOR(py_func, ops::PyFuncOp, ops::PyFuncOpMaker,
-                  ops::PyFuncOpVarTypInference, ops::PyFuncOpShapeInference,
+                  ops::PyFuncOpVarTypeInference, ops::PyFuncOpShapeInference,
                   ops::PyFuncOpGradDescMaker);
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 85394b336f..915325905b 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -85,10 +85,10 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase {
     AddComment(R"DOC(
       CreateCustomReader Operator
 
-      A custom reader can be used for input data preprocessing. 
-      A custom reader holds its own sub-block, which will be executed in CPU 
-      in its 'ReadNext()' function. Users can configurate their own 
-      preprocessing pipelines by inserting operators into custom reader's 
+      A custom reader can be used for input data preprocessing.
+      A custom reader holds its own sub-block, which will be executed in CPU
+      in its 'ReadNext()' function. Users can configurate their own
+      preprocessing pipelines by inserting operators into custom reader's
       sub-block.
     )DOC");
   }
@@ -123,23 +123,22 @@ class CustomReaderInferShape : public framework::InferShapeBase {
 
 class CustomReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    framework::VarDesc* out_reader = block->FindVar(op_desc.Output("Out")[0]);
-    PADDLE_ENFORCE_NOT_NULL(out_reader);
-    out_reader->SetType(framework::proto::VarType::READER);
+  void operator()(const framework::InferVarTypeContext& ctx) const override {
+    auto& out_var_name = ctx.Output("Out")[0];
+    PADDLE_ENFORCE(ctx.HasVar(out_var_name));
+    ctx.SetType(out_var_name, framework::proto::VarType::READER);
 
     auto sink_var_names =
-        boost::get<std::vector<std::string>>(op_desc.GetAttr("sink_var_names"));
+        boost::get<std::vector<std::string>>(ctx.GetAttr("sink_var_names"));
     const auto* sub_block =
-        boost::get<framework::BlockDesc*>(op_desc.GetAttr("sub_block"));
+        boost::get<framework::BlockDesc*>(ctx.GetAttr("sub_block"));
     std::vector<framework::proto::VarType::Type> res_data_types;
     for (const std::string& var_name : sink_var_names) {
       framework::VarDesc* var = sub_block->FindVar(var_name);
       PADDLE_ENFORCE_NOT_NULL(var);
       res_data_types.emplace_back(var->GetDataType());
     }
-    out_reader->SetDataTypes(res_data_types);
+    ctx.SetDataTypes(out_var_name, res_data_types);
   }
 };
 
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 846b2ed77e..9a98d68e13 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -51,19 +51,16 @@ class ReadInferShape : public framework::InferShapeBase {
 
 class ReadInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    bool infer_out = boost::get<bool>(op_desc.GetAttr("infer_out"));
+  void operator()(const framework::InferVarTypeContext& ctx) const override {
+    bool infer_out = boost::get<bool>(ctx.GetAttr("infer_out"));
     if (infer_out) {
-      std::string reader_name = op_desc.Input("Reader")[0];
-      std::vector<std::string> out_names = op_desc.Output("Out");
-      framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-      auto dtypes = reader->GetDataTypes();
+      std::string reader_name = ctx.Input("Reader")[0];
+      std::vector<std::string> out_names = ctx.Output("Out");
+      auto dtypes = ctx.GetDataTypes(reader_name);
       PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
       for (size_t i = 0; i < dtypes.size(); ++i) {
-        framework::VarDesc& out = block->FindRecursiveOrCreateVar(out_names[i]);
-        out.SetType(framework::proto::VarType::LOD_TENSOR);
-        out.SetDataType(dtypes[i]);
+        ctx.SetType(out_names[i], framework::proto::VarType::LOD_TENSOR);
+        ctx.SetDataType(out_names[i], dtypes[i]);
       }
     }
   }
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 25c3e7d77b..58b0dfd555 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -59,8 +59,7 @@ class FileReaderInferShape : public framework::InferShapeBase {
 
 class FileReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override;
+  void operator()(framework::InferVarTypeContext& ctx) const override;
 };
 
 // general infershape for decorated reader
@@ -72,8 +71,7 @@ class DecoratedReaderInferShape : public framework::InferShapeBase {
 // general var type inference for decorated reader
 class DecoratedReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override;
+  void operator()(framework::InferVarTypeContext& ctx) const override;
 };
 
 class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index fcc598f4f1..45da2ac4c6 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -159,12 +159,9 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
 
 class SaveOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output(LOOKUP_TABLE_PATH).front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto out_var_name = ctx.Output(LOOKUP_TABLE_PATH).front();
+    ctx.SetType(out_var_name, framework::proto::VarType::RAW);
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 4ea77ed30d..d2f05c42a7 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -69,17 +69,13 @@ $$Out = scale*(X + bias)$$
 
 class ScaleOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto &in_var_name = op_desc.Input("X").front();
-    auto &in_var = detail::Ref(block->FindVarRecursive(in_var_name));
-
-    auto out_var_name = op_desc.Output("Out").front();
-    auto *out_var = block->FindVarRecursive(out_var_name);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto &in_var_name = ctx.Input("X").front();
+    auto out_var_name = ctx.Output("Out").front();
 
     if (in_var_name != out_var_name) {
-      out_var->SetType(in_var.GetType());
-      out_var->SetDataType(in_var.GetDataType());
+      ctx.SetType(out_var_name, ctx.GetType(in_var_name));
+      ctx.SetDataType(out_var_name, ctx.GetDataType(in_var_name));
     }
   }
 };
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index 0e7b1463d1..e950f30a42 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -60,10 +60,9 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
 
 class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output("Out")) {
-      block->Var(out_var)->SetType(framework::proto::VarType::SELECTED_ROWS);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    for (auto &out_var : ctx.Output("Out")) {
+      ctx.SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
     }
   }
 };
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 7abfbbd3cb..d674711392 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -159,24 +159,20 @@ the LoD information with the first input.
 
 class SumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
-    auto& inputs = op_desc.Input("X");
+  void operator()(framework::InferVarTypeContext& ctx) const override {
+    auto& inputs = ctx.Input("X");
     auto var_type = framework::proto::VarType::SELECTED_ROWS;
-    for (auto& name : op_desc.Input("X")) {
-      VLOG(10) << name << " "
-               << block->FindRecursiveOrCreateVar(name).GetType();
+    for (auto& name : ctx.Input("X")) {
+      VLOG(10) << name << " " << ctx.GetType(name);
     }
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [block](const std::string& name) {
-          return block->FindRecursiveOrCreateVar(name).GetType() ==
-                 framework::proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [ctx](const std::string& name) {
+          return ctx.GetType(name) == framework::proto::VarType::LOD_TENSOR;
         });
 
-    auto is_tensor_array = [block](const std::string& name) {
-      return block->FindRecursiveOrCreateVar(name).GetType() ==
-             framework::proto::VarType::LOD_TENSOR_ARRAY;
+    auto is_tensor_array = [ctx](const std::string& name) {
+      return ctx.GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
     };
 
     bool any_input_is_tensor_array =
@@ -188,8 +184,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
       if (!all_inputs_are_tensor_array) {
         std::ostringstream os;
         for (auto& each : inputs) {
-          os << "    " << each << " type is "
-             << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
+          os << "    " << each << " type is " << ctx.GetType(each) << "\n";
         }
         PADDLE_ENFORCE(all_inputs_are_tensor_array,
                        "Not all inputs are tensor array:\n%s", os.str());
@@ -199,11 +194,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
       var_type = framework::proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = op_desc.Output("Out").front();
-    auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    out_var.SetType(var_type);
-    auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
-    out_var.SetDataType(in_var.GetDataType());
+    auto out_var_name = ctx.Output("Out").front();
+    ctx.SetType(out_var_name, var_type);
+    ctx.SetDataType(out_var_name, ctx.GetDataType(inputs.front()));
   }
 };
 
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index 58a74ec2c1..d7f67ccb2f 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -177,10 +177,9 @@ class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
 class LoDTensorArray2TensorGradInferVarType
     : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    for (auto &out_var : op_desc.Output(framework::GradVarName("X"))) {
-      block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR_ARRAY);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    for (auto &out_var : ctx.Output(framework::GradVarName("X"))) {
+      ctx.SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index a8c86de9f9..845629d40f 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -46,8 +46,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class TensorRTEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext &ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index e3132ae76f..b3a8b6a141 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -112,17 +112,15 @@ uniform distribution. The random result is in set [min, max].
 
 class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Out").front();
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto out_var_name = ctx.Output("Out").front();
     auto var_data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(op_desc.GetAttr("dtype")));
+        boost::get<int>(ctx.GetAttr("dtype")));
 
-    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) {
-      out_var.SetType(framework::proto::VarType::LOD_TENSOR);
+    if (ctx.GetType(out_var_name) != framework::proto::VarType::SELECTED_ROWS) {
+      ctx.SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    out_var.SetDataType(var_data_type);
+    ctx.SetDataType(out_var_name, var_data_type);
   }
 };
 

From 6cfd20dea8ddd8d6aaefdf3f389b9b186debd259 Mon Sep 17 00:00:00 2001
From: Aurelius84 <liujiezhangbupt@gmail.com>
Date: Fri, 15 Mar 2019 04:10:00 +0000
Subject: [PATCH 44/73] fix words spell error test=develop

---
 paddle/fluid/API.spec            | 2 +-
 python/paddle/fluid/layers/nn.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index c4b984cd23..b8faa53c62 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -68,7 +68,7 @@ paddle.fluid.initializer.MSRAInitializer.__init__ (ArgSpec(args=['self', 'unifor
 paddle.fluid.initializer.force_init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', '6d0f3e22c90d9d500d36ff57daf056ee'))
 paddle.fluid.initializer.init_on_cpu (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'a6d7011ca3d8c0d454dac3a56eae0c29'))
 paddle.fluid.initializer.NumpyArrayInitializer.__init__ (ArgSpec(args=['self', 'value'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '0fd03868c3c4f25d7f8d43daac69e6d3'))
+paddle.fluid.layers.fc (ArgSpec(args=['input', 'size', 'num_flatten_dims', 'param_attr', 'bias_attr', 'act', 'is_test', 'name'], varargs=None, keywords=None, defaults=(1, None, None, None, False, None)), ('document', '424e898365195e3ccbc2e7dc8b63605e'))
 paddle.fluid.layers.embedding (ArgSpec(args=['input', 'size', 'is_sparse', 'is_distributed', 'padding_idx', 'param_attr', 'dtype'], varargs=None, keywords=None, defaults=(False, False, None, None, 'float32')), ('document', '89c2c55a0b0656b106064048e068e77a'))
 paddle.fluid.layers.dynamic_lstm (ArgSpec(args=['input', 'size', 'h_0', 'c_0', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'dtype', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'float32', None)), ('document', 'dfbb624f85015df29e994ca6999e8ff6'))
 paddle.fluid.layers.dynamic_lstmp (ArgSpec(args=['input', 'size', 'proj_size', 'param_attr', 'bias_attr', 'use_peepholes', 'is_reverse', 'gate_activation', 'cell_activation', 'candidate_activation', 'proj_activation', 'dtype', 'name', 'h_0', 'c_0', 'cell_clip', 'proj_clip'], varargs=None, keywords=None, defaults=(None, None, True, False, 'sigmoid', 'tanh', 'tanh', 'tanh', 'float32', None, None, None, None, None)), ('document', 'b4b608b986eb9617aa0525e1be21d32d'))
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 8cf0a457fd..9eca56e7e7 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -206,12 +206,12 @@ def fc(input,
 
     This function creates a fully connected layer in the network. It can take
     one or multiple tensors as its inputs(input can be a list of Variable, see
-    Args in detail). It creates a variable called weights foreach input tensor,
+    Args in detail). It creates a variable called weights for each input tensor,
     which represents a fully connected weight matrix from each input unit to
     each output unit. The fully connected layer multiplies each input tensor
     with its corresponding weight to produce an output Tensor with shape [M, `size`],
     where M is batch size. If multiple input tensors are given, the results of
-    multiple output tensors with shape [M, `size`] will be sumed up. If bias_attr
+    multiple output tensors with shape [M, `size`] will be summed up. If bias_attr
     is not None, a bias variable will be created and added to the output.
     Finally, if activation is not None, it will be applied to the output as well.
 

From 2d1e76fb0c3b531742e38d9fe4aed0147f15ee85 Mon Sep 17 00:00:00 2001
From: Aurelius84 <liujiezhangbupt@gmail.com>
Date: Fri, 15 Mar 2019 05:04:40 +0000
Subject: [PATCH 45/73] fix API.spec  test=develop

---
 paddle/fluid/API.spec | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index b8faa53c62..0eb88084c3 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -511,3 +511,17 @@ paddle.fluid.unique_name.guard (ArgSpec(args=['new_generator'], varargs=None, ke
 paddle.fluid.recordio_writer.convert_reader_to_recordio_file (ArgSpec(args=['filename', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', '65c7523e86f0c50bb729b01667f36310'))
 paddle.fluid.recordio_writer.convert_reader_to_recordio_files (ArgSpec(args=['filename', 'batch_per_file', 'reader_creator', 'feeder', 'compressor', 'max_num_records', 'feed_order'], varargs=None, keywords=None, defaults=(Compressor.Snappy, 1000, None)), ('document', 'bc643f0f5f1b9db57ff0d8a57d379bd7'))
 paddle.fluid.Scope Scope() -> paddle.fluid.core._Scope
+paddle.reader.map_readers (ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None), ('document', '77cbadb09df588e21e5cc0819b69c87d'))
+paddle.reader.buffered (ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None), ('document', '0d6186f109feceb99f60ec50a0a624cb'))
+paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None), ('document', '884291104e1c3f37f33aae44b7deeb0d'))
+paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
+paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
+paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
+paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560'))
+paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69'))
+paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
+paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
+paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a'))
+paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8'))

From 46ee6bb1aa9b187a260b5c0080f28be16ab453a3 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 15 Mar 2019 13:27:51 +0800
Subject: [PATCH 46/73] fix distributed unit-tests

test=develop
---
 paddle/fluid/framework/operator.cc | 22 ++++++++++++++--------
 paddle/fluid/framework/operator.h  |  1 +
 2 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index a9a53b0d74..ac1ad2b05e 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -874,17 +874,23 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
   return kernel_configs;
 }
 
-void OperatorWithKernel::RunImpl(const Scope& scope,
-                                 const platform::Place& place) const {
+RuntimeContext* OperatorWithKernel::GetRuntimeContext(
+    const Scope& scope) const {
   if (!HasAttr(kEnableRuntimeContext)) {
-    runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
+    return new RuntimeContext(Inputs(), Outputs(), scope);
   } else {
     const Scope* cur_scope = &scope;
     if (!runtime_ctx_ || pre_scope_ != cur_scope) {
       runtime_ctx_.reset(new RuntimeContext(Inputs(), Outputs(), scope));
       pre_scope_ = cur_scope;
     }
+    return runtime_ctx_.get();
   }
+}
+
+void OperatorWithKernel::RunImpl(const Scope& scope,
+                                 const platform::Place& place) const {
+  auto runtime_ctx = GetRuntimeContext(scope);
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   auto* dev_ctx = pool.Get(place);
 
@@ -899,7 +905,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   OpKernelMap& kernels = kernels_iter->second;
 
   auto expected_kernel_key = this->GetExpectedKernelType(
-      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx_, nullptr));
+      ExecutionContext(*this, scope, *dev_ctx, *runtime_ctx, nullptr));
   VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
 
   auto kernel_iter = kernels.find(expected_kernel_key);
@@ -923,8 +929,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
 
   // do data transformScope &transfer_scope;
   std::vector<std::string> transfered_inplace_vars;
-  auto* transfer_scope = PrepareData(
-      scope, expected_kernel_key, &transfered_inplace_vars, runtime_ctx_.get());
+  auto* transfer_scope = PrepareData(scope, expected_kernel_key,
+                                     &transfered_inplace_vars, runtime_ctx);
 
   // exec scope is the scope that kernel actually executed on.
   const Scope& exec_scope =
@@ -935,13 +941,13 @@ void OperatorWithKernel::RunImpl(const Scope& scope,
   }
 
   if (!HasAttr(kAllKernelsMustComputeRuntimeShape)) {
-    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx_);
+    RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, *runtime_ctx);
     this->InferShape(&infer_shape_ctx);
   }
   // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext
   // not Scope. Imperative mode only pass inputs and get outputs.
   kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx,
-                                       *runtime_ctx_, kernel_configs));
+                                       *runtime_ctx, kernel_configs));
 
   if (!transfered_inplace_vars.empty()) {
     // there is inplace variable has been transfered.
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 323aa5a7f5..f0592f4f5f 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -464,6 +464,7 @@ class OperatorWithKernel : public OperatorBase {
   // same.
   proto::VarType::Type IndicateDataType(const ExecutionContext& ctx) const;
   void RunImpl(const Scope& scope, const platform::Place& place) const final;
+  RuntimeContext* GetRuntimeContext(const Scope& scope) const;
 
   /**
    * Transfer data from scope to a transfered scope. If there is no data need to

From 721c2c00ef7d056a67d0be5364eda4435d02d166 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 15 Mar 2019 15:06:20 +0800
Subject: [PATCH 47/73] refine fc_infershape

test=develop
---
 paddle/fluid/operators/fc_op.cc               | 34 +++++++++++--------
 paddle/fluid/operators/fc_op.h                | 16 +++++++++
 .../fused/fused_embedding_seq_pool_op.cc      |  6 +++-
 paddle/fluid/operators/hash_op.cc             |  6 +++-
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc | 28 +++++++++------
 .../sequence_ops/sequence_enumerate_op.cc     |  6 +++-
 6 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index eb4617a935..033eca967a 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -55,17 +55,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
       "The input tensor Input's rank of FCOp should be larger than "
       "in_num_col_dims.");
 
-  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
-  PADDLE_ENFORCE_EQ(
-      in_mat_dims[1], w_dims[0],
-      "Fully Connected input and weigth size do not match. %s, %s");
-
   std::vector<int64_t> output_dims;
-  output_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
-  for (int i = 0; i < in_num_col_dims; ++i) {
-    output_dims.push_back(in_dims[i]);
-  }
-  output_dims.push_back(w_dims[1]);
+  FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims);
 
   ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
   ctx->ShareLoD("Input", "Out");
@@ -128,6 +119,12 @@ void FCOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>(
+      framework::kAllKernelsMustComputeRuntimeShape,
+      "If an Op has this attribute, all its kernels should calculate output"
+      "variable's shape in the corresponding Compute() function. Note that "
+      "this temporal attribute would be deleted after all ops contain it.")
+      .SetDefault(true);
   AddComment(R"DOC(
   Fully Connected Operator.
 
@@ -142,13 +139,20 @@ class FCOpKernel : public framework::OpKernel<T> {
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
-    auto input = ctx.Input<Tensor>("Input");
-    auto w = ctx.Input<Tensor>("W");
-    auto bias = ctx.Input<Tensor>("Bias");
-    auto output = ctx.Output<Tensor>("Out");
+    auto input = ctx.Input<framework::LoDTensor>("Input");
+    auto w = ctx.Input<framework::LoDTensor>("W");
+    auto bias = ctx.Input<framework::LoDTensor>("Bias");
+    auto output = ctx.Output<framework::LoDTensor>("Out");
+    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
     auto w_dims = w->dims();
+
+    std::vector<int64_t> output_dims;
+    FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims);
+    output->Resize(framework::make_ddim(output_dims));
+    output->set_lod(input->lod());
+
     auto out_dims = output->dims();
-    int M = framework::product(out_dims) / out_dims[out_dims.size() - 1];
+    int M = framework::product(out_dims) / w_dims[1];
 
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index e1b780fc0c..b82a63cd83 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -48,5 +48,21 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
   void Make() override;
 };
 
+inline void FCOutputSize(const framework::DDim& in_dims,
+                         const framework::DDim& w_dims,
+                         std::vector<int64_t>& out_dims,  // NOLINT
+                         int in_num_col_dims) {
+  auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
+  PADDLE_ENFORCE_EQ(
+      in_mat_dims[1], w_dims[0],
+      "Fully Connected input and weigth size do not match. %s, %s");
+
+  out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
+  for (int i = 0; i < in_num_col_dims; ++i) {
+    out_dims.push_back(in_dims[i]);
+  }
+  out_dims.push_back(w_dims[1]);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index a0026427e2..40a411985c 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -88,7 +88,11 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update.")
         .SetDefault(false);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(
+        framework::kAllKernelsMustComputeRuntimeShape,
+        "If an Op has this attribute, all its kernels should calculate output"
+        "variable's shape in the corresponding Compute() function. Note that "
+        "this temporal attribute would be deleted after all ops contain it.")
         .SetDefault(true);
     AddComment(R"DOC(
 FusedEmbeddingSeqPool Operator.
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index f6395fb32f..4deee8b433 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -54,7 +54,11 @@ $$Out = scale * X$$
 )DOC");
     AddAttr<int>("num_hash", "").SetDefault(1);
     AddAttr<int>("mod_by", "").SetDefault(100000);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(
+        framework::kAllKernelsMustComputeRuntimeShape,
+        "If an Op has this attribute, all its kernels should calculate output"
+        "variable's shape in the corresponding Compute() function. Note that "
+        "this temporal attribute would be deleted after all ops contain it.")
         .SetDefault(true);
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 3a926a716f..2bdf146f4d 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -123,9 +123,9 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
-    auto input = ctx.Input<Tensor>("Input");
-    auto w = ctx.Input<Tensor>("W");
-    auto bias = ctx.Input<Tensor>("Bias");
+    auto input = ctx.Input<framework::LoDTensor>("Input");
+    auto w = ctx.Input<framework::LoDTensor>("W");
+    auto bias = ctx.Input<framework::LoDTensor>("Bias");
 
     PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
                    "Input must be with 2 or 4 dimensions, i.e. NCHW");
@@ -151,7 +151,13 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
 
-    auto output = ctx.Output<Tensor>("Out");
+    auto output = ctx.Output<framework::LoDTensor>("Out");
+    int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
+    std::vector<int64_t> output_dims;
+    FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims);
+    output->Resize(framework::make_ddim(output_dims));
+    output->set_lod(input->lod());
+
     T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
     auto dst_memory = mem.dst(output_data);
@@ -204,19 +210,21 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
 
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const T* input_data = input->data<T>();
+
+    const Tensor* w = ctx.Input<Tensor>("W");
+    const T* w_data = w->data<T>();
+
     if (input_grad) {
+      input_grad->Resize(input->dims());
       input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
     }
     if (w_grad) {
+      w_grad->Resize(w->dims());
       w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
     }
 
-    const Tensor* input = ctx.Input<Tensor>("Input");
-    const T* input_data = input->data<T>();
-
-    const Tensor* w = ctx.Input<Tensor>("W");
-    const T* w_data = w->data<T>();
-
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     const T* out_grad_data = out_grad->data<T>();
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index f357c9c08d..75bcd3c47f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -59,7 +59,11 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
         .SetDefault(0);
-    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape, "")
+    AddAttr<bool>(
+        framework::kAllKernelsMustComputeRuntimeShape,
+        "If an Op has this attribute, all its kernels should calculate output"
+        "variable's shape in the corresponding Compute() function. Note that "
+        "this temporal attribute would be deleted after all ops contain it.")
         .SetDefault(true);
     AddComment(R"DOC(
 Sequence Enumerate Operator.

From 81b4fad8b94d85a473875ae0efe634fe46697314 Mon Sep 17 00:00:00 2001
From: achao2013 <chao_jichao@hotmail.com>
Date: Fri, 15 Mar 2019 15:40:37 +0800
Subject: [PATCH 48/73] add moving average absmax op and fix bug (#15155)

* Add moving average absmax op in quantilize-aware training.
---
 paddle/fluid/API.spec                         |   2 +-
 paddle/fluid/operators/fake_quantize_op.cc    | 102 ++++++++++++++++++
 paddle/fluid/operators/fake_quantize_op.cu    |  38 +++++++
 paddle/fluid/operators/fake_quantize_op.h     |  59 +++++++++-
 .../contrib/quantize/quantize_transpiler.py   |  84 +++++++++++++--
 .../slim/quantization/quantization_pass.py    |  86 ++++++++++++++-
 .../slim/tests/test_quantization_pass.py      |  13 +++
 .../tests/unittests/test_fake_quantize_op.py  |  42 ++++++++
 8 files changed, 409 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6b25d6a14f..fdd23681af 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -367,7 +367,7 @@ paddle.fluid.contrib.BeamSearchDecoder.read_array (ArgSpec(args=['self', 'init',
 paddle.fluid.contrib.BeamSearchDecoder.update_array (ArgSpec(args=['self', 'array', 'value'], varargs=None, keywords=None, defaults=None), ('document', '5754e9b3212b7c09497151516a0de5a7'))
 paddle.fluid.contrib.memory_usage (ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None), ('document', '8fcb2f93bb743693baa8d4860a5ccc47'))
 paddle.fluid.contrib.op_freq_statistic (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4d43687113c4bf5b29d15aee2f4e4afa'))
-paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
+paddle.fluid.contrib.QuantizeTranspiler.__init__ (ArgSpec(args=['self', 'weight_bits', 'activation_bits', 'activation_quantize_type', 'weight_quantize_type', 'window_size', 'moving_rate'], varargs=None, keywords=None, defaults=(8, 8, 'abs_max', 'abs_max', 10000, 0.9)), ('document', '14b39f1fcd5667ff556b1aad94357d1d'))
 paddle.fluid.contrib.QuantizeTranspiler.convert_to_int8 (ArgSpec(args=['self', 'program', 'place', 'scope'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.QuantizeTranspiler.freeze_program (ArgSpec(args=['self', 'program', 'place', 'fuse_bn', 'scope'], varargs=None, keywords=None, defaults=(False, None)), ('document', '909675a1ab055c69b436a7893fcae4fd'))
 paddle.fluid.contrib.QuantizeTranspiler.training_transpile (ArgSpec(args=['self', 'program', 'startup_program'], varargs=None, keywords=None, defaults=(None, None)), ('document', '6dd9909f10b283ba2892a99058a72884'))
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index 70186e5efa..d51d51b495 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -81,6 +81,30 @@ struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, T> {
 
 template struct FindRangeAbsMaxFunctor<platform::CPUDeviceContext, float>;
 
+template <typename T>
+struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& ctx,
+                  const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state, const T* cur_scale,
+                  const float rate, framework::Tensor* out_state,
+                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+    T accum = in_accum.data<T>()[0];
+    T state = in_state.data<T>()[0];
+    T scale = cur_scale[0];
+
+    state = rate * state + 1;
+    accum = rate * accum + scale;
+    scale = accum / state;
+
+    out_state->mutable_data<T>(ctx.GetPlace())[0] = state;
+    out_accum->mutable_data<T>(ctx.GetPlace())[0] = accum;
+    out_scale->mutable_data<T>(ctx.GetPlace())[0] = scale;
+  }
+};
+
+template struct FindMovingAverageAbsMaxFunctor<platform::CPUDeviceContext,
+                                               float>;
+
 class FakeQuantizeAbsMaxOp : public framework::OperatorWithKernel {
  public:
   FakeQuantizeAbsMaxOp(const std::string& type,
@@ -255,6 +279,78 @@ $$Out = round(X/scale * range)$$
   }
 };
 
+class FakeQuantizeMovingAverageAbsMaxOp : public framework::OperatorWithKernel {
+ public:
+  FakeQuantizeMovingAverageAbsMaxOp(const std::string& type,
+                                    const framework::VariableNameMap& inputs,
+                                    const framework::VariableNameMap& outputs,
+                                    const framework::AttributeMap& attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(
+        ctx->HasInput("X"),
+        "Input(X) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("Out"),
+        "Output(Out) of FakeQuantizeMovingAverageAbsMaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("OutScale"),
+                   "Output(OutScale) of FakeQuantizeMovingAverageAbsMaxOp "
+                   "should not be null");
+    if (ctx->HasOutput("OutState")) {
+      ctx->SetOutputDim("OutState", {1});
+    }
+    if (ctx->HasOutput("OutAccum")) {
+      ctx->SetOutputDim("OutAccum", {1});
+    }
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("OutScale", {1});
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(ctx.Input<framework::LoDTensor>("X")->type(),
+                                   ctx.device_context());
+  }
+};
+
+class FakeQuantizeMovingAverageAbsMaxOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) Input is float data type.");
+    AddInput("InScale", "Last scale.");
+    AddInput("InAccum", "Last accum.").AsDispensable();
+    AddInput("InState", "Last state.").AsDispensable();
+    AddOutput("Out", "(Tensor) Output of quantized low level tensor.");
+    AddOutput("OutScale", " Current scale");
+    AddOutput("OutState", "(Tensor) state buffer.").AsDispensable();
+    AddOutput("OutAccum", "(Tensor) accum buffer.").AsDispensable();
+    AddAttr<float>("moving_rate", "(float, default 0.9) moving rate.")
+        .SetDefault(0.9);
+    AddAttr<int>("bit_length", "(int, default 8), quantization bit number.")
+        .SetDefault(8)
+        .AddCustomChecker([](const int& bit_length) {
+          PADDLE_ENFORCE(bit_length >= 1 && bit_length <= 16,
+                         "'bit_length' should be between 1 and 16.");
+        });
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+FakeQuantize operator is used in static quantization.
+
+$$scale = (0.9*max(abs(x))+accum)/(0.9*state+1)$$
+$$range = 2^{bit_length - 1} - 1$$
+$$Out = round(X/scale * range)$$
+
+)DOC");
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -273,6 +369,12 @@ REGISTER_OPERATOR(fake_quantize_range_abs_max, ops::FakeQuantizeRangeAbsMaxOp,
 REGISTER_OP_CPU_KERNEL(fake_quantize_range_abs_max,
                        ops::FakeQuantizeRangeAbsMaxKernel<CPU, float>);
 
+REGISTER_OPERATOR(fake_quantize_moving_average_abs_max,
+                  ops::FakeQuantizeMovingAverageAbsMaxOp,
+                  ops::FakeQuantizeMovingAverageAbsMaxOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(fake_quantize_moving_average_abs_max,
+                       ops::FakeQuantizeMovingAverageAbsMaxKernel<CPU, float>);
 REGISTER_OPERATOR(fake_channel_wise_quantize_abs_max,
                   ops::FakeChannelWiseQuantizeAbsMaxOp,
                   ops::FakeChannelWiseQuantizeAbsMaxOpMaker,
diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu
index 5da16a7c73..3707f6772e 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu
+++ b/paddle/fluid/operators/fake_quantize_op.cu
@@ -147,6 +147,41 @@ struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, T> {
 
 template struct FindRangeAbsMaxFunctor<platform::CUDADeviceContext, float>;
 
+template <typename T>
+struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& ctx,
+                  const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state, const T* cur_scale,
+                  const float rate, framework::Tensor* out_state,
+                  framework::Tensor* out_accum, framework::Tensor* out_scale) {
+    const auto gpu_place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
+
+    T accum;
+    memory::Copy(platform::CPUPlace(), &accum, gpu_place, in_accum.data<T>(),
+                 sizeof(T), 0);
+    T state;
+    memory::Copy(platform::CPUPlace(), &state, gpu_place, in_state.data<T>(),
+                 sizeof(T), 0);
+    T scale;
+    memory::Copy(platform::CPUPlace(), &scale, gpu_place, cur_scale, sizeof(T),
+                 0);
+
+    state = rate * state + 1;
+    accum = rate * accum + scale;
+    scale = accum / state;
+
+    memory::Copy(gpu_place, out_accum->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &accum, sizeof(T), 0);
+    memory::Copy(gpu_place, out_state->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &state, sizeof(T), 0);
+    memory::Copy(gpu_place, out_scale->mutable_data<T>(gpu_place),
+                 platform::CPUPlace(), &scale, sizeof(T), 0);
+  }
+};
+
+template struct FindMovingAverageAbsMaxFunctor<platform::CUDADeviceContext,
+                                               float>;
+
 template <typename T>
 struct ClipAndFakeQuantFunctor<platform::CUDADeviceContext, T> {
   void operator()(const platform::CUDADeviceContext& ctx,
@@ -178,3 +213,6 @@ REGISTER_OP_CUDA_KERNEL(fake_channel_wise_quantize_abs_max,
                         ops::FakeChannelWiseQuantizeAbsMaxKernel<CUDA, float>);
 REGISTER_OP_CUDA_KERNEL(fake_quantize_range_abs_max,
                         ops::FakeQuantizeRangeAbsMaxKernel<CUDA, float>);
+REGISTER_OP_CUDA_KERNEL(
+    fake_quantize_moving_average_abs_max,
+    ops::FakeQuantizeMovingAverageAbsMaxKernel<CUDA, float>);
diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h
index 8b47600e7d..ec667e89e7 100644
--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -42,12 +42,20 @@ struct FindRangeAbsMaxFunctor {
                   framework::Tensor* scales_arr, framework::Tensor* out_scale);
 };
 
+template <typename DeviceContext, typename T>
+struct FindMovingAverageAbsMaxFunctor {
+  void operator()(const DeviceContext& ctx, const framework::Tensor& in_accum,
+                  const framework::Tensor& in_state,
+                  const framework::Tensor& cur_scale,
+                  framework::Tensor* out_state, framework::Tensor* out_accum,
+                  framework::Tensor* out_scale);
+};
+
 template <typename DeviceContext, typename T>
 class FakeQuantizeAbsMaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<framework::Tensor>("X");
-
     auto* out = context.Output<framework::Tensor>("Out");
     auto* out_scale = context.Output<framework::Tensor>("OutScale");
     T* out_s = out_scale->mutable_data<T>(context.GetPlace());
@@ -138,5 +146,54 @@ class FakeQuantizeRangeAbsMaxKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename DeviceContext, typename T>
+class FakeQuantizeMovingAverageAbsMaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* in = context.Input<framework::Tensor>("X");
+    auto* in_scale = context.Input<framework::Tensor>("InScale");
+    auto* out = context.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+
+    bool is_test = context.Attr<bool>("is_test");
+    int bit_length = context.Attr<int>("bit_length");
+    int bin_cnt = std::pow(2, bit_length - 1) - 1;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+
+    // testing
+    if (is_test) {
+      ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *in_scale,
+                                                  bin_cnt, out);
+      return;
+    }
+
+    // training
+    auto* in_accum = context.Input<framework::Tensor>("InAccum");
+    auto* in_state = context.Input<framework::Tensor>("InState");
+    auto& allocator =
+        platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx);
+    auto cur_scale = allocator.Allocate(1 * sizeof(T));
+    T* cur_scale_data = static_cast<T*>(cur_scale->ptr());
+
+    FindAbsMaxFunctor<DeviceContext, T>()(dev_ctx, in->data<T>(), in->numel(),
+                                          cur_scale_data);
+
+    auto* out_state = context.Output<framework::Tensor>("OutState");
+    auto* out_accum = context.Output<framework::Tensor>("OutAccum");
+    auto* out_scale = context.Output<framework::Tensor>("OutScale");
+    out_state->mutable_data<T>(context.GetPlace());
+    out_accum->mutable_data<T>(context.GetPlace());
+    out_scale->mutable_data<T>(context.GetPlace());
+    float moving_rate = context.Attr<float>("moving_rate");
+
+    FindMovingAverageAbsMaxFunctor<DeviceContext, T>()(
+        dev_ctx, *in_accum, *in_state, cur_scale_data, moving_rate, out_state,
+        out_accum, out_scale);
+
+    ClipAndFakeQuantFunctor<DeviceContext, T>()(dev_ctx, *in, *out_scale,
+                                                bin_cnt, out);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
index 032d0353ea..8eddf18cec 100644
--- a/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/quantize/quantize_transpiler.py
@@ -84,7 +84,8 @@ class QuantizeTranspiler(object):
                  activation_bits=8,
                  activation_quantize_type='abs_max',
                  weight_quantize_type='abs_max',
-                 window_size=10000):
+                 window_size=10000,
+                 moving_rate=0.9):
         """
         Convert and rewrite the fluid Program according to weight and
         activation quantization type.
@@ -117,23 +118,27 @@ class QuantizeTranspiler(object):
         """
         self.weight_bits = weight_bits
         self.activation_bits = activation_bits
-        quant_type = ['abs_max', 'range_abs_max']
+        quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                str(weight_quantize_type))
         if activation_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown activation_quantize_type : '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                str(activation_quantize_type))
 
         self.weight_quantize_type = weight_quantize_type
         self.activation_quantize_type = activation_quantize_type
 
         self.window_size = window_size
+        self.moving_rate = moving_rate
         self.helper = LayerHelper(self.__class__.__name__)
         self.fake_quant_op_types = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
+            'fake_quantize_moving_average_abs_max'
         ]
         self.fake_dequant_op_types = ['fake_dequantize_max_abs']
         self.is_test = None
@@ -168,6 +173,7 @@ class QuantizeTranspiler(object):
             block_id = block.idx
             # insert quant op and dequant op
             for name in op.input_arg_names:
+                #if share input between ops
                 if name in dequanted_vars[block_id]:
                     dequant_var = dequanted_vars[block_id][name]
                 else:
@@ -261,6 +267,7 @@ class QuantizeTranspiler(object):
             max_range = None
             scale_var = None
             for name in op.input_arg_names:
+                #rename input name of the op to the input name of last op which has be removed
                 if name in op_in_rename_map[block_id]:
                     op._rename_input(name, op_in_rename_map[block_id][name])
 
@@ -272,8 +279,7 @@ class QuantizeTranspiler(object):
                     max_range = param_range * act_range / scale_v
                 else:
                     assert isinstance(scale_v, Variable)
-                    scale_var = var_scale_map[block_id][_original_var_name(
-                        name)]
+                    scale_var = scale_v
 
             if len(op.output_arg_names) != 1:
                 raise ValueError("Only support one output, but op %s has"
@@ -309,7 +315,7 @@ class QuantizeTranspiler(object):
                 op_type = op.type
 
                 # insert dequant_op after fc/conv, need to rename
-                # input of the followed ops
+                # input of the followed ops(of fc/conv) to the dquant_op
                 for name in op.input_arg_names:
                     if name in op_out_rename_map[block_id]:
                         op._rename_input(name,
@@ -389,8 +395,8 @@ class QuantizeTranspiler(object):
             for op in block.ops:
                 args += op.input_arg_names
                 args += op.output_arg_names
-            args = list(set(args))
-            var_names = block.vars.keys()
+            args = list(set(args))  #vals of all left ops
+            var_names = block.vars.keys()  # all vals
             sub_block_remove_vars = []
             for var in var_names:
                 if var not in args:
@@ -471,6 +477,61 @@ class QuantizeTranspiler(object):
 
         return quant_var, scale
 
+    def _insert_quant_moving_average_abs_max_op(self, block, idx, var,
+                                                quant_bits):
+        """Insert fake_quantize_moving_average_abs_max
+        """
+        quant_var = block.create_var(
+            name=_quantized_var_name(var.name),
+            type=var.type,
+            shape=var.shape,
+            dtype=var.dtype)
+        state = self.helper.create_global_variable(
+            name=unique_name.generate('state'),
+            persistable=True,
+            dtype=var.dtype,
+            shape=[1])
+        self.helper.set_variable_initializer(
+            state, initializer=Constant(value=1))
+        accum = self.helper.create_global_variable(
+            name=unique_name.generate('accum'),
+            persistable=True,
+            dtype=var.dtype,
+            shape=[1])
+        self.helper.set_variable_initializer(
+            accum, initializer=Constant(value=1))
+        scale = self.helper.create_parameter(
+            attr=ParamAttr(
+                name=_quantized_scale_name(var.name),
+                initializer=Constant(0.001),
+                trainable=False),
+            shape=[1],
+            dtype=var.dtype)
+        scale.stop_gradient = True
+
+        ins = {'X': var, 'InScale': scale}
+        outs = {'Out': quant_var, 'OutScale': scale}
+        if not self.is_test:
+            ins['InState'] = state
+            ins['InAccum'] = accum
+            outs['OutState'] = state
+            outs['OutAccum'] = accum
+
+        attrs = {
+            'bit_length': quant_bits,
+            'moving_rate': self.moving_rate,
+            'is_test': self.is_test
+        }
+
+        quant_op = block._insert_op(
+            idx,
+            type='fake_quantize_moving_average_abs_max',
+            attrs=attrs,
+            inputs=ins,
+            outputs=outs)
+
+        return quant_var, scale
+
     def _insert_quant_op(self, block, idx, var, quant_bits, quant_type):
         """
         Insert fake_quantize_op
@@ -480,6 +541,9 @@ class QuantizeTranspiler(object):
         elif quant_type == 'range_abs_max':
             return self._insert_quant_range_abs_max_op(block, idx, var,
                                                        quant_bits)
+        elif quant_type == 'moving_average_abs_max':
+            return self._insert_quant_moving_average_abs_max_op(block, idx, var,
+                                                                quant_bits)
 
     def _insert_dequant_op(self, block, idx, var, scale, quant_bits):
         """
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index 622add4843..919db4c78e 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -38,7 +38,8 @@ class QuantizationTransformPass(object):
                  activation_bits=8,
                  activation_quantize_type='abs_max',
                  weight_quantize_type='abs_max',
-                 window_size=10000):
+                 window_size=10000,
+                 moving_rate=0.9):
         """
         Convert and rewrite the IrGraph according to weight and
         activation quantization type.
@@ -83,19 +84,22 @@ class QuantizationTransformPass(object):
         self._weight_bits = weight_bits
         self._activation_bits = activation_bits
 
-        quant_type = ['abs_max', 'range_abs_max']
+        quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max']
         if activation_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown activation_quantize_type : '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max'.", str(activation_quantize_type))
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                str(activation_quantize_type))
         if weight_quantize_type not in quant_type:
             raise ValueError(
                 "Unknown weight_quantize_type: '%s'. It can only be ",
-                "'abs_max' or 'range_abs_max'.", str(weight_quantize_type))
+                "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.",
+                str(weight_quantize_type))
 
         self._activation_quantize_type = activation_quantize_type
         self._weight_quantize_type = weight_quantize_type
         self._window_size = window_size
+        self._moving_rate = moving_rate
 
         self._need_initialized = collections.OrderedDict()
         self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
@@ -222,6 +226,9 @@ class QuantizationTransformPass(object):
         elif quant_type == 'range_abs_max':
             return self._insert_quant_range_abs_max_op(graph, var_node,
                                                        quant_bits)
+        elif quant_type == 'moving_average_abs_max':
+            return self._insert_quant_moving_average_abs_max_op(graph, var_node,
+                                                                quant_bits)
 
     def _insert_quant_abs_max_op(self, graph, var_node, quant_bits):
         """
@@ -309,6 +316,74 @@ class QuantizationTransformPass(object):
 
         return quant_var_node, scale_out_node
 
+    def _insert_quant_moving_average_abs_max_op(self, graph, var_node,
+                                                quant_bits):
+        """Insert fake_quantize_moving_average_abs_max
+        """
+        quant_var_node = graph.create_var_node(
+            name=self._quantized_var_name(var_node.name()),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        scale_in_node = graph.create_persistable_node(
+            name=self._quantized_scale_name(var_node.name()),
+            var_type=core.VarDesc.VarType.LOD_TENSOR,
+            shape=[1],
+            var_dtype=var_node.dtype())
+        self._need_initialized[scale_in_node.var()] = Constant(value=0.001)
+
+        scale_out_node = graph.create_var_node_from_desc(scale_in_node.var())
+        ins = {'X': var_node, 'InScale': scale_in_node}
+        outs = {'Out': quant_var_node, 'OutScale': scale_out_node}
+        if not self._is_test:
+            state_in_node = graph.create_persistable_node(
+                name=unique_name.generate('state'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                var_dtype=var_node.dtype(),
+                shape=[1])
+            self._need_initialized[state_in_node.var()] = Constant(value=1)
+            accum_in_node = graph.create_persistable_node(
+                name=unique_name.generate('accum'),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                var_dtype=var_node.dtype(),
+                shape=[1])
+            self._need_initialized[accum_in_node.var()] = Constant(value=1)
+            state_out_node = graph.create_var_node_from_desc(state_in_node.var(
+            ))
+            accum_out_node = graph.create_var_node_from_desc(accum_in_node.var(
+            ))
+
+            ins['InState'] = state_in_node
+            ins['InAccum'] = accum_in_node
+            outs['OutState'] = state_out_node
+            outs['OutAccum'] = accum_out_node
+
+        attrs = {
+            'bit_length': quant_bits,
+            'moving_rate': self._moving_rate,
+            'is_test': self._is_test,
+            'op_role': core.op_proto_and_checker_maker.OpRole.Forward
+        }
+
+        quant_op_node = graph.create_op_node(
+            op_type='fake_quantize_moving_average_abs_max',
+            attrs=attrs,
+            inputs=ins,
+            outputs=outs)
+
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(scale_in_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+        graph.link_to(quant_op_node, scale_out_node)
+
+        if not self._is_test:
+            graph.link_to(state_in_node, quant_op_node)
+            graph.link_to(accum_in_node, quant_op_node)
+            graph.link_to(quant_op_node, state_out_node)
+            graph.link_to(quant_op_node, accum_out_node)
+
+        return quant_var_node, scale_out_node
+
     def _insert_dequant_op(self, graph, var_node, scale_var_node, quant_bits):
         """
         Insert fake_dequantize_op in the graph.
@@ -389,7 +464,8 @@ class QuantizationFreezePass(object):
         self._weight_quantize_type = weight_quantize_type
         self._quantizable_ops = ['conv2d', 'depthwise_conv2d', 'mul']
         self._fake_quant_op_names = [
-            'fake_quantize_abs_max', 'fake_quantize_range_abs_max'
+            'fake_quantize_abs_max', 'fake_quantize_range_abs_max',
+            'fake_quantize_moving_average_abs_max'
         ]
         self._fake_dequant_op_names = ['fake_dequantize_max_abs']
         self._op_input_rename_map = collections.OrderedDict()
diff --git a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
index c6a301b7f4..0b4b2a285f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_quantization_pass.py
@@ -164,6 +164,9 @@ class TestQuantizationTransformPass(unittest.TestCase):
     def test_linear_fc_quant_range_abs_max(self):
         self.linear_fc_quant('range_abs_max', for_ci=True)
 
+    def test_linear_fc_quant_moving_average_abs_max(self):
+        self.linear_fc_quant('moving_average_abs_max', for_ci=True)
+
     def residual_block_quant(self, quant_type, for_ci=False):
         main = fluid.Program()
         startup = fluid.Program()
@@ -201,6 +204,9 @@ class TestQuantizationTransformPass(unittest.TestCase):
     def test_residual_block_range_abs_max(self):
         self.residual_block_quant('range_abs_max', for_ci=True)
 
+    def test_residual_block_moving_average_abs_max(self):
+        self.residual_block_quant('moving_average_abs_max', for_ci=True)
+
 
 class TestQuantizationFreezePass(unittest.TestCase):
     def freeze_graph(self, use_cuda, seed, quant_type, for_ci=False):
@@ -380,11 +386,18 @@ class TestQuantizationFreezePass(unittest.TestCase):
             with fluid.unique_name.guard():
                 self.freeze_graph(
                     True, seed=1, quant_type='range_abs_max', for_ci=True)
+                self.freeze_graph(
+                    True,
+                    seed=1,
+                    quant_type='moving_average_abs_max',
+                    for_ci=True)
 
     def test_freeze_graph_cpu_static(self):
         with fluid.unique_name.guard():
             self.freeze_graph(
                 False, seed=2, quant_type='range_abs_max', for_ci=True)
+            self.freeze_graph(
+                False, seed=2, quant_type='moving_average_abs_max', for_ci=True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
index 90a90112bd..cf8f01edb9 100644
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
 
 
 class TestFakeQuantizeOp(OpTest):
@@ -75,6 +76,7 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
             'InScale': np.zeros(1).astype("float32")
         }
         scale = np.max(np.abs(self.inputs['X'])).astype("float32")
+
         out_scales = np.zeros(self.attrs['window_size']).astype("float32")
         out_scales[0] = scale
         self.outputs = {
@@ -88,6 +90,46 @@ class TestFakeQuantizeRangeAbsMaxOp(OpTest):
         self.check_output()
 
 
+class TestFakeQuantizeMovingOp(OpTest):
+    def setUp(self):
+        self.op_type = "fake_quantize_moving_average_abs_max"
+        self.attrs = {
+            'bit_length': int(5),
+            'moving_rate': float(0.9),
+            'is_test': False
+        }
+        accum = np.zeros(1).astype("float32")
+        accum[0] = 1
+        state = np.zeros(1).astype("float32")
+        state[0] = 1
+        scale = np.zeros(1).astype("float32")
+        scale[0] = 0.001
+        self.inputs = {
+            'X': np.random.random((8, 16, 7, 7)).astype("float32"),
+            'InScale': scale,
+            'InAccum': accum,
+            'InState': state,
+        }
+
+        out_accum = np.zeros(1).astype("float32")
+        out_state = np.zeros(1).astype("float32")
+        out_scale = np.zeros(1).astype("float32")
+        out_accum[0] = self.attrs['moving_rate'] * accum[0] + np.max(
+            np.abs(self.inputs['X'])).astype("float32")
+        out_state[0] = self.attrs['moving_rate'] * state[0] + 1
+        out_scale = out_accum / out_state
+        self.outputs = {
+            'Out': np.round(self.inputs['X'] / out_scale * (
+                (1 << (self.attrs['bit_length'] - 1)) - 1)),
+            'OutAccum': out_accum,
+            'OutState': out_state,
+            'OutScale': out_scale,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class TestFakeQuantizeRangeAbsMaxOp2(OpTest):
     def setUp(self):
         self.op_type = "fake_quantize_range_abs_max"

From 5ecdc49c6b234b3709e7466dd3e3b30c6326368d Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Fri, 15 Mar 2019 16:14:12 +0800
Subject: [PATCH 49/73] set enable_runtime_context_cache_ default false

test=develop
---
 paddle/fluid/inference/api/analysis_config.cc                   | 2 --
 paddle/fluid/inference/api/paddle_analysis_config.h             | 2 +-
 paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc | 1 +
 3 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 2189b87381..a9e477f883 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -205,8 +205,6 @@ void AnalysisConfig::Update() {
       // Append after the Affine_channel_conv_fuse pass.
       pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
     }
-    // runtime_context_cache isn't fit for tensorrt.
-    enable_runtime_context_cache_ = false;
   }
 
   if (use_mkldnn_) {
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 85639eebe4..3b7faa5400 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -278,7 +278,7 @@ struct AnalysisConfig {
   // since the input/output names of this Op do not change in the execution,
   // RuntimeContext could be created only at the first iteration of this Op's
   // execution to save the elapsed time.
-  bool enable_runtime_context_cache_{true};
+  bool enable_runtime_context_cache_{false};
 
   // A runtime cache, shouldn't be transferred to others.
   std::string serialized_info_cache_;
diff --git a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
index 5157bd280d..e1787a7177 100644
--- a/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_pyramid_dnn_tester.cc
@@ -107,6 +107,7 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->DisableGpu();
   cfg->SwitchSpecifyInputNames();
   cfg->SwitchIrOptim();
+  cfg->SwitchRuntimeContextCache();
   if (FLAGS_zero_copy) {
     cfg->SwitchUseFeedFetchOps(false);
   }

From 50ff898378af0de73217785d72d3bc0595c9b3e4 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 15 Mar 2019 18:00:13 +0800
Subject: [PATCH 50/73] graph neural network for imperative mode

test=develop
---
 paddle/fluid/operators/squeeze_op.cc          |  1 +
 python/paddle/fluid/framework.py              |  5 ++
 python/paddle/fluid/layers/nn.py              |  7 +-
 .../tests/unittests/test_imperative_gnn.py    | 89 +++++++++++++++++++
 .../fluid/tests/unittests/test_layers.py      | 22 +++++
 5 files changed, 122 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_imperative_gnn.py

diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index e389c6a65e..ecfb4e8956 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -94,6 +94,7 @@ class SqueezeOpInferShape : public framework::InferShapeBase {
   }
 };
 
+// TODO(paddle-dev): Should use OpKernel.
 class SqueezeOp : public framework::OperatorBase {
  public:
   using OperatorBase::OperatorBase;
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 8988c55096..38ffd0c382 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -430,6 +430,11 @@ class Variable(object):
         Returns:
             str: The debug string.
         """
+        if _in_imperative_mode():
+            # TODO(panyx0718): add imperative debug info.
+            return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
+                                                     self.shape)
+
         assert isinstance(throw_on_error, bool) and isinstance(with_details,
                                                                bool)
         protostr = self.desc.serialize_to_string()
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ea028b0566..ca917871da 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -23,7 +23,7 @@ import os
 import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder
+from ..framework import Variable, OpProtoHolder, _in_imperative_mode
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
 from .tensor import concat, assign
@@ -4864,7 +4864,8 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
         if transpose_y:
             y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
         if x_shape[-1] != y_shape[-2]:
-            raise ValueError("Invalid inputs for matmul.")
+            raise ValueError("Invalid inputs for matmul. x: %s, y: %s\n" %
+                             (x_shape, y_shape))
 
         if len(y_shape) > 2 and len(x_shape) > 2:
             for i, dim_x in enumerate(x_shape[:-2]):
@@ -6367,6 +6368,8 @@ def squeeze(input, axes, name=None):
             x = layers.data(name='x', shape=[5, 1, 10])
             y = layers.sequeeze(input=x, axes=[1])
     """
+    assert not _in_imperative_mode(), (
+        "squeeze layer is not supported in imperative mode yet.")
     helper = LayerHelper("squeeze", **locals())
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
new file mode 100644
index 0000000000..d471a9baf8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+import unittest
+import numpy as np
+import six
+import sys
+
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
+from test_imperative_base import new_program_scope
+from paddle.fluid.imperative.base import to_variable
+
+
+def gen_data():
+    pass
+
+
+class GraphConv(fluid.imperative.Layer):
+    def __init__(self, name_scope, in_features, out_features):
+        super(GraphConv, self).__init__(name_scope)
+
+        self._in_features = in_features
+        self._out_features = out_features
+        self.weight = self.create_parameter(
+            attr=None,
+            dtype='float32',
+            shape=[self._in_features, self._out_features])
+        self.bias = self.create_parameter(
+            attr=None, dtype='float32', shape=[self._out_features])
+
+    def forward(self, features, adj):
+        support = fluid.layers.matmul(features, self.weight)
+        # TODO(panyx0718): sparse matmul?
+        return fluid.layers.matmul(adj, support) + self.bias
+
+
+class GCN(fluid.imperative.Layer):
+    def __init__(self, name_scope, num_hidden):
+        super(GCN, self).__init__(name_scope)
+        self.gc = GraphConv(self.full_name(), num_hidden, 32)
+        self.gc2 = GraphConv(self.full_name(), 32, 10)
+
+    def forward(self, x, adj):
+        x = fluid.layers.relu(self.gc(x, adj))
+        return self.gc2(x, adj)
+
+
+class TestImperativeGNN(unittest.TestCase):
+    def test_gnn_float32(self):
+        seed = 90
+
+        with fluid.imperative.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+
+            features = np.zeros([1, 100, 50], dtype=np.float32)
+            adj = np.zeros([1, 100, 100], dtype=np.float32)
+            labels = np.zeros([100, 1], dtype=np.int64)
+
+            model = GCN('test_gcn', 50)
+            logits = model(to_variable(features), to_variable(adj))
+            sys.stderr.write('%s\n' % logits)
+            logits = fluid.layers.reshape(logits, logits.shape[1:])
+            # In other example, it's nll with log_softmax. However, paddle's
+            # log_loss only supports binary classification now.
+            loss = fluid.layers.softmax_with_cross_entropy(logits,
+                                                           to_variable(labels))
+            loss = fluid.layers.reduce_sum(loss)
+            sys.stderr.write('%s\n' % loss._numpy())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 5b186ae038..09b16a72d2 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -84,6 +84,28 @@ class TestLayer(LayerTest):
 
         self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
 
+    def test_matmul(self):
+        with self.static_graph():
+            t = layers.data(name='t', shape=[3, 3], dtype='float32')
+            t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
+            ret = layers.matmul(t, t2)
+            static_ret = self.get_static_graph_result(
+                feed={
+                    't': np.ones(
+                        [3, 3], dtype='float32'),
+                    't2': np.ones(
+                        [3, 3], dtype='float32')
+                },
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            t = np.ones([3, 3], dtype='float32')
+            t2 = np.ones([3, 3], dtype='float32')
+            ret = layers.matmul(t, t2)
+            dy_ret = layers.relu(base.to_variable(ret))
+
+        self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
+
     def test_conv2d(self):
         with self.static_graph():
             images = layers.data(name='pixel', shape=[3, 5, 5], dtype='float32')

From 438bca9c3dedb2bd7deffb6be956ffc1ed5f4447 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 15 Mar 2019 20:17:33 +0800
Subject: [PATCH 51/73] Implement Runtime Var Type Inference

test=develop
---
 paddle/fluid/framework/details/op_registry.h  |   2 +
 paddle/fluid/framework/var_type_inference.h   |  22 ++-
 paddle/fluid/imperative/layer.cc              |  84 ++++++++---
 paddle/fluid/imperative/layer.h               | 142 ++++++++++++++++--
 paddle/fluid/imperative/tracer.cc             |  25 +--
 paddle/fluid/imperative/type_defs.h           |   1 +
 .../fluid/operators/controlflow/while_op.cc   |  17 +--
 .../operators/distributed_ops/split_ids_op.cc |   2 +
 paddle/fluid/operators/nccl/nccl_op.cc        |   9 +-
 paddle/fluid/operators/py_func_op.cc          |   3 +
 .../reader/create_custom_reader_op.cc         |   2 +-
 paddle/fluid/operators/reader/read_op.cc      |   2 +-
 .../operators/reader/reader_op_registry.cc    |  21 ++-
 .../operators/reader/reader_op_registry.h     |   2 +
 paddle/fluid/operators/scale_op.cc            |   1 +
 .../fluid/operators/split_selected_rows_op.cc |   2 +
 paddle/fluid/operators/sum_op.cc              |   1 +
 paddle/fluid/pybind/imperative.cc             |   4 +-
 18 files changed, 265 insertions(+), 77 deletions(-)

diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 79281863f6..346aba07d1 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -16,6 +16,8 @@ limitations under the License. */
 
 #include <string>
 #include <tuple>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/grad_op_desc_maker.h"
 #include "paddle/fluid/framework/inplace_op_inference.h"
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index ed52e1ad81..b4b7be619a 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_desc.h"
@@ -80,6 +81,19 @@ class InferVarTypeContext {
     block_->FindRecursiveOrCreateVar(name).SetDataType(type);
   }
 
+  inline std::vector<proto::VarType::Type> GetDataTypes(
+      const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    return block_->FindRecursiveOrCreateVar(name).GetDataTypes();
+  }
+
+  inline void SetDataTypes(
+      const std::string& name,
+      const std::vector<proto::VarType::Type>& multiple_data_type) {
+    PADDLE_ENFORCE_NOT_NULL(block_);
+    block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type);
+  }
+
   inline std::vector<int64_t> GetShape(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(block_);
     return block_->FindRecursiveOrCreateVar(name).GetShape();
@@ -101,17 +115,11 @@ class InferVarTypeContext {
     block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level);
   }
 
- private:
+ protected:
   const OpDesc* op_;
   BlockDesc* block_;
 };
 
-// infer var type context for imperative mode
-class RuntimeInferVarTypeContext : public InferVarTypeContext {
- public:
-  RuntimeInferVarTypeContext() : InferVarTypeContext(nullptr, nullptr) {}
-};
-
 class VarTypeInference {
  public:
   virtual ~VarTypeInference() {}
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 5530823b90..aee905aa41 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -220,7 +220,7 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
   }
 
   VLOG(3) << "apply op grad: " << Type();
-  std::vector<framework::VariableValueMap> tmp_grad_outputs;
+  std::vector<VarBasePtrMap> tmp_grad_outputs;
   if (backward_id_ > 0) {
     VLOG(3) << "py_layer_grad";
     tmp_grad_outputs.resize(1);
@@ -246,23 +246,59 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
           // Allocate a new variable
           Variable* tmp_var = new framework::Variable();
           tmp_var->GetMutable<framework::LoDTensor>();
-          outputs.emplace_back(tmp_var);
+          VarBase* tmp_var_base =
+              new VarBase(it.second[i]->Name(), tmp_var, nullptr, true);
+          outputs.emplace_back(tmp_var_base);
         }
       }
 
-      // Run grad op
-      framework::RuntimeContext ctx(grad_input_vars_[k], tmp_grad_outputs[k]);
-
       // No need to do compile time infer shape here.
       // grad_op_desc_->InferShape(*block_);
       // grad_op_desc->InferVarType(block_);
 
       std::unique_ptr<framework::OperatorBase> opbase =
           framework::OpRegistry::CreateOp(*grad_op_desc);
+
+      // auto& info =
+      // framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
+      // if (info.infer_var_type_) {
+      // framework::RuntimeInferVarTypeContext infer_var_type_ctx(
+      // this, &grad_inputs, &outputs, &attrs_map);
+      // info.infer_var_type_(infer_var_type_ctx);
+      // }
+
       framework::OperatorWithKernel* op_kernel =
           dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
       PADDLE_ENFORCE_NOT_NULL(op_kernel, "only support op with kernel");
 
+      // Run grad op
+      framework::VariableValueMap grad_invars_map;
+      framework::VariableValueMap grad_outvars_map;
+
+      for (const auto& it : grad_input_vars_[k]) {
+        auto& grad_invars = grad_invars_map[it.first];
+        grad_invars.reserve(it.second.size());
+        for (const VarBase* grad_inp : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_inp->var_, "op %s input %s nullptr",
+                                  grad_op_desc->Type(), grad_inp->Name());
+
+          grad_invars.emplace_back(grad_inp->var_);
+        }
+      }
+
+      for (const auto& it : tmp_grad_outputs[k]) {
+        auto& grad_outvars = grad_outvars_map[it.first];
+        grad_outvars.reserve(it.second.size());
+        for (VarBase* grad_out : it.second) {
+          PADDLE_ENFORCE_NOT_NULL(grad_out->var_, "op %s output %s nullptr",
+                                  grad_op_desc->Type(), grad_out->Name());
+
+          grad_outvars.emplace_back(grad_out->var_);
+        }
+      }
+
+      framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
+
       framework::Scope scope;
       PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
       p.op.RuntimeInferShape(scope, place_, ctx);
@@ -279,8 +315,8 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
       PADDLE_ENFORCE_EQ(outputs.size(), origin_outputs.size());
 
       for (size_t i = 0; i < outputs.size(); ++i) {
-        framework::Variable* grad = outputs[i];
-        framework::Variable* orig_grad = origin_outputs[i];
+        framework::Variable* grad = outputs[i]->var_;
+        framework::Variable* orig_grad = origin_outputs[i]->var_;
         AddTo(grad, orig_grad, place_);
         delete grad;
       }
@@ -328,28 +364,35 @@ void PyLayer::RegisterFunc(int func_id, const py::object& py_func) {
 
 int PyLayer::NumFuncs() { return py_funcs_.size(); }
 
-std::vector<Variable*> PyLayer::Apply(int func_id,
-                                      const std::vector<VarBase*>& inputs) {
-  std::vector<framework::Variable*> invars;
-  for (const VarBase* in : inputs) {
-    invars.push_back(in->var_);
-  }
+std::vector<framework::Variable*> PyLayer::Apply(
+    int func_id, const std::vector<VarBase*>& inputs) {
   PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  return CallPythonFunc(py_funcs_[func_id], invars);
+  return CallPythonFunc(py_funcs_[func_id], inputs);
 }
 
-std::vector<Variable*> PyLayer::ApplyGrad(
-    int func_id, const std::vector<framework::Variable*>& inputs) {
+std::vector<VarBase*> PyLayer::ApplyGrad(int func_id,
+                                         const std::vector<VarBase*>& inputs) {
   PADDLE_ENFORCE(py_funcs_.find(func_id) != py_funcs_.end());
-  return CallPythonFunc(py_funcs_[func_id], inputs);
+  auto rets = CallPythonFunc(py_funcs_[func_id], inputs);
+
+  std::vector<VarBase*> outs;
+  outs.reserve(rets.size());
+  for (size_t i = 0U; i != rets.size(); ++i) {
+    outs.emplace_back(new VarBase(
+        string::Sprintf("%s_out_%d", framework::GradVarName(PyLayer::kFwdOut),
+                        i),
+        rets[i], nullptr, true));
+  }
+
+  return outs;
 }
 
 std::vector<framework::Variable*> PyLayer::CallPythonFunc(
-    const py::object& callable, const std::vector<framework::Variable*>& ins) {
+    const py::object& callable, const std::vector<VarBase*>& ins) {
   py::gil_scoped_acquire guard;
   py::tuple in_args(ins.size());
   for (size_t i = 0; i < ins.size(); ++i) {
-    const framework::LoDTensor& t = ins[i]->Get<framework::LoDTensor>();
+    const framework::LoDTensor& t = ins[i]->var_->Get<framework::LoDTensor>();
     in_args[i] = t.IsInitialized() ? py::cast(t) : py::cast(nullptr);
   }
   VLOG(3) << "pyfunc in " << py::len(in_args);
@@ -359,6 +402,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
   auto ret_tuple = py::cast<py::tuple>(ret);
   size_t ret_num = py::len(ret_tuple);
   std::vector<framework::Variable*> outs;
+  outs.reserve(ret_num);
   VLOG(3) << "pyfunc out " << ret_num;
   for (size_t i = 0; i < ret_num; ++i) {
     try {
@@ -369,7 +413,7 @@ std::vector<framework::Variable*> PyLayer::CallPythonFunc(
       auto* tensor = var->GetMutable<framework::LoDTensor>();
       tensor->ShareDataWith(*py_out_tensor);
       tensor->set_lod(py_out_tensor->lod());
-      outs.push_back(var);
+      outs.emplace_back(var);
     } catch (py::cast_error&) {
       PADDLE_THROW("The %d-th output must be LoDTensor", i);
     }
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 618a5b7a03..494988608e 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -18,14 +18,16 @@
 #include "paddle/fluid/framework/python_headers.h"
 // clang-format on
 
-#include <map>     // NOLINT
-#include <string>  // NOLINT
-#include <vector>  // NOLINT
-#include <memory>  // NOLINT
+#include <map>            // NOLINT
+#include <string>         // NOLINT
+#include <vector>         // NOLINT
+#include <memory>         // NOLINT
+#include <unordered_map>  // NOLINT
 
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_desc.h"
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -184,6 +186,10 @@ class VarBase {
     }
   }
 
+  inline void SetDType(framework::proto::VarType::Type type) {
+    auto tensor = var_->GetMutable<framework::LoDTensor>();
+    tensor->mutable_data(place_, dtype_);
+  }
   inline framework::proto::VarType::Type DType() const { return dtype_; }
 
   inline void SetStopGradient(bool stop_gradient) {
@@ -328,9 +334,9 @@ class PYBIND11_HIDDEN OpBase {
   std::map<std::string, std::vector<int>> pre_ops_out_idx_;
 
   // Inputs to a vector of bwd ops.
-  std::vector<framework::VariableValueMap> grad_input_vars_;
+  std::vector<VarBasePtrMap> grad_input_vars_;
   // Outputs to a vector of bwd ops.
-  std::vector<framework::VariableValueMap> grad_output_vars_;
+  std::vector<VarBasePtrMap> grad_output_vars_;
 
   std::vector<py::object> backward_hooks_;
 };
@@ -359,12 +365,130 @@ class PyLayer {
   static std::vector<framework::Variable*> Apply(
       int func_id, const std::vector<VarBase*>& inputs);
 
-  static std::vector<framework::Variable*> ApplyGrad(
-      int func_id, const std::vector<framework::Variable*>& inputs);
+  static std::vector<VarBase*> ApplyGrad(int func_id,
+                                         const std::vector<VarBase*>& inputs);
 
  private:
   static std::vector<framework::Variable*> CallPythonFunc(
-      const py::object& callable, const std::vector<framework::Variable*>& ins);
+      const py::object& callable, const std::vector<VarBase*>& ins);
+};
+
+// infer var type context for imperative mode
+class PYBIND11_HIDDEN RuntimeInferVarTypeContext
+    : public framework::InferVarTypeContext {
+ public:
+  RuntimeInferVarTypeContext(imperative::OpBase* op,
+                             const imperative::VarBasePtrMap* inputs,
+                             imperative::VarBasePtrMap* outputs,
+                             const framework::AttributeMap* attrs_map)
+      : InferVarTypeContext(nullptr, nullptr),
+        op_(op),
+        inputs_(inputs),
+        outputs_(outputs),
+        attrs_(attrs_map),
+        input_names_(),
+        output_names_(),
+        var_set_() {
+    input_names_.reserve(inputs_->size());
+    for (auto& it : *inputs_) {
+      for (imperative::VarBase* var : it.second) {
+        input_names_[it.first].emplace_back(var->Name());
+        var_set_[var->Name()] = var;
+      }
+    }
+
+    output_names_.reserve(outputs_->size());
+    for (auto& it : *outputs_) {
+      for (imperative::VarBase* var : it.second) {
+        output_names_[it.first].emplace_back(var->Name());
+        var_set_[var->Name()] = var;
+      }
+    }
+  }
+
+  framework::Attribute GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(attrs_);
+    return attrs_->at(name);
+  }
+
+  inline bool HasVar(const std::string& name) const {
+    return var_set_.count(name) > 0;
+  }
+
+  inline bool HasInput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(inputs_);
+    return inputs_->count(name) > 0;
+  }
+
+  inline bool HasOutput(const std::string& name) const {
+    PADDLE_ENFORCE_NOT_NULL(outputs_);
+    return outputs_->count(name) > 0;
+  }
+
+  inline const std::vector<std::string>& Input(const std::string& name) const {
+    return input_names_.at(name);
+  }
+
+  inline const std::vector<std::string>& Output(const std::string& name) const {
+    return output_names_.at(name);
+  }
+
+  inline framework::proto::VarType::Type GetType(
+      const std::string& name) const {
+    return var_set_.at(name)->DType();
+  }
+
+  inline void SetType(const std::string& name,
+                      framework::proto::VarType::Type type) {
+    var_set_[name]->SetDType(type);
+  }
+
+  inline framework::proto::VarType::Type GetDataType(
+      const std::string& name) const {
+    return var_set_.at(name)->DType();
+  }
+
+  inline void SetDataType(const std::string& name,
+                          framework::proto::VarType::Type type) {
+    var_set_[name]->SetDType(type);
+  }
+
+  inline std::vector<framework::proto::VarType::Type> GetDataTypes(
+      const std::string& name) const {
+    PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType");
+  }
+
+  inline void SetDataTypes(
+      const std::string& name,
+      const std::vector<framework::proto::VarType::Type>& multiple_data_type) {
+    PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType");
+  }
+
+  inline std::vector<int64_t> GetShape(const std::string& name) const {
+    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
+  }
+
+  inline void SetShape(const std::string& name,
+                       const std::vector<int64_t>& dims) {
+    PADDLE_THROW("Do not handle Shape in runtime InferVarType");
+  }
+
+  inline int32_t GetLoDLevel(const std::string& name) const {
+    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
+  }
+
+  inline void SetLoDLevel(const std::string& name, int32_t lod_level) {
+    PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
+  }
+
+ private:
+  imperative::OpBase* op_;
+  const imperative::VarBasePtrMap* inputs_;
+  imperative::VarBasePtrMap* outputs_;
+  const framework::AttributeMap* attrs_;
+  std::unordered_map<std::string, std::vector<std::string>> input_names_;
+  std::unordered_map<std::string, std::vector<std::string>> output_names_;
+  std::unordered_map<std::string, imperative::VarBase*> var_set_;
 };
 
 }  // namespace imperative
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7ee92b4d8c..7a07ec358d 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -19,6 +19,7 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include "paddle/fluid/framework/var_type_inference.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -160,7 +161,7 @@ Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
 }
 
 std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                                    const VarBasePtrMap& outputs,
+                                    VarBasePtrMap& outputs,
                                     framework::AttributeMap attrs_map,
                                     const platform::Place expected_place,
                                     const bool stop_gradient) {
@@ -228,6 +229,12 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
       framework::OpRegistry::CreateOp(op->Type(), invars_name_map,
                                       outvars_name_map, attrs_map);
 
+  if (info.infer_var_type_) {
+    RuntimeInferVarTypeContext infer_var_type_ctx(op, &inputs, &outputs,
+                                                  &attrs_map);
+    info.infer_var_type_(infer_var_type_ctx);
+  }
+
   // TODO(minqiyang): Support infer var type in imperative mode
   // Run forward op
   VLOG(3) << "tracer running " << op->Type();
@@ -278,12 +285,12 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
             auto fwd_var_it = current_vars_map.find(grad_invar);
             PADDLE_ENFORCE(fwd_var_it != current_vars_map.end());
             // Forward inputs or outputs.
-            grad_in_vars.emplace_back(fwd_var_it->second->var_);
+            grad_in_vars.emplace_back(fwd_var_it->second);
           } else {
             VarBase* var = current_vars_map[var_it->second];
             InitGrad(var, prepared_op.GetDeviceContext());
             // Douts.
-            grad_in_vars.emplace_back(var->grads_->var_);
+            grad_in_vars.emplace_back(var->grads_);
           }
 
           vars_saved_for_backward.insert(it.first);
@@ -300,7 +307,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                          op->Type());
           VarBase* var = current_vars_map[var_it->second];
           InitGrad(var, prepared_op.GetDeviceContext());
-          grad_out_vars.push_back(var->grads_->var_);
+          grad_out_vars.push_back(var->grads_);
         }
       }
     }
@@ -342,23 +349,23 @@ std::vector<VarBase*> Tracer::PyTrace(OpBase* op,
     auto& grad_output_vars =
         op->grad_output_vars_[0][framework::GradVarName(PyLayer::kFwdOut)];
 
-    for (const VarBase* inp : inputs) {
-      grad_input_vars.push_back(inp->var_);
+    for (VarBase* inp : inputs) {
+      grad_input_vars.push_back(inp);
     }
     for (VarBase* out : outputs) {
-      grad_input_vars.push_back(out->var_);
+      grad_input_vars.push_back(out);
     }
 
     // TODO(minqiyang): Add GPU support for PyLayer, only support CPU now
     platform::CPUPlace place;
     for (VarBase* out : outputs) {
       InitGrad(out, platform::DeviceContextPool::Instance().Get(place));
-      grad_input_vars.push_back(out->grads_->var_);
+      grad_input_vars.push_back(out->grads_);
     }
 
     for (VarBase* inp : inputs) {
       InitGrad(inp, platform::DeviceContextPool::Instance().Get(place));
-      grad_output_vars.push_back(inp->grads_->var_);
+      grad_output_vars.push_back(inp->grads_);
     }
   }
   return outputs;
diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
index fc9e42f8d0..c51ce931de 100644
--- a/paddle/fluid/imperative/type_defs.h
+++ b/paddle/fluid/imperative/type_defs.h
@@ -25,6 +25,7 @@ class VarBase;
 class OpBase;
 
 typedef std::map<std::string, std::vector<VarBase*>> VarBasePtrMap;
+typedef std::map<std::string, std::vector<const VarBase*>> ConstVarBasePtrMap;
 typedef std::map<std::string, std::vector<OpBase*>> OpBasePtrMap;
 
 }  // namespace imperative
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 8352ba4f2b..90c3067868 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -365,19 +365,16 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
 
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto p_names = op_desc.Input(kX);
-    auto pg_ig_names = op_desc.Output(framework::GradVarName(kX));
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto p_names = ctx.Input(kX);
+    auto pg_ig_names = ctx.Output(framework::GradVarName(kX));
 
     for (size_t i = 0; i < p_names.size(); ++i) {
-      auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
-      auto *g_var = block->FindVarRecursive(pg_ig_names[i]);
-      if (g_var != nullptr) {  // Gradient could be @EMPTY@
+      if (ctx.HasVar(pg_ig_names[i])) {
         VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
-                << " type: " << p_var.GetType();
-        g_var->SetType(p_var.GetType());
-        g_var->SetDataType(p_var.GetDataType());
+                << " type: " << ctx.GetType(p_names[i]);
+        ctx.SetType(pg_ig_names[i], ctx.GetType(p_names[i]));
+        ctx.SetDataType(pg_ig_names[i], ctx.GetDataType(p_names[i]));
       }
     }
   }
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
index 2932a202a5..e9f3f89c6e 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/distributed_ops/split_ids_op.h"
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
index 0018139cb0..7df5a881f5 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -60,12 +60,9 @@ class NCCLInitOp : public framework::OperatorBase {
 
 class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {
-    auto out_var_name = op_desc.Output("Communicator").front();
-    auto &out_var = block->FindRecursiveOrCreateVar(out_var_name);
-    auto var_type = framework::proto::VarType::RAW;
-    out_var.SetType(var_type);
+  void operator()(framework::InferVarTypeContext &ctx) const override {
+    auto out_var_name = ctx.Output("Communicator").front();
+    ctx.SetType(out_var_name, framework::proto::VarType::RAW);
   }
 };
 
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index f630ad678f..6472b9c163 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -14,8 +14,11 @@
 
 #include "paddle/fluid/operators/py_func_op.h"
 
+#include <memory>
 #include <set>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index 915325905b..b65e236856 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -123,7 +123,7 @@ class CustomReaderInferShape : public framework::InferShapeBase {
 
 class CustomReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::InferVarTypeContext& ctx) const override {
+  void operator()(framework::InferVarTypeContext& ctx) const override {
     auto& out_var_name = ctx.Output("Out")[0];
     PADDLE_ENFORCE(ctx.HasVar(out_var_name));
     ctx.SetType(out_var_name, framework::proto::VarType::READER);
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 9a98d68e13..40549ce54d 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -51,7 +51,7 @@ class ReadInferShape : public framework::InferShapeBase {
 
 class ReadInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(const framework::InferVarTypeContext& ctx) const override {
+  void operator()(framework::InferVarTypeContext& ctx) const override {
     bool infer_out = boost::get<bool>(ctx.GetAttr("infer_out"));
     if (infer_out) {
       std::string reader_name = ctx.Input("Reader")[0];
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 3921eedf94..44772281be 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -98,11 +98,10 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
   }
 }
 
-void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
-                                        framework::BlockDesc* block) const {
-  std::string reader_name = op_desc.Output("Out")[0];
-  framework::VarDesc* reader = block->FindVarRecursive(reader_name);
-  reader->SetType(framework::proto::VarType::READER);
+void FileReaderInferVarType::operator()(
+    framework::InferVarTypeContext& ctx) const {
+  std::string reader_name = ctx.Output("Out")[0];
+  ctx.SetType(reader_name, framework::proto::VarType::READER);
 }
 
 void DecoratedReaderInferShape::operator()(
@@ -125,13 +124,11 @@ void DecoratedReaderInferShape::operator()(
 }
 
 void DecoratedReaderInferVarType::operator()(
-    const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
-  std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
-  framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
-  std::string out_reader_name = op_desc.Output("Out")[0];
-  framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
-  out_reader->SetType(framework::proto::VarType::READER);
-  out_reader->SetDataTypes(in_reader->GetDataTypes());
+    framework::InferVarTypeContext& ctx) const {
+  const std::string& in_reader_name = ctx.Input("UnderlyingReader")[0];
+  const std::string& out_reader_name = ctx.Output("Out")[0];
+  ctx.SetType(out_reader_name, framework::proto::VarType::READER);
+  ctx.SetDataTypes(out_reader_name, ctx.GetDataTypes(in_reader_name));
 }
 
 void DecoratedReaderMakerBase::Make() {
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 58b0dfd555..5a775b82f5 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -14,7 +14,9 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index d2f05c42a7..208a6f8009 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/scale_op.h"
 
+#include <memory>
 #include <string>
 
 #include "paddle/fluid/operators/detail/safe_ref.h"
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index e950f30a42..f102b911b5 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/split_selected_rows_op.h"
 
+#include <memory>
+
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index d674711392..7dba00fffa 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -12,6 +12,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/sum_op.h"
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 6bbda69297..21e7793e0a 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -38,7 +38,7 @@ void BindTracer(pybind11::module* m) {
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
               const imperative::VarBasePtrMap& inputs,
-              const imperative::VarBasePtrMap& outputs,
+              imperative::VarBasePtrMap& outputs,
               framework::AttributeMap attrs_map,
               const platform::CPUPlace expected_place,
               const bool stop_gradient = false) {
@@ -48,7 +48,7 @@ void BindTracer(pybind11::module* m) {
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
               const imperative::VarBasePtrMap& inputs,
-              const imperative::VarBasePtrMap& outputs,
+              imperative::VarBasePtrMap& outputs,
               framework::AttributeMap attrs_map,
               const platform::CUDAPlace expected_place,
               const bool stop_gradient = false) {

From 9041b238e3814daa7753c905a4017a5488545d40 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 15 Mar 2019 20:22:09 +0800
Subject: [PATCH 52/73] Polish code

test=develop
---
 paddle/fluid/imperative/tracer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 7b65d55e9e..ae3b16727d 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -48,7 +48,7 @@ class Tracer {
   virtual ~Tracer() {}
 
   std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
-                              const VarBasePtrMap& outputs,
+                              VarBasePtrMap& outputs,  // NOLINT
                               framework::AttributeMap attrs_map,
                               const platform::Place expected_place,
                               const bool stop_gradient = false);

From 3be7e971ab4286ad95f3fcc7503512e77f0fcd56 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 15 Mar 2019 20:55:01 +0800
Subject: [PATCH 53/73] polish

test=develop
---
 python/paddle/fluid/framework.py              |  2 +-
 .../tests/unittests/test_imperative_gnn.py    | 61 ++++++++++++++++++-
 .../fluid/tests/unittests/test_layers.py      |  3 +-
 3 files changed, 60 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 38ffd0c382..556ce71ee5 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -431,7 +431,7 @@ class Variable(object):
             str: The debug string.
         """
         if _in_imperative_mode():
-            # TODO(panyx0718): add imperative debug info.
+            # TODO(panyx0718): add more imperative debug info.
             return 'name %s, dtype: %s shape: %s' % (self.name, self.dtype,
                                                      self.shape)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index d471a9baf8..2086fab5c8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -21,7 +21,7 @@ import sys
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.optimizer import AdamOptimizer
 from paddle.fluid.imperative.nn import Conv2D, Pool2D, FC
 from test_imperative_base import new_program_scope
 from paddle.fluid.imperative.base import to_variable
@@ -65,24 +65,79 @@ class TestImperativeGNN(unittest.TestCase):
     def test_gnn_float32(self):
         seed = 90
 
+        startup = fluid.Program()
+        startup.random_seed = seed
+        main = fluid.Program()
+        main.random_seed = seed
+
+        scope = fluid.core.Scope()
+        with new_program_scope(main=main, startup=startup, scope=scope):
+            features = fluid.layers.data(
+                name='features',
+                shape=[1, 100, 50],
+                dtype='float32',
+                append_batch_size=False)
+            # Use selected rows when it's supported.
+            adj = fluid.layers.data(
+                name='adj',
+                shape=[1, 100, 100],
+                dtype='float32',
+                append_batch_size=False)
+            labels = fluid.layers.data(
+                name='labels',
+                shape=[100, 1],
+                dtype='int64',
+                append_batch_size=False)
+
+            model = GCN('test_gcn', 50)
+            logits = model(features, adj)
+            logits = fluid.layers.reshape(logits, logits.shape[1:])
+            # In other example, it's nll with log_softmax. However, paddle's
+            # log_loss only supports binary classification now.
+            loss = fluid.layers.softmax_with_cross_entropy(logits, labels)
+            loss = fluid.layers.reduce_sum(loss)
+
+            adam = AdamOptimizer(learning_rate=1e-3)
+            adam.minimize(loss)
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            exe.run(startup)
+            static_loss = exe.run(feed={
+                'features': np.zeros(
+                    [1, 100, 50], dtype=np.float32),
+                'adj': np.zeros(
+                    [1, 100, 100], dtype=np.float32),
+                'labels': np.zeros(
+                    [100, 1], dtype=np.int64)
+            },
+                                  fetch_list=[loss])[0]
+
+            static_weight = np.array(
+                scope.find_var(model.gc.weight.name).get_tensor())
+
         with fluid.imperative.guard():
             fluid.default_startup_program().random_seed = seed
             fluid.default_main_program().random_seed = seed
 
             features = np.zeros([1, 100, 50], dtype=np.float32)
+            # Use selected rows when it's supported.
             adj = np.zeros([1, 100, 100], dtype=np.float32)
             labels = np.zeros([100, 1], dtype=np.int64)
 
             model = GCN('test_gcn', 50)
             logits = model(to_variable(features), to_variable(adj))
-            sys.stderr.write('%s\n' % logits)
             logits = fluid.layers.reshape(logits, logits.shape[1:])
             # In other example, it's nll with log_softmax. However, paddle's
             # log_loss only supports binary classification now.
             loss = fluid.layers.softmax_with_cross_entropy(logits,
                                                            to_variable(labels))
             loss = fluid.layers.reduce_sum(loss)
-            sys.stderr.write('%s\n' % loss._numpy())
+            adam = AdamOptimizer(learning_rate=1e-3)
+            adam.minimize(loss)
+            self.assertEqual(static_loss, loss._numpy())
+            self.assertTrue(
+                np.allclose(static_weight, model.gc.weight._numpy()))
+            sys.stderr.write('%s %s\n' % (static_loss, loss._numpy()))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 09b16a72d2..42ec845b42 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -101,8 +101,7 @@ class TestLayer(LayerTest):
         with self.dynamic_graph():
             t = np.ones([3, 3], dtype='float32')
             t2 = np.ones([3, 3], dtype='float32')
-            ret = layers.matmul(t, t2)
-            dy_ret = layers.relu(base.to_variable(ret))
+            dy_ret = layers.matmul(base.to_variable(t), base.to_variable(t2))
 
         self.assertTrue(np.allclose(static_ret, dy_ret._numpy()))
 

From b5078c211a536326727c1eb0c3b29a7047fdaecb Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 15 Mar 2019 21:04:30 +0800
Subject: [PATCH 54/73] Make infer var type virtual

test=develop
---
 paddle/fluid/framework/var_type_inference.h | 37 +++++++------
 paddle/fluid/imperative/layer.h             | 57 +++++++++++----------
 paddle/fluid/imperative/tracer.cc           |  2 +-
 3 files changed, 50 insertions(+), 46 deletions(-)

diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index b4b7be619a..5dd08442c2 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -31,86 +31,89 @@ class InferVarTypeContext {
   InferVarTypeContext(const OpDesc* op, BlockDesc* block)
       : op_(op), block_(block) {}
 
-  Attribute GetAttr(const std::string& name) const {
+  virtual ~InferVarTypeContext() {}
+
+  virtual Attribute GetAttr(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
     return op_->GetAttr(name);
   }
 
-  inline bool HasVar(const std::string& name) const {
+  virtual bool HasVar(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(block_);
     return block_->FindVarRecursive(name) != nullptr;
   }
 
-  inline bool HasInput(const std::string& name) const {
+  virtual bool HasInput(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
     return op_->Inputs().count(name) > 0;
   }
 
-  inline bool HasOutput(const std::string& name) const {
+  virtual bool HasOutput(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
     return op_->Outputs().count(name) > 0;
   }
 
-  inline const std::vector<std::string>& Input(const std::string& name) const {
+  virtual const std::vector<std::string>& Input(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
     return op_->Input(name);
   }
 
-  inline const std::vector<std::string>& Output(const std::string& name) const {
+  virtual const std::vector<std::string>& Output(
+      const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(op_);
     return op_->Output(name);
   }
 
-  inline proto::VarType::Type GetType(const std::string& name) const {
+  virtual proto::VarType::Type GetType(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(block_);
     return block_->FindRecursiveOrCreateVar(name).GetType();
   }
 
-  inline void SetType(const std::string& name, proto::VarType::Type type) {
+  virtual void SetType(const std::string& name, proto::VarType::Type type) {
     PADDLE_ENFORCE_NOT_NULL(block_);
     block_->FindRecursiveOrCreateVar(name).SetType(type);
   }
 
-  inline proto::VarType::Type GetDataType(const std::string& name) const {
+  virtual proto::VarType::Type GetDataType(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(block_);
     return block_->FindRecursiveOrCreateVar(name).GetDataType();
   }
 
-  inline void SetDataType(const std::string& name, proto::VarType::Type type) {
+  virtual void SetDataType(const std::string& name, proto::VarType::Type type) {
     PADDLE_ENFORCE_NOT_NULL(block_);
     block_->FindRecursiveOrCreateVar(name).SetDataType(type);
   }
 
-  inline std::vector<proto::VarType::Type> GetDataTypes(
+  virtual std::vector<proto::VarType::Type> GetDataTypes(
       const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(block_);
     return block_->FindRecursiveOrCreateVar(name).GetDataTypes();
   }
 
-  inline void SetDataTypes(
+  virtual void SetDataTypes(
       const std::string& name,
       const std::vector<proto::VarType::Type>& multiple_data_type) {
     PADDLE_ENFORCE_NOT_NULL(block_);
     block_->FindRecursiveOrCreateVar(name).SetDataTypes(multiple_data_type);
   }
 
-  inline std::vector<int64_t> GetShape(const std::string& name) const {
+  virtual std::vector<int64_t> GetShape(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(block_);
     return block_->FindRecursiveOrCreateVar(name).GetShape();
   }
 
-  inline void SetShape(const std::string& name,
-                       const std::vector<int64_t>& dims) {
+  virtual void SetShape(const std::string& name,
+                        const std::vector<int64_t>& dims) {
     PADDLE_ENFORCE_NOT_NULL(block_);
     block_->FindRecursiveOrCreateVar(name).SetShape(dims);
   }
 
-  inline int32_t GetLoDLevel(const std::string& name) const {
+  virtual int32_t GetLoDLevel(const std::string& name) const {
     PADDLE_ENFORCE_NOT_NULL(block_);
     return block_->FindRecursiveOrCreateVar(name).GetLoDLevel();
   }
 
-  inline void SetLoDLevel(const std::string& name, int32_t lod_level) {
+  virtual void SetLoDLevel(const std::string& name, int32_t lod_level) {
     PADDLE_ENFORCE_NOT_NULL(block_);
     block_->FindRecursiveOrCreateVar(name).SetLoDLevel(lod_level);
   }
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 494988608e..4ad7d847c1 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -377,12 +377,10 @@ class PyLayer {
 class PYBIND11_HIDDEN RuntimeInferVarTypeContext
     : public framework::InferVarTypeContext {
  public:
-  RuntimeInferVarTypeContext(imperative::OpBase* op,
-                             const imperative::VarBasePtrMap* inputs,
+  RuntimeInferVarTypeContext(const imperative::VarBasePtrMap* inputs,
                              imperative::VarBasePtrMap* outputs,
                              const framework::AttributeMap* attrs_map)
       : InferVarTypeContext(nullptr, nullptr),
-        op_(op),
         inputs_(inputs),
         outputs_(outputs),
         attrs_(attrs_map),
@@ -406,83 +404,86 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
     }
   }
 
-  framework::Attribute GetAttr(const std::string& name) const {
+  virtual ~RuntimeInferVarTypeContext() {}
+
+  framework::Attribute GetAttr(const std::string& name) const override {
     PADDLE_ENFORCE_NOT_NULL(attrs_);
     return attrs_->at(name);
   }
 
-  inline bool HasVar(const std::string& name) const {
+  bool HasVar(const std::string& name) const override {
     return var_set_.count(name) > 0;
   }
 
-  inline bool HasInput(const std::string& name) const {
+  bool HasInput(const std::string& name) const override {
     PADDLE_ENFORCE_NOT_NULL(inputs_);
     return inputs_->count(name) > 0;
   }
 
-  inline bool HasOutput(const std::string& name) const {
+  bool HasOutput(const std::string& name) const override {
     PADDLE_ENFORCE_NOT_NULL(outputs_);
     return outputs_->count(name) > 0;
   }
 
-  inline const std::vector<std::string>& Input(const std::string& name) const {
+  const std::vector<std::string>& Input(
+      const std::string& name) const override {
     return input_names_.at(name);
   }
 
-  inline const std::vector<std::string>& Output(const std::string& name) const {
+  const std::vector<std::string>& Output(
+      const std::string& name) const override {
     return output_names_.at(name);
   }
 
-  inline framework::proto::VarType::Type GetType(
-      const std::string& name) const {
+  framework::proto::VarType::Type GetType(
+      const std::string& name) const override {
     return var_set_.at(name)->DType();
   }
 
-  inline void SetType(const std::string& name,
-                      framework::proto::VarType::Type type) {
+  void SetType(const std::string& name,
+               framework::proto::VarType::Type type) override {
     var_set_[name]->SetDType(type);
   }
 
-  inline framework::proto::VarType::Type GetDataType(
-      const std::string& name) const {
+  framework::proto::VarType::Type GetDataType(
+      const std::string& name) const override {
     return var_set_.at(name)->DType();
   }
 
-  inline void SetDataType(const std::string& name,
-                          framework::proto::VarType::Type type) {
+  void SetDataType(const std::string& name,
+                   framework::proto::VarType::Type type) override {
     var_set_[name]->SetDType(type);
   }
 
-  inline std::vector<framework::proto::VarType::Type> GetDataTypes(
-      const std::string& name) const {
+  std::vector<framework::proto::VarType::Type> GetDataTypes(
+      const std::string& name) const override {
     PADDLE_THROW("GetDataTypes is not supported in runtime InferVarType");
   }
 
-  inline void SetDataTypes(
-      const std::string& name,
-      const std::vector<framework::proto::VarType::Type>& multiple_data_type) {
+  void SetDataTypes(const std::string& name,
+                    const std::vector<framework::proto::VarType::Type>&
+                        multiple_data_type) override {
     PADDLE_THROW("SetDataTypes is not supported in runtime InferVarType");
   }
 
-  inline std::vector<int64_t> GetShape(const std::string& name) const {
+  std::vector<int64_t> GetShape(const std::string& name) const override {
     PADDLE_THROW("Do not handle Shape in runtime InferVarType");
   }
 
-  inline void SetShape(const std::string& name,
-                       const std::vector<int64_t>& dims) {
+  void SetShape(const std::string& name,
+                const std::vector<int64_t>& dims) override {
     PADDLE_THROW("Do not handle Shape in runtime InferVarType");
   }
 
-  inline int32_t GetLoDLevel(const std::string& name) const {
+  int32_t GetLoDLevel(const std::string& name) const override {
     PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
   }
 
-  inline void SetLoDLevel(const std::string& name, int32_t lod_level) {
+  void SetLoDLevel(const std::string& name, int32_t lod_level) override {
     PADDLE_THROW("Do not handle LoDLevel in runtime InferVarType");
   }
 
  private:
-  imperative::OpBase* op_;
   const imperative::VarBasePtrMap* inputs_;
   imperative::VarBasePtrMap* outputs_;
   const framework::AttributeMap* attrs_;
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7a07ec358d..166883bd6f 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -230,7 +230,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                                       outvars_name_map, attrs_map);
 
   if (info.infer_var_type_) {
-    RuntimeInferVarTypeContext infer_var_type_ctx(op, &inputs, &outputs,
+    RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, &outputs,
                                                   &attrs_map);
     info.infer_var_type_(infer_var_type_ctx);
   }

From c0ddb93ccc84be81cbbffd1f18c6b19ba154dc86 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 15 Mar 2019 21:06:16 +0800
Subject: [PATCH 55/73] Polish code

test=develop
---
 paddle/fluid/framework/grad_op_desc_maker.h | 7 ++-----
 paddle/fluid/imperative/profiler.cc         | 3 ---
 2 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index 46ebf4051f..3645cd4e43 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -55,14 +55,11 @@ class GradOpDescMakerBase {
                    std::back_inserter(ret_val),
                    [this](const std::string& fwd_var_name) -> std::string {
                      auto g_name = GradVarName(fwd_var_name);
-                     if (no_grad_set_.empty()) {
+                     if (no_grad_set_.empty() || !no_grad_set_.count(g_name)) {
                        (*this->grad_to_var_)[g_name] = fwd_var_name;
                        return g_name;
-                     } else if (no_grad_set_.count(g_name)) {
-                       return kEmptyVarName;
                      } else {
-                       (*this->grad_to_var_)[g_name] = fwd_var_name;
-                       return g_name;
+                       return kEmptyVarName;
                      }
                    });
     if (!drop_empty_grad) {
diff --git a/paddle/fluid/imperative/profiler.cc b/paddle/fluid/imperative/profiler.cc
index 828c36c5ae..34570b3a60 100644
--- a/paddle/fluid/imperative/profiler.cc
+++ b/paddle/fluid/imperative/profiler.cc
@@ -36,13 +36,11 @@ static bool gTracerProfilerStarted = false;
 #endif
 
 void StartProfile() {
-  LOG(ERROR) << "XX " << FLAGS_tracer_profile_fname;
   if (!FLAGS_tracer_profile_fname.empty()) {
     std::call_once(gTracerProfileOnce, [] {
 #ifdef WITH_GPERFTOOLS
       ProfilerStart(FLAGS_tracer_profile_fname.c_str());
       gTracerProfilerStarted = true;
-      LOG(ERROR) << "YY";
 #else
       LOG(WARNING) << "Paddle is not compiled with gperftools. "
                       "FLAGS_tracer_profile_fname will be ignored";
@@ -52,7 +50,6 @@ void StartProfile() {
 }
 
 void StopProfile() {
-  LOG(ERROR) << "ZZ " << FLAGS_tracer_profile_fname;
 #ifdef WITH_GPERFTOOLS
   ProfilerFlush();
 #else

From 362253732c09499eb496d351f1fe7a7bcb1779ea Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 15 Mar 2019 21:14:36 +0800
Subject: [PATCH 56/73] Polish code

test=develop
---
 paddle/fluid/framework/grad_op_desc_maker.h | 2 ++
 paddle/fluid/imperative/tracer.cc           | 3 +--
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index 3645cd4e43..f2f4c53eea 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -14,7 +14,9 @@ limitations under the License. */
 
 #pragma once
 #include <algorithm>
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 #include "paddle/fluid/framework/op_desc.h"
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7773a3f8fc..3a6427aa91 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -131,8 +131,7 @@ framework::VariableNameMap CreateOutputVarNameMap(
   return result;
 }
 
-Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
-}
+Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {}
 
 std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                                     const VarBasePtrMap& outputs,

From db0c9708239b46d9461ef21cf5dce1d2b9c9cbfe Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Fri, 15 Mar 2019 21:21:04 +0800
Subject: [PATCH 57/73] Polish code

test=develop
---
 paddle/fluid/operators/distributed_ops/fake_init_op.cc | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
index 28ebdcb03e..89228c7243 100644
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -56,8 +56,7 @@ class FakeInitOp : public framework::OperatorBase {
 
 class FakeInitOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc &op_desc,
-                  framework::BlockDesc *block) const override {}
+  void operator()(framework::InferVarTypeContext &ctx) const override {}
 };
 
 class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {

From 86e912c544937b02abcc80e856a3ac2d62ca22e6 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Sat, 16 Mar 2019 22:43:30 +0800
Subject: [PATCH 58/73] Fix windows compiling (#16230)

test=develop
---
 paddle/fluid/framework/ir/CMakeLists.txt | 4 +++-
 paddle/fluid/operators/CMakeLists.txt    | 6 ++++--
 paddle/fluid/platform/device_context.cc  | 2 ++
 paddle/fluid/platform/device_context.h   | 4 ++++
 4 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index faf7768a7b..bfab221a9e 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -102,7 +102,9 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
-cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+if(NOT WIN32)
+    cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
+endif()
 cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 2f8e0b3a30..651c5e6e75 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -58,8 +58,10 @@ if (WITH_GPU)
         op_library(conv_fusion_op)
         file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
     endif()
-    op_library(sync_batch_norm_op)
-    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+    if (NOT WIN32)
+        op_library(sync_batch_norm_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n")
+    endif()
 else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index ada9a19736..d54a3e8670 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -316,7 +316,9 @@ CUDADeviceContext::~CUDADeviceContext() {
   eigen_stream_.reset();
   eigen_device_.reset();
   PADDLE_ENFORCE(cudaStreamDestroy(stream_));
+#if !defined(_WIN32)
   PADDLE_ENFORCE(dynload::ncclCommDestroy(nccl_comm_));
+#endif
 }
 
 Place CUDADeviceContext::GetPlace() const { return place_; }
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 3f7ce3d944..1eb8d9691a 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -265,11 +265,13 @@ class CUDADeviceContext : public DeviceContext {
   /*! \brief  Return cuda stream in the device context. */
   cudaStream_t stream() const;
 
+#if !defined(_WIN32)
   /*! \brief  Return nccl communicators. */
   ncclComm_t nccl_comm() const { return nccl_comm_; }
 
   /*! \brief  Set nccl communicators. */
   void set_nccl_comm(ncclComm_t comm) { nccl_comm_ = comm; }
+#endif
 
   template <typename Callback>
   void RecordEvent(cudaEvent_t ev, Callback callback) {
@@ -295,12 +297,14 @@ class CUDADeviceContext : public DeviceContext {
   std::unique_ptr<CublasHandleHolder> cublas_handle_;
   std::unique_ptr<CublasHandleHolder> cublas_tensor_core_handle_;
 
+#if !defined(_WIN32)
   // NCCL communicator (single process version) for NCCL collective operations.
   // NCCL collective operations provides fast collectives over multiple GPUs
   // both within and across nodes.
   // But, this collectives is used for collectives over multiple GPUs within
   // nodes.
   ncclComm_t nccl_comm_{nullptr};
+#endif
 
   int compute_capability_;
   int runtime_version_;

From efca4de78ebb38e1e249a8dae005f48c34f7384d Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Sun, 17 Mar 2019 10:10:39 -0500
Subject: [PATCH 59/73] Fix cross_entropy bug (#16236)

test=develop
---
 paddle/fluid/operators/softmax_with_cross_entropy_op.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index 52b8dcc681..89aaac4cbe 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -439,7 +439,8 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
+    framework::TensorCopy(*context.Input<Tensor>("Softmax"), context.GetPlace(),
+                          context.device_context(), logit_grad);
     T* logit_grad_data = logit_grad->data<T>();
 
     const int batch_size = logit_grad->dims()[0];

From 2579ade45fb0d2d698ee4ee0a8f6f4162f9ddb5f Mon Sep 17 00:00:00 2001
From: Wojciech Uss <wojciech.uss@intel.com>
Date: Mon, 18 Mar 2019 04:28:11 +0100
Subject: [PATCH 60/73] Add cpu_quantize_pass for C-API quantization (#16127)

* Add cpu_quantize_pass for C-API quantization

test=develop

* add cpu_quantize_pass test

* fix lint: add include memory unorderd_map and unordered_set

test=develop

* fuse_relu 1

test=develop

* tuned 2 without squash

* fixes

test=develop

* remove unused vars

test=develop

* refactored

test=develop

* fix lint c-style cast -> C++ style cast

test=develop

* remove QuantMax and c style casts

test=develop

* last usage of QuantMax removed

test=develop

* Fix Analysis Predictor UT

Check if memory_optimize_pass has already been added
to the analysis config before adding a new one, so
that it is not added multiple times.
test=develop

* change map to unordered_map

fix the forgotten part of cpu_quantize_pass_tester.cc

test=develop

* removed quantized attribute

* fixed cpu_quantize_pass_tester and op attr comments

test=develop

* removed redundant line

test=debug

* removed gmock

test=develop

* fix after merge
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   4 +-
 .../fluid/framework/ir/cpu_quantize_pass.cc   | 239 ++++++++++++++++++
 paddle/fluid/framework/ir/cpu_quantize_pass.h |  66 +++++
 .../framework/ir/cpu_quantize_pass_tester.cc  | 211 ++++++++++++++++
 .../framework/ir/graph_pattern_detector.cc    |  51 +++-
 .../framework/ir/graph_pattern_detector.h     |  29 +++
 paddle/fluid/inference/analysis/argument.h    |   6 +
 .../inference/analysis/ir_pass_manager.cc     |  11 +-
 paddle/fluid/inference/api/analysis_config.cc |   9 +-
 paddle/fluid/operators/conv_op.cc             |   7 +
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  |   1 +
 paddle/fluid/operators/pool_op.cc             |   7 +
 12 files changed, 631 insertions(+), 10 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_pass.cc
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_pass.h
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index bfab221a9e..3808dd5fba 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,6 +46,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
+pass_library(cpu_quantize_pass inference)
 pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
 pass_library(attention_lstm_fuse_pass inference)
@@ -102,10 +103,11 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
+cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
+cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if(NOT WIN32)
     cc_test(test_sync_batch_norm_pass SRCS sync_batch_norm_pass_tester.cc DEPS sync_batch_norm_pass)
 endif()
-cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if (WITH_MKLDNN)
     cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
     cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
new file mode 100644
index 0000000000..edfaf47f01
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.cc
@@ -0,0 +1,239 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+namespace {
+
+void UnlinkNodes(ir::Node* a, ir::Node* b) {
+  a->outputs.erase(std::remove(a->outputs.begin(), a->outputs.end(), b),
+                   a->outputs.end());
+  b->inputs.erase(std::remove(b->inputs.begin(), b->inputs.end(), a),
+                  b->inputs.end());
+}
+
+}  // namespace
+
+enum { U8_MAX = 255, S8_MAX = 127 };
+
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<double, Eigen::Dynamic, 1>>;
+using string::PrettyLogDetail;
+
+void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
+                                    std::string input_name, double scale_to_one,
+                                    bool is_unsigned,
+                                    std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+
+  // Create quantize output variable
+  VarDesc quantize_out_desc(patterns::PDNodeName("quantize", "out"));
+  auto* quantize_out_node = g->CreateVarNode(&quantize_out_desc);
+
+  // create a quantize op node
+  OpDesc q_desc;
+  q_desc.SetType("quantize");
+  q_desc.SetInput("Input", std::vector<std::string>({input->Name()}));
+  q_desc.SetOutput("Output",
+                   std::vector<std::string>({quantize_out_node->Name()}));
+  q_desc.SetAttr("Scale", scale);
+  q_desc.SetAttr("is_negative_input", !is_unsigned);
+  auto quantize_op = g->CreateOpNode(&q_desc);  // OpDesc will be copied.
+
+  // update op's input
+  op->Op()->SetInput(input_name,
+                     std::vector<std::string>({quantize_out_node->Name()}));
+
+  // link quantize op
+  UnlinkNodes(input, op);
+  IR_NODE_LINK_TO(input, quantize_op);
+  IR_NODE_LINK_TO(quantize_op, quantize_out_node);
+  IR_NODE_LINK_TO(quantize_out_node, op);
+
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+
+void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
+                                       std::string output_name,
+                                       double scale_to_one, bool is_unsigned,
+                                       std::string scale_attr_name) const {
+  unsigned max = is_unsigned ? U8_MAX : S8_MAX;
+  float scale = scale_to_one * max;
+
+  // Create dequantize input variable
+  VarDesc dequantize_in_desc(patterns::PDNodeName("dequantize", "in"));
+  auto* dequantize_in_node = g->CreateVarNode(&dequantize_in_desc);
+
+  // create a dequantize op node for output.
+  OpDesc deq_desc;
+  deq_desc.SetType("dequantize");
+  deq_desc.SetInput("Input",
+                    std::vector<std::string>({dequantize_in_node->Name()}));
+  deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
+  deq_desc.SetAttr("Scale", scale);
+  auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
+
+  // update op's output
+  op->Op()->SetOutput(output_name,
+                      std::vector<std::string>({dequantize_in_node->Name()}));
+
+  // link dequantize op
+  UnlinkNodes(op, output);
+  IR_NODE_LINK_TO(op, dequantize_in_node);
+  IR_NODE_LINK_TO(dequantize_in_node, dequantize_op);
+  IR_NODE_LINK_TO(dequantize_op, output);
+
+  if (!scale_attr_name.empty()) op->Op()->SetAttr(scale_attr_name, scale);
+}
+
+void CPUQuantizePass::QuantizeConv(Graph* graph,
+                                   bool with_residual_data) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::ConvResidual conv_pattern{pattern, name_scope_};
+  conv_pattern(with_residual_data);
+
+  int quantize_conv_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize conv2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_op, conv_op, conv_pattern);
+    auto* conv_op_desc = conv_op->Op();
+
+    // skip if should not be quantized
+    if (!conv_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(conv_op_desc->GetAttr("use_quantizer")))
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(conv_output, conv_output, conv_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[conv_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[conv_input->Name()].first;
+    QuantizeInput(g, conv_op, conv_input, "Input", input_scale,
+                  is_input_unsigned, "Scale_in");
+
+    auto filter_scale_tensor = scales[conv_filter->Name()].second;
+    EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
+                                     filter_scale_tensor.numel(), 1};
+    eigen_tensor *= static_cast<double>(S8_MAX);
+    std::vector<float> filter_scale{
+        filter_scale_tensor.data<double>(),
+        filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
+
+    conv_op->Op()->SetAttr("Scale_weights", filter_scale);
+
+    if (with_residual_data) {
+      GET_IR_NODE_FROM_SUBGRAPH(conv_residual_data, conv_residual_data,
+                                conv_pattern);
+      auto residual_scale =
+          scales[conv_residual_data->Name()].second.data<double>()[0];
+      bool is_residual_unsigned = scales[conv_residual_data->Name()].first;
+
+      QuantizeInput(g, conv_op, conv_residual_data, "ResidualData",
+                    residual_scale, is_residual_unsigned, "Scale_in_eltwise");
+    }
+
+    auto output_scale = scales[conv_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[conv_output->Name()].first;
+    DequantizeOutput(g, conv_op, conv_output, "Output", output_scale,
+                     is_output_unsigned, "Scale_out");
+
+    ++quantize_conv_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_conv_count);
+
+  std::stringstream msg_ss;
+  msg_ss << "---    quantized " << quantize_conv_count << " conv2d ops";
+  if (with_residual_data) msg_ss << " with residual connection";
+  PrettyLogDetail(msg_ss.str().c_str());
+}
+
+void CPUQuantizePass::QuantizePool(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Pool pool_pattern{pattern, name_scope_};
+  pool_pattern();
+
+  int quantize_pool_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize pool2d op";
+    GET_IR_NODE_FROM_SUBGRAPH(pool_op, pool_op, pool_pattern);
+    auto* pool_op_desc = pool_op->Op();
+
+    // skip if should not be quantized
+    if (!pool_op_desc->HasAttr("use_quantizer") ||
+        !boost::get<bool>(pool_op_desc->GetAttr("use_quantizer")))
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[pool_input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[pool_input->Name()].first;
+    QuantizeInput(g, pool_op, pool_input, "X", input_scale, is_input_unsigned);
+
+    auto output_scale = scales[pool_output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[pool_output->Name()].first;
+    DequantizeOutput(g, pool_op, pool_output, "Out", output_scale,
+                     is_output_unsigned);
+
+    ++quantize_pool_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_pool_count);
+
+  PrettyLogDetail("---    quantized %d pool2d ops", quantize_pool_count);
+}
+
+std::unique_ptr<ir::Graph> CPUQuantizePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Quantizing the graph.";
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init(name_scope_, graph.get());
+
+  PADDLE_ENFORCE(param_scope());
+
+  QuantizeConv(graph.get(), true /* with_residual_data */);
+  QuantizeConv(graph.get());
+  QuantizePool(graph.get());
+
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_pass, paddle::framework::ir::CPUQuantizePass)
+    .RequirePassAttr("quant_var_scales");
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass.h b/paddle/fluid/framework/ir/cpu_quantize_pass.h
new file mode 100644
index 0000000000..9873bb04e1
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <utility>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Map variable name to tensor of scaling factors scaling it to MAX=1.0.
+ * bool denotes whether quantization of the variable should be done to unsigned
+ * type.
+ */
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, LoDTensor>>;
+
+/*
+ * Quantize all supported operators.
+ */
+class CPUQuantizePass : public FusePassBase {
+ public:
+  virtual ~CPUQuantizePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+
+  void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
+
+  void QuantizePool(Graph* graph) const;
+
+  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
+                     double scale_to_one, bool is_unsigned,
+                     std::string scale_attr_name = "") const;
+
+  void DequantizeOutput(Graph* g, Node* op, Node* output,
+                        std::string output_name, double scale_to_one,
+                        bool is_unsigned,
+                        std::string scale_attr_name = "") const;
+
+  const std::string name_scope_{"quantize"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
new file mode 100644
index 0000000000..89601be7d1
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_pass_tester.cc
@@ -0,0 +1,211 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_pass.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/framework/naive_executor.h"
+#include "paddle/fluid/platform/place.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs, bool use_mkldnn,
+           bool use_quantizer = false) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+  op->SetAttr("name", name);
+  if (type == "conv2d") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    if (inputs.size() > 2)
+      op->SetInput("Bias", {inputs[2]});
+    else
+      op->SetInput("Bias", {});
+    if (inputs.size() > 3) {
+      op->SetInput("ResidualData", {inputs[3]});
+      op->SetAttr("fuse_residual_connection", true);
+    } else {
+      op->SetInput("ResidualData", {});
+      op->SetAttr("fuse_residual_connection", false);
+    }
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+  } else if (type == "dropout") {
+    op->SetInput("X", {inputs[0]});
+    op->SetOutput("Out", {outputs[0]});
+  } else if (type == "fc") {
+    op->SetInput("Input", {inputs[0]});
+    if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
+    if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
+    op->SetOutput("Out", {outputs[0]});
+  }
+}
+
+static const std::initializer_list<std::string> variable_names{
+    "a", "w1", "c",  "d", "w2", "e",  "f", "g",
+    "h", "w3", "b1", "i", "j",  "w4", "b2"};
+// (a,w1)->Conv1->c and c->Pool1->d
+//
+// (d,w2)->Conv2->e and e->Pool2->f
+//
+// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
+//
+// (d,w4, b2)->Conv4->i
+ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
+  ProgramDesc prog;
+  for (auto& v : variable_names) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    if (v.find("w") == 0 || v.find("b") == 0) {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "conv2d", "Conv1", {"a", "w1"}, {"c"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool1", {"c"}, {"d"}, use_mkldnn, use_quantizer);
+
+  SetOp(&prog, "conv2d", "Conv2", {"d", "w2"}, {"e"}, use_mkldnn,
+        use_quantizer);
+  SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
+
+  SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
+  SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
+  SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
+        use_quantizer);
+
+  SetOp(&prog, "conv2d", "Conv4", {"c", "w4", "b2"}, {"i"}, use_mkldnn,
+        use_quantizer);
+
+  return prog;
+}
+
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32,
+                       ::paddle::memory::Allocator::kDefault, 1);
+}
+
+void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
+              int quant_count, int dequant_count, int added_nodes_count,
+              float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  // Init scope, as it is used in pass
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+
+  auto* scales = new VarQuantScale();
+
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+    LoDTensor tensor;
+    tensor.Resize({1});
+    auto* ptr = tensor.mutable_data<double>(place);
+    ptr[0] = 2.0;
+
+    (*scales)[v] = std::make_pair(false, std::move(tensor));
+  }
+
+  graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_pass");
+  pass->Set("quant_var_scales", scales);
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int conv2d_nodes_count = 0;
+  int pool2d_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "conv2d") {
+        conv2d_nodes_count++;
+        auto op_name = boost::get<std::string>(op->GetAttr("name"));
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_in")), scale)
+            << "Scale_in for node '" + op_name + "'.";
+        EXPECT_EQ(boost::get<float>(op->GetAttr("Scale_out")), scale)
+            << "Scale_out for node '" + op_name + "'.";
+        EXPECT_EQ(
+            boost::get<std::vector<float>>(op->GetAttr("Scale_weights"))[0],
+            scale)
+            << "Scale_weights for node '" + op_name + "'.";
+      } else if (op->Type() == "pool2d") {
+        pool2d_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+      }
+    }
+  }
+  EXPECT_EQ(conv2d_nodes_count, conv_count);
+  EXPECT_EQ(pool2d_nodes_count, pool_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+
+TEST(CpuQuantizePass, quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = true;
+  // (a->QUANT1->IN1,w1)->Conv1->OUT1->DEQUANT1->c and
+  // c->QUANT2->IN2->Pool1->OUT2->DEQUANT2->d
+  //
+  // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
+  // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
+  //
+  // d->Dropout1->g and g->Fc1->h and
+  // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
+  //
+  // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
+  // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
+  int added_nodes = 7 + 7 + 6 + 6;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
+           2.0f * 127);
+}
+
+TEST(CpuQuantizePass, do_not_quantize) {
+  bool use_mkldnn = true;
+  bool use_quantizer = false;
+  int added_nodes = 0;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 0, 0, added_nodes,
+           1.0f);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_pass);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 08354b526a..b653e5a521 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -90,7 +90,8 @@ void GraphPatternDetector::operator()(Graph *graph,
   ValidateByNodeRole(&subgraphs);
 
   if (subgraphs.empty()) return;
-  PrettyLogEndl(Style::detail(), "---  detect %d subgraphs", subgraphs.size());
+  PrettyLogEndl(Style::detail(), "---  detected %d subgraphs",
+                subgraphs.size());
   int id = 0;
   for (auto &g : subgraphs) {
     VLOG(3) << "optimizing #" << id++ << " subgraph";
@@ -1074,9 +1075,53 @@ PDNode *patterns::Conv::operator()() {
                         ->AsOutput()
                         ->assert_is_op_output("conv2d", "Output");
 
-  conv_op->LinksFrom({input_var, filter_var});
-  conv_op->LinksTo({output_var});
+  conv_op->LinksFrom({input_var, filter_var}).LinksTo({output_var});
+  return output_var;
+}
+
+PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
+  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");
+
+  if (!with_residual_data)
+    conv_op->assert_op_attr("fuse_residual_connection", false);
+
+  auto input_var = pattern->NewNode(conv_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("conv2d", "Input");
+
+  auto filter_var = pattern->NewNode(conv_filter_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("conv2d", "Filter");
+
+  auto output_var = pattern->NewNode(conv_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("conv2d", "Output");
+
+  std::vector<PDNode *> links_from{input_var, filter_var};
+
+  if (with_residual_data) {
+    auto res_conn_var = pattern->NewNode(conv_residual_data_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("conv2d", "ResidualData");
+    links_from.push_back(res_conn_var);
+  }
+
+  conv_op->LinksFrom(links_from).LinksTo({output_var});
+  return output_var;
+}
+
+PDNode *patterns::Pool::operator()() {
+  auto pool_op = pattern->NewNode(pool_op_repr())->assert_is_op("pool2d");
+
+  auto input_var = pattern->NewNode(pool_input_repr())
+                       ->AsInput()
+                       ->assert_is_op_input("pool2d", "X");
+
+  auto output_var = pattern->NewNode(pool_output_repr())
+                        ->AsOutput()
+                        ->assert_is_op_output("pool2d", "Out");
 
+  pool_op->LinksFrom({input_var}).LinksTo({output_var});
   return output_var;
 }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 3db4bba10d..fc30b5b21c 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -659,6 +659,35 @@ struct Conv : public PatternBase {
   PATTERN_DECL_NODE(conv_output);
 };
 
+// Convolution op with residual data
+struct ConvResidual : public PatternBase {
+  ConvResidual(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_residual") {}
+
+  PDNode* operator()(bool with_residual_data);
+
+  PATTERN_DECL_NODE(conv_op);
+  PATTERN_DECL_NODE(conv_input);
+  PATTERN_DECL_NODE(conv_filter);
+  PATTERN_DECL_NODE(conv_residual_data);
+  PATTERN_DECL_NODE(conv_output);
+};
+
+// Pool op
+// Forward pass for pooling.
+// pool_input is the input.
+// pool_output is a result of the operator.
+struct Pool : public PatternBase {
+  Pool(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "pooling") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(pool_op);
+  PATTERN_DECL_NODE(pool_input);
+  PATTERN_DECL_NODE(pool_output);
+};
+
 // ElementwiseAdd used in residual connections.
 // y_var is used and convolution output.
 // The operator is removed, when residual
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 89e934ae27..321deccf86 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -27,6 +27,7 @@
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/ir/graph.h"
@@ -38,7 +39,10 @@
 namespace paddle {
 namespace inference {
 namespace analysis {
+
 using framework::ir::Graph;
+using VarQuantScale =
+    std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
 
 /*
  * The argument definition of both Pass and PassManagers.
@@ -127,6 +131,8 @@ struct Argument {
   // Pass a set of op types to enable its mkldnn kernel
   DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes,
                       std::unordered_set<std::string>);
+  // Scales for variables to be quantized
+  DECL_ARGUMENT_FIELD(quant_var_scales, QuantVarScales, VarQuantScale);
 
   // Passed from config.
   DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 1cdb4881fb..8fd86b2cc5 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/analysis/ir_pass_manager.h"
 #include <string>
+#include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/graph.h"
@@ -55,14 +56,14 @@ void IRPassManager::CreatePasses(Argument *argument,
                                   ".dot";
       pass->Set("graph_viz_path", new std::string(std::move(dot_file_path)));
       pass_num++;
-    }
-    if (pass_name == "mkldnn_placement_pass") {
+    } else if (pass_name == "mkldnn_placement_pass") {
       pass->Set("mkldnn_enabled_op_types",
                 new std::unordered_set<std::string>(
                     argument->mkldnn_enabled_op_types()));
-    }
-
-    if (pass_name == "tensorrt_subgraph_pass") {
+    } else if (pass_name == "cpu_quantize_pass") {
+      pass->Set("quant_var_scales",
+                new VarQuantScale(argument->quant_var_scales()));
+    } else if (pass_name == "tensorrt_subgraph_pass") {
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
       pass->Set("max_batch_size", new int(argument->tensorrt_max_batch_size()));
       pass->Set("min_subgraph_size",
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 7741111222..92526f4e74 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -219,7 +219,14 @@ void AnalysisConfig::Update() {
   }
 
   if (enable_memory_optim_) {
-    pass_builder()->AppendAnalysisPass("memory_optimize_pass");
+    auto analysis_passes = pass_builder()->AnalysisPasses();
+    auto memory_opti_pass_name = "memory_optimize_pass";
+    bool already_exists =
+        std::find(analysis_passes.begin(), analysis_passes.end(),
+                  memory_opti_pass_name) != analysis_passes.end();
+    if (!already_exists) {
+      pass_builder()->AppendAnalysisPass(memory_opti_pass_name);
+    }
   }
 
   if (ir_debug_) {
diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc
index ca6bc4df0f..c6121d00da 100644
--- a/paddle/fluid/operators/conv_op.cc
+++ b/paddle/fluid/operators/conv_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/conv_op.h"
 
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -194,6 +195,12 @@ void Conv2DOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("use_quantizer",
+                "(bool, default false) "
+                "Set to true for operators that should be quantized and use "
+                "int8 kernel. "
+                "Only used on CPU.")
+      .SetDefault(false);
   AddAttr<bool>("fuse_relu", "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
   AddAttr<bool>("fuse_residual_connection",
diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index 14ca3e8073..8d96ae7e42 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -592,6 +592,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           platform::SetDstMemoryHandler<uint8_t>(ctx, output, handler,
                                                  &dst_memory_p);
         } else {
+          need_s8_to_u8 = fuse_relu;
           platform::SetDstMemoryHandler<int8_t>(ctx, output, handler,
                                                 &dst_memory_p);
         }
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index 0a0ece162c..7963c27a01 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pool_op.h"
+#include <unordered_map>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
 #endif
@@ -212,6 +213,12 @@ void Pool2dOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
+  AddAttr<bool>("use_quantizer",
+                "(bool, default false) "
+                "Set to true for operators that should be quantized and use "
+                "int8 kernel. "
+                "Only used on CPU.")
+      .SetDefault(false);
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "

From cc0ae1f1a17adc687499e0ee03f40374722869d5 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Mon, 18 Mar 2019 13:00:23 +0800
Subject: [PATCH 61/73] refine with comments

test=develop
---
 .../fluid/framework/ir/runtime_context_cache_pass.cc |  2 +-
 paddle/fluid/framework/operator.cc                   |  2 +-
 paddle/fluid/framework/operator.h                    | 12 ++++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
index 75f3795185..67b29512c4 100644
--- a/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
+++ b/paddle/fluid/framework/ir/runtime_context_cache_pass.cc
@@ -25,7 +25,7 @@ std::unique_ptr<ir::Graph> RuntimeContextCachePass::ApplyImpl(
   VLOG(3) << "Applies Runtime Context Cache strategy.";
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
-      n->Op()->SetAttr(kEnableRuntimeContext, true);
+      n->Op()->SetAttr(kEnableCacheRuntimeContext, true);
     }
   }
   return graph;
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index ac1ad2b05e..ab96201b33 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -876,7 +876,7 @@ std::vector<KernelConfig>* OperatorWithKernel::GetKernelConfig(
 
 RuntimeContext* OperatorWithKernel::GetRuntimeContext(
     const Scope& scope) const {
-  if (!HasAttr(kEnableRuntimeContext)) {
+  if (!HasAttr(kEnableCacheRuntimeContext)) {
     return new RuntimeContext(Inputs(), Outputs(), scope);
   } else {
     const Scope* cur_scope = &scope;
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index f0592f4f5f..ca5f0e27b3 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -63,12 +63,12 @@ constexpr char kZeroVarSuffix[] = "@ZERO";
 constexpr char kNewGradSuffix[] = "@NEWGRAD@";
 
 /// RuntimeContext is used to relate input/output names of Operator with
-/// the corresponding variables in Scope.
-/// If an Op has attribute kEnableRuntimeContext, it means that in a same Scope,
-/// since the input/output names of this Op do not change in the execution,
-/// RuntimeContext could be created only at the first iteration of this Op's
-/// execution to save the elapsed time.
-constexpr char kEnableRuntimeContext[] = "@ENABLE_RUNTIME_CONTEXT@";
+/// the corresponding variables in name scope.
+/// If an Op has attribute kEnableCacheRuntimeContext, it means that in a same
+/// name scope, since the input/output names of this Op do not change in the
+/// execution, RuntimeContext could be created only at the first iteration of
+/// this Op's execution to save the elapsed time.
+constexpr char kEnableCacheRuntimeContext[] = "@ENABLE_CACHE_RUNTIME_CONTEXT@";
 
 /// If an Op has this attribute, all its kernels should calculate output
 /// variable's shape in the corresponding Compute() function. And

From d9f0e7252a6f9828fdd224bc6089e5c29e987107 Mon Sep 17 00:00:00 2001
From: luotao1 <luotao02@baidu.com>
Date: Mon, 18 Mar 2019 13:42:43 +0800
Subject: [PATCH 62/73] refine with comments

test=develop
---
 paddle/fluid/operators/fc_op.cc                       | 11 ++++-------
 .../operators/fused/fused_embedding_seq_pool_op.cc    |  7 ++-----
 paddle/fluid/operators/hash_op.cc                     |  7 ++-----
 paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc         |  4 ++--
 .../operators/sequence_ops/sequence_enumerate_op.cc   |  7 ++-----
 5 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 033eca967a..242f5390b8 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -119,11 +119,8 @@ void FCOpMaker::Make() {
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);
-  AddAttr<bool>(
-      framework::kAllKernelsMustComputeRuntimeShape,
-      "If an Op has this attribute, all its kernels should calculate output"
-      "variable's shape in the corresponding Compute() function. Note that "
-      "this temporal attribute would be deleted after all ops contain it.")
+  AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                "Skip calling InferShape() function in the runtime.")
       .SetDefault(true);
   AddComment(R"DOC(
   Fully Connected Operator.
@@ -140,8 +137,8 @@ class FCOpKernel : public framework::OpKernel<T> {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
                    "It must use CPUPlace.");
     auto input = ctx.Input<framework::LoDTensor>("Input");
-    auto w = ctx.Input<framework::LoDTensor>("W");
-    auto bias = ctx.Input<framework::LoDTensor>("Bias");
+    auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
     auto output = ctx.Output<framework::LoDTensor>("Out");
     int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
     auto w_dims = w->dims();
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index 40a411985c..ecb8918499 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -88,11 +88,8 @@ class FusedEmbeddingSeqPoolOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(boolean, default false) "
                   "Sparse update.")
         .SetDefault(false);
-    AddAttr<bool>(
-        framework::kAllKernelsMustComputeRuntimeShape,
-        "If an Op has this attribute, all its kernels should calculate output"
-        "variable's shape in the corresponding Compute() function. Note that "
-        "this temporal attribute would be deleted after all ops contain it.")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
     AddComment(R"DOC(
 FusedEmbeddingSeqPool Operator.
diff --git a/paddle/fluid/operators/hash_op.cc b/paddle/fluid/operators/hash_op.cc
index 4deee8b433..82222d0a7e 100644
--- a/paddle/fluid/operators/hash_op.cc
+++ b/paddle/fluid/operators/hash_op.cc
@@ -54,11 +54,8 @@ $$Out = scale * X$$
 )DOC");
     AddAttr<int>("num_hash", "").SetDefault(1);
     AddAttr<int>("mod_by", "").SetDefault(100000);
-    AddAttr<bool>(
-        framework::kAllKernelsMustComputeRuntimeShape,
-        "If an Op has this attribute, all its kernels should calculate output"
-        "variable's shape in the corresponding Compute() function. Note that "
-        "this temporal attribute would be deleted after all ops contain it.")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
   }
 };
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 2bdf146f4d..69c0486eb6 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -124,8 +124,8 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
     auto input = ctx.Input<framework::LoDTensor>("Input");
-    auto w = ctx.Input<framework::LoDTensor>("W");
-    auto bias = ctx.Input<framework::LoDTensor>("Bias");
+    auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
 
     PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
                    "Input must be with 2 or 4 dimensions, i.e. NCHW");
diff --git a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
index 75bcd3c47f..f5d6060bc3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_enumerate_op.cc
@@ -59,11 +59,8 @@ class SequenceEnumerateOpMaker : public framework::OpProtoAndCheckerMaker {
         });
     AddAttr<int>("pad_value", "(int) The enumerate sequence padding value.")
         .SetDefault(0);
-    AddAttr<bool>(
-        framework::kAllKernelsMustComputeRuntimeShape,
-        "If an Op has this attribute, all its kernels should calculate output"
-        "variable's shape in the corresponding Compute() function. Note that "
-        "this temporal attribute would be deleted after all ops contain it.")
+    AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
+                  "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
     AddComment(R"DOC(
 Sequence Enumerate Operator.

From 3e9319f3ab25390e7ab74533a59aa2a45cd19471 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Mon, 18 Mar 2019 14:21:29 +0800
Subject: [PATCH 63/73] add more imperative layer tests.

test=develop
---
 paddle/fluid/imperative/layer.cc              |  6 +--
 paddle/fluid/imperative/tracer.cc             |  9 ++--
 python/paddle/fluid/layers/nn.py              |  5 ++
 .../fluid/tests/unittests/test_layers.py      | 54 +++++++++++++++++++
 4 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 5530823b90..33064b8302 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -214,10 +214,8 @@ framework::LoDTensor& VarBase::GradValue() {
 }
 
 std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
-  if (grad_op_descs_.empty() && backward_id_ <= 0) {
-    VLOG(3) << "op with no grad: " << Type();
-    return {};
-  }
+  PADDLE_ENFORCE(!grad_op_descs_.empty() || backward_id_ > 0,
+                 "%s has no backward implementation", Type());
 
   VLOG(3) << "apply op grad: " << Type();
   std::vector<framework::VariableValueMap> tmp_grad_outputs;
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7ee92b4d8c..77d6394510 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -46,11 +46,12 @@ void CreateGradOp(const framework::OpDesc& op_desc,
                   std::vector<framework::OpDesc*>* grad_op_descs,
                   std::unordered_map<std::string, std::string>* grad_to_var) {
   PADDLE_ENFORCE(grad_op_descs->empty());
-  std::vector<std::unique_ptr<framework::OpDesc>> descs =
-      framework::OpInfoMap::Instance()
-          .Get(op_desc.Type())
-          .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
+  const framework::OpInfo& op_info =
+      framework::OpInfoMap::Instance().Get(op_desc.Type());
+  if (!op_info.grad_op_maker_) return;
 
+  std::vector<std::unique_ptr<framework::OpDesc>> descs =
+      op_info.GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block);
   for (auto& desc : descs) {
     grad_op_descs->emplace_back(desc.release());
   }
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 270cacecf2..dbe495b75c 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -24,6 +24,7 @@ import inspect
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant, NumpyArrayInitializer
 from ..framework import Variable, OpProtoHolder, _in_imperative_mode
+from ..imperative import base
 from ..param_attr import ParamAttr
 from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
 from .tensor import concat, assign
@@ -9138,6 +9139,10 @@ def _elementwise_op(helper):
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
     y = helper.kwargs.get('y', None)
+    if _in_imperative_mode():
+        x = base.to_variable(x)
+        y = base.to_variable(y)
+
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
     axis = helper.kwargs.get('axis', -1)
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index 42ec845b42..885ee170e8 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -174,6 +174,60 @@ class TestLayer(LayerTest):
             self.assertTrue(np.allclose(static_ret[i], static_ret2[i]))
             self.assertTrue(np.allclose(static_ret[i], dy_ret[i]._numpy()))
 
+    def test_elementwise_math(self):
+        n = np.ones([3, 3], dtype='float32')
+        n2 = np.ones([3, 3], dtype='float32') * 1.1
+        n3 = np.ones([3, 3], dtype='float32') * 2
+        n4 = np.ones([3, 3], dtype='float32') * 3
+        n5 = np.ones([3, 3], dtype='float32') * 4
+        n6 = np.ones([3, 3], dtype='float32') * 5
+
+        with self.static_graph():
+            t = layers.data(name='t', shape=[3, 3], dtype='float32')
+            t2 = layers.data(name='t2', shape=[3, 3], dtype='float32')
+            t3 = layers.data(name='t3', shape=[3, 3], dtype='float32')
+            t4 = layers.data(name='t4', shape=[3, 3], dtype='float32')
+            t5 = layers.data(name='t5', shape=[3, 3], dtype='float32')
+            t6 = layers.data(name='t6', shape=[3, 3], dtype='float32')
+
+            ret = layers.elementwise_add(t, t2)
+            ret = layers.elementwise_pow(ret, t3)
+            ret = layers.elementwise_div(ret, t4)
+            ret = layers.elementwise_sub(ret, t5)
+            ret = layers.elementwise_mul(ret, t6)
+
+            static_ret = self.get_static_graph_result(
+                feed={
+                    't': n,
+                    't2': n2,
+                    't3': n3,
+                    't4': n4,
+                    't5': n5,
+                    't6': n6
+                },
+                fetch_list=[ret])[0]
+
+        with self.dynamic_graph():
+            ret = layers.elementwise_add(n, n2)
+            ret = layers.elementwise_pow(ret, n3)
+            ret = layers.elementwise_div(ret, n4)
+            ret = layers.elementwise_sub(ret, n5)
+            dy_ret = layers.elementwise_mul(ret, n6)
+        self.assertTrue(
+            np.allclose(static_ret, dy_ret._numpy()),
+            '%s vs %s' % (static_ret, dy_ret._numpy()))
+
+    def test_elementwise_minmax(self):
+        n = np.ones([3, 3], dtype='float32')
+        n2 = np.ones([3, 3], dtype='float32') * 2
+
+        with self.dynamic_graph():
+            min_ret = layers.elementwise_min(n, n2)
+            max_ret = layers.elementwise_max(n, n2)
+
+        self.assertTrue(np.allclose(n, min_ret._numpy()))
+        self.assertTrue(np.allclose(n2, max_ret._numpy()))
+
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):

From 36dce65bb389f3251752519e54913c24b933fccc Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 18 Mar 2019 15:54:35 +0800
Subject: [PATCH 64/73] Take DataType and VarType apart

test=develop
---
 paddle/fluid/framework/details/op_registry.h  |  4 +-
 paddle/fluid/framework/ir/graph_test.cc       |  2 +-
 paddle/fluid/framework/op_desc.cc             |  2 +-
 paddle/fluid/framework/type_defs.h            |  2 +-
 .../framework/var_type_inference_test.cc      |  2 +-
 paddle/fluid/imperative/layer.cc              | 26 ++++++------
 paddle/fluid/imperative/layer.h               | 42 +++++++++++++------
 paddle/fluid/imperative/tracer.cc             |  3 +-
 paddle/fluid/operators/sum_op.cc              |  4 +-
 paddle/fluid/pybind/pybind.cc                 |  2 +-
 10 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 346aba07d1..420d4da8d5 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -129,9 +129,9 @@ struct OpInfoFiller<T, kGradOpDescMaker> {
 template <typename T>
 struct OpInfoFiller<T, kVarTypeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
-    info->infer_var_type_ = [](InferVarTypeContext& context) {
+    info->infer_var_type_ = [](InferVarTypeContext* context) {
       T inference;
-      inference(context);
+      inference(*context);
     };
   }
 };
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 2940f3ceeb..851c1b80a8 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -48,7 +48,7 @@ class SumOpVarTypeInference : public VarTypeInference {
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [ctx](const std::string &name) {
+        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
           return ctx.GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index aae0eafe6c..8f9c6cb5e9 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -679,7 +679,7 @@ void OpDesc::InferVarType(BlockDesc *block) const {
   auto &info = OpInfoMap::Instance().Get(this->Type());
   if (info.infer_var_type_) {
     InferVarTypeContext context(this, block);
-    info.infer_var_type_(context);
+    info.infer_var_type_(&context);
   }
 }
 
diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
index a774f9ff49..f55520901c 100644
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@@ -54,7 +54,7 @@ using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDesc>>(
     const std::vector<BlockDesc*>& grad_block)>;
 
 using InferVarTypeFN =
-    std::function<void(framework::InferVarTypeContext& /*context*/)>;
+    std::function<void(framework::InferVarTypeContext* /*context*/)>;
 
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index d7d3e0a033..60e1d610da 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -49,7 +49,7 @@ class SumOpVarTypeInference : public VarTypeInference {
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [ctx](const std::string &name) {
+        inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
           return ctx.GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index aee905aa41..28ab208f3f 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -243,12 +243,14 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
         auto& outputs = tmp_grad_outputs[k][it.first];
         outputs.reserve(it.second.size());
         for (size_t i = 0; i < it.second.size(); ++i) {
+          VarBase* origin_grad_var_base = it.second[i];
+
           // Allocate a new variable
-          Variable* tmp_var = new framework::Variable();
-          tmp_var->GetMutable<framework::LoDTensor>();
-          VarBase* tmp_var_base =
-              new VarBase(it.second[i]->Name(), tmp_var, nullptr, true);
-          outputs.emplace_back(tmp_var_base);
+          VarBase* tmp_grad_var_base = new VarBase(
+              string::Sprintf("%s@IGrad", origin_grad_var_base->Name()),
+              origin_grad_var_base->DataType(), origin_grad_var_base->Dims(),
+              place_, true, false);
+          outputs.emplace_back(tmp_grad_var_base);
         }
       }
 
@@ -259,13 +261,12 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
       std::unique_ptr<framework::OperatorBase> opbase =
           framework::OpRegistry::CreateOp(*grad_op_desc);
 
-      // auto& info =
-      // framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
-      // if (info.infer_var_type_) {
-      // framework::RuntimeInferVarTypeContext infer_var_type_ctx(
-      // this, &grad_inputs, &outputs, &attrs_map);
-      // info.infer_var_type_(infer_var_type_ctx);
-      // }
+      auto& info = framework::OpInfoMap::Instance().Get(grad_op_desc->Type());
+      if (info.infer_var_type_) {
+        RuntimeInferVarTypeContext infer_var_type_ctx(
+            &grad_input_vars_[k], &tmp_grad_outputs[k], &attrs_);
+        info.infer_var_type_(&infer_var_type_ctx);
+      }
 
       framework::OperatorWithKernel* op_kernel =
           dynamic_cast<framework::OperatorWithKernel*>(opbase.get());
@@ -298,7 +299,6 @@ std::map<std::string, std::vector<VarBase*>> OpBase::ApplyGrad() {
       }
 
       framework::RuntimeContext ctx(grad_invars_map, grad_outvars_map);
-
       framework::Scope scope;
       PreparedOp p = PreparedOp::Prepare(ctx, *op_kernel, place_);
       p.op.RuntimeInferShape(scope, place_, ctx);
diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index 4ad7d847c1..f210cd1745 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -137,13 +137,13 @@ class VarBase {
                 persistable) {}
 
  private:
+  // TODO(minqiyang): need support SelectedRows
   VarBase(const std::string& name, framework::proto::VarType::Type dtype,
           const framework::DDim& shape, const platform::Place& place,
           framework::Variable* var, VarBase* grad, bool stop_gradient,
           bool persistable)
       : name_(name),
-        dtype_(dtype),
-        place_(place),
+        type_(framework::proto::VarType::LOD_TENSOR),
         var_(var),
         grads_(grad),
         stop_gradient_(stop_gradient),
@@ -153,10 +153,12 @@ class VarBase {
         pre_op_out_idx_(-1) {
     if (!var_) {
       var_ = new framework::Variable();
-      auto tensor = var_->GetMutable<framework::LoDTensor>();
-      tensor->Resize(shape);
-      tensor->mutable_data(place_, dtype_);
     }
+    auto tensor = var_->GetMutable<framework::LoDTensor>();
+    tensor->Resize(shape);
+    tensor->mutable_data(place, dtype);
+    VLOG(10) << "create varbase: " << name_ << " type: " << dtype
+             << " place: " << place;
   }
 
  public:
@@ -186,11 +188,23 @@ class VarBase {
     }
   }
 
-  inline void SetDType(framework::proto::VarType::Type type) {
+  inline framework::DDim Dims() const {
+    return var_->Get<framework::LoDTensor>().dims();
+  }
+
+  // data type. e.g.. FP32
+  inline void SetDataType(framework::proto::VarType::Type type) {
     auto tensor = var_->GetMutable<framework::LoDTensor>();
-    tensor->mutable_data(place_, dtype_);
+    tensor->mutable_data(place_, type);
   }
-  inline framework::proto::VarType::Type DType() const { return dtype_; }
+  inline framework::proto::VarType::Type DataType() const {
+    auto tensor = var_->Get<framework::LoDTensor>();
+    return tensor.type();
+  }
+
+  // tensor type. e.g.. LoDTensor
+  inline void SetType(framework::proto::VarType::Type type) { type_ = type; }
+  inline framework::proto::VarType::Type Type() const { return type_; }
 
   inline void SetStopGradient(bool stop_gradient) {
     stop_gradient_ = stop_gradient;
@@ -244,7 +258,7 @@ class VarBase {
   }
 
   std::string name_;
-  framework::proto::VarType::Type dtype_;
+  framework::proto::VarType::Type type_;
   platform::Place place_;
 
   framework::Variable* var_;
@@ -339,6 +353,8 @@ class PYBIND11_HIDDEN OpBase {
   std::vector<VarBasePtrMap> grad_output_vars_;
 
   std::vector<py::object> backward_hooks_;
+
+  framework::AttributeMap attrs_;
 };
 
 class Layer {
@@ -437,22 +453,22 @@ class PYBIND11_HIDDEN RuntimeInferVarTypeContext
 
   framework::proto::VarType::Type GetType(
       const std::string& name) const override {
-    return var_set_.at(name)->DType();
+    return var_set_.at(name)->Type();
   }
 
   void SetType(const std::string& name,
                framework::proto::VarType::Type type) override {
-    var_set_[name]->SetDType(type);
+    var_set_[name]->SetType(type);
   }
 
   framework::proto::VarType::Type GetDataType(
       const std::string& name) const override {
-    return var_set_.at(name)->DType();
+    return var_set_.at(name)->DataType();
   }
 
   void SetDataType(const std::string& name,
                    framework::proto::VarType::Type type) override {
-    var_set_[name]->SetDType(type);
+    var_set_[name]->SetDataType(type);
   }
 
   std::vector<framework::proto::VarType::Type> GetDataTypes(
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 166883bd6f..0f7a241537 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -232,7 +232,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   if (info.infer_var_type_) {
     RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, &outputs,
                                                   &attrs_map);
-    info.infer_var_type_(infer_var_type_ctx);
+    info.infer_var_type_(&infer_var_type_ctx);
   }
 
   // TODO(minqiyang): Support infer var type in imperative mode
@@ -259,6 +259,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
     VLOG(5) << "start construct backward op";
 
     // construct grad op descs
+    op->attrs_ = attrs_map;
     std::unique_ptr<framework::OpDesc> fwd_op_desc(new framework::OpDesc(
         op->Type(), invars_name_map, outvars_name_map, attrs_map));
     std::unique_ptr<std::unordered_map<std::string, std::string>> grad_to_var(
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 7dba00fffa..2405a74d2b 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -168,11 +168,11 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
     }
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [ctx](const std::string& name) {
+        inputs.begin(), inputs.end(), [&ctx](const std::string& name) {
           return ctx.GetType(name) == framework::proto::VarType::LOD_TENSOR;
         });
 
-    auto is_tensor_array = [ctx](const std::string& name) {
+    auto is_tensor_array = [&ctx](const std::string& name) {
       return ctx.GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
     };
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 552a5e0c32..5a80b785e8 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -194,7 +194,7 @@ PYBIND11_MODULE(core, m) {
       .def_property("name", &imperative::VarBase::Name,
                     &imperative::VarBase::SetName)
       .def_property_readonly("shape", &imperative::VarBase::Shape)
-      .def_property_readonly("dtype", &imperative::VarBase::DType)
+      .def_property_readonly("dtype", &imperative::VarBase::DataType)
       .def_property("persistable", &imperative::VarBase::IsPersistable,
                     &imperative::VarBase::SetPersistable)
       .def_property("stop_gradient", &imperative::VarBase::IsStopGradient,

From 8ea4218ce132a8ea4428efb753fe179aadd01aa8 Mon Sep 17 00:00:00 2001
From: tangwei12 <tangwei12@baidu.com>
Date: Mon, 18 Mar 2019 17:25:16 +0800
Subject: [PATCH 65/73] update load persistables for increment, test=develop
 (#15576)

* update load persistables for increment, test=develop

* update load persistables for increment, test=develop

* update API Spec, test=develop

* update API Spec, test=develop

* add doc, test=develop

* add doc, test=develop

* Update lookup_table_utils.py

* Update API.spec

* Update lookup_table_utils.py

test=develop

* Update API.spec

test=develop

* fix api spec

* Update lookup_table_utils.py

test=develop
---
 paddle/fluid/API.spec                         |   6 +-
 .../fluid/contrib/utils/lookup_table_utils.py | 294 ++++++++++++++----
 2 files changed, 230 insertions(+), 70 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 056463205a..66fc323e6b 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -393,9 +393,9 @@ paddle.fluid.contrib.MagnitudePruner.__init__ (ArgSpec(args=['self', 'threshold'
 paddle.fluid.contrib.MagnitudePruner.prune (ArgSpec(args=['self', 'param', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.RatioPruner.__init__ (ArgSpec(args=['self', 'ratios'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e7a81a325b296a9ca502ee5adb4fc85d'))
 paddle.fluid.contrib.RatioPruner.prune (ArgSpec(args=['self', 'param', 'ratio'], varargs=None, keywords=None, defaults=(None,)), ('document', '358cbf2978c91028fb96a195a9884645'))
-paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '11fbf7e8dd2289805de291b453a33ee7'))
-paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '5b5577bb3d24070da819674255d16196'))
-paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '4efbd93876832d4d35497cdbc7a1e6d8'))
+paddle.fluid.contrib.load_persistables_for_increment (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var', 'lookup_table_var_path'], varargs=None, keywords=None, defaults=None), ('document', '2ab36d4f7a564f5f65e455807ad06c67'))
+paddle.fluid.contrib.load_persistables_for_inference (ArgSpec(args=['dirname', 'executor', 'program', 'lookup_table_var_name'], varargs=None, keywords=None, defaults=None), ('document', '59066bac9db0ac6ce414d05780b7333f'))
+paddle.fluid.contrib.convert_dist_to_sparse_program (ArgSpec(args=['program'], varargs=None, keywords=None, defaults=None), ('document', '74c39c595dc70d6be2f16d8e462d282b'))
 paddle.fluid.contrib.HDFSClient.__init__ (ArgSpec(args=['self', 'hadoop_home', 'configs'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.HDFSClient.delete (ArgSpec(args=['self', 'hdfs_path'], varargs=None, keywords=None, defaults=None), ('document', 'c3721aa2d4d9ef5a857dd47b2681c03e'))
 paddle.fluid.contrib.HDFSClient.download (ArgSpec(args=['self', 'hdfs_path', 'local_path', 'overwrite', 'unzip'], varargs=None, keywords=None, defaults=(False, False)), ('document', 'ca55bde92184d3fd0f9f5c963b25e634'))
diff --git a/python/paddle/fluid/contrib/utils/lookup_table_utils.py b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
index 20e6328d81..a127f5b11b 100644
--- a/python/paddle/fluid/contrib/utils/lookup_table_utils.py
+++ b/python/paddle/fluid/contrib/utils/lookup_table_utils.py
@@ -18,6 +18,7 @@ import os
 import time
 import logging
 
+import paddle
 from paddle.fluid import core
 from paddle.fluid import io
 from paddle.fluid import Program
@@ -84,8 +85,9 @@ def convert_dist_to_sparse_program(program):
     when we train model with distributed lookup table but want to do the local inference, we can use
     this function to convert the train program with distributed lookup table to sparse lookup table.
 
-    :param program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
-    :return:
+    Args:
+        program(Program): the program must be the trainer program, which will be get by the distribute transpiler.
+    Returns:
         program: The `program` is a Program, it's the program replace distributed lookup table to sparse lookup table.
     """
     if not program._distributed_lookup_table:
@@ -128,68 +130,92 @@ def convert_dist_to_sparse_program(program):
     return program
 
 
-def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
-    def _is_checkpoint_var(exclude_fluid_vars=None):
-        """
-        the checkpoint will not save or load all the variables.
-        var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
-
-        : param var(Variable)
-        """
-
-        if exclude_fluid_vars is None:
-            exclude_fluid_vars = []
-
-        def is_valid(var):
-            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                    var.desc.type() == core.VarDesc.VarType.RAW:
-                return False
-            # @GRAD are named for gradient variables, checkpoint will not save it.
-            if "@GRAD" in var.name:
-                return False
-            # .trainer_ are named for distribute train variables, checkpoint will not save it.
-            if ".trainer_" in var.name:
-                return False
-
-            # .block is named for distribute train variables, checkpoint will not save it.
-            if ".block" in var.name:
-                return False
-
-            if "tmp_" in var.name:
-                return False
-
-            if var.name in exclude_fluid_vars:
-                return False
-
-            return var.persistable
-
-        return is_valid
-
-    io.load_vars(
-        executor,
-        dirname=dirname,
-        main_program=program,
-        predicate=_is_checkpoint_var(lookup_table_vars),
-        filename=None)
-
-
 def load_persistables_for_increment(dirname, executor, program,
                                     lookup_table_var, lookup_table_var_path):
     """
     WARNING: this function will only be used for distributed training with distributed lookup table.
     for increment trainning, the pserver will not only load dense variables,
-    but also load the suitable lookup table var. Because of slice lookup table
-    var with HASH, we must load the correct slice var.
+    but also load the suitable lookup table var. Because of sliced lookup table
+    var with HASH, we must load the correct sliced var.
+
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        program(Program): The parameter server program, which will run on Pserver.
+        lookup_table_var: the distributed lookup tables var name.
+        lookup_table_var_path: the the distributed lookup tables var location.
+
+    Returns:
+        None
+    """
 
+    def _load_persistable_vars(executor, dirname, need_load_vars):
+        load_prog = Program()
+        load_block = load_prog.global_block()
+        need_delete_vars = []
+
+        for param in need_load_vars:
+            origin_var = param.origin
+            slice_var = param.slice
+            is_slice = param.is_slice
+            offset = param.offset
+
+            if is_slice:
+                origin = load_block.create_var(
+                    name="{}.load".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
+
+                slice = load_block.create_var(
+                    name=slice_var.name,
+                    type=slice_var.type,
+                    shape=slice_var.shape,
+                    dtype=slice_var.dtype,
+                    persistable=True)
+
+                dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
+                start = int(offset / dim1_flatten)
+                end = int(offset / dim1_flatten + slice.shape[0])
+
+                load_block.append_op(
+                    type="slice",
+                    inputs={'Input': origin},
+                    outputs={'Out': slice},
+                    attrs={'axes': [0],
+                           'starts': [start],
+                           'ends': [end]})
+
+                need_delete_vars.append(origin)
+            else:
+                origin = load_block.create_var(
+                    name="{}".format(origin_var.name),
+                    type=origin_var.type,
+                    shape=origin_var.shape,
+                    dtype=origin_var.dtype,
+                    persistable=True)
+                load_block.append_op(
+                    type='load',
+                    inputs={},
+                    outputs={'Out': [origin]},
+                    attrs={
+                        'file_path': os.path.join(dirname, origin_var.name)
+                    })
 
-    :param dirname(str): The directory path
-    :param executor(Executor): The executor to run for loading inference model.
-    :param program(Program): The parameter server program, which will run on Pserver.
-    :param lookup_table_var: the distributed lookup tables var name.
-    :param lookup_table_var_path: the the distributed lookup tables var location.
-    :return: None
-    """
+        load_block.append_op(
+            type='delete_var',
+            inputs={'X': need_delete_vars}, )
+
+        executor.run(load_prog)
 
     def __load_lookup_table_vars(executor, main_program, lookup_table_var,
                                  lookup_table_var_path):
@@ -217,7 +243,9 @@ def load_persistables_for_increment(dirname, executor, program,
                  "Distributed Lookup Table Vars from {}, time = {}".format(
                      dirname, time.ctime()))
 
-    _load_persistable_vars(executor, dirname, program, [lookup_table_var])
+    need_load_vars = program._parameters_on_pservers.get_distributed_vars_by_ep(
+        program._ps_endpoint)
+    _load_persistable_vars(executor, dirname, need_load_vars)
     __load_lookup_table_vars(executor, program, lookup_table_var,
                              lookup_table_var_path)
 
@@ -233,15 +261,62 @@ def load_persistables_for_inference(dirname, executor, program,
     Inference with distributed lookup table is a little funky, this function will load distributed
     lookup table vars into sparse var, can be used in local inference mode.
 
-    :param dirname(str): The directory path
-    :param executor(Executor): The executor to run for loading inference model.
-    :param program(Program): The parameter server program, which will run on Pserver.
-    :param lookup_table_var_name: the distributed lookup tables var name.
-    :return: None
+    Args:
+        dirname(str): The directory path
+        executor(Executor): The executor to run for loading inference model.
+        program(Program): The parameter server program, which will run on Pserver.
+        lookup_table_var_name: the distributed lookup tables var name.
+    Returns:
+        None
     """
 
-    def __load_lookup_table_vars(executor, dirname, main_program,
-                                 lookup_table_vars):
+    def _load_persistable_vars(executor, dirname, program, lookup_table_vars):
+        def _is_checkpoint_var(exclude_fluid_vars=None):
+            """
+            the checkpoint will not save or load all the variables.
+            var type is FEED_MINIBATCH/FETCH_LIST/RAW or var name ends with @GRAD are discarded.
+
+            : param var(Variable)
+            """
+
+            if exclude_fluid_vars is None:
+                exclude_fluid_vars = []
+
+            def is_valid(var):
+                if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
+                        var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
+                        var.desc.type() == core.VarDesc.VarType.RAW:
+                    return False
+                # @GRAD are named for gradient variables, checkpoint will not save it.
+                if "@GRAD" in var.name:
+                    return False
+                # .trainer_ are named for distribute train variables, checkpoint will not save it.
+                if ".trainer_" in var.name:
+                    return False
+
+                # .block is named for distribute train variables, checkpoint will not save it.
+                if ".block" in var.name:
+                    return False
+
+                if "tmp_" in var.name:
+                    return False
+
+                if var.name in exclude_fluid_vars:
+                    return False
+
+                return var.persistable
+
+            return is_valid
+
+        io.load_vars(
+            executor,
+            dirname=dirname,
+            main_program=program,
+            predicate=_is_checkpoint_var(lookup_table_vars),
+            filename=None)
+
+    def _load_lookup_table_vars(executor, dirname, main_program,
+                                lookup_table_vars):
         if not os.path.isdir(dirname):
             raise ValueError("There is no directory named '%s'", dirname)
 
@@ -313,11 +388,96 @@ def load_persistables_for_inference(dirname, executor, program,
                      dirname, time.ctime()))
 
     _load_persistable_vars(executor, dirname, program, [lookup_table_var_name])
-    __load_lookup_table_vars(executor, dirname, program,
-                             [lookup_table_var_name])
+    _load_lookup_table_vars(executor, dirname, program, [lookup_table_var_name])
 
     _logger.info("Finish Load Sparse Program With "
                  "Distributed Lookup Table Vars from {}, time = {}".format(
                      dirname, time.ctime()))
 
     return program
+
+
+def get_inference_model(main_program, feeded_var_names, target_vars):
+    """
+    Prune the given `main_program` to build a new program especially for inference with distributed lookup table ,
+    and then add `feeded_vars` and `target_vars` in this program.
+
+    Args:
+        main_program(Program|None): The original program, which will be pruned to
+                                    build the inference model. If is setted None,
+                                    the default main program will be used.
+                                    Default: None.
+        feeded_var_names(list[str]): Names of variables that need to be feeded data
+                                     during inference.
+        target_vars(list[Variable]): Variables from which we can get inference
+                                     results.
+    Returns:
+        program(Program)
+
+    Raises:
+        ValueError: If `feed_var_names` is not a list of basestring.
+        ValueError: If `target_vars` is not a list of Variable.
+
+    """
+
+    def prepend_feed_ops(inference_program,
+                         feed_target_names,
+                         feed_holder_name='feed'):
+        if len(feed_target_names) == 0:
+            return
+
+        global_block = inference_program.global_block()
+
+        feed_var = global_block.create_var(
+            name=feed_holder_name,
+            type=core.VarDesc.VarType.FEED_MINIBATCH,
+            persistable=True)
+
+        for i, name in enumerate(feed_target_names):
+            out = global_block.var(name)
+            global_block._prepend_op(
+                type='feed',
+                inputs={'X': [feed_var]},
+                outputs={'Out': [out]},
+                attrs={'col': i})
+
+    def append_fetch_ops(inference_program,
+                         fetch_target_names,
+                         fetch_holder_name='fetch'):
+        global_block = inference_program.global_block()
+        fetch_var = global_block.create_var(
+            name=fetch_holder_name,
+            type=core.VarDesc.VarType.FETCH_LIST,
+            persistable=True)
+
+        for i, name in enumerate(fetch_target_names):
+            global_block.append_op(
+                type='fetch',
+                inputs={'X': [name]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i})
+
+    origin_program = main_program.clone()
+    main_program = main_program.clone()
+    global_block = main_program.global_block()
+
+    need_to_remove_op_index = []
+    for i, op in enumerate(global_block.ops):
+        op.desc.set_is_target(False)
+        if op.type == "feed" or op.type == "fetch":
+            need_to_remove_op_index.append(i)
+
+    for index in need_to_remove_op_index[::-1]:
+        global_block._remove_op(index)
+
+    main_program.desc.flush()
+
+    main_program = main_program._prune(targets=target_vars)
+    main_program = main_program._inference_optimize(prune_read_op=True)
+
+    fetch_var_names = [v.name for v in target_vars]
+
+    prepend_feed_ops(main_program, feeded_var_names)
+    append_fetch_ops(main_program, fetch_var_names)
+
+    return main_program

From e818fa1004d2552fb5743d8d61cd1d86e63d349a Mon Sep 17 00:00:00 2001
From: xiaolil1 <39753926+xiaolil1@users.noreply.github.com>
Date: Mon, 18 Mar 2019 19:21:52 +0800
Subject: [PATCH 66/73] Enable INT8 transpose kernel for MobileNet-SSD
 improvement. (#16159)

* Enable INT8 transpose kernel for MobileNet-SSD improvement.
test=develop

* Refine the license year.
test=develop

* Delete redundant code.
test=develop

* Add axis check.
test=develop
---
 .../operators/mkldnn/transpose_mkldnn_op.cc   | 28 ++++++-
 .../mkldnn/test_transpose_int8_mkldnn_op.py   | 78 +++++++++++++++++++
 2 files changed, 105 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py

diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index e41bfb80df..4debc7ca5e 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -73,6 +73,29 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class TransposeINT8MKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
+    std::vector<int> axis_int8 = {0, 2, 3, 1};
+    if (axis.size() != 1) {
+      PADDLE_ENFORCE_EQ(axis.size(), axis_int8.size());
+      for (size_t i = 0; i < axis.size(); i++) {
+        PADDLE_ENFORCE_EQ(axis[i], axis_int8[i],
+                          "Current INT8 MKLDNN Transpose kernel only surpport "
+                          "axis with [0, 2, 3, 1] due to MKL-DNN kernel "
+                          "implementation.");
+      }
+    }
+    auto* input = ctx.Input<Tensor>("X");
+    auto* output = ctx.Output<Tensor>("Out");
+    output->ShareDataWith(*input);
+    output->set_layout(DataLayout::kMKLDNN);
+    output->set_format(input->format());
+  }
+};
+
 template <typename T>
 class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -140,7 +163,10 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(transpose2, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNOpKernel<float>);
+                   ops::TransposeMKLDNNOpKernel<float>,
+                   ops::TransposeINT8MKLDNNOpKernel<uint8_t>,
+                   ops::TransposeINT8MKLDNNOpKernel<int8_t>);
+
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
new file mode 100644
index 0000000000..a8127bcc78
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_int8_mkldnn_op.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import OpTest
+from mkldnn_op_test import format_reorder
+
+
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        self.init_op_type()
+        self.initTestCase()
+        self.initInputData()
+        self.use_mkldnn = True
+        self.axis = (0, 2, 3, 1)
+
+        self.inputs = {
+            'X': format_reorder(self.input_data, self.shape)
+        }  #transform data format to 'NHWC' for INT8 transpose specially.
+
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+        }
+
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype('int8'),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+    def init_op_type(self):
+        self.op_type = "transpose2"
+
+    def test_check_output(self):
+        self.check_output(no_check_set=['XShape'])
+
+    def initTestCase(self):
+        self.shape = (2, 3, 4, 5)
+
+    def initInputData(self):
+        self.input_data = (
+            np.random.randint(0, 100, self.shape) - 50).astype('int8')
+
+
+class TestINT8Case(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (2, 4, 6, 8)
+
+    def initInputData(self):
+        self.input_data = (
+            np.random.randint(0, 100, self.shape) - 50).astype('int8')
+
+
+class TestUINT8Case(TestTransposeOp):
+    def initTestCase(self):
+        self.shape = (1, 3, 5, 7)
+
+    def initDataType(self):
+        self.input_data = (np.random.randint(0, 100,
+                                             self.shape)).astype('uint8')
+
+
+if __name__ == '__main__':
+    unittest.main()

From b40e41fbd1b8d5305067d198d4285c958fa3691a Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 18 Mar 2019 20:02:42 +0800
Subject: [PATCH 67/73] Polish code style

test=develop
---
 .../fluid/framework/details/graph_test_base.h | 10 ++++----
 paddle/fluid/framework/details/op_registry.h  |  2 +-
 paddle/fluid/framework/ir/graph_test.cc       | 12 +++++-----
 paddle/fluid/framework/var_type_inference.h   | 12 +++++-----
 .../framework/var_type_inference_test.cc      | 10 ++++----
 paddle/fluid/imperative/tracer.cc             |  9 ++++---
 paddle/fluid/imperative/tracer.h              |  2 +-
 .../fluid/operators/beam_search_decode_op.cc  | 10 ++++----
 paddle/fluid/operators/beam_search_op.cc      | 10 ++++----
 .../controlflow/tensor_array_read_write_op.cc | 12 +++++-----
 .../fluid/operators/controlflow/while_op.cc   | 14 +++++------
 .../operators/distributed_ops/fake_init_op.cc |  2 +-
 .../operators/distributed_ops/merge_ids_op.cc |  8 +++----
 .../operators/distributed_ops/split_ids_op.cc |  8 +++----
 paddle/fluid/operators/fill_constant_op.cc    |  8 +++----
 .../fused/fused_embedding_seq_pool_op.cc      | 12 +++++-----
 .../get_tensor_from_selected_rows_op.cc       | 10 ++++----
 .../operators/hierarchical_sigmoid_op.cc      | 18 +++++++-------
 paddle/fluid/operators/lod_rank_table_op.cc   |  6 ++---
 .../fluid/operators/lod_tensor_to_array_op.cc |  6 ++---
 paddle/fluid/operators/lookup_table_op.cc     | 12 +++++-----
 paddle/fluid/operators/nccl/nccl_op.cc        |  6 ++---
 paddle/fluid/operators/nce_op.cc              | 12 +++++-----
 .../operators/ngraph/ngraph_engine_op.cc      |  2 +-
 .../operators/optimizers/lars_momentum_op.cc  |  2 +-
 .../fluid/operators/optimizers/momentum_op.cc | 14 +++++------
 paddle/fluid/operators/optimizers/sgd_op.cc   | 12 +++++-----
 paddle/fluid/operators/py_func_op.cc          | 24 +++++++++----------
 .../reader/create_custom_reader_op.cc         | 14 +++++------
 paddle/fluid/operators/reader/read_op.cc      | 14 +++++------
 .../operators/reader/reader_op_registry.cc    | 16 ++++++-------
 .../operators/reader/reader_op_registry.h     |  4 ++--
 paddle/fluid/operators/save_op.cc             |  6 ++---
 paddle/fluid/operators/scale_op.cc            | 10 ++++----
 .../fluid/operators/split_selected_rows_op.cc |  6 ++---
 paddle/fluid/operators/sum_op.cc              | 24 +++++++++----------
 .../operators/tensor_array_to_tensor_op.cc    |  6 ++---
 .../operators/tensorrt/tensorrt_engine_op.cc  |  2 +-
 paddle/fluid/operators/uniform_random_op.cc   | 13 +++++-----
 paddle/fluid/pybind/imperative.cc             |  4 ++--
 40 files changed, 192 insertions(+), 192 deletions(-)

diff --git a/paddle/fluid/framework/details/graph_test_base.h b/paddle/fluid/framework/details/graph_test_base.h
index 2fae684516..d139f84883 100644
--- a/paddle/fluid/framework/details/graph_test_base.h
+++ b/paddle/fluid/framework/details/graph_test_base.h
@@ -68,11 +68,11 @@ class SplitOpMaker : public OpProtoAndCheckerMaker {
 
 class DummyVarTypeInference : public VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    auto& inputs = ctx.Input("X");
-    auto type = ctx.GetType(inputs.front());
-    auto out_var_name = ctx.Output("Out").front();
-    ctx.SetType(out_var_name, type);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& inputs = ctx->Input("X");
+    auto type = ctx->GetType(inputs.front());
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, type);
   }
 };
 
diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h
index 420d4da8d5..e13ff99f3f 100644
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@@ -131,7 +131,7 @@ struct OpInfoFiller<T, kVarTypeInference> {
   void operator()(const char* op_type, OpInfo* info) const {
     info->infer_var_type_ = [](InferVarTypeContext* context) {
       T inference;
-      inference(*context);
+      inference(context);
     };
   }
 };
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 851c1b80a8..a95588a57b 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -43,20 +43,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(InferVarTypeContext &ctx) const override {
-    auto &inputs = ctx.Input("X");
+  void operator()(InferVarTypeContext *ctx) const override {
+    auto &inputs = ctx->Input("X");
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
-          return ctx.GetType(name) == proto::VarType::LOD_TENSOR;
+          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
       default_var_type = proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = ctx.Output("Out").front();
-    ctx.SetType(out_var_name, default_var_type);
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, default_var_type);
   }
 };
 
@@ -71,7 +71,7 @@ class DummyOpMaker : public OpProtoAndCheckerMaker {
 
 class DummyOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h
index 5dd08442c2..2e9c64d3e6 100644
--- a/paddle/fluid/framework/var_type_inference.h
+++ b/paddle/fluid/framework/var_type_inference.h
@@ -126,20 +126,20 @@ class InferVarTypeContext {
 class VarTypeInference {
  public:
   virtual ~VarTypeInference() {}
-  virtual void operator()(InferVarTypeContext& context) const = 0;  // NOLINT
+  virtual void operator()(InferVarTypeContext* context) const = 0;  // NOLINT
 };
 
 class PassInDtypeAndVarTypeToOutput : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const final {  // NOLINT
+  void operator()(framework::InferVarTypeContext* ctx) const final {  // NOLINT
     auto in_out_var_names = this->GetInputOutputWithSameType();
 
     for (auto& i_o_n : in_out_var_names) {
-      auto& x_name = ctx.Input(i_o_n.first).at(0);
-      auto& out_name = ctx.Output(i_o_n.second).at(0);
+      auto& x_name = ctx->Input(i_o_n.first).at(0);
+      auto& out_name = ctx->Output(i_o_n.second).at(0);
 
-      ctx.SetType(out_name, ctx.GetType(x_name));
-      ctx.SetDataType(out_name, ctx.GetDataType(x_name));
+      ctx->SetType(out_name, ctx->GetType(x_name));
+      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
     }
   }
 
diff --git a/paddle/fluid/framework/var_type_inference_test.cc b/paddle/fluid/framework/var_type_inference_test.cc
index 60e1d610da..6bbb25a573 100644
--- a/paddle/fluid/framework/var_type_inference_test.cc
+++ b/paddle/fluid/framework/var_type_inference_test.cc
@@ -44,20 +44,20 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 
 class SumOpVarTypeInference : public VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto &inputs = ctx.Input("X");
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &inputs = ctx->Input("X");
     auto default_var_type = proto::VarType::SELECTED_ROWS;
 
     bool any_input_is_lod_tensor = std::any_of(
         inputs.begin(), inputs.end(), [&ctx](const std::string &name) {
-          return ctx.GetType(name) == proto::VarType::LOD_TENSOR;
+          return ctx->GetType(name) == proto::VarType::LOD_TENSOR;
         });
     if (any_input_is_lod_tensor) {
       default_var_type = proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = ctx.Output("Out").front();
-    ctx.SetType(out_var_name, default_var_type);
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, default_var_type);
   }
 };
 }  // namespace framework
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 0f7a241537..18bd1d1938 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -161,7 +161,7 @@ Tracer::Tracer(framework::BlockDesc* root_block) : root_block_(root_block) {
 }
 
 std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
-                                    VarBasePtrMap& outputs,
+                                    VarBasePtrMap* outputs,
                                     framework::AttributeMap attrs_map,
                                     const platform::Place expected_place,
                                     const bool stop_gradient) {
@@ -195,7 +195,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
     }
   }
 
-  op->output_vars_ = outputs;
+  op->output_vars_ = *outputs;
   for (auto it : op->output_vars_) {
     auto& outvars = outvars_map[it.first];
     const std::vector<VarBase*>& outputs = it.second;
@@ -218,7 +218,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
   framework::VariableNameMap invars_name_map =
       CreateInputVarNameMap(op, inputs);
   framework::VariableNameMap outvars_name_map =
-      CreateOutputVarNameMap(op, outputs);
+      CreateOutputVarNameMap(op, *outputs);
 
   auto& info = framework::OpInfoMap::Instance().Get(op->Type());
   if (info.Checker() != nullptr) {
@@ -230,8 +230,7 @@ std::set<std::string> Tracer::Trace(OpBase* op, const VarBasePtrMap& inputs,
                                       outvars_name_map, attrs_map);
 
   if (info.infer_var_type_) {
-    RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, &outputs,
-                                                  &attrs_map);
+    RuntimeInferVarTypeContext infer_var_type_ctx(&inputs, outputs, &attrs_map);
     info.infer_var_type_(&infer_var_type_ctx);
   }
 
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index ae3b16727d..a87f3b8009 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -48,7 +48,7 @@ class Tracer {
   virtual ~Tracer() {}
 
   std::set<std::string> Trace(OpBase* op, const VarBasePtrMap& inputs,
-                              VarBasePtrMap& outputs,  // NOLINT
+                              VarBasePtrMap* outputs,  // NOLINT
                               framework::AttributeMap attrs_map,
                               const platform::Place expected_place,
                               const bool stop_gradient = false);
diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc
index 703edcad11..4cef49280d 100644
--- a/paddle/fluid/operators/beam_search_decode_op.cc
+++ b/paddle/fluid/operators/beam_search_decode_op.cc
@@ -203,12 +203,12 @@ class BeamSearchDecodeInferShape : public framework::InferShapeBase {
 
 class BeamSearchDecodeInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    for (auto& o : ctx.Output("SentenceIds")) {
-      ctx.SetType(o, framework::proto::VarType::LOD_TENSOR);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    for (auto& o : ctx->Output("SentenceIds")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
-    for (auto& o : ctx.Output("SentenceScores")) {
-      ctx.SetType(o, framework::proto::VarType::LOD_TENSOR);
+    for (auto& o : ctx->Output("SentenceScores")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc
index 8958d00a68..a6aa35e056 100644
--- a/paddle/fluid/operators/beam_search_op.cc
+++ b/paddle/fluid/operators/beam_search_op.cc
@@ -120,12 +120,12 @@ class BeamSearchOp : public framework::OperatorWithKernel {
 
 class BeamSearchInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    for (auto &o : ctx.Output("selected_ids")) {
-      ctx.SetType(o, framework::proto::VarType::LOD_TENSOR);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o : ctx->Output("selected_ids")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
-    for (auto &o : ctx.Output("selected_scores")) {
-      ctx.SetType(o, framework::proto::VarType::LOD_TENSOR);
+    for (auto &o : ctx->Output("selected_scores")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_TENSOR);
     }
   }
 };
diff --git a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
index 041eef602e..45f18ac925 100644
--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -100,13 +100,13 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 
 class WriteToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto x_name = ctx.Input("X")[0];
-    auto out_name = ctx.Output("Out")[0];
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = ctx->Input("X")[0];
+    auto out_name = ctx->Output("Out")[0];
     VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
-    ctx.SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY);
-    if (ctx.HasVar(x_name)) {
-      ctx.SetDataType(out_name, ctx.GetDataType(x_name));
+    ctx->SetType(out_name, framework::proto::VarType::LOD_TENSOR_ARRAY);
+    if (ctx->HasVar(x_name)) {
+      ctx->SetDataType(out_name, ctx->GetDataType(x_name));
     }
   }
 };
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 90c3067868..deb8ec3bb2 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -365,16 +365,16 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
 
 class WhileGradOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto p_names = ctx.Input(kX);
-    auto pg_ig_names = ctx.Output(framework::GradVarName(kX));
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto p_names = ctx->Input(kX);
+    auto pg_ig_names = ctx->Output(framework::GradVarName(kX));
 
     for (size_t i = 0; i < p_names.size(); ++i) {
-      if (ctx.HasVar(pg_ig_names[i])) {
+      if (ctx->HasVar(pg_ig_names[i])) {
         VLOG(5) << "Setting " << pg_ig_names[i] << " following " << p_names[i]
-                << " type: " << ctx.GetType(p_names[i]);
-        ctx.SetType(pg_ig_names[i], ctx.GetType(p_names[i]));
-        ctx.SetDataType(pg_ig_names[i], ctx.GetDataType(p_names[i]));
+                << " type: " << ctx->GetType(p_names[i]);
+        ctx->SetType(pg_ig_names[i], ctx->GetType(p_names[i]));
+        ctx->SetDataType(pg_ig_names[i], ctx->GetDataType(p_names[i]));
       }
     }
   }
diff --git a/paddle/fluid/operators/distributed_ops/fake_init_op.cc b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
index 89228c7243..5ee35e0458 100644
--- a/paddle/fluid/operators/distributed_ops/fake_init_op.cc
+++ b/paddle/fluid/operators/distributed_ops/fake_init_op.cc
@@ -56,7 +56,7 @@ class FakeInitOp : public framework::OperatorBase {
 
 class FakeInitOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 class FakeInitOpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
index 0a269c7575..1b0b4dd316 100644
--- a/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/merge_ids_op.cc
@@ -114,10 +114,10 @@ class MergeIdsOp : public framework::OperatorWithKernel {
 
 class MergeIdsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto input_type = ctx.GetType(ctx.Input("Ids")[0]);
-    for (auto &out_var : ctx.Output("Out")) {
-      ctx.SetType(out_var, input_type);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, input_type);
     }
   }
 };
diff --git a/paddle/fluid/operators/distributed_ops/split_ids_op.cc b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
index e9f3f89c6e..191ca1efe8 100644
--- a/paddle/fluid/operators/distributed_ops/split_ids_op.cc
+++ b/paddle/fluid/operators/distributed_ops/split_ids_op.cc
@@ -73,10 +73,10 @@ class SplitIdsOp : public framework::OperatorWithKernel {
 
 class SplitIdsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto input_type = ctx.GetType(ctx.Input("Ids")[0]);
-    for (auto &out_var : ctx.Output("Out")) {
-      ctx.SetType(out_var, input_type);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto input_type = ctx->GetType(ctx->Input("Ids")[0]);
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, input_type);
     }
   }
 };
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index eb5996d50e..cf2f4776cf 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -39,11 +39,11 @@ class FillConstantOp : public framework::OperatorWithKernel {
 
 class FillConstantOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
+  void operator()(framework::InferVarTypeContext* ctx) const override {
     auto data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(ctx.GetAttr("dtype")));
-    auto& out_var_name = ctx.Output("Out").front();
-    ctx.SetDataType(out_var_name, data_type);
+        boost::get<int>(ctx->GetAttr("dtype")));
+    auto& out_var_name = ctx->Output("Out").front();
+    ctx->SetDataType(out_var_name, data_type);
   }
 };
 
diff --git a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
index 27a761c29f..5edeeae14e 100644
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.cc
@@ -137,20 +137,20 @@ class FusedEmbeddingSeqPoolOpGrad : public framework::OperatorWithKernel {
 class FusedEmbeddingSeqPoolOpGradVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    auto out_var_name = ctx.Output(framework::GradVarName("W")).front();
-    auto attr = ctx.GetAttr("is_sparse");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "fused_embedding_seq_pool_grad op "
               << framework::GradVarName("W") << " is set to SelectedRows";
-      ctx.SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "fused_embedding_seq_pool_grad op "
               << framework::GradVarName("W") << " is set to LoDTensor";
-      ctx.SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    ctx.SetDataType(out_var_name, ctx.GetDataType(ctx.Input("W")[0]));
+    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
index 5388e65497..c0893359af 100644
--- a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
+++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc
@@ -81,12 +81,12 @@ GetTensorFromSelectedRows is used to get the tensor from SelectedRows.
 class GetTensorFromSelectedRowsOpVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const {  // NOLINT
-    auto out_var_name = ctx.Output("Out").front();
-    auto in_var_name = ctx.Input("X").front();
+  void operator()(framework::InferVarTypeContext *ctx) const {  // NOLINT
+    auto out_var_name = ctx->Output("Out").front();
+    auto in_var_name = ctx->Input("X").front();
 
-    ctx.SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
-    ctx.SetDataType(out_var_name, ctx.GetDataType(in_var_name));
+    ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
   }
 };
 
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
index 508c99b953..d0e1057c43 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -197,32 +197,32 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
 class HierarchicalSigmoidGradOpGradVarTypeInference
     : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    auto w_grad_var_name = ctx.Output(framework::GradVarName("W")).front();
-    auto bias_grad_var_name_vec = ctx.Output(framework::GradVarName("Bias"));
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto w_grad_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto bias_grad_var_name_vec = ctx->Output(framework::GradVarName("Bias"));
     std::string bias_grad_var_name;
     bool hasBias = false;
     if (bias_grad_var_name_vec.size()) {
       hasBias = true;
-      bias_grad_var_name = ctx.Output(framework::GradVarName("Bias")).front();
+      bias_grad_var_name = ctx->Output(framework::GradVarName("Bias")).front();
     }
-    auto attr = ctx.GetAttr("is_sparse");
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to SelectedRows";
-      ctx.SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(w_grad_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(30) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
                << " is set to LoDTensor";
-      ctx.SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(w_grad_var_name, framework::proto::VarType::LOD_TENSOR);
     }
     if (hasBias) {
       VLOG(30) << "hierarchical_sigmoid_grad op "
                << framework::GradVarName("Bias") << " is set to LoDTensor";
-      ctx.SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(bias_grad_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    ctx.SetDataType(w_grad_var_name, ctx.GetDataType(ctx.Input("W")[0]));
+    ctx->SetDataType(w_grad_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/lod_rank_table_op.cc b/paddle/fluid/operators/lod_rank_table_op.cc
index a7bbb49827..0a43ac0c52 100644
--- a/paddle/fluid/operators/lod_rank_table_op.cc
+++ b/paddle/fluid/operators/lod_rank_table_op.cc
@@ -64,9 +64,9 @@ class LoDRankTableInferShape : public framework::InferShapeBase {
 
 class LoDRankTableInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    for (auto &o : ctx.Output("Out")) {
-      ctx.SetType(o, framework::proto::VarType::LOD_RANK_TABLE);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o : ctx->Output("Out")) {
+      ctx->SetType(o, framework::proto::VarType::LOD_RANK_TABLE);
     }
   }
 };
diff --git a/paddle/fluid/operators/lod_tensor_to_array_op.cc b/paddle/fluid/operators/lod_tensor_to_array_op.cc
index 4fd45db67b..61e3427370 100644
--- a/paddle/fluid/operators/lod_tensor_to_array_op.cc
+++ b/paddle/fluid/operators/lod_tensor_to_array_op.cc
@@ -201,9 +201,9 @@ class LoDTensorToArrayInferShape : public framework::InferShapeBase {
 
 class LoDTensorToArrayInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    for (auto &out_var : ctx.Output("Out")) {
-      ctx.SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index a59ff23f93..8d1ebe6b1c 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -147,20 +147,20 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
 
 class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    auto out_var_name = ctx.Output(framework::GradVarName("W")).front();
-    auto attr = ctx.GetAttr("is_sparse");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto out_var_name = ctx->Output(framework::GradVarName("W")).front();
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      ctx.SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(out_var_name, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "lookup_table_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      ctx.SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    ctx.SetDataType(out_var_name, ctx.GetDataType(ctx.Input("W")[0]));
+    ctx->SetDataType(out_var_name, ctx->GetDataType(ctx->Input("W")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/nccl/nccl_op.cc b/paddle/fluid/operators/nccl/nccl_op.cc
index 7df5a881f5..6a0ae0dede 100644
--- a/paddle/fluid/operators/nccl/nccl_op.cc
+++ b/paddle/fluid/operators/nccl/nccl_op.cc
@@ -60,9 +60,9 @@ class NCCLInitOp : public framework::OperatorBase {
 
 class NCCLInitOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto out_var_name = ctx.Output("Communicator").front();
-    ctx.SetType(out_var_name, framework::proto::VarType::RAW);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_var_name = ctx->Output("Communicator").front();
+    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
   }
 };
 
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index 3c3d79cc7b..fa7cc58c08 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -237,21 +237,21 @@ class NCEOpGrad : public framework::OperatorWithKernel {
 
 class NCEOpGradVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto weight_grad = ctx.Output(framework::GradVarName("Weight")).front();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto weight_grad = ctx->Output(framework::GradVarName("Weight")).front();
 
-    auto attr = ctx.GetAttr("is_sparse");
+    auto attr = ctx->GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to SelectedRows";
-      ctx.SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS);
+      ctx->SetType(weight_grad, framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "nce_op_grad op " << weight_grad << " and "
               << " is set to LoDTensor";
-      ctx.SetType(weight_grad, framework::proto::VarType::LOD_TENSOR);
+      ctx->SetType(weight_grad, framework::proto::VarType::LOD_TENSOR);
     }
-    ctx.SetDataType(weight_grad, ctx.GetDataType(ctx.Input("Input")[0]));
+    ctx->SetDataType(weight_grad, ctx->GetDataType(ctx->Input("Input")[0]));
   }
 };
 
diff --git a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
index a88ddf33a0..479c95ba08 100644
--- a/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
+++ b/paddle/fluid/operators/ngraph/ngraph_engine_op.cc
@@ -37,7 +37,7 @@ class NgraphEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class NgraphEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
index 668fa889ac..126b665dd4 100644
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -72,7 +72,7 @@ use L2 regularizers in case of using LARS.
 
 class LarsMomentumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {}
+  void operator()(framework::InferVarTypeContext* ctx) const override {}
 };
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
index 1be423da5b..7cf218c20f 100644
--- a/paddle/fluid/operators/optimizers/momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -21,14 +21,14 @@ using Tensor = framework::Tensor;
 
 class MomentumOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    auto& input_var = ctx.Input("Param")[0];
-    for (auto& out_var : ctx.Output("ParamOut")) {
-      if (ctx.GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) {
-        ctx.SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
-      } else if (ctx.GetType(input_var) ==
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& input_var = ctx->Input("Param")[0];
+    for (auto& out_var : ctx->Output("ParamOut")) {
+      if (ctx->GetType(input_var) == framework::proto::VarType::SELECTED_ROWS) {
+        ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
+      } else if (ctx->GetType(input_var) ==
                  framework::proto::VarType::LOD_TENSOR) {
-        ctx.SetType(out_var, framework::proto::VarType::LOD_TENSOR);
+        ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR);
       } else {
         PADDLE_THROW(
             "Only support LodTensor and SelectedRows, Unexpected Input Type.");
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
index cac3d9b68f..34e99a14ff 100644
--- a/paddle/fluid/operators/optimizers/sgd_op.cc
+++ b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -50,18 +50,18 @@ class SGDOp : public framework::OperatorWithKernel {
 
 class SGDOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto &input_var_n = ctx.Input("Param")[0];
-    auto in_var_type = ctx.GetType(input_var_n);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &input_var_n = ctx->Input("Param")[0];
+    auto in_var_type = ctx->GetType(input_var_n);
     PADDLE_ENFORCE(in_var_type == framework::proto::VarType::SELECTED_ROWS ||
                        in_var_type == framework::proto::VarType::LOD_TENSOR,
                    "The input Var's type should be LoDtensor or SelectedRows,"
                    " but the received var(%s)'s type is %s",
                    input_var_n, in_var_type);
 
-    for (auto &out_var_n : ctx.Output("ParamOut")) {
-      if (ctx.GetType(out_var_n) != in_var_type) {
-        ctx.SetType(out_var_n, in_var_type);
+    for (auto &out_var_n : ctx->Output("ParamOut")) {
+      if (ctx->GetType(out_var_n) != in_var_type) {
+        ctx->SetType(out_var_n, in_var_type);
       }
     }
   }
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 6472b9c163..67202c7f9d 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -96,10 +96,10 @@ static void CallPythonFunc(py::object *callable,
 
 class PyFuncOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    bool has_out = (ctx.HasOutput("Out") && !ctx.Output("Out").empty());
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    bool has_out = (ctx->HasOutput("Out") && !ctx->Output("Out").empty());
 
-    bool has_in = (ctx.HasInput("X") && !ctx.Input("Out").empty());
+    bool has_in = (ctx->HasInput("X") && !ctx->Input("Out").empty());
 
     /**
      * X or Out can be empty, so that py_func can be more flexible
@@ -107,8 +107,8 @@ class PyFuncOpVarTypeInference : public framework::VarTypeInference {
      */
     PADDLE_ENFORCE(has_in || has_out, "Input(X) or Output(Out) must exist");
 
-    PADDLE_ENFORCE_GE(boost::get<int>(ctx.GetAttr(kForwardPythonCallableId)), 0,
-                      "Function id cannot be less than 0");
+    PADDLE_ENFORCE_GE(boost::get<int>(ctx->GetAttr(kForwardPythonCallableId)),
+                      0, "Function id cannot be less than 0");
 
     if (!has_out) return;
 
@@ -118,7 +118,7 @@ class PyFuncOpVarTypeInference : public framework::VarTypeInference {
      * the corresponding forward variable
      */
     const std::string kGradVarSuffix = framework::kGradVarSuffix;
-    auto &out_var_names = ctx.Output("Out");
+    auto &out_var_names = ctx->Output("Out");
     for (auto &out_var_name : out_var_names) {
       if (out_var_name == framework::kEmptyVarName ||
           out_var_name.size() < kGradVarSuffix.size()) {
@@ -128,17 +128,17 @@ class PyFuncOpVarTypeInference : public framework::VarTypeInference {
       size_t len = out_var_name.size() - kGradVarSuffix.size();
       if (out_var_name.substr(len) == kGradVarSuffix) {
         auto fwd_var_name = out_var_name.substr(0, len);
-        PADDLE_ENFORCE(ctx.HasVar(out_var_name),
+        PADDLE_ENFORCE(ctx->HasVar(out_var_name),
                        "Backward variable %s not found", out_var_name);
-        PADDLE_ENFORCE(ctx.HasVar(fwd_var_name),
+        PADDLE_ENFORCE(ctx->HasVar(fwd_var_name),
                        "Backward variable %s not found", fwd_var_name);
         VLOG(10) << "Infer var_desc of Output(" << out_var_name << ") as Input("
                  << fwd_var_name << ")";
 
-        ctx.SetShape(out_var_name, ctx.GetShape(fwd_var_name));
-        ctx.SetDataType(out_var_name, ctx.GetDataType(fwd_var_name));
-        ctx.SetLoDLevel(out_var_name, ctx.GetLoDLevel(fwd_var_name));
-        ctx.SetType(out_var_name, ctx.GetType(fwd_var_name));
+        ctx->SetShape(out_var_name, ctx->GetShape(fwd_var_name));
+        ctx->SetDataType(out_var_name, ctx->GetDataType(fwd_var_name));
+        ctx->SetLoDLevel(out_var_name, ctx->GetLoDLevel(fwd_var_name));
+        ctx->SetType(out_var_name, ctx->GetType(fwd_var_name));
       }
     }
   }
diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc
index b65e236856..fdc7b0f6a0 100644
--- a/paddle/fluid/operators/reader/create_custom_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc
@@ -123,22 +123,22 @@ class CustomReaderInferShape : public framework::InferShapeBase {
 
 class CustomReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    auto& out_var_name = ctx.Output("Out")[0];
-    PADDLE_ENFORCE(ctx.HasVar(out_var_name));
-    ctx.SetType(out_var_name, framework::proto::VarType::READER);
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& out_var_name = ctx->Output("Out")[0];
+    PADDLE_ENFORCE(ctx->HasVar(out_var_name));
+    ctx->SetType(out_var_name, framework::proto::VarType::READER);
 
     auto sink_var_names =
-        boost::get<std::vector<std::string>>(ctx.GetAttr("sink_var_names"));
+        boost::get<std::vector<std::string>>(ctx->GetAttr("sink_var_names"));
     const auto* sub_block =
-        boost::get<framework::BlockDesc*>(ctx.GetAttr("sub_block"));
+        boost::get<framework::BlockDesc*>(ctx->GetAttr("sub_block"));
     std::vector<framework::proto::VarType::Type> res_data_types;
     for (const std::string& var_name : sink_var_names) {
       framework::VarDesc* var = sub_block->FindVar(var_name);
       PADDLE_ENFORCE_NOT_NULL(var);
       res_data_types.emplace_back(var->GetDataType());
     }
-    ctx.SetDataTypes(out_var_name, res_data_types);
+    ctx->SetDataTypes(out_var_name, res_data_types);
   }
 };
 
diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc
index 40549ce54d..33a69ad5fe 100644
--- a/paddle/fluid/operators/reader/read_op.cc
+++ b/paddle/fluid/operators/reader/read_op.cc
@@ -51,16 +51,16 @@ class ReadInferShape : public framework::InferShapeBase {
 
 class ReadInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    bool infer_out = boost::get<bool>(ctx.GetAttr("infer_out"));
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    bool infer_out = boost::get<bool>(ctx->GetAttr("infer_out"));
     if (infer_out) {
-      std::string reader_name = ctx.Input("Reader")[0];
-      std::vector<std::string> out_names = ctx.Output("Out");
-      auto dtypes = ctx.GetDataTypes(reader_name);
+      std::string reader_name = ctx->Input("Reader")[0];
+      std::vector<std::string> out_names = ctx->Output("Out");
+      auto dtypes = ctx->GetDataTypes(reader_name);
       PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size());
       for (size_t i = 0; i < dtypes.size(); ++i) {
-        ctx.SetType(out_names[i], framework::proto::VarType::LOD_TENSOR);
-        ctx.SetDataType(out_names[i], dtypes[i]);
+        ctx->SetType(out_names[i], framework::proto::VarType::LOD_TENSOR);
+        ctx->SetDataType(out_names[i], dtypes[i]);
       }
     }
   }
diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc
index 44772281be..64a1f6b687 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.cc
+++ b/paddle/fluid/operators/reader/reader_op_registry.cc
@@ -99,9 +99,9 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
 }
 
 void FileReaderInferVarType::operator()(
-    framework::InferVarTypeContext& ctx) const {
-  std::string reader_name = ctx.Output("Out")[0];
-  ctx.SetType(reader_name, framework::proto::VarType::READER);
+    framework::InferVarTypeContext* ctx) const {
+  std::string reader_name = ctx->Output("Out")[0];
+  ctx->SetType(reader_name, framework::proto::VarType::READER);
 }
 
 void DecoratedReaderInferShape::operator()(
@@ -124,11 +124,11 @@ void DecoratedReaderInferShape::operator()(
 }
 
 void DecoratedReaderInferVarType::operator()(
-    framework::InferVarTypeContext& ctx) const {
-  const std::string& in_reader_name = ctx.Input("UnderlyingReader")[0];
-  const std::string& out_reader_name = ctx.Output("Out")[0];
-  ctx.SetType(out_reader_name, framework::proto::VarType::READER);
-  ctx.SetDataTypes(out_reader_name, ctx.GetDataTypes(in_reader_name));
+    framework::InferVarTypeContext* ctx) const {
+  const std::string& in_reader_name = ctx->Input("UnderlyingReader")[0];
+  const std::string& out_reader_name = ctx->Output("Out")[0];
+  ctx->SetType(out_reader_name, framework::proto::VarType::READER);
+  ctx->SetDataTypes(out_reader_name, ctx->GetDataTypes(in_reader_name));
 }
 
 void DecoratedReaderMakerBase::Make() {
diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h
index 5a775b82f5..795a580605 100644
--- a/paddle/fluid/operators/reader/reader_op_registry.h
+++ b/paddle/fluid/operators/reader/reader_op_registry.h
@@ -61,7 +61,7 @@ class FileReaderInferShape : public framework::InferShapeBase {
 
 class FileReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override;
+  void operator()(framework::InferVarTypeContext* ctx) const override;
 };
 
 // general infershape for decorated reader
@@ -73,7 +73,7 @@ class DecoratedReaderInferShape : public framework::InferShapeBase {
 // general var type inference for decorated reader
 class DecoratedReaderInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override;
+  void operator()(framework::InferVarTypeContext* ctx) const override;
 };
 
 class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/save_op.cc b/paddle/fluid/operators/save_op.cc
index 45da2ac4c6..b02c098099 100644
--- a/paddle/fluid/operators/save_op.cc
+++ b/paddle/fluid/operators/save_op.cc
@@ -159,9 +159,9 @@ This operator will serialize and write LoDTensor / SelectedRows variable to file
 
 class SaveOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto out_var_name = ctx.Output(LOOKUP_TABLE_PATH).front();
-    ctx.SetType(out_var_name, framework::proto::VarType::RAW);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_var_name = ctx->Output(LOOKUP_TABLE_PATH).front();
+    ctx->SetType(out_var_name, framework::proto::VarType::RAW);
   }
 };
 
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 208a6f8009..4e4a015e18 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -70,13 +70,13 @@ $$Out = scale*(X + bias)$$
 
 class ScaleOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto &in_var_name = ctx.Input("X").front();
-    auto out_var_name = ctx.Output("Out").front();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto &in_var_name = ctx->Input("X").front();
+    auto out_var_name = ctx->Output("Out").front();
 
     if (in_var_name != out_var_name) {
-      ctx.SetType(out_var_name, ctx.GetType(in_var_name));
-      ctx.SetDataType(out_var_name, ctx.GetDataType(in_var_name));
+      ctx->SetType(out_var_name, ctx->GetType(in_var_name));
+      ctx->SetDataType(out_var_name, ctx->GetDataType(in_var_name));
     }
   }
 };
diff --git a/paddle/fluid/operators/split_selected_rows_op.cc b/paddle/fluid/operators/split_selected_rows_op.cc
index f102b911b5..88dfebc0cf 100644
--- a/paddle/fluid/operators/split_selected_rows_op.cc
+++ b/paddle/fluid/operators/split_selected_rows_op.cc
@@ -62,9 +62,9 @@ class SplitSelectedRowsOp : public framework::OperatorWithKernel {
 
 class SplitSelectedRowsOpInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    for (auto &out_var : ctx.Output("Out")) {
-      ctx.SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output("Out")) {
+      ctx->SetType(out_var, framework::proto::VarType::SELECTED_ROWS);
     }
   }
 };
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 2405a74d2b..1391148ccf 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -160,20 +160,20 @@ the LoD information with the first input.
 
 class SumOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext& ctx) const override {
-    auto& inputs = ctx.Input("X");
+  void operator()(framework::InferVarTypeContext* ctx) const override {
+    auto& inputs = ctx->Input("X");
     auto var_type = framework::proto::VarType::SELECTED_ROWS;
-    for (auto& name : ctx.Input("X")) {
-      VLOG(10) << name << " " << ctx.GetType(name);
+    for (auto& name : ctx->Input("X")) {
+      VLOG(10) << name << " " << ctx->GetType(name);
     }
 
     bool any_input_is_lod_tensor = std::any_of(
-        inputs.begin(), inputs.end(), [&ctx](const std::string& name) {
-          return ctx.GetType(name) == framework::proto::VarType::LOD_TENSOR;
+        inputs.begin(), inputs.end(), [ctx](const std::string& name) {
+          return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR;
         });
 
-    auto is_tensor_array = [&ctx](const std::string& name) {
-      return ctx.GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
+    auto is_tensor_array = [ctx](const std::string& name) {
+      return ctx->GetType(name) == framework::proto::VarType::LOD_TENSOR_ARRAY;
     };
 
     bool any_input_is_tensor_array =
@@ -185,7 +185,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
       if (!all_inputs_are_tensor_array) {
         std::ostringstream os;
         for (auto& each : inputs) {
-          os << "    " << each << " type is " << ctx.GetType(each) << "\n";
+          os << "    " << each << " type is " << ctx->GetType(each) << "\n";
         }
         PADDLE_ENFORCE(all_inputs_are_tensor_array,
                        "Not all inputs are tensor array:\n%s", os.str());
@@ -195,9 +195,9 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
       var_type = framework::proto::VarType::LOD_TENSOR;
     }
 
-    auto out_var_name = ctx.Output("Out").front();
-    ctx.SetType(out_var_name, var_type);
-    ctx.SetDataType(out_var_name, ctx.GetDataType(inputs.front()));
+    auto out_var_name = ctx->Output("Out").front();
+    ctx->SetType(out_var_name, var_type);
+    ctx->SetDataType(out_var_name, ctx->GetDataType(inputs.front()));
   }
 };
 
diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
index d7f67ccb2f..2b83c42f20 100644
--- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc
+++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc
@@ -177,9 +177,9 @@ class LoDTensorArray2TensorGradInferShape : public framework::InferShapeBase {
 class LoDTensorArray2TensorGradInferVarType
     : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    for (auto &out_var : ctx.Output(framework::GradVarName("X"))) {
-      ctx.SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &out_var : ctx->Output(framework::GradVarName("X"))) {
+      ctx->SetType(out_var, framework::proto::VarType::LOD_TENSOR_ARRAY);
     }
   }
 };
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index 845629d40f..6cf3e65e00 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -46,7 +46,7 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
 
 class TensorRTEngineInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {}
+  void operator()(framework::InferVarTypeContext *ctx) const override {}
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index b3a8b6a141..bb6a1c5b16 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -112,15 +112,16 @@ uniform distribution. The random result is in set [min, max].
 
 class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    auto out_var_name = ctx.Output("Out").front();
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_var_name = ctx->Output("Out").front();
     auto var_data_type = static_cast<framework::proto::VarType::Type>(
-        boost::get<int>(ctx.GetAttr("dtype")));
+        boost::get<int>(ctx->GetAttr("dtype")));
 
-    if (ctx.GetType(out_var_name) != framework::proto::VarType::SELECTED_ROWS) {
-      ctx.SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
+    if (ctx->GetType(out_var_name) !=
+        framework::proto::VarType::SELECTED_ROWS) {
+      ctx->SetType(out_var_name, framework::proto::VarType::LOD_TENSOR);
     }
-    ctx.SetDataType(out_var_name, var_data_type);
+    ctx->SetDataType(out_var_name, var_data_type);
   }
 };
 
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 21e7793e0a..e7d078d03a 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -38,7 +38,7 @@ void BindTracer(pybind11::module* m) {
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
               const imperative::VarBasePtrMap& inputs,
-              imperative::VarBasePtrMap& outputs,
+              imperative::VarBasePtrMap* outputs,
               framework::AttributeMap attrs_map,
               const platform::CPUPlace expected_place,
               const bool stop_gradient = false) {
@@ -48,7 +48,7 @@ void BindTracer(pybind11::module* m) {
       .def("trace",
            [](imperative::Tracer& self, imperative::OpBase* op,
               const imperative::VarBasePtrMap& inputs,
-              imperative::VarBasePtrMap& outputs,
+              imperative::VarBasePtrMap* outputs,
               framework::AttributeMap attrs_map,
               const platform::CUDAPlace expected_place,
               const bool stop_gradient = false) {

From 8364688c303c09e7284d9e63f395860ffccf52a6 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 18 Mar 2019 20:12:18 +0800
Subject: [PATCH 68/73] Fix py_func_op's problem

---
 paddle/fluid/operators/controlflow/get_places_op.cc | 6 +++---
 paddle/fluid/operators/py_func_op.cc                | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/controlflow/get_places_op.cc b/paddle/fluid/operators/controlflow/get_places_op.cc
index 0258739d6d..fa77f97419 100644
--- a/paddle/fluid/operators/controlflow/get_places_op.cc
+++ b/paddle/fluid/operators/controlflow/get_places_op.cc
@@ -93,9 +93,9 @@ execution.
 
 class GetPlacesInferVarType : public framework::VarTypeInference {
  public:
-  void operator()(framework::InferVarTypeContext &ctx) const override {
-    for (auto &o_name : ctx.Output("Out")) {
-      ctx.SetType(o_name, framework::proto::VarType::PLACE_LIST);
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    for (auto &o_name : ctx->Output("Out")) {
+      ctx->SetType(o_name, framework::proto::VarType::PLACE_LIST);
     }
   }
 };
diff --git a/paddle/fluid/operators/py_func_op.cc b/paddle/fluid/operators/py_func_op.cc
index 67202c7f9d..5300e80747 100644
--- a/paddle/fluid/operators/py_func_op.cc
+++ b/paddle/fluid/operators/py_func_op.cc
@@ -99,7 +99,7 @@ class PyFuncOpVarTypeInference : public framework::VarTypeInference {
   void operator()(framework::InferVarTypeContext *ctx) const override {
     bool has_out = (ctx->HasOutput("Out") && !ctx->Output("Out").empty());
 
-    bool has_in = (ctx->HasInput("X") && !ctx->Input("Out").empty());
+    bool has_in = (ctx->HasInput("X") && !ctx->Input("X").empty());
 
     /**
      * X or Out can be empty, so that py_func can be more flexible

From 565b19b7a50698894cc238b375dd73547548958c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 18 Mar 2019 20:39:41 +0800
Subject: [PATCH 69/73] fix set data type bug

test=develop
---
 paddle/fluid/imperative/layer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h
index f210cd1745..f409190a70 100644
--- a/paddle/fluid/imperative/layer.h
+++ b/paddle/fluid/imperative/layer.h
@@ -195,7 +195,7 @@ class VarBase {
   // data type. e.g.. FP32
   inline void SetDataType(framework::proto::VarType::Type type) {
     auto tensor = var_->GetMutable<framework::LoDTensor>();
-    tensor->mutable_data(place_, type);
+    tensor->mutable_data(tensor->place(), type);
   }
   inline framework::proto::VarType::Type DataType() const {
     auto tensor = var_->Get<framework::LoDTensor>();

From af030088903034eb2d6825bf6d34801b1f80faea Mon Sep 17 00:00:00 2001
From: Wojciech Uss <wojciech.uss@intel.com>
Date: Tue, 19 Mar 2019 02:32:19 +0100
Subject: [PATCH 70/73] Add cpu_quantize_placement_pass for C-API quantization
 (#16265)

* Add cpu_quantize_placement_pass for C-API quantization

test=develop

* added a comment on required pass attributes

test=develop
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +
 .../ir/cpu_quantize_placement_pass.cc         |  58 ++++++++
 .../ir/cpu_quantize_placement_pass.h          |  34 +++++
 .../ir/cpu_quantize_placement_pass_tester.cc  | 129 ++++++++++++++++++
 4 files changed, 223 insertions(+)
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
 create mode 100644 paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 6974420db0..a79a53867d 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -46,6 +46,7 @@ cc_library(fuse_pass_base SRCS fuse_pass_base.cc DEPS pass)
 pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(lock_free_optimize_pass base)
+pass_library(cpu_quantize_placement_pass base)
 pass_library(cpu_quantize_pass inference)
 pass_library(cpu_quantize_squash_pass inference)
 pass_library(fc_fuse_pass inference)
@@ -104,6 +105,7 @@ cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS g
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
 cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DEPS seqpool_concat_fuse_pass framework_proto)
 cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
+cc_test(test_cpu_quantize_placement_pass SRCS cpu_quantize_placement_pass_tester.cc DEPS cpu_quantize_placement_pass)
 cc_test(test_cpu_quantize_pass SRCS cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor)
 cc_test(test_cpu_quantize_squash_pass SRCS cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor)
 if(NOT WIN32)
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
new file mode 100644
index 0000000000..50bbe4915b
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+#include <string>
+#include <unordered_set>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+std::unique_ptr<ir::Graph> CPUQuantizePlacementPass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  VLOG(3) << "Marks operators which are to be quantized.";
+  const auto& excluded_ids_list =
+      Get<std::unordered_set<int>>("quantize_excluded_op_ids");
+  const auto& op_types_list =
+      Get<std::unordered_set<std::string>>("quantize_enabled_op_types");
+  for (const Node* n : graph->Nodes()) {
+    if (n->IsOp()) {
+      if (std::find(excluded_ids_list.begin(), excluded_ids_list.end(),
+                    n->id()) != excluded_ids_list.end())
+        continue;
+      auto* op = n->Op();
+      if (op->HasAttr("use_quantizer") || op->HasProtoAttr("use_quantizer")) {
+        if (op_types_list.empty()) {
+          op->SetAttr("use_quantizer", true);
+        } else if (std::find(op_types_list.begin(), op_types_list.end(),
+                             n->Name()) != op_types_list.end()) {
+          op->SetAttr("use_quantizer", true);
+        }
+      }
+    }
+  }
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(cpu_quantize_placement_pass,
+              paddle::framework::ir::CPUQuantizePlacementPass)
+    // a vector of operator type names to be quantized ("conv2d" etc.)
+    .RequirePassAttr("quantize_enabled_op_types")
+    // a vector of operator ids that are to be excluded from quantization
+    .RequirePassAttr("quantize_excluded_op_ids");
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
new file mode 100644
index 0000000000..ef3861b249
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+ * Specifies which operators should be quantized.
+ */
+class CPUQuantizePlacementPass : public Pass {
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
new file mode 100644
index 0000000000..5a4d622645
--- /dev/null
+++ b/paddle/fluid/framework/ir/cpu_quantize_placement_pass_tester.cc
@@ -0,0 +1,129 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/cpu_quantize_placement_pass.h"
+
+#include <gtest/gtest.h>
+#include <boost/logic/tribool.hpp>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs,
+           boost::tribool use_quantizer) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+
+  op->SetType(type);
+
+  if (!boost::indeterminate(use_quantizer))
+    op->SetAttr("use_quantizer", use_quantizer);
+
+  if (type == "conv2d") {
+    op->SetAttr("name", name);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+    op->SetInput("Bias", {inputs[2]});
+  } else if (type == "relu") {
+    op->SetInput("X", inputs);
+  } else if (type == "concat") {
+    op->SetAttr("axis", 1);
+    op->SetInput("X", {inputs[0], inputs[1]});
+  } else if (type == "pool2d") {
+    op->SetInput("X", {inputs[0]});
+  } else {
+    FAIL() << "Unexpected operator type.";
+  }
+  op->SetOutput("Out", {outputs[0]});
+}
+
+// operator                      use_quantizer
+// ---------------------------------------
+// (a,b)->concat->c              none
+// (c,weights,bias)->conv->f     false
+// f->relu->g                    none
+// g->pool->h                    false
+// (h,weights2,bias2)->conv->k   false
+// k->pool->l                    false
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g",
+                                 "h", "weights2", "bias2", "k", "l"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"}, boost::indeterminate);
+  SetOp(&prog, "conv2d", "conv1", {"c", "weights", "bias"}, {"f"}, false);
+  SetOp(&prog, "relu", "relu1", {"f"}, {"g"}, boost::indeterminate);
+  SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"}, false);
+  SetOp(&prog, "conv2d", "conv2", {"h", "weights2", "bias2"}, {"k"}, false);
+  SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"}, false);
+
+  return prog;
+}
+
+void MainTest(std::initializer_list<std::string> quantize_enabled_op_types,
+              std::initializer_list<int> quantize_excluded_op_ids,
+              unsigned expected_use_quantizer_true_count) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("cpu_quantize_placement_pass");
+  pass->Set("quantize_enabled_op_types",
+            new std::unordered_set<std::string>(quantize_enabled_op_types));
+  pass->Set("quantize_excluded_op_ids",
+            new std::unordered_set<int>(quantize_excluded_op_ids));
+
+  graph = pass->Apply(std::move(graph));
+
+  unsigned use_quantizer_true_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->HasAttr("use_quantizer") &&
+          boost::get<bool>(op->GetAttr("use_quantizer"))) {
+        ++use_quantizer_true_count;
+      }
+    }
+  }
+
+  EXPECT_EQ(use_quantizer_true_count, expected_use_quantizer_true_count);
+}
+
+TEST(QuantizerPlacementPass, enabled_pool) { MainTest({"pool2d"}, {}, 2); }
+
+TEST(QuantizerPlacementPass, enabled_conv_excluded_one) {
+  MainTest({"conv2d"}, {4}, 1);
+}
+
+TEST(QuantizerPlacementPass, excluded_none) {
+  // 2 conv + 2 pool
+  MainTest({}, {}, 4);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(cpu_quantize_placement_pass);

From 7e20e7691e3dda88a84d84a00be5dffcd9772ebf Mon Sep 17 00:00:00 2001
From: Yibing Liu <liuyibing01@baidu.com>
Date: Tue, 19 Mar 2019 10:18:19 +0800
Subject: [PATCH 71/73] Fix the bug in fp16 backward kernel (#16269)

test=develop
---
 paddle/fluid/operators/slice_op.cu            | 14 +++++-----
 .../fluid/tests/unittests/test_slice_op.py    | 26 +++++++++++++++++++
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/operators/slice_op.cu b/paddle/fluid/operators/slice_op.cu
index 1af57b89a3..24a564f9ef 100644
--- a/paddle/fluid/operators/slice_op.cu
+++ b/paddle/fluid/operators/slice_op.cu
@@ -31,18 +31,18 @@ __global__ void Padding(const paddle::platform::float16* d_out,
                         paddle::platform::float16* d_in) {
   int64_t out_idx = threadIdx.x + blockDim.x * blockIdx.x;
   if (out_idx < n) {
+    int64_t out_idx_tmp = out_idx;
     int coords[D] = {0};
     for (int i = D - 1; i >= 0; --i) {
-      coords[i] = out_idx % out_dims[i];
-      out_idx /= out_dims[i];
+      coords[i] = out_idx_tmp % out_dims[i];
+      out_idx_tmp /= out_dims[i];
       coords[i] += offsets[i];
     }
 
     int64_t in_idx = 0;
-    for (int i = 0; i < D - 1; ++i) {
-      in_idx += coords[i] * in_dims[i + 1];
+    for (int i = 0; i < D; ++i) {
+      in_idx = in_idx * in_dims[i] + coords[i];
     }
-    in_idx += coords[D - 1];
 
     d_in[in_idx] = d_out[out_idx];
   }
@@ -80,8 +80,8 @@ class SliceGradKernel<paddle::platform::CUDADeviceContext,
     set_zero(dev_ctx, d_in, static_cast<paddle::platform::float16>(0));
 
     int64_t numel = d_out->numel();
-    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1, 1, 1);
-    dim3 threads(PADDLE_CUDA_NUM_THREADS, 1, 1);
+    dim3 blocks((numel - 1) / PADDLE_CUDA_NUM_THREADS + 1);
+    dim3 threads(PADDLE_CUDA_NUM_THREADS);
     auto stream = ctx.cuda_device_context().stream();
 
     auto out_shape = framework::vectorize2int(out_dims);
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index 5fdabbabed..aefd8cb6d3 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -87,5 +87,31 @@ class TestFP16(TestSliceOp):
                 place, ['Input'], 'Out', max_relative_error=0.006)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestFP16_2(TestSliceOp):
+    def config(self):
+        self.dtype = "float16"
+        self.input = np.random.random([3, 4, 5]).astype(self.dtype)
+        self.starts = [0]
+        self.ends = [1]
+        self.axes = [1]
+        self.out = self.input[:, 0:1, :]
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_output_with_place(place, atol=1e-5)
+
+    def test_check_grad_normal(self):
+        place = core.CUDAPlace(0)
+        if core.is_float16_supported(place):
+            self.check_grad_with_place(
+                place, ['Input'],
+                'Out',
+                max_relative_error=0.006,
+                numeric_grad_delta=0.5)
+
+
 if __name__ == '__main__':
     unittest.main()

From 13816dd4acdabd21a715b3b1c63fb43cdbac7622 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Tue, 19 Mar 2019 03:54:50 +0100
Subject: [PATCH 72/73] [MKL-DNN] Fix to crash of Transformer when mkldnn is to
 be used (#16233)

* - Fix to crash of Transformer when mkldnn is to be used

Desc: TensorCopy was not setting MKLDNN primitive descriptor when layout was to be kMKLDNN

test=develop

* - Enable transformer for mkl-dnn

test=develo

* - Compilation fix

test=develop

* - Removed manual selection of MKL-DNN ops to be used in Transformer test

test=develop
---
 paddle/fluid/framework/tensor_util.cc         |  5 +++++
 .../fluid/inference/tests/api/CMakeLists.txt  |  2 +-
 .../tests/api/analyzer_transformer_tester.cc  | 20 +++++++++++++++++--
 3 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index a7f09df491..5f21dae605 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -44,6 +44,11 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
               << dst_place;
       return;
     }
+#ifdef PADDLE_WITH_MKLDNN
+    if (src.layout() == DataLayout::kMKLDNN) {
+      dst->set_mkldnn_prim_desc(src.get_mkldnn_prim_desc());
+    }
+#endif
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 8f7b6f31de..d9ac73b063 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -110,7 +110,7 @@ set(TRANSFORMER_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/transformer")
 download_model_and_data(${TRANSFORMER_INSTALL_DIR} "temp%2Ftransformer_model.tar.gz" "temp%2Ftransformer_data.txt.tar.gz")
 inference_analysis_test(test_analyzer_transformer SRCS analyzer_transformer_tester.cc 
   EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
-  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8)
+  ARGS --infer_model=${TRANSFORMER_INSTALL_DIR}/model --infer_data=${TRANSFORMER_INSTALL_DIR}/data.txt --batch_size=8 SERIAL)
 
 # ocr
 set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr")
diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
index 9d17f38ab7..f765f55611 100644
--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -183,10 +183,13 @@ void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
 }
 
 // Easy for profiling independently.
-TEST(Analyzer_Transformer, profile) {
+void profile(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
   std::vector<PaddleTensor> outputs;
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -194,6 +197,11 @@ TEST(Analyzer_Transformer, profile) {
                  input_slots_all, &outputs, FLAGS_num_threads);
 }
 
+TEST(Analyzer_Transformer, profile) { profile(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, profile_mkldnn) { profile(true); }
+#endif
+
 // Check the fuse status
 TEST(Analyzer_Transformer, fuse_statis) {
   AnalysisConfig cfg;
@@ -206,9 +214,12 @@ TEST(Analyzer_Transformer, fuse_statis) {
 }
 
 // Compare result of NativeConfig and AnalysisConfig
-TEST(Analyzer_Transformer, compare) {
+void compare(bool use_mkldnn = false) {
   AnalysisConfig cfg;
   SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
 
   std::vector<std::vector<PaddleTensor>> input_slots_all;
   SetInput(&input_slots_all);
@@ -216,5 +227,10 @@ TEST(Analyzer_Transformer, compare) {
       reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
 }
 
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
 }  // namespace inference
 }  // namespace paddle

From f8df9eb32ea1d7a41bf66bfc6553b33478516aef Mon Sep 17 00:00:00 2001
From: Zeng Jinle <32832641+sneaxiy@users.noreply.github.com>
Date: Mon, 18 Mar 2019 23:00:44 -0500
Subject: [PATCH 73/73] fix api doc (#16201)

test=develop
---
 paddle/fluid/API.spec              | 12 ++++++------
 python/paddle/fluid/data_feeder.py |  6 +++---
 python/paddle/fluid/executor.py    | 20 ++++++++++++++------
 python/paddle/reader/__init__.py   |  7 ++-----
 python/paddle/reader/creator.py    | 20 ++++++++++++++------
 python/paddle/reader/decorator.py  | 28 +++++++++++++---------------
 6 files changed, 52 insertions(+), 41 deletions(-)

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 66fc323e6b..9a6d0d1c08 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -12,7 +12,7 @@ paddle.fluid.program_guard (ArgSpec(args=['main_program', 'startup_program'], va
 paddle.fluid.name_scope (ArgSpec(args=['prefix'], varargs=None, keywords=None, defaults=(None,)), ('document', '0ef753f5cec69fef9ae6ad8b867b33a2'))
 paddle.fluid.Executor.__init__ (ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.Executor.close (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', 'f5369953dd0c443961cf79f7a00e1a03'))
-paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'aba8093edebf2d5c869b735b92811e45'))
+paddle.fluid.Executor.run (ArgSpec(args=['self', 'program', 'feed', 'fetch_list', 'feed_var_name', 'fetch_var_name', 'scope', 'return_numpy', 'use_program_cache'], varargs=None, keywords=None, defaults=(None, None, None, 'feed', 'fetch', None, True, False)), ('document', 'f482e93b38b4018796969a2e1dde479d'))
 paddle.fluid.global_scope (ArgSpec(args=[], varargs=None, keywords=None, defaults=None), ('document', 'e148d3ab1ed8edf3e928212a375959c0'))
 paddle.fluid.scope_guard (ArgSpec(args=['scope'], varargs=None, keywords=None, defaults=None), ('document', 'b94d1f6bcc29c4fb58fc0058561250c2'))
 paddle.fluid.DistributeTranspiler.__init__ (ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -494,7 +494,7 @@ paddle.fluid.CUDAPinnedPlace.__init__ __init__(self: paddle.fluid.core.CUDAPinne
 paddle.fluid.ParamAttr.__init__ (ArgSpec(args=['self', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.WeightNormParamAttr.__init__ (ArgSpec(args=['self', 'dim', 'name', 'initializer', 'learning_rate', 'regularizer', 'trainable', 'gradient_clip', 'do_model_average'], varargs=None, keywords=None, defaults=(None, None, None, 1.0, None, True, None, False)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.DataFeeder.__init__ (ArgSpec(args=['self', 'feed_list', 'place', 'program'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', '0eed2f198dc73c08a41b61edbc755753'))
+paddle.fluid.DataFeeder.decorate_reader (ArgSpec(args=['self', 'reader', 'multi_devices', 'num_places', 'drop_last'], varargs=None, keywords=None, defaults=(None, True)), ('document', 'f8f3df23c5633c614db781a91b81fb62'))
 paddle.fluid.DataFeeder.feed (ArgSpec(args=['self', 'iterable'], varargs=None, keywords=None, defaults=None), ('document', '459e316301279dfd82001b46f0b8ffca'))
 paddle.fluid.DataFeeder.feed_parallel (ArgSpec(args=['self', 'iterable', 'num_places'], varargs=None, keywords=None, defaults=(None,)), ('document', '543863d1f9d4853758adb613b8659e85'))
 paddle.fluid.clip.ErrorClipByValue.__init__ (ArgSpec(args=['self', 'max', 'min'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -518,11 +518,11 @@ paddle.reader.compose (ArgSpec(args=[], varargs='readers', keywords='kwargs', de
 paddle.reader.chain (ArgSpec(args=[], varargs='readers', keywords=None, defaults=None), ('document', 'd22c34e379a53901ae67a6bca7f4def4'))
 paddle.reader.shuffle (ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None), ('document', 'e42ea6fee23ce26b23cb142cd1d6522d'))
 paddle.reader.firstn (ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None), ('document', 'c5bb8f7dd4f917f1569a368aab5b8aad'))
-paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '283bc0b8a0e26ae186b8b9bee4aec560'))
+paddle.reader.xmap_readers (ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)), ('document', '9c804a42f8a4dbaa76b3c98e0ab7f796'))
 paddle.reader.PipeReader.__init__ (ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
-paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '5f80a7ed70052f01665e4c74acccfa69'))
+paddle.reader.PipeReader.get_line (ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')), ('document', '9621ae612e595b6c34eb3bb5f3eb1a45'))
 paddle.reader.multiprocess_reader (ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)), ('document', '7d8b3a96e592107c893d5d51ce968ba0'))
 paddle.reader.Fake.__init__ (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.reader.creator.np_array (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '28d457fbc9a71efa4ac91a3be179cada'))
-paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', '44fe286ab6175a5464d3a961a68c266a'))
-paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', '11b3704ea42cfd537953387a7e58dae8'))
+paddle.reader.creator.text_file (ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None), ('document', 'f45fcb7add066c8e042c6774fc7c3db2'))
+paddle.reader.creator.recordio (ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)), ('document', 'b4a94ee0e2cefb495619275c2f8c61d2'))
diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py
index a24e1d1300..3dac41ce43 100644
--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -268,8 +268,8 @@ class DataFeeder(object):
         Args:
             reader(function): the reader is the function which can generate data.
             multi_devices(bool): whether to use multiple devices or not.
-            num_places(int): if the multi_devices is True, you can specify the number
-                of GPU to use, if 'num_places' is None, the function will use all the
+            num_places(int): if multi_devices is True, you can specify the number
+                of GPU to use, if multi_devices is None, the function will use all the
                 GPU of the current machine. Default None.
             drop_last(bool): whether to drop the last batch if the
                 size of the last batch is less than batch_size. Default True.
@@ -278,7 +278,7 @@ class DataFeeder(object):
             dict: the result of conversion.
 
         Raises:
-            ValueError: If drop_last is False and the data batch which cannot fit for devices.
+            ValueError: If drop_last is False and the data batch cannot fit for devices.
         """
 
         def __reader_creator__():
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index cc3c0dd689..03aa9917f3 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -470,13 +470,21 @@ class Executor(object):
             program(Program|CompiledProgram): the program that need to run,
                 if not provided, then default_main_program (not compiled) will be used.
             feed(dict): feed variable map, e.g. {"image": ImageData, "label": LabelData}
-            fetch_list(list): a list of variable or variable names that user want to get, run will return them according to this list.
-            feed_var_name(str): the name for the input variable of feed Operator.
-            fetch_var_name(str): the name for the output variable of fetch Operator.
-            scope(Scope): the scope used to run this program, you can switch it to different scope. default is global_scope
+            fetch_list(list): a list of variable or variable names that user 
+                wants to get, this method will return them according to this list.
+            feed_var_name(str): the name for the input variable of 
+                feed Operator.
+            fetch_var_name(str): the name for the output variable of 
+                fetch Operator.
+            scope(Scope): the scope used to run this program, you can switch 
+                it to different scope. default is global_scope
             return_numpy(bool): if convert the fetched tensor to numpy
-            use_program_cache(bool): set use_program_cache to true if program not changed compare to the last step.
-
+            use_program_cache(bool): whether to use the cached program 
+                settings across batches. Setting it be true would be faster 
+                only when (1) the program is not compiled with data parallel, 
+                and (2) program, feed variable names and fetch_list variable 
+                names do not changed compared to the last step. 
+                
         Returns:
 
             list(numpy.array): fetch result according to fetch_list.
diff --git a/python/paddle/reader/__init__.py b/python/paddle/reader/__init__.py
index 678026cf95..b55a6298f6 100644
--- a/python/paddle/reader/__init__.py
+++ b/python/paddle/reader/__init__.py
@@ -38,9 +38,8 @@ items. It can be any function with no parameter that creates a iterable
 Element produced from the iterable should be a **single** entry of data,
 **not** a mini batch. That entry of data could be a single item, or a tuple of
 items.
-Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
-/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
-array of float32, int, list of int)
+Item should be of supported type (e.g., numpy array or list/tuple of float 
+or int).
 
 An example implementation for single item data reader creator:
 
@@ -62,8 +61,6 @@ An example implementation for multiple item data reader creator:
                 yield numpy.random.uniform(-1, 1, size=width*height), label
     return reader
 
-
-TODO(yuyang18): Should we add whole design doc here?
 """
 
 import paddle.reader.decorator
diff --git a/python/paddle/reader/creator.py b/python/paddle/reader/creator.py
index c861020225..353aca92f4 100644
--- a/python/paddle/reader/creator.py
+++ b/python/paddle/reader/creator.py
@@ -44,8 +44,11 @@ def text_file(path):
     Creates a data reader that outputs text line by line from given text file.
     Trailing new line ('\\\\n') of each line will be removed.
 
-    :path: path of the text file.
-    :returns: data reader of text file
+    Args:
+        path (str): path of the text file.
+    
+    Returns: 
+        callable: data reader of text file.
     """
 
     def reader():
@@ -59,10 +62,15 @@ def text_file(path):
 
 def recordio(paths, buf_size=100):
     """
-    Creates a data reader from given RecordIO file paths separated by ",",
-        glob pattern is supported.
-    :path: path of recordio files, can be a string or a string list.
-    :returns: data reader of recordio files.
+    Creates a data reader from given RecordIO file paths separated 
+    by ",", glob pattern is supported.
+
+    Args:
+        paths (str|list(str)): path of recordio files.
+        buf_size (int): prefetched buffer size. 
+
+    Returns:
+        callable: data reader of recordio files.
     """
 
     import recordio as rec
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index b2ef9f7580..685d08b9e0 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -242,20 +242,18 @@ class XmapEndSignal():
 
 def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     """
-    Use multiprocess to map samples from reader by a mapper defined by user.
-    And this function contains a buffered decorator.
-    :param mapper:  a function to map sample.
-    :type mapper: callable
-    :param reader: the data reader to read from
-    :type reader: callable
-    :param process_num: process number to handle original sample
-    :type process_num: int
-    :param buffer_size: max buffer size
-    :type buffer_size: int
-    :param order: keep the order of reader
-    :type order: bool
-    :return: the decarated reader
-    :rtype: callable
+    Use multi-threads to map samples from reader by a mapper defined by user.
+
+    Args:
+        mapper (callable): a function to map the data from reader.
+        reader (callable): a data reader which yields the data. 
+        process_num (int): thread number to handle original sample.
+        buffer_size (int): size of the queue to read data in. 
+        order (bool): whether to keep the data order from original reader. 
+            Default False.
+
+    Returns:
+        callable: a decorated reader with data mapping. 
     """
     end = XmapEndSignal()
 
@@ -477,7 +475,7 @@ class PipeReader:
         """
         :param cut_lines: cut buffer to lines
         :type cut_lines: bool
-        :param line_break: line break of the file, like \n or \r
+        :param line_break: line break of the file, like '\\\\n' or '\\\\r'
         :type line_break: string
 
         :return: one line or a buffer of bytes