From 67308822f85a387433867ea330624b9c16ae029c Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sun, 30 Sep 2018 19:45:05 +0800
Subject: [PATCH 01/15] Add selected_rows merge for clip_by_norm op

test=develop
---
 paddle/fluid/operators/CMakeLists.txt    |  3 ++-
 paddle/fluid/operators/clip_by_norm_op.h | 24 +++++++++++++++++++++++-
 2 files changed, 25 insertions(+), 2 deletions(-)
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index b61bca8c3d..e10fc422fa 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -229,7 +229,7 @@ if(WITH_DISTRIBUTE)
         op_library(${dist_op} DEPS ${DISTRIBUTE_DEPS})
         set_source_files_properties(${dist_op}.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     endforeach()
-    
+
     #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
     #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
     #        listen_and_serv_op sum_op executor SERIAL)
@@ -267,6 +267,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
+op_library(clip_by_norm_op DEPS selected_rows_functor)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
 op_library(print_op DEPS lod_tensor)
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 5af0eb0b2a..8346115913 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -31,10 +32,31 @@ class ClipByNormKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto max_norm = context.Attr<T>("max_norm");
-    auto* input = context.Input<Tensor>("X");
+    auto in_var = context.InputVar("X");
     auto* output = context.Output<Tensor>("Out");
     output->mutable_data<T>(context.GetPlace());
 
+    const Tensor* input = nullptr;
+    if (in_var->IsType<framework::LoDTensor>()) {
+      input = context.Input<Tensor>("X");
+    } else if (in_var->IsType<framework::SelectedRows>()) {
+      auto* x = context.Input<framework::SelectedRows>("X");
+
+      // merge ids in selected rows first
+      math::scatter::MergeAdd<DeviceContext, T> merge_func;
+      auto* merged_input = const_cast<framework::Scope&>(context.scope())
+                               .Var()
+                               ->GetMutable<framework::SelectedRows>();
+      merge_func(context.template device_context<DeviceContext>(), *x,
+                 merged_input);
+      input = &(merged_input->value());
+    } else {
+      PADDLE_THROW("Unexpected branch, input variable type is %s",
+                   in_var->Type().name());
+    }
+
+    PADDLE_ENFORCE_NOT_NULL(input);
+
     auto x = EigenVector<T>::Flatten(*input);
     auto out = EigenVector<T>::Flatten(*output);
     auto x_norm = x.square().sum().sqrt();

From f20fc955395a907e68136dd4fccce29660f5d140 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Sat, 6 Oct 2018 20:18:09 +0800
Subject: [PATCH 02/15] Resize output ddims and rows

---
 paddle/fluid/operators/clip_by_norm_op.h | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 8346115913..7144524a4c 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -33,12 +33,14 @@ class ClipByNormKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto max_norm = context.Attr<T>("max_norm");
     auto in_var = context.InputVar("X");
-    auto* output = context.Output<Tensor>("Out");
-    output->mutable_data<T>(context.GetPlace());
 
+    Tensor* output = nullptr;
     const Tensor* input = nullptr;
     if (in_var->IsType<framework::LoDTensor>()) {
       input = context.Input<Tensor>("X");
+
+      output = context.Output<Tensor>("Out");
+      output->mutable_data<T>(context.GetPlace());
     } else if (in_var->IsType<framework::SelectedRows>()) {
       auto* x = context.Input<framework::SelectedRows>("X");
 
@@ -50,6 +52,11 @@ class ClipByNormKernel : public framework::OpKernel<T> {
       merge_func(context.template device_context<DeviceContext>(), *x,
                  merged_input);
       input = &(merged_input->value());
+
+      auto* output_selected_rows = context.Output<SelectedRows>("Out");
+      output_selected_rows->set_rows(merged_input.rows());
+      output = output_selected_rows->mutable_data();
+      output->Resize(framework::make_ddim(merged_input.value().dims()));
     } else {
       PADDLE_THROW("Unexpected branch, input variable type is %s",
                    in_var->Type().name());

From bcd8c2ccc35f48a6563715562f525d30ac498e6f Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Mon, 8 Oct 2018 15:51:36 +0800
Subject: [PATCH 03/15] Add unit test

---
 paddle/fluid/operators/CMakeLists.txt         |  2 +-
 paddle/fluid/operators/clip_by_norm_op.h      | 22 ++++++-----
 .../tests/unittests/test_clip_by_norm_op.py   | 38 +++++++++++++++++++
 3 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index e10fc422fa..cafd7b11ae 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -267,7 +267,7 @@ if (WITH_GPU AND TENSORRT_FOUND)
 else()
     set(DEPS_OPS ${DEPS_OPS} tensorrt_engine_op)
 endif()
-op_library(clip_by_norm_op DEPS selected_rows_functor)
+op_library(clip_by_norm_op DEPS selected_rows_functor selected_rows)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
 op_library(print_op DEPS lod_tensor)
diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 7144524a4c..9f99c8a3f9 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/transform.h"
 
@@ -23,6 +24,7 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using SelectedRows = framework::SelectedRows;
 template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
@@ -41,22 +43,24 @@ class ClipByNormKernel : public framework::OpKernel<T> {
 
       output = context.Output<Tensor>("Out");
       output->mutable_data<T>(context.GetPlace());
-    } else if (in_var->IsType<framework::SelectedRows>()) {
-      auto* x = context.Input<framework::SelectedRows>("X");
+    } else if (in_var->IsType<SelectedRows>()) {
+      auto* x = context.Input<SelectedRows>("X");
 
       // merge ids in selected rows first
       math::scatter::MergeAdd<DeviceContext, T> merge_func;
-      auto* merged_input = const_cast<framework::Scope&>(context.scope())
-                               .Var()
-                               ->GetMutable<framework::SelectedRows>();
+      SelectedRows* merged_input =
+          const_cast<framework::Scope&>(context.scope())
+              .Var()
+              ->GetMutable<SelectedRows>();
       merge_func(context.template device_context<DeviceContext>(), *x,
                  merged_input);
       input = &(merged_input->value());
 
-      auto* output_selected_rows = context.Output<SelectedRows>("Out");
-      output_selected_rows->set_rows(merged_input.rows());
-      output = output_selected_rows->mutable_data();
-      output->Resize(framework::make_ddim(merged_input.value().dims()));
+      SelectedRows* output_selected_rows = context.Output<SelectedRows>("Out");
+      output_selected_rows->set_rows(merged_input->rows());
+      output_selected_rows->set_height(merged_input->height());
+      output = output_selected_rows->mutable_value();
+      output->Resize(merged_input->value().dims());
     } else {
       PADDLE_THROW("Unexpected branch, input variable type is %s",
                    in_var->Type().name());
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index 6103c3aafc..6556c0875e 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -18,6 +18,8 @@ import unittest
 import numpy as np
 from op_test import OpTest
 
+import paddle.fluid.core as core
+
 
 class TestClipByNormOp(OpTest):
     def setUp(self):
@@ -62,5 +64,41 @@ class TestCase3(TestClipByNormOp):
         self.max_norm = 1.0
 
 
+class TestClipByNormOpWithSelectedRows(OpTest):
+    def setUp(self):
+        self.initTestCase()
+
+        self.max_relative_error = 0.006
+
+        scope = core.Scope()
+        x_selected_rows = scope.var('X').get_selected_rows()
+        x_selected_rows.set_rows([1, 1, 2, 0])
+        x_tensor = x_selected_rows.get_tensor()
+        x_tensor = np.random.random((4, 1)).astype("float32")
+        x_tensor[np.abs(x_tensor) < self.max_relative_error] = 0.5
+
+        self.op_type = "clip_by_norm"
+        self.inputs = {'X': x_selected_rows, }
+        self.attrs = {}
+        self.attrs['max_norm'] = self.max_norm
+        y_tensor = np.zeros((3, 1))
+        y_tensor[0::1] = np.sum(x_tensor[0::1], x_tensor[1::1])
+        y_tensor[1::1] = x_tensor[2::1]
+        y_tensor[2::1] = x_tensor[3::1]
+        norm = np.sqrt(np.sum(np.square(y_tensor)))
+        if norm > self.max_norm:
+            output = self.max_norm * y_tensor / norm
+        else:
+            output = y_tensor
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def initTestCase(self):
+        self.shape = (100, )
+        self.max_norm = 1.0
+
+
 if __name__ == '__main__':
     unittest.main()

From 1456b8ec7dd7d1a13b7bf3e4d1c14e2a10fb0a38 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 10 Oct 2018 18:53:15 +0800
Subject: [PATCH 04/15] Add unittest for clip_by_norm_op with SelectedRows

test=develop
---
 paddle/fluid/operators/clip_by_norm_op.h      |  1 +
 .../tests/unittests/test_clip_by_norm_op.py   | 69 ++++++++++++-------
 2 files changed, 45 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/operators/clip_by_norm_op.h b/paddle/fluid/operators/clip_by_norm_op.h
index 9f99c8a3f9..855c4d7067 100644
--- a/paddle/fluid/operators/clip_by_norm_op.h
+++ b/paddle/fluid/operators/clip_by_norm_op.h
@@ -61,6 +61,7 @@ class ClipByNormKernel : public framework::OpKernel<T> {
       output_selected_rows->set_height(merged_input->height());
       output = output_selected_rows->mutable_value();
       output->Resize(merged_input->value().dims());
+      output->mutable_data<T>(context.GetPlace());
     } else {
       PADDLE_THROW("Unexpected branch, input variable type is %s",
                    in_var->Type().name());
diff --git a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
index 6556c0875e..46433d7825 100644
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
@@ -18,6 +18,7 @@ import unittest
 import numpy as np
 from op_test import OpTest
 
+import paddle.fluid as fluid
 import paddle.fluid.core as core
 
 
@@ -65,39 +66,57 @@ class TestCase3(TestClipByNormOp):
 
 
 class TestClipByNormOpWithSelectedRows(OpTest):
-    def setUp(self):
-        self.initTestCase()
-
-        self.max_relative_error = 0.006
-
+    def check_with_place(self, place):
+        self.config_test_case()
         scope = core.Scope()
+
+        # set input
         x_selected_rows = scope.var('X').get_selected_rows()
-        x_selected_rows.set_rows([1, 1, 2, 0])
+        x_selected_rows.set_rows(self.grad_rows)
         x_tensor = x_selected_rows.get_tensor()
-        x_tensor = np.random.random((4, 1)).astype("float32")
-        x_tensor[np.abs(x_tensor) < self.max_relative_error] = 0.5
-
-        self.op_type = "clip_by_norm"
-        self.inputs = {'X': x_selected_rows, }
-        self.attrs = {}
-        self.attrs['max_norm'] = self.max_norm
-        y_tensor = np.zeros((3, 1))
-        y_tensor[0::1] = np.sum(x_tensor[0::1], x_tensor[1::1])
-        y_tensor[1::1] = x_tensor[2::1]
-        y_tensor[2::1] = x_tensor[3::1]
-        norm = np.sqrt(np.sum(np.square(y_tensor)))
+        x_np = np.random.random(self.grad_shape).astype("float32")
+        x_np[np.abs(x_np) < self.max_relative_error] = 0.5
+        x_tensor.set(x_np, place)
+
+        # set output
+        out_selected_rows = scope.var('Out').get_selected_rows()
+
+        # run clip_by_norm_op
+        clip_by_norm_op = fluid.op.Operator(
+            "clip_by_norm", max_norm=self.max_norm, X='X', Out='Out')
+        clip_by_norm_op.run(scope, place)
+
+        # check output
+        self.assertEqual(out_selected_rows.rows(), self.grad_clipped_rows)
+        out_tensor = out_selected_rows.get_tensor()
+        y_np = np.zeros(self.grad_clipped_shape)
+        y_np[0] = np.sum(x_np[0:2])
+        y_np[1] = x_np[2]
+        y_np[2] = x_np[3]
+        norm = np.sqrt(np.sum(np.square(y_np)))
         if norm > self.max_norm:
-            output = self.max_norm * y_tensor / norm
+            output = self.max_norm * y_np / norm
         else:
-            output = y_tensor
-        self.outputs = {'Out': output}
+            output = y_np
+        self.assertTrue(
+            np.allclose(
+                np.array(out_tensor), output, atol=1e-5, equal_nan=False))
 
-    def test_check_output(self):
-        self.check_output()
+    def test_clip_by_norm_with_selected_ros(self):
+        places = [core.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(core.CUDAPlace(0))
 
-    def initTestCase(self):
-        self.shape = (100, )
+        for place in places:
+            self.check_with_place(place)
+
+    def config_test_case(self):
         self.max_norm = 1.0
+        self.max_relative_error = 0.006
+        self.grad_shape = (4, 1)
+        self.grad_clipped_shape = (3, 1)
+        self.grad_rows = [0, 0, 1, 2]
+        self.grad_clipped_rows = [0, 1, 2]
 
 
 if __name__ == '__main__':

From 40b17be4b0523492435714536de8c329c09925f3 Mon Sep 17 00:00:00 2001
From: Michal Gallus <michal.gallus@intel.com>
Date: Mon, 1 Oct 2018 15:52:32 +0200
Subject: [PATCH 05/15] Pass: Fuse Conv + Bias

test=develop
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   6 +-
 .../ir/conv_bias_mkldnn_fuse_pass.cc          |  78 +++++++++++++
 .../framework/ir/conv_bias_mkldnn_fuse_pass.h |  34 ++++++
 .../ir/conv_bias_mkldnn_fuse_pass_tester.cc   | 106 ++++++++++++++++++
 .../framework/ir/graph_pattern_detector.cc    |  32 ++++++
 .../framework/ir/graph_pattern_detector.h     |  21 ++++
 paddle/fluid/inference/analysis/analyzer.h    |   1 +
 7 files changed, 276 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
 create mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 0076a8bece..fbfb0776a9 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -30,6 +30,7 @@ pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
 if (WITH_MKLDNN)
+    pass_library(conv_bias_mkldnn_fuse_pass inference)
     pass_library(conv_relu_mkldnn_fuse_pass inference)
 endif ()
 pass_library(attention_lstm_fuse_pass inference)
@@ -51,6 +52,7 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-if (WITH_MKLDNN)
+if(WITH_MKLDNN)
+    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-endif ()
+endif()
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
new file mode 100644
index 0000000000..d0bd09a4f6
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  PADDLE_ENFORCE(graph.get());
+  FusePassBase::Init("conv_bias_mkldnn_fuse", graph.get());
+  GraphPatternDetector gpd;
+  auto* conv_input = gpd.mutable_pattern()
+                         ->NewNode("conv_bias_mkldnn_fuse/conv_input")
+                         ->AsInput()
+                         ->assert_is_op_input("conv2d", "Input");
+  patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(),
+                                       "conv_bias_mkldnn_fuse");
+  conv_bias_pattern(conv_input);
+  int found_conv_bias_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "handle ConvBias fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
+                              conv_bias_pattern);                      // Filter
+    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
+    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern);  // CONV op
+    // bias
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern);
+    // output
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern);
+    // elementwise_add op
+    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
+    // Create an ConvBias Node.
+    OpDesc desc;
+    std::string conv_bias_i_in = subgraph.at(conv_input)->Name();
+    std::string conv_bias_w_in = conv_weight->Name();
+    std::string conv_bias_b_in = eltwise_bias->Name();
+    std::string conv_bias_out = eltwise_out->Name();
+    desc.SetInput("Input", std::vector<std::string>({conv_bias_i_in}));
+    desc.SetInput("Filter", std::vector<std::string>({conv_bias_w_in}));
+    desc.SetInput("Bias", std::vector<std::string>({conv_bias_b_in}));
+    desc.SetOutput("Output", std::vector<std::string>({conv_bias_out}));
+    desc.SetType("conv2d");
+    for (auto& attr : conv->Op()->GetAttrMap()) {
+      desc.SetAttr(attr.first, attr.second);
+    }
+    auto conv_bias_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
+    GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out});
+    PADDLE_ENFORCE(subgraph.count(conv_input));
+    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node);
+    IR_NODE_LINK_TO(conv_weight, conv_bias_node);
+    IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
+    IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
+    found_conv_bias_count++;
+  };
+  gpd(graph.get(), handler);
+  AddStatis(found_conv_bias_count);
+  return graph;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::ConvBiasFusePass);
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
new file mode 100644
index 0000000000..187453b2a6
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+/*
+* Fuse the Conv and Elementwise_add to a ConvBiasOp.
+*/
+class ConvBiasFusePass : public FusePassBase {
+ public:
+  virtual ~ConvBiasFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc
new file mode 100644
index 0000000000..50fc62c173
--- /dev/null
+++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void SetOp(ProgramDesc* prog, const std::string& type,
+           const std::vector<std::string>& inputs,
+           const std::vector<std::string>& outputs) {
+  auto* op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(type);
+  if (type == "conv2d") {
+    op->SetAttr("use_mkldnn", true);
+    op->SetInput("Input", {inputs[0]});
+    op->SetInput("Filter", {inputs[1]});
+  } else if (type == "elementwise_add") {
+    op->SetInput("X", {inputs[0]});
+    op->SetInput("Y", {inputs[1]});
+  }
+  op->SetOutput("Out", outputs);
+}
+
+// a->OP0->b
+// b->OP1->c
+// (c, weights)->conv->f
+// (f, bias)->elementwise_add->g
+ProgramDesc BuildProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
+    auto* var = prog.MutableBlock(0)->Var(v);
+    var->SetType(proto::VarType::SELECTED_ROWS);
+    if (v == "weights" || v == "bias") {
+      var->SetPersistable(true);
+    }
+  }
+
+  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
+        std::vector<std::string>({"b"}));
+  SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
+        std::vector<std::string>({"c"}));
+  SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights"}),
+        std::vector<std::string>({"f"}));
+  SetOp(&prog, "elementwise_add", std::vector<std::string>({"f", "bias"}),
+        std::vector<std::string>({"g"}));
+
+  return prog;
+}
+
+TEST(ConvBiasFusePass, basic) {
+  auto prog = BuildProgramDesc();
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+
+  auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
+
+  int original_nodes_num = graph->Nodes().size();
+
+  graph = pass->Apply(std::move(graph));
+
+  int current_nodes_num = graph->Nodes().size();
+
+  // Remove 3 Nodes: conv, elementwise_add, conv_out
+  // Add 1 Node: ConvBias
+  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
+
+  // Assert conv_bias op in newly generated graph
+  int conv_bias_count = 0;
+
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "conv2d") {
+      if (node->Op()->HasAttr("use_mkldnn")) {
+        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
+        if (use_mkldnn) {
+          auto names = node->Op()->InputNames();
+          if (std::find(names.begin(), names.end(), "Bias") != names.end()) {
+            conv_bias_count++;
+          }
+        }
+      }
+    }
+  }
+  EXPECT_EQ(conv_bias_count, 1);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(conv_bias_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 46c6a52c09..4be1ead0d4 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -858,6 +858,38 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
   return ele_add_grad;
 }
 
+PDNode *patterns::ConvBias::operator()(
+    paddle::framework::ir::PDNode *conv_input) {
+  // Create Operators
+  conv_input->assert_is_op_input("conv2d", "Input");
+  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
+  auto *eltiwse_op =
+      pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
+  // Create variables
+  // Filter
+  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
+                              ->AsInput()
+                              ->assert_is_persistable_var()
+                              ->assert_is_op_input("conv2d", "Filter");
+  // intermediate variable, will be removed in the IR after fuse.
+  auto *conv_out_var = pattern->NewNode(conv_out_repr())
+                           ->AsIntermediate()
+                           ->assert_is_only_output_of_op("conv2d")
+                           ->assert_is_op_input("elementwise_add");
+  // Bias stored in elementwise_add
+  auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
+                               ->AsInput()
+                               ->assert_is_op_input("elementwise_add", "Y");
+  // output
+  auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr())
+                              ->AsOutput()
+                              ->assert_is_op_output("elementwise_add");
+  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
+  eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var})
+      .LinksTo({eltwise_out_var});
+  return eltwise_out_var;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 508113bf4f..60fb13b4f6 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -540,6 +540,27 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
   PATTERN_DECL_NODE(d_ele_y);
   PATTERN_DECL_NODE(ele_y);
 };
+
+// Conv with Elementwise_add as bias
+// op: conv + elementwise_add
+// named nodes:
+// conv_input, conv_weight,
+// conv_out, conv,
+// eltwise_bias, eltwise_out,
+// elementwise_add
+struct ConvBias : public PatternBase {
+  ConvBias(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "conv_bias") {}
+  PDNode* operator()(PDNode* conv_input);
+  // declare operator node's name
+  PATTERN_DECL_NODE(conv);
+  PATTERN_DECL_NODE(eltwise);
+  // declare variable node's name
+  PATTERN_DECL_NODE(conv_weight);
+  PATTERN_DECL_NODE(conv_out);
+  PATTERN_DECL_NODE(eltwise_bias);
+  PATTERN_DECL_NODE(eltwise_out);
+};
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 0aa9367bf5..b2bab73d1b 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -74,6 +74,7 @@ class Analyzer : public OrderedRegistry<PassManager> {
       "seq_concat_fc_fuse_pass",      //
       "fc_fuse_pass",                 //
 #ifdef PADDLE_WITH_MKLDNN
+      "conv_bias_mkldnn_fuse_pass",  //
       "conv_relu_mkldnn_fuse_pass",  //
 #endif
   }};

From 7e651c8641f8f197aa127a7eef63c2e8eb403a71 Mon Sep 17 00:00:00 2001
From: whs <wanghaoshuang@baidu.com>
Date: Thu, 11 Oct 2018 10:09:44 +0800
Subject: [PATCH 06/15] Fix truncated norm (#13785)

* Fix truncated normal.

* test=develop
---
 paddle/fluid/operators/truncated_gaussian_random_op.cc | 2 +-
 paddle/fluid/operators/truncated_gaussian_random_op.cu | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cc b/paddle/fluid/operators/truncated_gaussian_random_op.cc
index d854e28039..1e8708f264 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cc
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cc
@@ -148,7 +148,7 @@ struct TruncatedNormal {
 
   T operator()(T value) const {
     auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return (std::sqrt(2.0) * Erfinv(2 * p - 1) + mean) * std;
+    return std::sqrt(2.0) * Erfinv(2 * p - 1) * std + mean;
   }
 };
 
diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu
index ad2a9021bf..5a3510babe 100644
--- a/paddle/fluid/operators/truncated_gaussian_random_op.cu
+++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu
@@ -42,7 +42,7 @@ struct TruncatedNormal {
     rng.discard(n);
     T value = dist(rng);
     auto p = a_normal_cdf + (b_normal_cdf - a_normal_cdf) * value;
-    return (std::sqrt(2.0) * erfinvf(2 * p - 1) + mean) * std;
+    return std::sqrt(2.0) * erfinvf(2 * p - 1) * std + mean;
   }
 };
 
@@ -52,6 +52,7 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
     T* data = tensor->mutable_data<T>(context.GetPlace());
+
     unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
     if (seed == 0) {
       std::random_device rd;

From 9b11a175025cce59f0ba362f53f0c39f3bf35490 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 11 Oct 2018 11:54:59 +0800
Subject: [PATCH 07/15] Revert "[MKLDNN] Pass: Fuse Conv + Bias"

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   6 +-
 .../ir/conv_bias_mkldnn_fuse_pass.cc          |  78 -------------
 .../framework/ir/conv_bias_mkldnn_fuse_pass.h |  34 ------
 .../ir/conv_bias_mkldnn_fuse_pass_tester.cc   | 106 ------------------
 .../framework/ir/graph_pattern_detector.cc    |  32 ------
 .../framework/ir/graph_pattern_detector.h     |  21 ----
 paddle/fluid/inference/analysis/analyzer.h    |   1 -
 7 files changed, 2 insertions(+), 276 deletions(-)
 delete mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
 delete mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
 delete mode 100644 paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 79390e9321..796ce1f91c 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -30,7 +30,6 @@ pass_library(graph_to_program_pass base)
 pass_library(graph_viz_pass base)
 pass_library(fc_fuse_pass inference)
 if (WITH_MKLDNN)
-    pass_library(conv_bias_mkldnn_fuse_pass inference)
     pass_library(conv_relu_mkldnn_fuse_pass inference)
 endif ()
 pass_library(attention_lstm_fuse_pass inference)
@@ -53,7 +52,6 @@ cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_r
 cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass)
 cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector)
 cc_test(test_fc_fuse_pass SRCS fc_fuse_pass_tester.cc DEPS fc_fuse_pass framework_proto)
-if(WITH_MKLDNN)
-    cc_test(test_conv_bias_mkldnn_fuse_pass SRCS conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass)
+if (WITH_MKLDNN)
     cc_test(test_conv_relu_mkldnn_fuse_pass SRCS conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
-endif()
+endif ()
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
deleted file mode 100644
index d0bd09a4f6..0000000000
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
-#include <string>
-#include <vector>
-#include "paddle/fluid/platform/enforce.h"
-namespace paddle {
-namespace framework {
-namespace ir {
-std::unique_ptr<ir::Graph> ConvBiasFusePass::ApplyImpl(
-    std::unique_ptr<ir::Graph> graph) const {
-  PADDLE_ENFORCE(graph.get());
-  FusePassBase::Init("conv_bias_mkldnn_fuse", graph.get());
-  GraphPatternDetector gpd;
-  auto* conv_input = gpd.mutable_pattern()
-                         ->NewNode("conv_bias_mkldnn_fuse/conv_input")
-                         ->AsInput()
-                         ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(),
-                                       "conv_bias_mkldnn_fuse");
-  conv_bias_pattern(conv_input);
-  int found_conv_bias_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    VLOG(4) << "handle ConvBias fuse";
-    GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight,
-                              conv_bias_pattern);                      // Filter
-    GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, conv_bias_pattern);  // tmp
-    GET_IR_NODE_FROM_SUBGRAPH(conv, conv, conv_bias_pattern);  // CONV op
-    // bias
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_bias, eltwise_bias, conv_bias_pattern);
-    // output
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_bias_pattern);
-    // elementwise_add op
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_bias_pattern);
-    // Create an ConvBias Node.
-    OpDesc desc;
-    std::string conv_bias_i_in = subgraph.at(conv_input)->Name();
-    std::string conv_bias_w_in = conv_weight->Name();
-    std::string conv_bias_b_in = eltwise_bias->Name();
-    std::string conv_bias_out = eltwise_out->Name();
-    desc.SetInput("Input", std::vector<std::string>({conv_bias_i_in}));
-    desc.SetInput("Filter", std::vector<std::string>({conv_bias_w_in}));
-    desc.SetInput("Bias", std::vector<std::string>({conv_bias_b_in}));
-    desc.SetOutput("Output", std::vector<std::string>({conv_bias_out}));
-    desc.SetType("conv2d");
-    for (auto& attr : conv->Op()->GetAttrMap()) {
-      desc.SetAttr(attr.first, attr.second);
-    }
-    auto conv_bias_node = g->CreateOpNode(&desc);  // OpDesc will be copied.
-    GraphSafeRemoveNodes(graph.get(), {conv, eltwise, conv_out});
-    PADDLE_ENFORCE(subgraph.count(conv_input));
-    IR_NODE_LINK_TO(subgraph.at(conv_input), conv_bias_node);
-    IR_NODE_LINK_TO(conv_weight, conv_bias_node);
-    IR_NODE_LINK_TO(eltwise_bias, conv_bias_node);
-    IR_NODE_LINK_TO(conv_bias_node, eltwise_out);
-    found_conv_bias_count++;
-  };
-  gpd(graph.get(), handler);
-  AddStatis(found_conv_bias_count);
-  return graph;
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
-              paddle::framework::ir::ConvBiasFusePass);
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
deleted file mode 100644
index 187453b2a6..0000000000
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#pragma once
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/pass.h"
-namespace paddle {
-namespace framework {
-namespace ir {
-/*
-* Fuse the Conv and Elementwise_add to a ConvBiasOp.
-*/
-class ConvBiasFusePass : public FusePassBase {
- public:
-  virtual ~ConvBiasFusePass() {}
-
- protected:
-  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
-};
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc
deleted file mode 100644
index 50fc62c173..0000000000
--- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass_tester.cc
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-void SetOp(ProgramDesc* prog, const std::string& type,
-           const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
-  auto* op = prog->MutableBlock(0)->AppendOp();
-  op->SetType(type);
-  if (type == "conv2d") {
-    op->SetAttr("use_mkldnn", true);
-    op->SetInput("Input", {inputs[0]});
-    op->SetInput("Filter", {inputs[1]});
-  } else if (type == "elementwise_add") {
-    op->SetInput("X", {inputs[0]});
-    op->SetInput("Y", {inputs[1]});
-  }
-  op->SetOutput("Out", outputs);
-}
-
-// a->OP0->b
-// b->OP1->c
-// (c, weights)->conv->f
-// (f, bias)->elementwise_add->g
-ProgramDesc BuildProgramDesc() {
-  ProgramDesc prog;
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "weights", "bias", "f", "g"})) {
-    auto* var = prog.MutableBlock(0)->Var(v);
-    var->SetType(proto::VarType::SELECTED_ROWS);
-    if (v == "weights" || v == "bias") {
-      var->SetPersistable(true);
-    }
-  }
-
-  SetOp(&prog, "OP0", std::vector<std::string>({"a"}),
-        std::vector<std::string>({"b"}));
-  SetOp(&prog, "OP1", std::vector<std::string>({"b"}),
-        std::vector<std::string>({"c"}));
-  SetOp(&prog, "conv2d", std::vector<std::string>({"c", "weights"}),
-        std::vector<std::string>({"f"}));
-  SetOp(&prog, "elementwise_add", std::vector<std::string>({"f", "bias"}),
-        std::vector<std::string>({"g"}));
-
-  return prog;
-}
-
-TEST(ConvBiasFusePass, basic) {
-  auto prog = BuildProgramDesc();
-
-  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
-
-  auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
-
-  int original_nodes_num = graph->Nodes().size();
-
-  graph = pass->Apply(std::move(graph));
-
-  int current_nodes_num = graph->Nodes().size();
-
-  // Remove 3 Nodes: conv, elementwise_add, conv_out
-  // Add 1 Node: ConvBias
-  EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
-
-  // Assert conv_bias op in newly generated graph
-  int conv_bias_count = 0;
-
-  for (auto* node : graph->Nodes()) {
-    if (node->IsOp() && node->Op()->Type() == "conv2d") {
-      if (node->Op()->HasAttr("use_mkldnn")) {
-        bool use_mkldnn = boost::get<bool>(node->Op()->GetAttr("use_mkldnn"));
-        if (use_mkldnn) {
-          auto names = node->Op()->InputNames();
-          if (std::find(names.begin(), names.end(), "Bias") != names.end()) {
-            conv_bias_count++;
-          }
-        }
-      }
-    }
-  }
-  EXPECT_EQ(conv_bias_count, 1);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-USE_PASS(conv_bias_mkldnn_fuse_pass);
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index a8364cc05f..8625b562e7 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -964,38 +964,6 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()(
   return ele_add_grad;
 }
 
-PDNode *patterns::ConvBias::operator()(
-    paddle::framework::ir::PDNode *conv_input) {
-  // Create Operators
-  conv_input->assert_is_op_input("conv2d", "Input");
-  auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d");
-  auto *eltiwse_op =
-      pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add");
-  // Create variables
-  // Filter
-  auto *conv_weight_var = pattern->NewNode(conv_weight_repr())
-                              ->AsInput()
-                              ->assert_is_persistable_var()
-                              ->assert_is_op_input("conv2d", "Filter");
-  // intermediate variable, will be removed in the IR after fuse.
-  auto *conv_out_var = pattern->NewNode(conv_out_repr())
-                           ->AsIntermediate()
-                           ->assert_is_only_output_of_op("conv2d")
-                           ->assert_is_op_input("elementwise_add");
-  // Bias stored in elementwise_add
-  auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr())
-                               ->AsInput()
-                               ->assert_is_op_input("elementwise_add", "Y");
-  // output
-  auto *eltwise_out_var = pattern->NewNode(eltwise_out_repr())
-                              ->AsOutput()
-                              ->assert_is_op_output("elementwise_add");
-  conv_op->LinksFrom({conv_input, conv_weight_var}).LinksTo({conv_out_var});
-  eltiwse_op->LinksFrom({conv_out_var, eltwise_bias_var})
-      .LinksTo({eltwise_out_var});
-  return eltwise_out_var;
-}
-
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 9dfd7046ca..cdd6413d96 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -578,27 +578,6 @@ struct ElewiseAddActInplaceGrad : public PatternBase {
   PATTERN_DECL_NODE(d_ele_y);
   PATTERN_DECL_NODE(ele_y);
 };
-
-// Conv with Elementwise_add as bias
-// op: conv + elementwise_add
-// named nodes:
-// conv_input, conv_weight,
-// conv_out, conv,
-// eltwise_bias, eltwise_out,
-// elementwise_add
-struct ConvBias : public PatternBase {
-  ConvBias(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "conv_bias") {}
-  PDNode* operator()(PDNode* conv_input);
-  // declare operator node's name
-  PATTERN_DECL_NODE(conv);
-  PATTERN_DECL_NODE(eltwise);
-  // declare variable node's name
-  PATTERN_DECL_NODE(conv_weight);
-  PATTERN_DECL_NODE(conv_out);
-  PATTERN_DECL_NODE(eltwise_bias);
-  PATTERN_DECL_NODE(eltwise_out);
-};
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index e7d9cb8994..765145cb7d 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -76,7 +76,6 @@ class Analyzer : public OrderedRegistry<PassManager> {
       "conv_bn_fuse_pass",             //
       "conv_eltwiseadd_bn_fuse_pass",  //
 #ifdef PADDLE_WITH_MKLDNN
-      "conv_bias_mkldnn_fuse_pass",  //
       "conv_relu_mkldnn_fuse_pass",  //
 #endif
   }};

From 63b2e98f3d8783624016f56e0000eabf3dbcd02f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Wed, 10 Oct 2018 13:22:43 +0800
Subject: [PATCH 08/15] Explain LoD and a few other concepts

test=develop
---
 paddle/fluid/pybind/pybind.cc        | 45 +++++++++++++++++++++++++++-
 python/paddle/fluid/layers/io.py     |  6 +++-
 python/paddle/fluid/layers/tensor.py |  2 +-
 3 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 311cd94460..a91894ba89 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -157,7 +157,50 @@ PYBIND11_PLUGIN(core) {
       .def("_get_double_element", TensorGetElement<double>)
       .def("_dtype", [](Tensor &self) { return ToDataType(self.type()); });
 
-  py::class_<LoDTensor, Tensor>(m, "LoDTensor")
+  py::class_<LoDTensor, Tensor>(m, "LoDTensor", R"DOC(
+    LoDTensor is a Tensor with optional LoD information.
+
+    np.array(lod_tensor) can convert LoDTensor to numpy array.
+    lod_tensor.lod() can retrieve the LoD information.
+
+    LoD is short for Level of Details and is usually used for varied sequence
+    length. You can skip the following comment if you don't need optional LoD.
+
+  For example:
+     A LoDTensor X can look like the example below. It contains 2 sequences.
+     The first has length 2 and the second has length 3, as described by x.lod.
+
+     The first tensor dimension 6=2+3 is calculated from LoD if it's available.
+     It means the total number of sequence element. In X, each element has 2
+     columns, hence [6, 2].
+
+      x.lod  = [[2, 3]]
+      x.data = [[1, 2], [3, 4],
+                [5, 6], [7, 8], [9, 10], [11, 12]]
+      x.shape = [6, 2]
+
+      LoD can have multiple levels (for example, a paragraph can have multiple
+      sentences and a sentence can have multiple words). In the following
+      LodTensor Y, the lod_level is 2. It means there are 2 sequence, the
+      first sequence length is 2 (has 2 sub-sequences), the second one's
+      length is 1. The first sequence's 2 sub-sequences have length 2 and 2,
+      respectively. And the second sequence's 1 sub-sequence has length 3.
+
+      y.lod = [[2 1], [2 2 3]]
+      y.shape = [2+2+3, ...]
+
+  Note:
+      In above description, LoD is length-based. In Paddle internal
+      implementation, lod is offset-based. Hence, internally,
+      y.lod is represented as [[0, 2, 3], [0, 2, 4, 7]] (length-based
+      equivlent would be [[2-0, 3-2], [2-0, 4-2, 7-4]]).
+
+      Sometimes LoD is called recursive_sequence_length to be more
+      self-explanatory. In this case, it must be length-based. Due to history
+      reasons. when LoD is called lod in public API, it might be offset-based.
+      Users should be careful about it.
+
+        )DOC")
       .def_buffer(
           [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); })
       .def("__init__",
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 25fde782b7..a06cd4982f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -56,7 +56,11 @@ def data(name,
     Args:
        name(str): The name/alias of the function
        shape(list): Tuple declaring the shape.
-       append_batch_size(bool): Whether or not to append the data as a batch.
+       append_batch_size(bool):
+          1. If true, it prepends -1 to the shape.
+            For example if shape=[1], the resulting shape is [-1, 1].
+          2. If shape contains -1, such as shape=[1, -1],
+            append_batch_size will be enforced to be be False (ineffective).
        dtype(int|float): The type of data : float32, float_16, int etc
        type(VarType): The output type. By default it is LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 44b92af7ac..9c6a2112a6 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -100,7 +100,7 @@ def create_global_var(shape,
                       force_cpu=False,
                       name=None):
     """
-    Create a new variable in the global block(block 0).
+    Create a new tensor variable with value in the global block(block 0).
 
     Args:
         shape(list[int]): shape of the variable

From 5428cb9908740f2581112777e55a3f118b5b93d3 Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Fri, 12 Oct 2018 09:25:14 +0800
Subject: [PATCH 09/15] Profiler support merge data of all thread (#13811)

* profiler infor merge thread statistic information

* update profiler

* fix bug

* add merge thread msg to report

* optimize report

* statistic the time of ops in each thread but not all

* optimize report format

* optimize profile report

* optimize profile report
test=develop
---
 paddle/fluid/platform/profiler.cc | 65 ++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 652a6ec7a4..612f3bc0e7 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -276,7 +276,7 @@ struct EventItem {
 // Print results
 void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                    const std::string& sorted_domain, const size_t name_width,
-                   const size_t data_width, double total) {
+                   const size_t data_width, bool merge_thread) {
   // Output header information
   std::cout << "\n------------------------->"
             << "     Profiling Report     "
@@ -292,6 +292,10 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
     PADDLE_THROW("Invalid profiler state", g_state);
   }
 
+  if (merge_thread) {
+    std::cout << "Note! This Report merge all thread info into one."
+              << std::endl;
+  }
   std::cout << "Place: " << place << std::endl;
   std::cout << "Time unit: ms" << std::endl;
   std::cout << "Sorted by " << sorted_domain
@@ -312,8 +316,7 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
                 << std::setw(data_width) << event_item.min_time
                 << std::setw(data_width) << event_item.max_time
                 << std::setw(data_width) << event_item.ave_time
-                << std::setw(data_width) << event_item.total_time / total
-                << std::endl;
+                << std::setw(data_width) << event_item.ratio << std::endl;
     }
   }
   std::cout << std::endl;
@@ -321,8 +324,10 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
 
 // Parse the event list and output the profiling report
 void ParseEvents(const std::vector<std::vector<Event>>& events,
+                 bool merge_thread,
                  EventSortingKey sorted_by = EventSortingKey::kDefault) {
   if (g_state == ProfilerState::kDisabled) return;
+  if (merge_thread && events.size() < 2) return;
 
   std::string sorted_domain;
   std::function<bool(const EventItem&, const EventItem&)> sorted_func;
@@ -361,34 +366,55 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
       sorted_domain = "event first end time";
   }
 
+  const std::vector<std::vector<Event>>* analyze_events;
+  std::vector<std::vector<Event>> merged_events_list;
+  if (merge_thread) {
+    std::vector<Event> merged_events;
+    for (int i = 0; i < events.size(); ++i) {
+      for (int j = 0; j < events[i].size(); ++j) {
+        merged_events.push_back(events[i][j]);
+      }
+    }
+    merged_events_list.push_back(merged_events);
+    analyze_events = &merged_events_list;
+  } else {
+    analyze_events = &events;
+  }
+
   std::vector<std::vector<EventItem>> events_table;
   size_t max_name_width = 0;
-  double total = 0.;  // the total time
-  for (size_t i = 0; i < events.size(); i++) {
+  for (size_t i = 0; i < (*analyze_events).size(); i++) {
+    double total = 0.;  // the total time in one thread
     std::list<Event> pushed_events;
     std::vector<EventItem> event_items;
     std::unordered_map<std::string, int> event_idx;
 
-    for (size_t j = 0; j < events[i].size(); j++) {
-      if (events[i][j].type() == EventType::kPushRange) {
-        pushed_events.push_back(events[i][j]);
-      } else if (events[i][j].type() == EventType::kPopRange) {
+    for (size_t j = 0; j < (*analyze_events)[i].size(); j++) {
+      if ((*analyze_events)[i][j].type() == EventType::kPushRange) {
+        pushed_events.push_back((*analyze_events)[i][j]);
+      } else if ((*analyze_events)[i][j].type() == EventType::kPopRange) {
         std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
         while (rit != pushed_events.rend() &&
-               rit->name() != events[i][j].name()) {
+               rit->name() != (*analyze_events)[i][j].name()) {
           ++rit;
         }
 
         if (rit != pushed_events.rend()) {
           double event_time = (g_state == ProfilerState::kCUDA ||
                                g_state == ProfilerState::kAll)
-                                  ? rit->CudaElapsedMs(events[i][j])
-                                  : rit->CpuElapsedMs(events[i][j]);
+                                  ? rit->CudaElapsedMs((*analyze_events)[i][j])
+                                  : rit->CpuElapsedMs((*analyze_events)[i][j]);
           total += event_time;
 
-          std::string event_name =
-              "thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
-          max_name_width = std::max(max_name_width, event_name.size());
+          std::string event_name;
+          if (merge_thread) {
+            event_name = rit->name();
+            max_name_width = std::max(max_name_width, event_name.size());
+          } else {
+            event_name = "thread" + std::to_string(rit->thread_id()) + "::" +
+                         rit->name();
+            max_name_width = std::max(max_name_width, event_name.size());
+          }
 
           if (event_idx.find(event_name) == event_idx.end()) {
             event_idx[event_name] = event_items.size();
@@ -413,7 +439,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
           pushed_events.erase((++rit).base());
         } else {
           LOG(WARNING) << "Cannot find the push marker of event \'"
-                       << events[i][j].name()
+                       << (*analyze_events)[i][j].name()
                        << "\', which will be ignored in profiling report.";
         }
       }
@@ -421,6 +447,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
     // average time
     for (auto& item : event_items) {
       item.ave_time = item.total_time / item.calls;
+      item.ratio = item.total_time / total;
     }
     // sort
     if (sorted_by != EventSortingKey::kDefault) {
@@ -438,7 +465,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
   }
 
   // Print report
-  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12, total);
+  PrintProfiler(events_table, sorted_domain, max_name_width + 4, 12,
+                merge_thread);
 }
 
 void DisableProfiler(EventSortingKey sorted_key,
@@ -449,7 +477,8 @@ void DisableProfiler(EventSortingKey sorted_key,
   Mark("_stop_profiler_", nullptr);
 
   std::vector<std::vector<Event>> all_events = GetAllEvents();
-  ParseEvents(all_events, sorted_key);
+  ParseEvents(all_events, true, sorted_key);
+  ParseEvents(all_events, false, sorted_key);
   ResetProfiler();
   DeviceTracer* tracer = GetDeviceTracer();
   if (tracer->IsEnabled()) {

From 5f2e837847db9fff219333e03f4867abaa75768c Mon Sep 17 00:00:00 2001
From: Dun <randonlang@gmail.com>
Date: Fri, 12 Oct 2018 13:24:57 +0800
Subject: [PATCH 10/15] optimize depthwise conv by register memory (#13778)

* optimize depthwise conv by register memory
* test=develop
---
 paddle/fluid/operators/math/depthwise_conv.cu | 275 +++++++++++++-----
 1 file changed, 210 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 3be3899123..66d37c3bf3 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -46,17 +46,20 @@ __forceinline__ __device__ unsigned warp_id() {
   return ret;
 }
 
+#define ARG_DEFINE_KernelDepthwiseConv                                         \
+  const T *const input_data, const T *const filter_data, const int batch_size, \
+      const int output_channels, const int output_height,                      \
+      const int output_width, const int input_channels,                        \
+      const int input_height, const int input_width,                           \
+      const int filter_multiplier, const int filter_height,                    \
+      const int filter_width, const int stride_height, const int stride_width, \
+      const int padding_height, const int padding_width,                       \
+      const int dilate_height, const int dilate_width, T *const output_data
+
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
 template <typename T>
-__device__ __inline__ void KernelDepthwiseConv(
-    const T* const input_data, const T* const filter_data, const int batch_size,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* const output_data) {
+__device__ __inline__ void KernelDepthwiseConv(ARG_DEFINE_KernelDepthwiseConv) {
   for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
     for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
       const int batch = blockIdx.y;
@@ -97,42 +100,105 @@ __device__ __inline__ void KernelDepthwiseConv(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride>
-__global__ void KernelDepthwiseConvSp(
-    const T* const input_data, const T* const filter_data, const int batch_size,
-    const int output_channels, const int output_height, const int output_width,
-    const int input_channels, const int input_height, const int input_width,
-    const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* const output_data) {
-  if (c_filter_multiplier == 0)
-    KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
-                           output_height, output_width, input_channels,
-                           input_height, input_width, filter_multiplier,
-                           filter_height, filter_width, stride_height,
-                           stride_width, padding_height, padding_width,
-                           dilate_height, dilate_width, output_data);
+template <typename T, int c_filter>
+__device__ __inline__ void KernelDepthwiseConvCFilter(
+    ARG_DEFINE_KernelDepthwiseConv) {
+  const int kWeghtSize = c_filter * c_filter;
+  T r_weight[kWeghtSize];
+  const int batch = blockIdx.y;
+  const int c_out = blockIdx.x;
+  const T* weight = filter_data + c_out * c_filter * c_filter;
+  for (int i = 0; i < c_filter * c_filter; i++) r_weight[i] = weight[i];
 
-  else
-    KernelDepthwiseConv<T>(input_data, filter_data, batch_size, output_channels,
-                           output_height, output_width, input_channels,
-                           input_height, input_width, c_filter_multiplier,
-                           filter_height, filter_height, c_stride, c_stride,
-                           padding_height, padding_width, dilate_height,
-                           dilate_width, output_data);
+  for (int w_out = threadIdx.x; w_out < output_width; w_out += blockDim.x) {
+    for (int h_out = threadIdx.y; h_out < output_height; h_out += blockDim.y) {
+      const int batch = blockIdx.y;
+      const int c_out = blockIdx.x;
+
+      const int c_in = c_out / filter_multiplier;
+      T value = 0;
+      const int h_in_start = -padding_height + h_out * stride_height;
+      const int w_in_start = -padding_width + w_out * stride_width;
+      const int h_in_end = h_in_start + c_filter * dilate_height;
+      const int w_in_end = w_in_start + c_filter * dilate_width;
+
+      const int in_offset =
+          ((batch * input_channels + c_in) * input_height) * input_width;
+
+      const int h_end = h_in_end < input_height ? h_in_end : input_height;
+      const int w_end = w_in_end < input_width ? w_in_end : input_width;
+      const int h_start = h_in_start > 0 ? h_in_start : 0;
+      const int w_start = w_in_start > 0 ? w_in_start : 0;
+
+      for (int h_in = h_in_start, h_f = 0; h_f < c_filter;
+           h_in += dilate_height, h_f++) {
+        for (int w_in = w_in_start, w_f = 0; w_f < c_filter;
+             w_in += dilate_width, w_f++) {
+          if (h_in >= 0 && h_in < input_height && w_in >= 0 &&
+              w_in < input_width) {
+            const int offset = in_offset + h_in * input_width + w_in;
+            value += r_weight[h_f * c_filter + w_f] * input_data[offset];
+          }
+        }
+      }
+      int index =
+          ((batch * gridDim.x + c_out) * output_height + h_out) * output_width +
+          w_out;
+      output_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
+__global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
+  if (c_filter_multiplier == 0) {
+    if (c_filter == -1)
+      KernelDepthwiseConv<T>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          filter_multiplier, filter_height, filter_width, stride_height,
+          stride_width, padding_height, padding_width, dilate_height,
+          dilate_width, output_data);
+    else
+      KernelDepthwiseConvCFilter<T, c_filter>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          filter_multiplier, filter_height, filter_width, stride_height,
+          stride_width, padding_height, padding_width, dilate_height,
+          dilate_width, output_data);
+  } else {
+    if (c_filter == -1)
+      KernelDepthwiseConv<T>(input_data, filter_data, batch_size,
+                             output_channels, output_height, output_width,
+                             input_channels, input_height, input_width,
+                             c_filter_multiplier, filter_height, filter_height,
+                             c_stride, c_stride, padding_height, padding_width,
+                             dilate_height, dilate_width, output_data);
+    else
+      KernelDepthwiseConvCFilter<T, c_filter>(
+          input_data, filter_data, batch_size, output_channels, output_height,
+          output_width, input_channels, input_height, input_width,
+          c_filter_multiplier, filter_height, filter_height, c_stride, c_stride,
+          padding_height, padding_width, dilate_height, dilate_width,
+          output_data);
+  }
 }
 
 // CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+#define ARG_DEFINE_KernelDepthwiseConvInputGrad                                \
+  const T *const output_grad_data, const T *const filter_data,                 \
+      const int batch_size, const int output_channels,                         \
+      const int output_height, const int output_width,                         \
+      const int input_channels, const int input_height, const int input_width, \
+      const int filter_multiplier, const int filter_height,                    \
+      const int filter_width, const int stride_height, const int stride_width, \
+      const int padding_height, const int padding_width,                       \
+      const int dilate_height, const int dilate_width,                         \
+      T *const input_grad_data
+
 template <typename T>
 __device__ __inline__ void KernelDepthwiseConvInputGrad(
-    const T* const output_grad_data, const T* const filter_data,
-    const int batch_size, const int output_channels, const int output_height,
-    const int output_width, const int input_channels, const int input_height,
-    const int input_width, const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* const input_grad_data) {
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
     for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
       const int batch = blockIdx.y;
@@ -184,15 +250,67 @@ __device__ __inline__ void KernelDepthwiseConvInputGrad(
   }
 }
 
-template <typename T, int c_filter_multiplier, int c_stride>
+template <typename T, int c_filter, int c_filter_multiplier>
+__device__ __inline__ void KernelDepthwiseConvInputGradCFilter(
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
+  const int kWeghtSize = c_filter * c_filter * c_filter_multiplier + 1;
+  T r_weight[kWeghtSize];
+  const int batch = blockIdx.y;
+  const int c_in = blockIdx.x;
+
+  for (int c_i = 0; c_i < filter_multiplier; c_i++) {
+    int c_out = c_in * filter_multiplier + c_i;
+    const T* weight = filter_data + c_out * c_filter * c_filter;
+    for (int i = 0; i < c_filter * c_filter; i++)
+      r_weight[i + c_i * c_filter * c_filter] =
+          weight[c_filter * c_filter - i - 1];
+  }
+
+  for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
+    for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
+      const int batch = blockIdx.y;
+      const int c_in = blockIdx.x;
+
+      int h_out_start = h_in - (c_filter - 1) * dilate_height + padding_height;
+
+      int w_out_start = w_in - (c_filter - 1) * dilate_width + padding_width;
+
+      T value = 0;
+
+      for (int c_i = 0; c_i < filter_multiplier; c_i++) {
+        int c_out = c_in * filter_multiplier + c_i;
+        for (int h_out = h_out_start, h_f = 0; h_f < c_filter;
+             h_out += dilate_height, h_f++) {
+          for (int w_out = w_out_start, w_f = 0; w_f < c_filter;
+               w_out += dilate_width, w_f++) {
+            int s_h_out = h_out / stride_height;
+            int s_w_out = w_out / stride_width;
+            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
+                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
+                s_w_out < output_width) {
+              const int output_grad_offset =
+                  ((batch * output_channels + c_out) * output_height +
+                   s_h_out) *
+                      output_width +
+                  s_w_out;
+              value +=
+                  output_grad_data[output_grad_offset] *
+                  r_weight[h_f * c_filter + w_f + c_i * c_filter * c_filter];
+            }
+          }
+        }
+      }
+      int index =
+          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
+          w_in;
+      input_grad_data[index] = value;
+    }
+  }
+}
+
+template <typename T, int c_filter_multiplier, int c_stride, int c_filter>
 __global__ void KernelDepthwiseConvInputGradSp(
-    const T* const output_grad_data, const T* const filter_data,
-    const int batch_size, const int output_channels, const int output_height,
-    const int output_width, const int input_channels, const int input_height,
-    const int input_width, const int filter_multiplier, const int filter_height,
-    const int filter_width, const int stride_height, const int stride_width,
-    const int padding_height, const int padding_width, const int dilate_height,
-    const int dilate_width, T* const input_grad_data) {
+    ARG_DEFINE_KernelDepthwiseConvInputGrad) {
   if (c_filter_multiplier == 0)
     KernelDepthwiseConvInputGrad<T>(
         output_grad_data, filter_data, batch_size, output_channels,
@@ -200,13 +318,20 @@ __global__ void KernelDepthwiseConvInputGradSp(
         filter_multiplier, filter_height, filter_width, stride_height,
         stride_width, padding_height, padding_width, dilate_height,
         dilate_width, input_grad_data);
-  else
+  else if (c_filter == -1)
     KernelDepthwiseConvInputGrad<T>(
         output_grad_data, filter_data, batch_size, output_channels,
         output_height, output_width, input_channels, input_height, input_width,
         c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
         padding_height, padding_width, dilate_height, dilate_width,
         input_grad_data);
+  else
+    KernelDepthwiseConvInputGradCFilter<T, c_filter, c_filter_multiplier>(
+        output_grad_data, filter_data, batch_size, output_channels,
+        output_height, output_width, input_channels, input_height, input_width,
+        c_filter_multiplier, filter_height, filter_width, c_stride, c_stride,
+        padding_height, padding_width, dilate_height, dilate_width,
+        input_grad_data);
 }
 
 // Cuda kernel to compute the depthwise convolution backprop w.r.t. filter.
@@ -325,12 +450,14 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
     dim3 threads(std::min(output_width, thread), blocks, 1);
     dim3 grid(output_channels, batch_size, 1);
     int filter_multiplier = output_channels / input_channels;
-#define check_case(c_filter_multiplier, c_stride)                            \
+#define check_case(c_filter_multiplier, c_stride, c_filter)                  \
   if (c_filter_multiplier == 0 ||                                            \
       filter_multiplier == c_filter_multiplier &&                            \
-          stride_height == stride_width && stride_height == c_stride) {      \
-    KernelDepthwiseConvSp<T, c_filter_multiplier,                            \
-                          c_stride><<<grid, threads, 0, context.stream()>>>( \
+          stride_height == stride_width && stride_height == c_stride &&      \
+          (ksize_height == ksize_width && ksize_height == c_filter ||        \
+           c_filter == -1)) {                                                \
+    KernelDepthwiseConvSp<T, c_filter_multiplier, c_stride,                  \
+                          c_filter><<<grid, threads, 0, context.stream()>>>( \
         input_data, filter_data, batch_size, output_channels, output_height, \
         output_width, input_channels, input_height, input_width,             \
         filter_multiplier, ksize_height, ksize_width, stride_height,         \
@@ -338,11 +465,17 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T> {
         dilate_width, output_data);                                          \
     return;                                                                  \
   }
-    check_case(1, 1);
-    check_case(1, 2);
-    // NOTE(liangdun): 0,0 for other case
-    // add other case if needed, e.g. check_case(2^n,1)
-    check_case(0, 0);
+    check_case(1, 1, 3);
+    check_case(1, 1, 5);
+    check_case(1, 1, -1);
+    check_case(1, 2, 3);
+    check_case(1, 2, 5);
+    check_case(1, 2, -1);
+    check_case(0, 0, 3);
+    check_case(0, 0, 5);
+    check_case(0, 0, -1);
+// NOTE(liangdun): 0,0 for other case
+// add other case if needed, e.g. check_case(2^n,1)
 #undef check_case
   }
 };
@@ -384,13 +517,15 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
     dim3 grid(input_channels, batch_size, 1);
     int filter_multiplier = output_channels / input_channels;
 
-#define check_case(c_filter_multiplier, c_stride)                       \
+#define check_case(c_filter_multiplier, c_stride, c_filter)             \
   if (c_filter_multiplier == 0 ||                                       \
       filter_multiplier == c_filter_multiplier &&                       \
-          stride_height == stride_width && stride_height == c_stride) { \
+          stride_height == stride_width && stride_height == c_stride && \
+          (ksize_height == ksize_width && ksize_height == c_filter ||   \
+           c_filter == -1)) {                                           \
     KernelDepthwiseConvInputGradSp<                                     \
-        T, c_filter_multiplier,                                         \
-        c_stride><<<grid, threads, 0, context.stream()>>>(              \
+        T, c_filter_multiplier, c_stride,                               \
+        c_filter><<<grid, threads, 0, context.stream()>>>(              \
         output_grad_data, filter_data, batch_size, output_channels,     \
         output_height, output_width, input_channels, input_height,      \
         input_width, filter_multiplier, ksize_height, ksize_width,      \
@@ -398,11 +533,21 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T> {
         dilate_height, dilate_width, input_grad_data);                  \
     return;                                                             \
   }
-    check_case(1, 1);
-    check_case(1, 2);
-    // NOTE(liangdun): 0,0 for other case
-    // add other case if needed, e.g. check_case(2^n,1)
-    check_case(0, 0);
+    check_case(1, 1, 3);
+    check_case(1, 1, 5);
+    check_case(1, 1, -1);
+    check_case(1, 2, 3);
+    check_case(1, 2, 5);
+    check_case(1, 2, -1);
+    check_case(2, 1, 3);
+    check_case(2, 1, 5);
+    check_case(2, 1, -1);
+    check_case(2, 2, 3);
+    check_case(2, 2, 5);
+    check_case(2, 2, -1);
+    check_case(0, 0, -1);
+// NOTE(liangdun): 0,0 for other case
+// add other case if needed, e.g. check_case(2^n,1)
 #undef check_case
   }
 };

From 228506618b23845792d7f6de74cb6b97cd7bfb13 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 12 Oct 2018 14:41:57 +0800
Subject: [PATCH 11/15] Avoid GetMutable implicitly reset Var Type.

This can cause a lot of problem:
1. Wrong operator implementation, Op can get a wrong type without failure.
2. Anytype can be Get without defined in VarType.

Also fix wrong STEP_SCOPE usage.

test=develop
---
 paddle/fluid/framework/executor.cc              |  2 +-
 paddle/fluid/framework/feed_fetch_method.cc     |  3 +--
 paddle/fluid/framework/naive_executor.cc        |  2 +-
 paddle/fluid/framework/variable.h               |  6 +++++-
 paddle/fluid/framework/variable_test.cc         | 11 ++++++-----
 python/paddle/fluid/layers/io.py                |  6 +++++-
 python/paddle/fluid/layers/tensor.py            |  2 +-
 python/paddle/fluid/tests/book/test_word2vec.py | 16 ++--------------
 8 files changed, 22 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 70ec6e90a4..a070b8efb8 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -66,7 +66,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope*>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 8e1f93c5eb..3e9353f5cf 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,8 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
   // be created.
   VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs =
-      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
   if (index >= feed_inputs.size()) {
     feed_inputs.resize(index + 1);
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index ba10687d65..2840d503f1 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope>>();
+    var->GetMutable<std::vector<framework::Scope *>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 067e0c2b83..873e1b20a5 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,8 +38,12 @@ class Variable {
 
   template <typename T>
   T* GetMutable() {
-    if (!IsType<T>()) {
+    if (!holder_) {
       holder_.reset(new PlaceholderImpl<T>(new T()));
+    } else {
+      PADDLE_ENFORCE(IsType<T>(),
+                     "Variable must be type %s, the holding type is %s",
+                     typeid(T).name(), holder_->Type().name());
     }
     return static_cast<T*>(holder_->Ptr());
   }
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index c5c1d215f4..003dcfd3df 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,9 +33,10 @@ TEST(Variable, GetMutable) {
   const Tensor& tt = v->Get<Tensor>();
   EXPECT_EQ(1234, tt.content_);
 
-  std::string* s = v->GetMutable<std::string>();
-  *s = "hello";
-
-  const std::string& ss = v->Get<std::string>();
-  EXPECT_EQ("hello", ss);
+  try {
+    v->GetMutable<std::string>();
+  } catch (std::exception& e) {
+    return;
+  }
+  EXPECT_TRUE(false);
 }
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index 25fde782b7..a06cd4982f 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -56,7 +56,11 @@ def data(name,
     Args:
        name(str): The name/alias of the function
        shape(list): Tuple declaring the shape.
-       append_batch_size(bool): Whether or not to append the data as a batch.
+       append_batch_size(bool):
+          1. If true, it prepends -1 to the shape.
+            For example if shape=[1], the resulting shape is [-1, 1].
+          2. If shape contains -1, such as shape=[1, -1],
+            append_batch_size will be enforced to be be False (ineffective).
        dtype(int|float): The type of data : float32, float_16, int etc
        type(VarType): The output type. By default it is LOD_TENSOR.
        lod_level(int): The LoD Level. 0 means the input data is not a sequence.
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 44b92af7ac..9c6a2112a6 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -100,7 +100,7 @@ def create_global_var(shape,
                       force_cpu=False,
                       name=None):
     """
-    Create a new variable in the global block(block 0).
+    Create a new tensor variable with value in the global block(block 0).
 
     Args:
         shape(list[int]): shape of the variable
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 9191f0fc20..1f3a230048 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -17,7 +17,6 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
-from paddle.fluid.layers.control_flow import ParallelDo
 import unittest
 import os
 import numpy as np
@@ -84,18 +83,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
-        places = get_places()
-        pd = ParallelDo(places)
-        with pd.do():
-            avg_cost, predict_word = __network__(
-                list(
-                    map(pd.read_input, [
-                        first_word, second_word, third_word, forth_word,
-                        next_word
-                    ])))
-            pd.write_output(avg_cost)
-
-        avg_cost = fluid.layers.mean(pd())
+        raise ValueError('is_parallel=True not implemented')
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     sgd_optimizer.minimize(avg_cost)
@@ -262,7 +250,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
 
 for use_cuda in (False, True):
     for is_sparse in (False, True):
-        for is_parallel in (False, True):
+        for is_parallel in (False, ):  # TODO(paddle-dev): Add parallel test.
             inject_test_method(use_cuda, is_sparse, is_parallel)
 
 if __name__ == '__main__':

From efa5bac7ad4f1498a5ee4d340d550ea8b17fe9bf Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 12 Oct 2018 07:08:15 +0000
Subject: [PATCH 12/15] fix demo_ci bug in vis_demo.cc

test=develop
---
 paddle/fluid/inference/api/demo_ci/run.sh     |   8 +-
 .../api/demo_ci/trt_mobilenet_demo.cc         |  88 +++++++++++++++
 paddle/fluid/inference/api/demo_ci/utils.h    |  59 ++++++++++
 .../fluid/inference/api/demo_ci/vis_demo.cc   | 105 +++---------------
 4 files changed, 164 insertions(+), 96 deletions(-)
 create mode 100644 paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc

diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 76238070cd..65c95f0834 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -100,19 +100,17 @@ for WITH_STATIC_LIB in ON OFF; do
     rm -rf *
     cmake .. -DPADDLE_LIB=${PADDLE_ROOT}/build/fluid_install_dir/ \
       -DWITH_MKL=$TURN_ON_MKL \
-      -DDEMO_NAME=vis_demo \
+      -DDEMO_NAME=trt_mobilenet_demo \
       -DWITH_GPU=$TEST_GPU_CPU \
       -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
       -DUSE_TENSORRT=$USE_TENSORRT \
       -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
       -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
     make -j 
-    ./vis_demo \
+    ./trt_mobilenet_demo \
       --modeldir=$DATA_DIR/mobilenet/model \
       --data=$DATA_DIR/mobilenet/data.txt \
-      --refer=$DATA_DIR/mobilenet/result.txt \
-      --use_gpu=true \
-      --use_trt=true
+      --refer=$DATA_DIR/mobilenet/result.txt 
   fi
 done
 set +x
diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
new file mode 100644
index 0000000000..4377627859
--- /dev/null
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+/*
+ * This file contains demo of mobilenet for tensorrt.
+ */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
+#include <fstream>
+#include <iostream>
+
+// #include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/inference/demo_ci/utils.h"
+
+#ifdef PADDLE_WITH_CUDA
+DECLARE_double(fraction_of_gpu_memory_to_use);
+#endif
+DEFINE_string(modeldir, "", "Directory of the inference model.");
+DEFINE_string(refer, "", "path to reference result for comparison.");
+DEFINE_string(
+    data, "",
+    "path of data; each line is a record, format is "
+    "'<space splitted floats as data>\t<space splitted ints as shape'");
+
+namespace paddle {
+namespace demo {
+
+/*
+ * Use the native fluid engine to inference the demo.
+ */
+void Main() {
+  std::unique_ptr<PaddlePredictor> predictor;
+  paddle::contrib::MixedRTConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = true;
+  config.device = 0;
+  config.max_batch_size = 1;
+  config.fraction_of_gpu_memory = 0.1;  // set by yourself
+  predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
+
+  VLOG(3) << "begin to process data";
+  // Just a single batch of data.
+  std::string line;
+  std::ifstream file(FLAGS_data);
+  std::getline(file, line);
+  auto record = ProcessALine(line);
+  file.close();
+
+  // Inference.
+  PaddleTensor input;
+  input.shape = record.shape;
+  input.data =
+      PaddleBuf(record.data.data(), record.data.size() * sizeof(float));
+  input.dtype = PaddleDType::FLOAT32;
+
+  VLOG(3) << "run executor";
+  std::vector<PaddleTensor> output;
+  predictor->Run({input}, &output, 1);
+
+  VLOG(3) << "output.size " << output.size();
+  auto& tensor = output.front();
+  VLOG(3) << "output: " << SummaryTensor(tensor);
+
+  // compare with reference result
+  CheckOutput(FLAGS_refer, tensor);
+}
+
+}  // namespace demo
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  paddle::demo::Main();
+  return 0;
+}
diff --git a/paddle/fluid/inference/api/demo_ci/utils.h b/paddle/fluid/inference/api/demo_ci/utils.h
index cb89906711..4792c97fe7 100644
--- a/paddle/fluid/inference/api/demo_ci/utils.h
+++ b/paddle/fluid/inference/api/demo_ci/utils.h
@@ -14,6 +14,8 @@
 
 #pragma once
 #include <algorithm>
+#include <fstream>
+#include <iostream>
 #include <string>
 #include <vector>
 #include "paddle/fluid/inference/paddle_inference_api.h"
@@ -21,6 +23,11 @@
 namespace paddle {
 namespace demo {
 
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
 static void split(const std::string& str, char sep,
                   std::vector<std::string>* pieces) {
   pieces->clear();
@@ -39,6 +46,58 @@ static void split(const std::string& str, char sep,
   }
 }
 
+Record ProcessALine(const std::string& line) {
+  VLOG(3) << "process a line";
+  std::vector<std::string> columns;
+  split(line, '\t', &columns);
+  CHECK_EQ(columns.size(), 2UL)
+      << "data format error, should be <data>\t<shape>";
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(columns[0], ' ', &data_strs);
+  for (auto& d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(columns[1], ' ', &shape_strs);
+  for (auto& s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  VLOG(3) << "data size " << record.data.size();
+  VLOG(3) << "data shape size " << record.shape.size();
+  return record;
+}
+
+void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
+  std::string line;
+  std::ifstream file(referfile);
+  std::getline(file, line);
+  auto refer = ProcessALine(line);
+  file.close();
+
+  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
+  VLOG(3) << "predictor output numel " << numel;
+  VLOG(3) << "reference output numel " << refer.data.size();
+  CHECK_EQ(numel, refer.data.size());
+  switch (output.dtype) {
+    case PaddleDType::INT64: {
+      for (size_t i = 0; i < numel; ++i) {
+        CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
+      }
+      break;
+    }
+    case PaddleDType::FLOAT32:
+      for (size_t i = 0; i < numel; ++i) {
+        CHECK_LT(
+            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
+            1e-5);
+      }
+      break;
+  }
+}
+
 /*
  * Get a summary of a PaddleTensor content.
  */
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index b9d627b4a5..db61786e2f 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -18,10 +18,6 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include <fstream>
-#include <iostream>
-
-// #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/inference/demo_ci/utils.h"
 
 #ifdef PADDLE_WITH_CUDA
@@ -34,99 +30,28 @@ DEFINE_string(
     "path of data; each line is a record, format is "
     "'<space splitted floats as data>\t<space splitted ints as shape'");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
-DEFINE_bool(use_trt, false, "Whether use trt.");
 
 namespace paddle {
 namespace demo {
 
-struct Record {
-  std::vector<float> data;
-  std::vector<int32_t> shape;
-};
-
-void split(const std::string& str, char sep, std::vector<std::string>* pieces);
-
-Record ProcessALine(const std::string& line) {
-  VLOG(3) << "process a line";
-  std::vector<std::string> columns;
-  split(line, '\t', &columns);
-  CHECK_EQ(columns.size(), 2UL)
-      << "data format error, should be <data>\t<shape>";
-
-  Record record;
-  std::vector<std::string> data_strs;
-  split(columns[0], ' ', &data_strs);
-  for (auto& d : data_strs) {
-    record.data.push_back(std::stof(d));
-  }
-
-  std::vector<std::string> shape_strs;
-  split(columns[1], ' ', &shape_strs);
-  for (auto& s : shape_strs) {
-    record.shape.push_back(std::stoi(s));
-  }
-  VLOG(3) << "data size " << record.data.size();
-  VLOG(3) << "data shape size " << record.shape.size();
-  return record;
-}
-
-void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
-  std::string line;
-  std::ifstream file(referfile);
-  std::getline(file, line);
-  auto refer = ProcessALine(line);
-  file.close();
-
-  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
-  VLOG(3) << "predictor output numel " << numel;
-  VLOG(3) << "reference output numel " << refer.data.size();
-  CHECK_EQ(numel, refer.data.size());
-  switch (output.dtype) {
-    case PaddleDType::INT64: {
-      for (size_t i = 0; i < numel; ++i) {
-        CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
-      }
-      break;
-    }
-    case PaddleDType::FLOAT32:
-      for (size_t i = 0; i < numel; ++i) {
-        CHECK_LT(
-            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
-            1e-5);
-      }
-      break;
-  }
-}
-
 /*
  * Use the native fluid engine to inference the demo.
  */
-void Main(bool use_gpu, bool use_trt) {
+void Main(bool use_gpu) {
   std::unique_ptr<PaddlePredictor> predictor;
-  if (!use_trt) {
-    NativeConfig config;
-    config.param_file = FLAGS_modeldir + "/__params__";
-    config.prog_file = FLAGS_modeldir + "/__model__";
-    config.use_gpu = use_gpu;
-    config.device = 0;
-    if (FLAGS_use_gpu) {
-      config.fraction_of_gpu_memory = 0.1;  // set by yourself
-    }
-
-    VLOG(3) << "init predictor";
-    predictor =
-        CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
-  } else {
-    paddle::contrib::MixedRTConfig config;
-    config.param_file = FLAGS_modeldir + "/__params__";
-    config.prog_file = FLAGS_modeldir + "/__model__";
-    config.use_gpu = true;
-    config.device = 0;
-    config.max_batch_size = 1;
+  NativeConfig config;
+  config.param_file = FLAGS_modeldir + "/__params__";
+  config.prog_file = FLAGS_modeldir + "/__model__";
+  config.use_gpu = use_gpu;
+  config.device = 0;
+  if (FLAGS_use_gpu) {
     config.fraction_of_gpu_memory = 0.1;  // set by yourself
-    predictor = CreatePaddlePredictor<paddle::contrib::MixedRTConfig>(config);
   }
 
+  VLOG(3) << "init predictor";
+  predictor =
+      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+
   VLOG(3) << "begin to process data";
   // Just a single batch of data.
   std::string line;
@@ -159,12 +84,10 @@ void Main(bool use_gpu, bool use_trt) {
 
 int main(int argc, char** argv) {
   google::ParseCommandLineFlags(&argc, &argv, true);
-  if (FLAGS_use_gpu && FLAGS_use_trt) {
-    paddle::demo::Main(true /*use_gpu*/, true);
-  } else if (FLAGS_use_gpu) {
-    paddle::demo::Main(true /*use_gpu*/, false);
+  if (FLAGS_use_gpu) {
+    paddle::demo::Main(true /*use_gpu*/);
   } else {
-    paddle::demo::Main(false /*use_gpu*/, false /*use_tensorrt*/);
+    paddle::demo::Main(false /*use_gpu*/);
   }
   return 0;
 }

From 320c78e16f96dd84e72db45d0a21f79581d9fcc5 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 12 Oct 2018 11:28:38 +0000
Subject: [PATCH 13/15] fix commets

test=develop
---
 paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
index 4377627859..ffb12b5871 100644
--- a/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/trt_mobilenet_demo.cc
@@ -18,15 +18,9 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
-#include <fstream>
-#include <iostream>
-
-// #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/inference/demo_ci/utils.h"
 
-#ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
-#endif
 DEFINE_string(modeldir, "", "Directory of the inference model.");
 DEFINE_string(refer, "", "path to reference result for comparison.");
 DEFINE_string(
@@ -38,7 +32,7 @@ namespace paddle {
 namespace demo {
 
 /*
- * Use the native fluid engine to inference the demo.
+ * Use the tensorrt fluid engine to inference the demo.
  */
 void Main() {
   std::unique_ptr<PaddlePredictor> predictor;

From 9c77b65c06af0a2a95e244074f5ad7afcb9dc5e8 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Fri, 12 Oct 2018 19:39:44 +0800
Subject: [PATCH 14/15] Fix layers.uniform_random (#13823)

* fix layers.uniform_random

* fix uniform_random
test=develop

* remove var type set
test=develop

* fix similar error
test=develop
---
 paddle/fluid/operators/uniform_random_op.cc | 32 ++++++++++-----------
 python/paddle/fluid/layers/ops.py           | 17 +++++++----
 2 files changed, 28 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 763bb40358..aa907595cb 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -23,14 +23,14 @@ namespace operators {
 template <typename T>
 class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    framework::Tensor* tensor = nullptr;
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    framework::Tensor *tensor = nullptr;
     auto out_var = ctx.OutputVar("Out");
     if (out_var->IsType<framework::LoDTensor>()) {
       tensor = out_var->GetMutable<framework::LoDTensor>();
     } else if (out_var->IsType<framework::SelectedRows>()) {
       auto shape = ctx.Attr<std::vector<int>>("shape");
-      auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
+      auto *selected_rows = out_var->GetMutable<framework::SelectedRows>();
       tensor = selected_rows->mutable_value();
       tensor->Resize(framework::make_ddim(shape));
       selected_rows->mutable_rows()->reserve(shape[0]);
@@ -39,7 +39,7 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
           "uniform_random_op's output only"
           "supports SelectedRows and LoDTensor");
     }
-    T* data = tensor->mutable_data<T>(ctx.GetPlace());
+    T *data = tensor->mutable_data<T>(ctx.GetPlace());
     unsigned int seed = static_cast<unsigned int>(ctx.Attr<int>("seed"));
     std::minstd_rand engine;
     if (seed == 0) {
@@ -60,14 +60,14 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UniformRandomOp should not be null.");
 
     PADDLE_ENFORCE(
         ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
         "uniform_random's min must less then max");
-    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
     std::vector<int64_t> temp;
     temp.reserve(shape.size());
     for (auto dim : shape) {
@@ -78,7 +78,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
+      const framework::ExecutionContext &ctx) const override {
     return framework::OpKernelType(
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
         ctx.GetPlace());
@@ -112,17 +112,17 @@ uniform distribution. The random result is in set [min, max].
 
 class UniformRandomOpVarTypeInference : public framework::VarTypeInference {
  public:
-  void operator()(const framework::OpDesc& op_desc,
-                  framework::BlockDesc* block) const override {
+  void operator()(const framework::OpDesc &op_desc,
+                  framework::BlockDesc *block) const override {
     auto out_var_name = op_desc.Output("Out").front();
-    if (block->FindRecursiveOrCreateVar(out_var_name).GetType() ==
-        framework::proto::VarType::SELECTED_ROWS) {
-      block->FindRecursiveOrCreateVar(out_var_name)
-          .SetType(framework::proto::VarType::SELECTED_ROWS);
-    } else {
-      block->FindRecursiveOrCreateVar(out_var_name)
-          .SetType(framework::proto::VarType::LOD_TENSOR);
+    auto var_data_type = static_cast<framework::proto::VarType::Type>(
+        boost::get<int>(op_desc.GetAttr("dtype")));
+
+    auto out_var = block->FindRecursiveOrCreateVar(out_var_name);
+    if (out_var.GetType() != framework::proto::VarType::SELECTED_ROWS) {
+      out_var.SetType(framework::proto::VarType::LOD_TENSOR);
     }
+    out_var.SetDataType(var_data_type);
   }
 };
 
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 9a8300524d..1ff40a26f2 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -14,6 +14,8 @@
 
 from __future__ import print_function
 from .layer_function_generator import generate_layer_fn, generate_layer_fn_noattr
+from .. import core
+from ..framework import convert_np_dtype_to_dtype_
 
 __activations_noattr__ = [
     'sigmoid',
@@ -58,8 +60,11 @@ _uniform_random_ = generate_layer_fn('uniform_random')
 
 
 def uniform_random(shape, dtype=None, min=None, max=None, seed=None):
+    locals_var = locals().keys()
+    if not isinstance(dtype, core.VarDesc.VarType):
+        dtype = convert_np_dtype_to_dtype_(dtype)
     kwargs = dict()
-    for name in locals():
+    for name in locals_var:
         val = locals()[name]
         if val is not None:
             kwargs[name] = val
@@ -78,8 +83,9 @@ _hard_shrink_ = generate_layer_fn('hard_shrink')
 
 
 def hard_shrink(x, threshold=None):
+    locals_var = locals().keys()
     kwargs = dict()
-    for name in locals():
+    for name in locals_var:
         val = locals()[name]
         if val is not None:
             kwargs[name] = val
@@ -99,12 +105,12 @@ _cum_sum_ = generate_layer_fn('cumsum')
 
 
 def cumsum(x, axis=None, exclusive=None, reverse=None):
+    locals_var = locals().keys()
     kwargs = dict()
-    for name in locals():
+    for name in locals_var:
         val = locals()[name]
         if val is not None:
             kwargs[name] = val
-
     return _cum_sum_(**kwargs)
 
 
@@ -121,8 +127,9 @@ _thresholded_relu_ = generate_layer_fn('thresholded_relu')
 
 
 def thresholded_relu(x, threshold=None):
+    locals_var = locals().keys()
     kwargs = dict()
-    for name in locals():
+    for name in locals_var:
         val = locals()[name]
         if val is not None:
             kwargs[name] = val

From d852be7c4886678d1fe12a6f37148cf7437d0a82 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sat, 13 Oct 2018 19:25:23 +0800
Subject: [PATCH 15/15] Revert "Make variable::GetMutable robust"

---
 paddle/fluid/framework/executor.cc              |  2 +-
 paddle/fluid/framework/feed_fetch_method.cc     |  3 ++-
 paddle/fluid/framework/naive_executor.cc        |  2 +-
 paddle/fluid/framework/variable.h               |  6 +-----
 paddle/fluid/framework/variable_test.cc         | 11 +++++------
 python/paddle/fluid/tests/book/test_word2vec.py | 16 ++++++++++++++--
 6 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index a070b8efb8..70ec6e90a4 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -66,7 +66,7 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope*>>();
+    var->GetMutable<std::vector<framework::Scope>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc
index 3e9353f5cf..8e1f93c5eb 100644
--- a/paddle/fluid/framework/feed_fetch_method.cc
+++ b/paddle/fluid/framework/feed_fetch_method.cc
@@ -27,7 +27,8 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input,
   // be created.
   VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
   Variable* g_feed_value = scope->Var(var_name);
-  auto& feed_inputs = *(g_feed_value->GetMutable<FeedFetchList>());
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
   if (index >= feed_inputs.size()) {
     feed_inputs.resize(index + 1);
   }
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 2840d503f1..ba10687d65 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -37,7 +37,7 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
   } else if (var_type == proto::VarType::FETCH_LIST) {
     var->GetMutable<FeedFetchList>();
   } else if (var_type == proto::VarType::STEP_SCOPES) {
-    var->GetMutable<std::vector<framework::Scope *>>();
+    var->GetMutable<std::vector<framework::Scope>>();
   } else if (var_type == proto::VarType::LOD_RANK_TABLE) {
     var->GetMutable<LoDRankTable>();
   } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) {
diff --git a/paddle/fluid/framework/variable.h b/paddle/fluid/framework/variable.h
index 873e1b20a5..067e0c2b83 100644
--- a/paddle/fluid/framework/variable.h
+++ b/paddle/fluid/framework/variable.h
@@ -38,12 +38,8 @@ class Variable {
 
   template <typename T>
   T* GetMutable() {
-    if (!holder_) {
+    if (!IsType<T>()) {
       holder_.reset(new PlaceholderImpl<T>(new T()));
-    } else {
-      PADDLE_ENFORCE(IsType<T>(),
-                     "Variable must be type %s, the holding type is %s",
-                     typeid(T).name(), holder_->Type().name());
     }
     return static_cast<T*>(holder_->Ptr());
   }
diff --git a/paddle/fluid/framework/variable_test.cc b/paddle/fluid/framework/variable_test.cc
index 003dcfd3df..c5c1d215f4 100644
--- a/paddle/fluid/framework/variable_test.cc
+++ b/paddle/fluid/framework/variable_test.cc
@@ -33,10 +33,9 @@ TEST(Variable, GetMutable) {
   const Tensor& tt = v->Get<Tensor>();
   EXPECT_EQ(1234, tt.content_);
 
-  try {
-    v->GetMutable<std::string>();
-  } catch (std::exception& e) {
-    return;
-  }
-  EXPECT_TRUE(false);
+  std::string* s = v->GetMutable<std::string>();
+  *s = "hello";
+
+  const std::string& ss = v->Get<std::string>();
+  EXPECT_EQ("hello", ss);
 }
diff --git a/python/paddle/fluid/tests/book/test_word2vec.py b/python/paddle/fluid/tests/book/test_word2vec.py
index 1f3a230048..9191f0fc20 100644
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -17,6 +17,7 @@ from __future__ import print_function
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.layers.device import get_places
+from paddle.fluid.layers.control_flow import ParallelDo
 import unittest
 import os
 import numpy as np
@@ -83,7 +84,18 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
         avg_cost, predict_word = __network__(
             [first_word, second_word, third_word, forth_word, next_word])
     else:
-        raise ValueError('is_parallel=True not implemented')
+        places = get_places()
+        pd = ParallelDo(places)
+        with pd.do():
+            avg_cost, predict_word = __network__(
+                list(
+                    map(pd.read_input, [
+                        first_word, second_word, third_word, forth_word,
+                        next_word
+                    ])))
+            pd.write_output(avg_cost)
+
+        avg_cost = fluid.layers.mean(pd())
 
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
     sgd_optimizer.minimize(avg_cost)
@@ -250,7 +262,7 @@ def inject_test_method(use_cuda, is_sparse, is_parallel):
 
 for use_cuda in (False, True):
     for is_sparse in (False, True):
-        for is_parallel in (False, ):  # TODO(paddle-dev): Add parallel test.
+        for is_parallel in (False, True):
             inject_test_method(use_cuda, is_sparse, is_parallel)
 
 if __name__ == '__main__':