From 7ab5626dee0e08262d0b3eaf8f89f05217cccde8 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 13 Sep 2018 14:26:52 +0200
Subject: [PATCH 01/13] - Added initial pass for embedding-fc-lstm

- Added draft of new operator

- Added fused embedding fc lstm files

- First time embedding_fc_lstm_fuse_pass was invoked in
  test_text_classification

- Added Embedding pattern

- Not crashing

- Enabled draft of embedding_fc_lstm pass (does it job)

- First working (Seqcompute only) version

- Removed diagnostic comment

- First enabling of BatchCompute

- Disabling pass for embedding with is_sparse and is_distributed

- Cosmetics

- Style

- Style
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../ir/embedding_fc_lstm_fuse_pass.cc         | 242 +++++++
 .../ir/embedding_fc_lstm_fuse_pass.h          |  40 ++
 .../framework/ir/graph_pattern_detector.cc    |  18 +
 .../framework/ir/graph_pattern_detector.h     |  17 +
 paddle/fluid/inference/analysis/analyzer.h    |  17 +-
 .../operators/fused_embedding_fc_lstm_op.cc   | 608 ++++++++++++++++++
 .../operators/fused_embedding_fc_lstm_op.h    |  41 ++
 8 files changed, 976 insertions(+), 8 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
 create mode 100644 paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
 create mode 100644 paddle/fluid/operators/fused_embedding_fc_lstm_op.h
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 4dca3ceb45..01733fdda2 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -34,6 +34,7 @@ endif()
 pass_library(attention_lstm_fuse_pass inference)
 pass_library(infer_clean_graph_pass inference)
 pass_library(fc_lstm_fuse_pass inference)
+pass_library(embedding_fc_lstm_fuse_pass inference)
 pass_library(fc_gru_fuse_pass inference)
 pass_library(seq_concat_fc_fuse_pass inference)
 
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
new file mode 100644
index 0000000000..38495125c3
--- /dev/null
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -0,0 +1,242 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
+#include <string>
+#include "paddle/fluid/framework/lod_tensor.h"
+
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+static int BuildFusion(Graph* graph, const std::string& name_scope,
+                       Scope* scope, bool with_fc_bias) {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Build pattern
+  PDNode* x = pattern->NewNode(patterns::PDNodeName(name_scope, "x"))
+                  ->assert_is_op_input("lookup_table")
+                  ->assert_var_not_persistable();
+  patterns::Embedding embedding_pattern(pattern, name_scope);
+  // TODO(jczaja): Intermediate can only be for val that are not used anywhere
+  //               but lookup table output may go into other LSTM (for reverse
+  //               direction)
+  auto* embedding_out = embedding_pattern(x);
+  patterns::FC fc_pattern(pattern, name_scope);
+
+  // fc_out is a tmp var, will be removed after fuse, so marked as intermediate.
+  auto* fc_out = fc_pattern(embedding_out, with_fc_bias)->AsIntermediate();
+  patterns::LSTM lstm_pattern(pattern, name_scope);
+  lstm_pattern(fc_out);
+
+  // Create New OpDesc
+  auto embedding_lstm_creator = [&](Node* embedding, Node* W, Node* lstm,
+                                    Node* input, Node* weight_x, Node* weight_h,
+                                    Node* bias, Node* hidden, Node* cell,
+                                    Node* xx, Node* fc_bias) {
+    OpDesc op_desc;
+    op_desc.SetType("fused_embedding_fc_lstm");
+#define SET_IN(Key, node__) op_desc.SetInput(#Key, {node__->Name()});
+    SET_IN(Ids, input);
+    SET_IN(WeightH, weight_h);
+    // Neet to have this passed as We need Wc data for peephole connections
+    SET_IN(Bias, bias);
+#undef SET_IN
+
+    // Multiply embeddings with Weights
+    PADDLE_ENFORCE(scope);
+    const std::string& embeddings = patterns::UniqueKey("Embeddings");
+    auto* embeddings_var = scope->Var(embeddings);
+    PADDLE_ENFORCE(embeddings_var);
+    auto* embeddings_tensor =
+        embeddings_var->GetMutable<framework::LoDTensor>();
+    // Get WeightX size: [single_embedding, fc_size]
+    // and embedding size: [dict_size, single_embedding]
+    // and create new size of embeddings eg. [dict_size , hidden_size]
+    auto* embedding_var = scope->FindVar(W->Name());
+    PADDLE_ENFORCE(embedding_var);
+    const auto& embedding_tensor = embedding_var->Get<framework::LoDTensor>();
+
+    const auto& weightx_tensor =
+        scope->FindVar(weight_x->Name())->Get<framework::LoDTensor>();
+    embeddings_tensor->Resize(
+        {embedding_tensor.dims()[0], weightx_tensor.dims()[1]});
+
+    // Multiplie embeddings via WeightsX and add bias
+    auto embedding_data = embedding_tensor.data<float>();
+    auto weightx_data = weightx_tensor.data<float>();
+    auto embeddings_data =
+        embeddings_tensor->mutable_data<float>(platform::CPUPlace());
+
+    // Adding biases to GEMM result to be
+    auto* lstm_bias_var = scope->FindVar(bias->Name());
+    PADDLE_ENFORCE(lstm_bias_var);
+    const auto& lstm_bias_tensor = lstm_bias_var->Get<framework::LoDTensor>();
+
+    auto alpha = 1.0f;
+    auto beta = 1.0f;
+    int m = embedding_tensor.dims()[0];
+    int n = weightx_tensor.dims()[1];
+    int k = embedding_tensor.dims()[1];
+
+    // Copy only gate biases values (only actual bias data, not peephole
+    // weights)
+    std::vector<float> combined_biases(n, 0.0f);
+    memcpy(&combined_biases[0], lstm_bias_tensor.data<float>(),
+           n * sizeof(float));
+
+    if (with_fc_bias) {
+      // Add FC-bias with LSTM-bias (into GEMM result to be)
+      auto* fc_bias_var = scope->FindVar(fc_bias->Name());
+      const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
+      for (int i = 0; i < fc_bias_tensor.numel(); i++) {
+        combined_biases[i] =
+            lstm_bias_tensor.data<float>()[i] + fc_bias_tensor.data<float>()[i];
+      }
+    }
+
+    // broadcast biases
+    std::vector<float> ones(m, 1.0f);
+    paddle::operators::math::CBlas<float>::GEMM(
+        CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
+        &combined_biases[0], n, 0.0f, embeddings_data, n);
+
+    // Wx*embeddings
+    paddle::operators::math::CBlas<float>::GEMM(
+        CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha,
+        embedding_data, k, weightx_data, n, beta, embeddings_data, n);
+    op_desc.SetInput("Embeddings", {embeddings});
+
+    // Create temp variables.
+    const std::string BatchedInput = patterns::UniqueKey("BatchedInput");
+    const std::string BatchedCellPreAct =
+        patterns::UniqueKey("BatchedCellPreAct");
+    const std::string BatchedGate = patterns::UniqueKey("BatchedGate");
+
+    scope->Var(BatchedInput)->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedCellPreAct)->GetMutable<framework::LoDTensor>();
+    scope->Var(BatchedGate)->GetMutable<framework::LoDTensor>();
+
+    op_desc.SetInput("H0", {});
+    op_desc.SetInput("C0", {});
+    op_desc.SetOutput("Hidden", {hidden->Name()});
+    op_desc.SetOutput("Cell", {cell->Name()});
+    op_desc.SetOutput("XX", {xx->Name()});
+    op_desc.SetOutput("BatchedGate", {BatchedGate});
+    op_desc.SetOutput("BatchCellPreAct", {BatchedCellPreAct});
+    op_desc.SetOutput("BatchedInput", {BatchedInput});
+    op_desc.SetAttr("is_reverse", lstm->Op()->GetAttr("is_reverse"));
+    op_desc.SetAttr("use_peepholes", lstm->Op()->GetAttr("use_peepholes"));
+    // TODO(TJ): get from attr
+    op_desc.SetAttr("use_seq", true);
+
+    PADDLE_ENFORCE(graph->Has(kParamScopeAttr));
+    auto* scope = graph->Get<Scope*>(kParamScopeAttr);
+#define OP_SET_OUT(x)                            \
+  const std::string x = patterns::UniqueKey(#x); \
+  op_desc.SetOutput(#x, {x});                    \
+  scope->Var(x)->GetMutable<LoDTensor>()
+    OP_SET_OUT(BatchedCell);
+    OP_SET_OUT(BatchedHidden);
+    OP_SET_OUT(ReorderedH0);
+    OP_SET_OUT(ReorderedC0);
+#undef OP_SET_OUT
+
+    auto* op = graph->CreateOpNode(&op_desc);
+    IR_NODE_LINK_TO(input, op);
+    IR_NODE_LINK_TO(weight_x, op);
+    IR_NODE_LINK_TO(weight_h, op);
+    IR_NODE_LINK_TO(bias, op);
+    IR_NODE_LINK_TO(op, hidden);
+    return op;
+  };
+
+  int fusion_count{0};
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_IR_NODE_FROM_SUBGRAPH(lstm, lstm, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Weight, Weight, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Bias, Bias, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Cell, Cell, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(Hidden, Hidden, lstm_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(lookup_table, lookup_table, embedding_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(W, W, embedding_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(w, w, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(mul, mul, fc_pattern);
+
+    // TODO(jczaja): Add support for is_sparse / is_distributed
+    auto is_sparse = boost::get<bool>(lookup_table->Op()->GetAttr("is_sparse"));
+    auto is_distributed =
+        boost::get<bool>(lookup_table->Op()->GetAttr("is_distributed"));
+
+    if (is_sparse == true || is_distributed == true) {
+      return;
+    }
+
+    if (with_fc_bias) {
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, Out, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(fc_bias, bias, fc_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(elementwise_add, elementwise_add, fc_pattern);
+      embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
+                             Bias, Hidden, Cell, fc_out, fc_bias);
+      // Remove unneeded nodes.
+      // TODO(jczaja): Proper removing of loopup table
+      std::unordered_set<const Node*> marked_nodes(
+          //{lookup_table, mul, lstm, elementwise_add, fc_bias, W});
+          {mul, lstm, elementwise_add, fc_bias});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    } else {
+      GET_IR_NODE_FROM_SUBGRAPH(fc_out, mul_out, fc_pattern);
+      embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
+                             Bias, Hidden, Cell, fc_out, nullptr);
+      // Remove unneeded nodes.
+      // TODO(jczaja): Proper removing of loopup table
+      // std::unordered_set<const Node*> marked_nodes({lookup_table, W, mul,
+      // lstm});
+      std::unordered_set<const Node*> marked_nodes({mul, lstm});
+      GraphSafeRemoveNodes(graph, marked_nodes);
+    }
+
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+std::unique_ptr<ir::Graph> EmbeddingFCLSTMFusePass::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  FusePassBase::Init(name_scope_, graph.get());
+
+  int fusion_count = BuildFusion(graph.get(), name_scope_, param_scope(),
+                                 true /*with_fc_bias*/);
+
+  AddStatis(fusion_count);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(embedding_fc_lstm_fuse_pass,
+              paddle::framework::ir::EmbeddingFCLSTMFusePass);
diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
new file mode 100644
index 0000000000..e5ad3067ec
--- /dev/null
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h
@@ -0,0 +1,40 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// Fusing of Embedding , FC and LSTM op
+
+// Just FC without bias
+class EmbeddingFCLSTMFusePass : public FusePassBase {
+ public:
+  virtual ~EmbeddingFCLSTMFusePass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
+
+  const std::string name_scope_{"embedding_fc_lstm_fuse"};
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 6d2c51b0e9..46c6a52c09 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -692,6 +692,24 @@ PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
   }
 }
 
+PDNode *patterns::Embedding::operator()(PDNode *x) {
+  x->assert_is_op_input("lookup_table", "Ids");
+  auto *lookup_table_op =
+      pattern->NewNode(lookup_table_repr())->assert_is_op("lookup_table");
+#define NEW_NODE(arg__, io__)                    \
+  auto *arg__ = pattern->NewNode(arg__##_repr()) \
+                    ->assert_is_op_##io__("lookup_table", #arg__);
+
+  NEW_NODE(W, input);
+
+  NEW_NODE(Out, output);
+#undef NEW_NODE
+
+  lookup_table_op->LinksFrom({x, W});
+  lookup_table_op->LinksTo({Out});
+  return Out;
+}
+
 PDNode *patterns::LSTM::operator()(PDNode *x) {
   x->assert_is_op_input("lstm", "Input");
   auto *lstm_op = pattern->NewNode(lstm_repr())->assert_is_op("lstm");
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 69b486c29d..508113bf4f 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -418,6 +418,23 @@ struct FC : public PatternBase {
   PATTERN_DECL_NODE(Out);
 };
 
+// Embedding
+struct Embedding : public PatternBase {
+  Embedding(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "embedding") {}
+
+  PDNode* operator()(PDNode* x);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(lookup_table);
+  // Inputs
+  //
+  PATTERN_DECL_NODE(Ids);
+  PATTERN_DECL_NODE(W);  // embeddings
+  // Outputs
+  PATTERN_DECL_NODE(Out);
+};
+
 struct LSTM : public PatternBase {
   LSTM(PDPattern* pattern, const std::string& name_scope)
       : PatternBase(pattern, name_scope, "lstm") {}
diff --git a/paddle/fluid/inference/analysis/analyzer.h b/paddle/fluid/inference/analysis/analyzer.h
index 9bdbefc07c..0aa9367bf5 100644
--- a/paddle/fluid/inference/analysis/analyzer.h
+++ b/paddle/fluid/inference/analysis/analyzer.h
@@ -64,14 +64,15 @@ class Analyzer : public OrderedRegistry<PassManager> {
   // larger fusion.
   const std::vector<std::string> all_ir_passes_{{
       // Manual update the passes here.
-      "infer_clean_graph_pass",    //
-      "attention_lstm_fuse_pass",  //
-      "fc_lstm_fuse_pass",         //
-      "mul_lstm_fuse_pass",        //
-      "fc_gru_fuse_pass",          //
-      "mul_gru_fuse_pass",         //
-      "seq_concat_fc_fuse_pass",   //
-      "fc_fuse_pass",              //
+      "infer_clean_graph_pass",       //
+      "attention_lstm_fuse_pass",     //
+      "embedding_fc_lstm_fuse_pass",  //
+      "fc_lstm_fuse_pass",            //
+      "mul_lstm_fuse_pass",           //
+      "fc_gru_fuse_pass",             //
+      "mul_gru_fuse_pass",            //
+      "seq_concat_fc_fuse_pass",      //
+      "fc_fuse_pass",                 //
 #ifdef PADDLE_WITH_MKLDNN
       "conv_relu_mkldnn_fuse_pass",  //
 #endif
diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
new file mode 100644
index 0000000000..3c4cc77452
--- /dev/null
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
@@ -0,0 +1,608 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused_embedding_fc_lstm_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+void FusedEmbeddingFCLSTMOp::InferShape(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Embeddings"),
+                 "Assert only one Input(Embeddings) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("WeightH"),
+                 "Assert only one Input(WeightH) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("Bias"), "Assert only one Input(Bias) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("XX"), "Assert only one Output(XX) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                 "Assert only one Output(Hidden) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                 "Assert only one Output(Cell) of LSTM.");
+  PADDLE_ENFORCE(ctx->HasInput("Ids"),
+                 "Input(Ids) of LookupTableOp should not be null.");
+
+  auto table_dims = ctx->GetInputDim("Embeddings");
+  auto ids_dims = ctx->GetInputDim("Ids");
+  int ids_rank = ids_dims.size();
+
+  PADDLE_ENFORCE_EQ(table_dims.size(), 2);
+  PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
+                    "The last dimension of the 'Ids' tensor must be 1.");
+
+  auto x_dims = ctx->GetInputDim("Ids");
+  PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(Ids)'s rank must be 2.");
+
+  if (ctx->HasInput("H0")) {
+    PADDLE_ENFORCE(ctx->HasInput("C0"),
+                   "Input(Cell) and Input(Hidden) of LSTM should not "
+                   "be null at the same time.");
+    auto h_dims = ctx->GetInputDim("H0");
+    auto c_dims = ctx->GetInputDim("C0");
+    PADDLE_ENFORCE(h_dims == c_dims,
+                   "The dimension of Input(H0) and Input(C0) "
+                   "should be the same.");
+  }
+
+  auto embeddings_dims = ctx->GetInputDim("Embeddings");
+  PADDLE_ENFORCE_EQ(embeddings_dims.size(), 2,
+                    "The rank of Input(Embeddings) should be 2.");
+  //  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
+  //                    "The first dimension of Input(Embeddings) "
+  //                    "should be %d.",
+  //                    x_dims[1]);
+
+  auto wh_dims = ctx->GetInputDim("WeightH");
+  int frame_size = wh_dims[1] / 4;
+  PADDLE_ENFORCE_EQ(wh_dims.size(), 2,
+                    "The rank of Input(WeightH) should be 2.");
+  PADDLE_ENFORCE_EQ(wh_dims[0], frame_size,
+                    "The first dimension of Input(WeightH) "
+                    "should be %d.",
+                    frame_size);
+  PADDLE_ENFORCE_EQ(wh_dims[1], 4 * frame_size,
+                    "The second dimension of Input(WeightH) "
+                    "should be 4 * %d.",
+                    frame_size);
+
+  auto b_dims = ctx->GetInputDim("Bias");
+  PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+  PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                    "The first dimension of Input(Bias) should be 1.");
+  PADDLE_ENFORCE_EQ(
+      b_dims[1], (ctx->Attrs().Get<bool>("use_peepholes") ? 7 : 4) * frame_size,
+      "The second dimension of Input(Bias) should be "
+      "7 * %d if enable peepholes connection or"
+      "4 * %d if disable peepholes",
+      frame_size, frame_size);
+
+  framework::DDim out_dims({x_dims[0], frame_size});
+  ctx->SetOutputDim("Hidden", out_dims);
+  ctx->SetOutputDim("Cell", out_dims);
+  ctx->ShareLoD("Ids", "Hidden");
+  ctx->ShareLoD("Ids", "Cell");
+  int xx_width;
+  if (ctx->Attrs().Get<bool>("use_seq")) {
+    xx_width = wh_dims[1];
+  } else {
+    xx_width = x_dims[1] > wh_dims[1] ? wh_dims[1] : x_dims[1];
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedInput"),
+                   "Assert only one Output(BatchedInput) of LSTM.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedHidden"),
+                   "Assert only one Output(BatchedHidden) of LSTM.");
+    PADDLE_ENFORCE(ctx->HasOutput("BatchedCell"),
+                   "Assert only one Output(BatchedCell) of LSTM.");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedH0"),
+                   "Assert only one Output(ReorderedH0) of LSTM");
+    PADDLE_ENFORCE(ctx->HasOutput("ReorderedC0"),
+                   "Assert only one Output(ReorderedC0) of LSTM.");
+    ctx->SetOutputDim("BatchedInput", {x_dims[0], wh_dims[1]});
+    ctx->SetOutputDim("BatchedHidden", out_dims);
+    ctx->SetOutputDim("BatchedCell", out_dims);
+  }
+  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
+  ctx->ShareLoD("Ids", "XX");
+}
+
+framework::OpKernelType FusedEmbeddingFCLSTMOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(
+          ctx.Input<framework::LoDTensor>("Embeddings")->type()),
+      ctx.device_context());
+}
+
+void FusedEmbeddingFCLSTMOpMaker::Make() {
+  AddInput("Ids",
+           "An input with type int32 or int64 "
+           "contains the ids to be looked up in W. "
+           "The last dimension size must be 1.");
+  AddInput("Embeddings",
+           "(Tensor) the learnable weights of X."
+           " - The shape is (M x 4D), where M is the dim size of x, D is the "
+           "hidden size. "
+           " - Weight = {W_cx, W_ix, W_fx, W_ox}");
+  AddInput("WeightH",
+           "(Tensor) same as LSTMOp, the learnable hidden-hidden weights."
+           " - The shape is (D x 4D), where D is the hidden size. "
+           " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+  AddInput("Bias",
+           "(Tensor) the learnable weights. Almost same as LSTMOp"
+           "Note: we should add the fc bias into this (1x4D) in bias."
+           "input-hidden bias weight and peephole connections weight if "
+           "setting `use_peepholes` True. "
+           "1. `use_peepholes = False` "
+           " - The shape is (1 x 4D). "
+           " - Bias = {b_c, b_i, b_f, b_o}."
+           "2. `use_peepholes = True` "
+           " - The shape is (1 x 7D). "
+           " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+  AddInput("H0",
+           "(Tensor, optional) (same as LSTMOp) the initial hidden state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size and D is the hidden size.")
+      .AsDispensable();
+  AddInput("C0",
+           "(Tensor, optional) (same as LSTMOp) (the initial cell state is an "
+           "optional "
+           "input. This is a tensor with shape (N x D), where N is the "
+           "batch size. `H0` and `C0` can be NULL but only at the same time.")
+      .AsDispensable();
+  AddOutput("Hidden",
+            "(LoDTensor) (same as LSTMOp) the hidden state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("Cell",
+            "(LoDTensor) (same as LSTMOp) the cell state of LSTM operator. "
+            "The shape is (T x D), and lod is the same with the `Input`.");
+  AddOutput("XX",
+            "(LoDTensor) the result after X * WeightX (size is T x 4D)"
+            " or batched_X (size is T x M), this will be automatically chosen,"
+            " where T is the total time steps in this mini-batch,"
+            " D is the hidden size, M is the dim size of x input.")
+      .AsIntermediate();
+  AddOutput("BatchedInput", "(LoDTensor) (T x 4D).").AsIntermediate();
+  AddOutput("BatchedHidden", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("BatchedCell", "(LoDTensor) (T x D).").AsIntermediate();
+  AddOutput("ReorderedH0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddOutput("ReorderedC0", "(LoDTensor) (N x D).").AsIntermediate();
+  AddAttr<bool>("use_peepholes",
+                "(bool, defalut: True) "
+                "whether to enable diagonal/peephole connections.")
+      .SetDefault(true);
+  AddAttr<bool>("is_reverse",
+                "(bool, defalut: False) "
+                "whether to compute reversed LSTM.")
+      .SetDefault(false);
+  AddAttr<bool>("use_seq",
+                "(bool, defalut: True) "
+                "whether to use seq mode to compute.")
+      .SetDefault(true);
+  AddAttr<std::string>("gate_activation",
+                       "(string, default: sigmoid)"
+                       "The activation for input gate, forget gate and output "
+                       "gate, `sigmoid` by default.")
+      .SetDefault("sigmoid")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("cell_activation",
+                       "(string, default: tanh)"
+                       "The activation for cell output, `tanh` by defalut.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddAttr<std::string>("candidate_activation",
+                       "(string, default: tanh)"
+                       "The activation for candidate hidden state, "
+                       "`tanh` by default.")
+      .SetDefault("tanh")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Long-Short Term Memory (LSTM) Operator.
+This operator fuse the X into LSTM, more details can refer to LSTM op.
+)DOC");
+}
+
+template <typename T>
+class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
+ public:
+#define INIT_VEC_FUNC                                                          \
+  std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand; \
+  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");               \
+  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");               \
+  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");          \
+  if (platform::jit::MayIUse(platform::jit::avx)) {                            \
+    math::VecActivations<T, platform::jit::avx> act_functor;                   \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  } else {                                                                     \
+    math::VecActivations<T, platform::jit::isa_any> act_functor;               \
+    act_gate = act_functor(act_gate_str);                                      \
+    act_cell = act_functor(act_cell_str);                                      \
+    act_cand = act_functor(act_cand_str);                                      \
+  }
+
+#define INIT_BASE_INPUT_OUTPUT                        \
+  auto* ids = ctx.Input<LoDTensor>("Ids");            \
+  auto* h0 = ctx.Input<Tensor>("H0");                 \
+  auto* c0 = ctx.Input<Tensor>("C0");                 \
+  auto* embeddings = ctx.Input<Tensor>("Embeddings"); \
+  auto* wh = ctx.Input<Tensor>("WeightH");            \
+  auto* bias = ctx.Input<Tensor>("Bias");             \
+  auto* xx = ctx.Output<LoDTensor>("XX");             \
+  auto* hidden_out = ctx.Output<LoDTensor>("Hidden"); \
+  auto* cell_out = ctx.Output<LoDTensor>("Cell");     \
+  bool is_reverse = ctx.Attr<bool>("is_reverse");     \
+  bool use_peepholes = ctx.Attr<bool>("use_peepholes");
+
+#define INIT_BASE_SIZES                       \
+  auto ids_dims = ids->dims();   /* T x M*/   \
+  auto ids_numel = ids->numel(); /* T x 1*/   \
+  auto wh_dims = wh->dims();     /* D x 4D*/  \
+  const int D = wh_dims[0];                   \
+  const int D2 = D * 2;                       \
+  const int D3 = D * 3;                       \
+  int64_t row_number = embeddings->dims()[0]; \
+  int64_t row_width = embeddings->dims()[1];  \
+  const int D4 = wh_dims[1];
+
+#define INIT_BASE_INPUT_DATAS                                        \
+  const int64_t* ids_data = ids->data<int64_t>();                    \
+  const T* embeddings_data = embeddings->data<T>();                  \
+  const T* wh_data = wh->data<T>();                                  \
+  /* diagonal weight*/                                               \
+  const T* wc_data = bias->data<T>() + D4;                           \
+  /* for peephole only*/                                             \
+  Tensor checked_cell;                                               \
+  T* checked_cell_data = nullptr;                                    \
+  auto place = ctx.GetPlace();                                       \
+  if (use_peepholes) {                                               \
+    /* w_ic * Ct-1, w_fc * Ct-1  ; w_oc * Ct => ih*/                 \
+    checked_cell_data = checked_cell.mutable_data<T>({2, D}, place); \
+  }
+
+/// Compute LSTM
+#define GEMM_WH_ADDON(bs, prev, out)                                           \
+  blas.GEMM(CblasNoTrans, CblasNoTrans, bs, D4, D, static_cast<T>(1), prev, D, \
+            wh_data, D4, static_cast<T>(1), out, D4)
+
+// gates: W_ch, W_ih, W_fh, W_oh
+#define GET_Ct(ct_1, gates, ct)                   \
+  /* C_t = C_t-1 * fgated + cand_gated * igated*/ \
+  act_cand(D, gates, gates);                      \
+  blas.VMUL(D, gates, gates + D, gates + D);      \
+  blas.VMUL(D, ct_1, gates + D2, gates + D2);     \
+  blas.VADD(D, gates + D, gates + D2, ct)
+
+#define GET_Ht(ct, gates, ht)        \
+  /* H_t = act_cell(C_t) * ogated */ \
+  act_cell(D, ct, gates + D2);       \
+  blas.VMUL(D, gates + D2, gates + D3, ht)
+
+#define GET_Ct_NOH0C0(gates, ct)     \
+  /* C_t = igated * cgated*/         \
+  act_gate(D, gates + D, gates + D); \
+  act_cand(D, gates, gates);         \
+  blas.VMUL(D, gates, gates + D, ct)
+
+#define COMPUTE_CtHt_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                \
+  act_gate(D, gates + D3, gates + D3);     \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt_PEEPHOLE_NOH0C0(gates, ct, ht) \
+  GET_Ct_NOH0C0(gates, ct);                         \
+  /* get outgated, put W_oc * C_t on igated */      \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);        \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);  \
+  act_gate(D, gates + D3, gates + D3);              \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt(gates, ct_1, ct, ht) \
+  act_gate(D3, gates + D, gates + D);     \
+  GET_Ct(ct_1, gates, ct);                \
+  GET_Ht(ct, gates, ht)
+
+#define COMPUTE_CtHt_PEEPHOLE(gates, ct_1, ct, ht)        \
+  /* get fgated and igated*/                              \
+  blas.VMUL(D, wc_data, ct_1, checked_cell_data);         \
+  blas.VMUL(D, wc_data + D, ct_1, checked_cell_data + D); \
+  blas.VADD(D2, checked_cell_data, gates + D, gates + D); \
+  act_gate(D2, gates + D, gates + D);                     \
+  GET_Ct(ct_1, gates, ct);                                \
+  /* get ogated*/                                         \
+  blas.VMUL(D, wc_data + D2, ct, gates + D);              \
+  blas.VADD(D, gates + D, gates + D3, gates + D3);        \
+  act_gate(D, gates + D3, gates + D3);                    \
+  GET_Ht(ct, gates, ht)
+
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    INIT_BASE_INPUT_OUTPUT
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
+
+    //  std::cout << "====> SeqCompute" << std::endl;
+    auto ids_lod = ids->lod();
+    const int total_T = ids_dims[0];
+    const int N = ids_lod[0].size() - 1;
+    const T* h0_data = h0 ? h0->data<T>() : nullptr;
+    const T* c0_data = c0 ? c0->data<T>() : nullptr;
+    T* xx_data = xx->mutable_data<T>(place);
+    T* h_out_data = hidden_out->mutable_data<T>(place);
+    T* c_out_data = cell_out->mutable_data<T>(place);
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+
+    for (int64_t i = 0; i < ids_numel; ++i) {
+      PADDLE_ENFORCE_LT(ids_data[i], row_number);
+      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
+             row_width * sizeof(T));
+    }
+
+    int xx_offset = D4;
+    int gate_offset = D;
+    if (is_reverse) {
+      const int offset = (total_T - 1) * D;
+      xx_data = xx_data + offset * 4;
+      h_out_data = h_out_data + offset;
+      c_out_data = c_out_data + offset;
+      xx_offset = -D4;
+      gate_offset = -D;
+    }
+
+#define MOVE_ONE_STEP                    \
+  prev_h_data = h_out_data;              \
+  prev_c_data = c_out_data;              \
+  xx_data = xx_data + xx_offset;         \
+  h_out_data = h_out_data + gate_offset; \
+  c_out_data = c_out_data + gate_offset
+
+#define PROCESS_H0C0_DEFINES                           \
+  int bid = is_reverse ? N - 1 - i : i;                \
+  int seq_len = ids_lod[0][bid + 1] - ids_lod[0][bid]; \
+  const T* prev_c_data = nullptr;                      \
+  const T* prev_h_data = nullptr;                      \
+  int tstart = 0
+
+#define PROCESS_H0C0_PEEPHOLE                                      \
+  PROCESS_H0C0_DEFINES;                                            \
+  if (h0_data) {                                                   \
+    prev_h_data = h0_data + bid * D;                               \
+    prev_c_data = c0_data + bid * D;                               \
+  } else {                                                         \
+    COMPUTE_CtHt_PEEPHOLE_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                                 \
+    tstart = 1;                                                    \
+  }
+
+#define PROCESS_H0C0                                      \
+  PROCESS_H0C0_DEFINES;                                   \
+  if (h0_data) {                                          \
+    prev_h_data = h0_data + bid * D;                      \
+    prev_c_data = c0_data + bid * D;                      \
+  } else {                                                \
+    COMPUTE_CtHt_NOH0C0(xx_data, c_out_data, h_out_data); \
+    MOVE_ONE_STEP;                                        \
+    tstart = 1;                                           \
+  }
+
+    if (use_peepholes) {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0_PEEPHOLE
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt_PEEPHOLE(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
+        }
+      }
+    } else {
+      for (int i = 0; i < N; ++i) {
+        PROCESS_H0C0
+        for (int step = tstart; step < seq_len; ++step) {
+          GEMM_WH_ADDON(1, prev_h_data, xx_data);
+          COMPUTE_CtHt(xx_data, prev_c_data, c_out_data, h_out_data);
+          MOVE_ONE_STEP;
+        }
+      }
+    }
+#undef PROCESS_H0C0_DEFINES
+#undef PROCESS_H0C0_PEEPHOLE
+#undef PROCESS_H0C0
+#undef MOVE_ONE_STEP
+  }
+
+  void BatchCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = platform::CPUDeviceContext;
+    INIT_BASE_INPUT_OUTPUT
+    if (ids->lod()[0].size() == 2) {
+      SeqCompute(ctx);
+      return;
+    }
+    INIT_BASE_SIZES
+    INIT_VEC_FUNC
+    INIT_BASE_INPUT_DATAS
+
+    // std::cout << "===> Batch Compute" << std::endl;
+
+    auto* reordered_h0 = ctx.Output<Tensor>("ReorderedH0");
+    auto* reordered_c0 = ctx.Output<Tensor>("ReorderedC0");
+    auto* batched_input = ctx.Output<LoDTensor>("BatchedInput");
+    auto* batched_c_out = ctx.Output<LoDTensor>("BatchedCell");
+    auto* batched_h_out = ctx.Output<LoDTensor>("BatchedHidden");
+    T* xx_data = xx->mutable_data<T>(place);
+    T* batched_input_data = batched_input->mutable_data<T>(place);
+    T* batched_c_out_data = batched_c_out->mutable_data<T>(place);
+    T* batched_h_out_data = batched_h_out->mutable_data<T>(place);
+    hidden_out->mutable_data<T>(place);
+    cell_out->mutable_data<T>(place);
+
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+
+    for (int64_t i = 0; i < ids_numel; ++i) {
+      PADDLE_ENFORCE_LT(ids_data[i], row_number);
+      PADDLE_ENFORCE_GE(ids_data[i], 0, "ids %d", i);
+      memcpy(xx_data + i * row_width, embeddings_data + ids_data[i] * row_width,
+             row_width * sizeof(T));
+    }
+
+    to_batch(dev_ctx, *xx, batched_input, true, is_reverse);
+
+    auto batched_lod = batched_input->lod();
+    const auto& seq_order = batched_lod[2];
+    const int max_bs = seq_order.size();
+    reordered_h0->Resize({max_bs, D});
+    reordered_c0->Resize({max_bs, D});
+
+    int tstart = 0;
+    T* prev_h_data = nullptr;
+    T* prev_c_data = nullptr;
+    if (h0) {
+      // reorder h0, c0
+      T* reordered_h0_data = reordered_h0->mutable_data<T>(place);
+      T* reordered_c0_data = reordered_c0->mutable_data<T>(place);
+      const T* h0_data = h0->data<T>();
+      const T* c0_data = c0->data<T>();
+      prev_h_data = reordered_h0_data;
+      prev_c_data = reordered_c0_data;
+      size_t sz = sizeof(T) * D;
+      for (int i = 0; i < max_bs; ++i) {
+        std::memcpy(reordered_h0_data, h0_data + seq_order[i] * D, sz);
+        std::memcpy(reordered_c0_data, c0_data + seq_order[i] * D, sz);
+        reordered_h0_data += D;
+        reordered_c0_data += D;
+      }
+    } else {
+      // compute without h0, c0
+      T* cur_in_data = batched_input_data;
+      T* cur_h_out_data = batched_h_out_data;
+      T* cur_c_out_data = batched_c_out_data;
+      for (int i = 0; i < max_bs; ++i) {
+        GET_Ct_NOH0C0(cur_in_data, cur_c_out_data);
+        if (use_peepholes) {
+          blas.VMUL(D, wc_data + D2, cur_c_out_data, cur_in_data + D);
+          blas.VADD(D, cur_in_data + D, cur_in_data + D3, cur_in_data + D3);
+        }
+        act_gate(D, cur_in_data + D3, cur_in_data + D3);
+        GET_Ht(cur_c_out_data, cur_in_data, cur_h_out_data);
+        cur_in_data += D4;
+        cur_c_out_data += D;
+        cur_h_out_data += D;
+      }
+      tstart = 1;
+      prev_h_data = batched_h_out_data;
+      prev_c_data = batched_c_out_data;
+    }
+    const auto& batch_starts = batched_lod[0];
+    const int max_seq_len = batch_starts.size() - 1;
+    const int offset = tstart * max_bs * D;
+    batched_input_data = batched_input_data + offset * 4;
+    batched_h_out_data = batched_h_out_data + offset;
+    batched_c_out_data = batched_c_out_data + offset;
+
+#define DEFINE_CUR                        \
+  T* cur_in_data = batched_input_data;    \
+  T* cur_prev_c_data = prev_c_data;       \
+  T* cur_c_out_data = batched_c_out_data; \
+  T* cur_h_out_data = batched_h_out_data
+
+#define MOVE_ONE_BATCH  \
+  cur_in_data += D4;    \
+  cur_prev_c_data += D; \
+  cur_c_out_data += D;  \
+  cur_h_out_data += D
+
+#define MOVE_ONE_STEP                  \
+  prev_c_data = batched_c_out_data;    \
+  prev_h_data = batched_h_out_data;    \
+  batched_c_out_data = cur_c_out_data; \
+  batched_h_out_data = cur_h_out_data; \
+  batched_input_data = cur_in_data
+
+    if (use_peepholes) {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt_PEEPHOLE(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                                cur_h_out_data);
+          MOVE_ONE_BATCH;
+        }
+        MOVE_ONE_STEP;
+      }
+    } else {
+      for (int step = tstart; step < max_seq_len; ++step) {
+        const int cur_bs = batch_starts[step + 1] - batch_starts[step];
+        GEMM_WH_ADDON(cur_bs, prev_h_data, batched_input_data);
+        DEFINE_CUR;
+        for (int i = 0; i < cur_bs; ++i) {
+          COMPUTE_CtHt(cur_in_data, cur_prev_c_data, cur_c_out_data,
+                       cur_h_out_data);
+          MOVE_ONE_BATCH;
+        }
+        MOVE_ONE_STEP;
+      }
+    }
+#undef MOVE_ONE_STEP
+#undef MOVE_ONE_BATCH
+#undef DEFINE_CUR
+
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batched_h_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_h_out, hidden_out);
+    batched_c_out->set_lod(batched_lod);
+    to_seq(dev_ctx, *batched_c_out, cell_out);
+  }
+
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    if (ctx.Attr<bool>("use_seq")) {
+      SeqCompute(ctx);
+    } else {
+      BatchCompute(ctx);
+    }
+  }
+
+#undef COMPUTE_CtHt_PEEPHOLE
+#undef COMPUTE_CtHt
+#undef GET_Ct_NOH0C0
+#undef COMPUTE_CtHt_NOH0C0
+#undef COMPUTE_CtHt_PEEPHOLE_NOH0C0
+#undef GET_Ht
+#undef GET_Ct
+#undef GEMM_WH_ADDON
+#undef INIT_BASE_INPUT_DATAS
+#undef INIT_BASE_SIZES
+#undef INIT_BASE_INPUT_OUTPUT
+#undef INIT_VEC_FUNC
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fused_embedding_fc_lstm, ops::FusedEmbeddingFCLSTMOp,
+                  ops::FusedEmbeddingFCLSTMOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(fused_embedding_fc_lstm,
+                       ops::FusedEmbeddingFCLSTMKernel<float>,
+                       ops::FusedEmbeddingFCLSTMKernel<double>);
diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.h b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h
new file mode 100644
index 0000000000..2775b2ac04
--- /dev/null
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class FusedEmbeddingFCLSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FusedEmbeddingFCLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
+}  // namespace operators
+}  // namespace paddle

From d5114c60b098a3c5f778d48b70d0683b093b49db Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Tue, 25 Sep 2018 11:00:30 +0200
Subject: [PATCH 02/13] - Reviewers suggesstions to fused_embedding_fc_lstm_op

---
 .../fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc | 11 ++++++-----
 paddle/fluid/operators/fused_embedding_fc_lstm_op.cc  |  4 ----
 2 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index 38495125c3..af3f23cbf9 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.h"
+#include <algorithm>
 #include <string>
 #include "paddle/fluid/framework/lod_tensor.h"
 
@@ -98,17 +99,17 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
 
     // Copy only gate biases values (only actual bias data, not peephole
     // weights)
-    std::vector<float> combined_biases(n, 0.0f);
-    memcpy(&combined_biases[0], lstm_bias_tensor.data<float>(),
-           n * sizeof(float));
+    std::vector<float> combined_biases;
+    combined_biases.reserve(n);
+    std::copy_n(lstm_bias_tensor.data<float>(), n,
+                std::back_inserter(combined_biases));
 
     if (with_fc_bias) {
       // Add FC-bias with LSTM-bias (into GEMM result to be)
       auto* fc_bias_var = scope->FindVar(fc_bias->Name());
       const auto& fc_bias_tensor = fc_bias_var->Get<framework::LoDTensor>();
       for (int i = 0; i < fc_bias_tensor.numel(); i++) {
-        combined_biases[i] =
-            lstm_bias_tensor.data<float>()[i] + fc_bias_tensor.data<float>()[i];
+        combined_biases[i] += fc_bias_tensor.data<float>()[i];
       }
     }
 
diff --git a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
index 3c4cc77452..0b917a4036 100644
--- a/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused_embedding_fc_lstm_op.cc
@@ -63,10 +63,6 @@ void FusedEmbeddingFCLSTMOp::InferShape(
   auto embeddings_dims = ctx->GetInputDim("Embeddings");
   PADDLE_ENFORCE_EQ(embeddings_dims.size(), 2,
                     "The rank of Input(Embeddings) should be 2.");
-  //  PADDLE_ENFORCE_EQ(wx_dims[0], x_dims[1],
-  //                    "The first dimension of Input(Embeddings) "
-  //                    "should be %d.",
-  //                    x_dims[1]);
 
   auto wh_dims = ctx->GetInputDim("WeightH");
   int frame_size = wh_dims[1] / 4;

From 910cd415f2147291f5cee83c103c1a1bd84e982f Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 27 Sep 2018 14:01:11 +0200
Subject: [PATCH 03/13] - Disabled embedding_fc_lstm_fuse by defult and  
 extended test_text_classification ot use new op

---
 paddle/fluid/inference/api/paddle_inference_api.h   |  2 +-
 .../api/analyzer_text_classification_tester.cc      | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 984358b2bd..77b04bb6f5 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -216,7 +216,7 @@ struct AnalysisConfig : public NativeConfig {
   bool enable_ir_optim = true;
   // Manually determine the IR passes to run.
   IrPassMode ir_mode{IrPassMode::kExclude};
-  std::vector<std::string> ir_passes;
+  std::vector<std::string> ir_passes{"embedding_fc_lstm_fuse_pass"};
 
   // NOTE this is just for internal development, please not use it.
   bool _use_mkldnn{false};
diff --git a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
index 340ef152f0..ca19475bda 100644
--- a/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_text_classification_tester.cc
@@ -104,5 +104,18 @@ TEST(Analyzer_Text_Classification, compare) {
   CompareNativeAndAnalysis(cfg, input_slots_all);
 }
 
+TEST(Analyzer_Text_Classification, compare_against_embedding_fc_lstm_fused) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  // Enable embedding_fc_lstm_fuse_pass (disabled by default)
+  auto it = std::find(cfg.ir_passes.begin(), cfg.ir_passes.end(),
+                      "embedding_fc_lstm_fuse_pass");
+  if (it != cfg.ir_passes.end()) cfg.ir_passes.erase(it);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(cfg, input_slots_all);
+}
+
 }  // namespace inference
 }  // namespace paddle

From 1df69f7c9dc53e317babc32d0d91842a11fedd97 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Fri, 28 Sep 2018 09:42:13 +0200
Subject: [PATCH 04/13] -  Fix to comment

test=develop
---
 paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index af3f23cbf9..b155da375f 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -199,7 +199,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
       embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
                              Bias, Hidden, Cell, fc_out, fc_bias);
       // Remove unneeded nodes.
-      // TODO(jczaja): Proper removing of loopup table
+      // TODO(jczaja): Proper removing of lookup table
       std::unordered_set<const Node*> marked_nodes(
           //{lookup_table, mul, lstm, elementwise_add, fc_bias, W});
           {mul, lstm, elementwise_add, fc_bias});
@@ -209,7 +209,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
       embedding_lstm_creator(lookup_table, W, lstm, subgraph.at(x), w, Weight,
                              Bias, Hidden, Cell, fc_out, nullptr);
       // Remove unneeded nodes.
-      // TODO(jczaja): Proper removing of loopup table
+      // TODO(jczaja): Proper removing of lookup table
       // std::unordered_set<const Node*> marked_nodes({lookup_table, W, mul,
       // lstm});
       std::unordered_set<const Node*> marked_nodes({mul, lstm});

From 9ae5baebfa3939a3af07a3e4338a34bb5667c993 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 28 Sep 2018 07:52:24 +0000
Subject: [PATCH 05/13] test=develop

---
 paddle/legacy/trainer/tests/CMakeLists.txt       |  6 +++++-
 .../recognize_digits/CMakeLists.txt              | 16 +++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/paddle/legacy/trainer/tests/CMakeLists.txt b/paddle/legacy/trainer/tests/CMakeLists.txt
index 08548bea4c..fbefcced56 100644
--- a/paddle/legacy/trainer/tests/CMakeLists.txt
+++ b/paddle/legacy/trainer/tests/CMakeLists.txt
@@ -16,7 +16,11 @@ endfunction()
 trainer_test(test_Compare)
 trainer_test(test_PyDataProviderWrapper)
 trainer_test(test_recurrent_machine_generation)
-trainer_test(test_Trainer)
+if(NOT APPLE)
+  trainer_test(test_Trainer)
+else()
+  message(WARNING "These tests has been disabled in OSX for random fail: \n test_Trainer") 
+endif()
 
 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
index 673c965b66..ad056aaa7b 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt
@@ -2,6 +2,16 @@ file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
 # default test
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
+if(NOT APPLE)
+    foreach(src ${TEST_OPS})
+        py_test(${src} SRCS ${src}.py)
+    endforeach()
+else()
+    foreach(src ${TEST_OPS})
+        if(${src} STREQUAL "test_recognize_digits_conv")
+            message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src})
+        else()
+            py_test(${src} SRCS ${src}.py)
+        endif()
+    endforeach()
+endif()

From 358b38695356226875aa7495244e2ea70e8224e9 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 28 Sep 2018 10:34:15 +0000
Subject: [PATCH 06/13] test=develop

---
 paddle/fluid/inference/api/api_impl_tester.cc | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc
index 106a941b29..bed7c87131 100644
--- a/paddle/fluid/inference/api/api_impl_tester.cc
+++ b/paddle/fluid/inference/api/api_impl_tester.cc
@@ -21,6 +21,12 @@ limitations under the License. */
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/tests/test_helper.h"
 
+#ifdef __clang__
+#define ACC_DIFF 4e-3
+#else
+#define ACC_DIFF 1e-3
+#endif
+
 DEFINE_string(dirname, "", "Directory of the inference model.");
 
 namespace paddle {
@@ -99,8 +105,8 @@ void MainWord2Vec(bool use_gpu) {
 
   float* lod_data = output1.data<float>();
   for (int i = 0; i < output1.numel(); ++i) {
-    EXPECT_LT(lod_data[i] - data[i], 1e-3);
-    EXPECT_GT(lod_data[i] - data[i], -1e-3);
+    EXPECT_LT(lod_data[i] - data[i], ACC_DIFF);
+    EXPECT_GT(lod_data[i] - data[i], -ACC_DIFF);
   }
 }
 
@@ -144,7 +150,7 @@ void MainImageClassification(bool use_gpu) {
   float* data = static_cast<float*>(outputs[0].data.data());
   float* lod_data = output1.data<float>();
   for (size_t j = 0; j < len / sizeof(float); ++j) {
-    EXPECT_NEAR(lod_data[j], data[j], 1e-3);
+    EXPECT_NEAR(lod_data[j], data[j], ACC_DIFF);
   }
 }
 
@@ -199,7 +205,7 @@ void MainThreadsWord2Vec(bool use_gpu) {
       float* ref_data = refs[tid].data<float>();
       EXPECT_EQ(refs[tid].numel(), static_cast<int64_t>(len / sizeof(float)));
       for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
       }
     });
   }
@@ -251,7 +257,7 @@ void MainThreadsImageClassification(bool use_gpu) {
       float* ref_data = refs[tid].data<float>();
       EXPECT_EQ((size_t)refs[tid].numel(), len / sizeof(float));
       for (int i = 0; i < refs[tid].numel(); ++i) {
-        EXPECT_NEAR(ref_data[i], data[i], 1e-3);
+        EXPECT_NEAR(ref_data[i], data[i], ACC_DIFF);
       }
     });
   }

From e202f33aa96ee8c44f9bac892881dce0fe5067be Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Fri, 28 Sep 2018 13:13:43 +0200
Subject: [PATCH 07/13] - Yet another clarification to comment

test=develop
---
 paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
index b155da375f..ba11f19c92 100644
--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -119,7 +119,7 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
         CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
         &combined_biases[0], n, 0.0f, embeddings_data, n);
 
-    // Wx*embeddings
+    // Wx*embeddings + biases
     paddle::operators::math::CBlas<float>::GEMM(
         CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha,
         embedding_data, k, weightx_data, n, beta, embeddings_data, n);

From ddd60581b7f442e8f232f83a760c3d4c537a16b1 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 28 Sep 2018 10:19:55 +0800
Subject: [PATCH 08/13] clean up channel

test=develop
---
 paddle/fluid/framework/CMakeLists.txt         |    7 -
 paddle/fluid/framework/channel.h              |  291 -----
 paddle/fluid/framework/channel_impl.h         |  369 ------
 paddle/fluid/framework/channel_test.cc        | 1008 -----------------
 paddle/fluid/framework/concurrency_test.cc    |  292 -----
 paddle/fluid/framework/executor.cc            |    5 +-
 paddle/fluid/framework/framework.proto        |    7 -
 paddle/fluid/framework/tuple.h                |    1 -
 paddle/fluid/framework/var_desc.cc            |   54 +-
 paddle/fluid/framework/var_desc.h             |    4 -
 paddle/fluid/framework/var_type.h             |    6 -
 .../fluid/inference/analysis/analysis_pass.h  |    6 -
 paddle/fluid/operators/CMakeLists.txt         |    5 -
 paddle/fluid/operators/channel_close_op.cc    |   70 --
 paddle/fluid/operators/channel_create_op.cc   |  113 --
 paddle/fluid/operators/channel_recv_op.cc     |   98 --
 paddle/fluid/operators/channel_send_op.cc     |   76 --
 .../operators/concurrency/CMakeLists.txt      |    1 -
 .../operators/concurrency/channel_util.cc     |  111 --
 .../operators/concurrency/channel_util.h      |   38 -
 paddle/fluid/operators/select_op.cc           |  419 -------
 paddle/fluid/pybind/protobuf.cc               |    2 -
 paddle/fluid/pybind/pybind.cc                 |    1 -
 python/paddle/fluid/concurrency.py            |  454 --------
 python/paddle/fluid/framework.py              |    3 +-
 .../paddle/fluid/tests/no_test_concurrency.py |  260 -----
 .../paddle/fluid/tests/notest_concurrency.py  |   41 -
 27 files changed, 4 insertions(+), 3738 deletions(-)
 delete mode 100644 paddle/fluid/framework/channel.h
 delete mode 100644 paddle/fluid/framework/channel_impl.h
 delete mode 100644 paddle/fluid/framework/channel_test.cc
 delete mode 100644 paddle/fluid/framework/concurrency_test.cc
 delete mode 100644 paddle/fluid/operators/channel_close_op.cc
 delete mode 100644 paddle/fluid/operators/channel_create_op.cc
 delete mode 100644 paddle/fluid/operators/channel_recv_op.cc
 delete mode 100644 paddle/fluid/operators/channel_send_op.cc
 delete mode 100644 paddle/fluid/operators/concurrency/CMakeLists.txt
 delete mode 100644 paddle/fluid/operators/concurrency/channel_util.cc
 delete mode 100644 paddle/fluid/operators/concurrency/channel_util.h
 delete mode 100644 paddle/fluid/operators/select_op.cc
 delete mode 100644 python/paddle/fluid/concurrency.py
 delete mode 100644 python/paddle/fluid/tests/no_test_concurrency.py
 delete mode 100644 python/paddle/fluid/tests/notest_concurrency.py

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 39898dd236..de960dba8f 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -169,15 +169,8 @@ cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
 cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
 cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 
-# cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )
 
 if (NOT WIN32)
 cc_test(rw_lock_test SRCS rw_lock_test.cc)
 endif (NOT WIN32)
-
-# disable test temporarily.
-# TODO https://github.com/PaddlePaddle/Paddle/issues/11971
-# cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op
-#         channel_send_op channel_recv_op sum_op select_op elementwise_add_op compare_op
-#         conditional_block_op while_op assign_op print_op executor proto_desc)
diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h
deleted file mode 100644
index 722bf8e8ec..0000000000
--- a/paddle/fluid/framework/channel.h
+++ /dev/null
@@ -1,291 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>            // for size_t
-#include <condition_variable>  // NOLINT
-#include <typeindex>
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-enum class ChannelAction {
-  SEND = 0,
-  RECEIVE = 1,
-  CLOSE = 2,
-};
-
-// Channel is the abstract class of buffered and un-buffered channels.
-template <typename T>
-class Channel {
- public:
-  virtual bool CanSend() = 0;
-  virtual bool CanReceive() = 0;
-  virtual void Send(T*) = 0;
-  virtual bool Receive(T*) = 0;
-  virtual size_t Cap() = 0;
-  virtual void Lock() = 0;
-
-  virtual void Unlock() = 0;
-  virtual bool IsClosed() = 0;
-  virtual void Close() = 0;
-  virtual ~Channel() {}
-
-  virtual void AddToSendQ(const void* referrer, T* data,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(ChannelAction)> cb) = 0;
-  virtual void AddToReceiveQ(const void* referrer, T* data,
-                             std::shared_ptr<std::condition_variable_any> cond,
-                             std::function<bool(ChannelAction)> cb) = 0;
-  virtual void RemoveFromSendQ(const void* referrer) = 0;
-  virtual void RemoveFromReceiveQ(const void* referrer) = 0;
-};
-
-// Forward declaration of channel implementations.
-template <typename T>
-class ChannelImpl;
-
-template <typename T>
-Channel<T>* MakeChannel(size_t buffer_size) {
-  return new ChannelImpl<T>(buffer_size);
-}
-
-template <typename T>
-void CloseChannel(Channel<T>* ch) {
-  ch->Close();
-}
-
-/*
- * The ChannelHolder class serves two main purposes:
- * 1. It acts as a unified wrapper for the different kinds of
- *    channels, i.e. Buffered and Unbuffered channels. This is
- *    similar to the ReaderHolder class.
- * 2. It also helps us in TypeHiding. This is similar to the
- *    PlaceHolder implementations in variable.h and tensor.h.
- */
-class ChannelHolder {
- public:
-  template <typename T>
-  void Reset(size_t buffer_size) {
-    holder_.reset(new PlaceholderImpl<T>(buffer_size));
-  }
-
-  template <typename T>
-  void Send(T* data) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    PADDLE_ENFORCE_EQ(
-        holder_->Type(), std::type_index(typeid(T)),
-        "Channel type is not same as the type of the data being sent");
-    // Static cast should be safe because we have ensured that types are same
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
-    channel->Send(data);
-  }
-
-  template <typename T>
-  bool Receive(T* data) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    PADDLE_ENFORCE_EQ(
-        holder_->Type(), std::type_index(typeid(T)),
-        "Channel type is not same as the type of the data being sent");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
-    return channel->Receive(data);
-  }
-
-  bool IsClosed() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->IsClosed();
-  }
-
-  bool CanSend() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->CanSend();
-  }
-
-  bool CanReceive() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->CanReceive();
-  }
-
-  void close() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Close();
-  }
-
-  size_t Cap() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->Cap();
-  }
-
-  void Lock() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Lock();
-  }
-
-  void Unlock() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->Unlock();
-  }
-
-  template <typename T>
-  void AddToSendQ(const void* referrer, T* data,
-                  std::shared_ptr<std::condition_variable_any> cond,
-                  std::function<bool(ChannelAction)> cb) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    if (channel != nullptr) {
-      channel->AddToSendQ(referrer, data, cond, cb);
-    }
-  }
-
-  template <typename T>
-  void AddToReceiveQ(const void* referrer, T* data,
-                     std::shared_ptr<std::condition_variable_any> cond,
-                     std::function<bool(ChannelAction)> cb) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
-    if (channel != nullptr) {
-      channel->AddToReceiveQ(referrer, data, cond, cb);
-    }
-  }
-
-  void RemoveFromSendQ(const void* referrer) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->RemoveFromSendQ(referrer);
-  }
-
-  void RemoveFromReceiveQ(const void* referrer) {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    holder_->RemoveFromReceiveQ(referrer);
-  }
-
-  inline bool IsInitialized() const { return holder_ != nullptr; }
-
-  inline const std::type_index Type() {
-    PADDLE_ENFORCE_EQ(IsInitialized(), true,
-                      "The Channel hasn't been initialized");
-    return holder_->Type();
-  }
-
- private:
-  /**
-   * @note    Placeholder hides type T, so it doesn't appear as a template
-   *          parameter of ChannelHolder.
-   */
-  struct Placeholder {
-    virtual ~Placeholder() {}
-    virtual const std::type_index Type() const = 0;
-    virtual void* Ptr() const = 0;
-    virtual bool IsClosed() = 0;
-    virtual bool CanSend() = 0;
-    virtual bool CanReceive() = 0;
-    virtual void RemoveFromSendQ(const void* referrer) = 0;
-    virtual void RemoveFromReceiveQ(const void* referrer) = 0;
-    virtual void Close() = 0;
-    virtual void Lock() = 0;
-    virtual void Unlock() = 0;
-    virtual size_t Cap() = 0;
-  };
-
-  template <typename T>
-  struct PlaceholderImpl : public Placeholder {
-    explicit PlaceholderImpl(size_t buffer_size)
-        : type_(std::type_index(typeid(T))) {
-      channel_.reset(MakeChannel<T>(buffer_size));
-    }
-
-    virtual const std::type_index Type() const { return type_; }
-
-    virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
-
-    virtual bool IsClosed() {
-      if (channel_) {
-        return channel_->IsClosed();
-      }
-      return false;
-    }
-
-    virtual bool CanSend() {
-      if (channel_) {
-        return channel_->CanSend();
-      }
-      return false;
-    }
-
-    virtual bool CanReceive() {
-      if (channel_) {
-        return channel_->CanReceive();
-      }
-      return false;
-    }
-
-    virtual void RemoveFromSendQ(const void* referrer) {
-      if (channel_) {
-        channel_->RemoveFromSendQ(referrer);
-      }
-    }
-
-    virtual void RemoveFromReceiveQ(const void* referrer) {
-      if (channel_) {
-        channel_->RemoveFromReceiveQ(referrer);
-      }
-    }
-
-    virtual void Close() {
-      if (channel_) channel_->Close();
-    }
-
-    virtual size_t Cap() {
-      if (channel_)
-        return channel_->Cap();
-      else
-        return -1;
-    }
-
-    virtual void Lock() {
-      if (channel_) channel_->Lock();
-    }
-
-    virtual void Unlock() {
-      if (channel_) channel_->Unlock();
-    }
-
-    std::unique_ptr<Channel<T>> channel_;
-    const std::type_index type_;
-  };
-
-  // Pointer to a PlaceholderImpl object
-  std::unique_ptr<Placeholder> holder_;
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-#include "paddle/fluid/framework/channel_impl.h"
diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h
deleted file mode 100644
index 26d454534e..0000000000
--- a/paddle/fluid/framework/channel_impl.h
+++ /dev/null
@@ -1,369 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <stddef.h>  // for size_t
-#include <atomic>
-#include <condition_variable>  // NOLINT
-#include <deque>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/platform/enforce.h"
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-class ChannelImpl : public paddle::framework::Channel<T> {
-  friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
-  friend void paddle::framework::CloseChannel<T>(Channel<T> *);
-
- public:
-  virtual bool CanSend();
-  virtual bool CanReceive();
-  virtual void Send(T *);
-  virtual bool Receive(T *);
-  virtual size_t Cap() { return cap_; }
-  virtual void Lock();
-  virtual void Unlock();
-  virtual bool IsClosed();
-  virtual void Close();
-  explicit ChannelImpl(size_t);
-  virtual ~ChannelImpl();
-
-  virtual void AddToSendQ(const void *referrer, T *data,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(ChannelAction)> cb);
-  virtual void AddToReceiveQ(const void *referrer, T *data,
-                             std::shared_ptr<std::condition_variable_any> cond,
-                             std::function<bool(ChannelAction)> cb);
-
-  virtual void RemoveFromSendQ(const void *referrer);
-  virtual void RemoveFromReceiveQ(const void *referrer);
-
- private:
-  struct QueueMessage {
-    T *data;
-    std::shared_ptr<std::condition_variable_any> cond;
-    bool chan_closed = false;
-    bool completed = false;
-    const void *referrer;  // TODO(thuan): figure out better way to do this
-    std::function<bool(ChannelAction)> callback;
-
-    explicit QueueMessage(T *item)
-        : data(item), cond(std::make_shared<std::condition_variable_any>()) {}
-
-    QueueMessage(T *item, std::shared_ptr<std::condition_variable_any> cond)
-        : data(item), cond(cond) {}
-
-    void Wait(std::unique_lock<std::recursive_mutex> &lock) {
-      cond->wait(lock, [this]() { return completed; });
-    }
-
-    void Notify() {
-      completed = true;
-      cond->notify_all();
-    }
-  };
-
-  void send_return() {
-    send_ctr--;
-    destructor_cond_.notify_all();
-  }
-
-  bool recv_return(bool value) {
-    recv_ctr--;
-    destructor_cond_.notify_all();
-    return value;
-  }
-
-  std::shared_ptr<QueueMessage> get_first_message(
-      std::deque<std::shared_ptr<QueueMessage>> *queue, ChannelAction action) {
-    while (!queue->empty()) {
-      // Check whether this message was added by Select
-      // If this was added by Select then execute the callback
-      // to check if you can execute this message. The callback
-      // can return false if some other case was executed in Select.
-      // In that case just discard this QueueMessage and process next.
-      std::shared_ptr<QueueMessage> m = queue->front();
-      queue->pop_front();
-      if (m->callback == nullptr || m->callback(action)) return m;
-    }
-    return nullptr;
-  }
-
-  size_t cap_;
-  std::recursive_mutex mu_;
-  bool closed_;
-  std::deque<T> buf_;
-  std::deque<std::shared_ptr<QueueMessage>> recvq;
-  std::deque<std::shared_ptr<QueueMessage>> sendq;
-  std::atomic<unsigned> send_ctr{0};
-  std::atomic<unsigned> recv_ctr{0};
-  std::condition_variable_any destructor_cond_;
-};
-
-template <typename T>
-ChannelImpl<T>::ChannelImpl(size_t capacity)
-    : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
-  PADDLE_ENFORCE_GE(capacity, 0);
-}
-
-template <typename T>
-bool ChannelImpl<T>::CanSend() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return !closed_ && (!recvq.empty() || buf_.size() < cap_);
-}
-
-template <typename T>
-bool ChannelImpl<T>::CanReceive() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return !(closed_ && buf_.empty()) && (!sendq.empty() || buf_.size() > 0);
-}
-
-template <typename T>
-void ChannelImpl<T>::Send(T *item) {
-  send_ctr++;
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-
-  // If channel is closed, throw exception
-  if (closed_) {
-    send_return();
-    lock.unlock();
-    PADDLE_THROW("Cannot send on closed channel");
-  }
-
-  // If there is a receiver, directly pass the value we want
-  // to send to the receiver, bypassing the channel buffer if any
-  if (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m =
-        get_first_message(&recvq, ChannelAction::SEND);
-
-    if (m != nullptr) {
-      *(m->data) = std::move(*item);
-      m->Notify();
-      send_return();
-      return;
-    } else {
-      Send(item);
-      send_return();
-      return;
-    }
-  }
-
-  // Unbuffered channel will always bypass this
-  // If buffered channel has space in buffer,
-  // write the element to the buffer.
-  if (buf_.size() < cap_) {
-    // Copy to buffer
-    buf_.push_back(std::move(*item));
-    send_return();
-    return;
-  }
-
-  // Block on channel, because some receiver will complete
-  // the operation for us
-  auto m = std::make_shared<QueueMessage>(item);
-  sendq.push_back(m);
-  m->Wait(lock);
-  if (m->chan_closed) {
-    send_return();
-    lock.unlock();
-    PADDLE_THROW("Cannot send on closed channel");
-  }
-  send_return();
-}
-
-template <typename T>
-bool ChannelImpl<T>::Receive(T *item) {
-  recv_ctr++;
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-
-  // If channel is closed and buffer is empty or
-  // channel is unbuffered
-  if (closed_ && buf_.empty()) return recv_return(false);
-
-  // If there is a sender, directly receive the value we want
-  // from the sender. In case of a buffered channel, read from
-  // buffer and move front of send queue to the buffer
-  if (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m =
-        get_first_message(&sendq, ChannelAction::RECEIVE);
-    if (buf_.size() > 0) {
-      // Case 1 : Channel is Buffered
-      // Do Data transfer from front of buffer
-      // and add a QueueMessage to the buffer
-      *item = std::move(buf_.front());
-      buf_.pop_front();
-      // If first message from sendq is not null
-      // add it to the buffer and notify it
-      if (m != nullptr) {
-        // Copy to buffer
-        buf_.push_back(std::move(*(m->data)));
-        m->Notify();
-      }  // Ignore if there is no first message
-    } else {
-      // Case 2: Channel is Unbuffered
-      // Do data transfer from front of SendQ
-      // If front is nullptr, then recursively call itself
-      if (m != nullptr) {
-        *item = std::move(*(m->data));
-        m->Notify();
-      } else {
-        return recv_return(Receive(item));
-      }
-    }
-    return recv_return(true);
-  }
-
-  // If this is a buffered channel and there are items in buffer
-  if (buf_.size() > 0) {
-    // Directly read from buffer
-    *item = std::move(buf_.front());
-    buf_.pop_front();
-    // return true
-    return recv_return(true);
-  }
-
-  // No sender available, block on this channel
-  // Some receiver will complete the option for us
-  auto m = std::make_shared<QueueMessage>(item);
-  recvq.push_back(m);
-  m->Wait(lock);
-
-  return recv_return(!m->chan_closed);
-}
-
-template <typename T>
-void ChannelImpl<T>::Lock() {
-  mu_.lock();
-}
-
-template <typename T>
-void ChannelImpl<T>::Unlock() {
-  mu_.unlock();
-}
-
-template <typename T>
-bool ChannelImpl<T>::IsClosed() {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  return closed_;
-}
-
-template <typename T>
-void ChannelImpl<T>::Close() {
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-
-  if (closed_) {
-    // TODO(abhinavarora): closing an already closed channel should panic
-    lock.unlock();
-    return;
-  }
-
-  closed_ = true;
-
-  // Empty the readers
-  while (!recvq.empty()) {
-    std::shared_ptr<QueueMessage> m = recvq.front();
-    recvq.pop_front();
-    m->chan_closed = true;
-
-    // Execute callback function (if any)
-    if (m->callback != nullptr) {
-      m->callback(ChannelAction::CLOSE);
-    }
-
-    m->Notify();
-  }
-
-  // Empty the senders
-  while (!sendq.empty()) {
-    std::shared_ptr<QueueMessage> m = sendq.front();
-    sendq.pop_front();
-    m->chan_closed = true;
-
-    // Execute callback function (if any)
-    if (m->callback != nullptr) {
-      m->callback(ChannelAction::CLOSE);
-    }
-
-    m->Notify();
-  }
-}
-
-template <typename T>
-void ChannelImpl<T>::AddToSendQ(
-    const void *referrer, T *data,
-    std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(ChannelAction)> cb) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  auto m = std::make_shared<QueueMessage>(data, cond);
-  m->referrer = referrer;
-  m->callback = cb;
-  sendq.push_back(m);
-}
-
-template <typename T>
-void ChannelImpl<T>::AddToReceiveQ(
-    const void *referrer, T *data,
-    std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(ChannelAction)> cb) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-  auto m = std::make_shared<QueueMessage>(data, cond);
-  m->referrer = referrer;
-  m->callback = cb;
-  recvq.push_back(m);
-}
-
-template <typename T>
-void ChannelImpl<T>::RemoveFromSendQ(const void *referrer) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-
-  for (auto it = sendq.begin(); it != sendq.end();) {
-    std::shared_ptr<QueueMessage> sendMsg = (std::shared_ptr<QueueMessage>)*it;
-
-    if (sendMsg->referrer == referrer) {
-      it = sendq.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-
-template <typename T>
-void ChannelImpl<T>::RemoveFromReceiveQ(const void *referrer) {
-  std::lock_guard<std::recursive_mutex> lock{mu_};
-
-  for (auto it = recvq.begin(); it != recvq.end();) {
-    std::shared_ptr<QueueMessage> recvMsg = (std::shared_ptr<QueueMessage>)*it;
-
-    if (recvMsg->referrer == referrer) {
-      it = recvq.erase(it);
-    } else {
-      ++it;
-    }
-  }
-}
-
-template <typename T>
-ChannelImpl<T>::~ChannelImpl() {
-  Close();
-  // The destructor must wait for all readers and writers to complete their task
-  // The channel has been closed, so we will not accept new readers and writers
-  std::unique_lock<std::recursive_mutex> lock{mu_};
-  destructor_cond_.wait(lock,
-                        [this]() { return send_ctr == 0 && recv_ctr == 0; });
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc
deleted file mode 100644
index 542d791f6b..0000000000
--- a/paddle/fluid/framework/channel_test.cc
+++ /dev/null
@@ -1,1008 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-
-#include <chrono>  // NOLINT
-#include <thread>  // NOLINT
-#include "gtest/gtest.h"
-
-using paddle::framework::Channel;
-using paddle::framework::ChannelHolder;
-using paddle::framework::MakeChannel;
-using paddle::framework::CloseChannel;
-
-TEST(Channel, ChannelCapacityTest) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  EXPECT_EQ(ch->Cap(), buffer_size);
-  CloseChannel(ch);
-  delete ch;
-
-  ch = MakeChannel<size_t>(0);
-  EXPECT_EQ(ch->Cap(), 0U);
-  CloseChannel(ch);
-  delete ch;
-}
-
-void RecevingOrderEqualToSendingOrder(Channel<int> *ch, int num_items) {
-  unsigned sum_send = 0;
-  std::thread t([&]() {
-    for (int i = 0; i < num_items; i++) {
-      ch->Send(&i);
-      sum_send += i;
-    }
-  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));
-  for (int i = 0; i < num_items; i++) {
-    int recv = -1;
-    EXPECT_EQ(ch->Receive(&recv), true);
-    EXPECT_EQ(recv, i);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));
-  CloseChannel(ch);
-  t.join();
-  unsigned expected_sum = (num_items * (num_items - 1)) / 2;
-  EXPECT_EQ(sum_send, expected_sum);
-  delete ch;
-}
-
-TEST(Channel, SufficientBufferSizeDoesntBlock) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Send(&i);
-  }
-
-  size_t out;
-  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out), true);  // should not block
-    EXPECT_EQ(out, i);
-  }
-  CloseChannel(ch);
-  delete ch;
-}
-
-// This tests that a  channel must return false
-// on send and receive performed after closing the channel.
-// Receive will only return false after close when queue is empty.
-// By creating separate threads for sending and receiving, we make this
-// function able to test both buffered and unbuffered channels.
-void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
-  const size_t data = 5;
-  std::thread send_thread{[&]() {
-    size_t i = data;
-    ch->Send(&i);  // should not block
-  }};
-
-  std::thread recv_thread{[&]() {
-    size_t i;
-    EXPECT_EQ(ch->Receive(&i), true);  // should not block
-    EXPECT_EQ(i, data);
-  }};
-
-  send_thread.join();
-  recv_thread.join();
-
-  // After closing send should panic. Receive should
-  // also  false as there is no data in queue.
-  CloseChannel(ch);
-  send_thread = std::thread{[&]() {
-    size_t i = data;
-    bool is_exception = false;
-    try {
-      ch->Send(&i);
-    } catch (paddle::platform::EnforceNotMet e) {
-      is_exception = true;
-    }
-    EXPECT_EQ(is_exception, true);
-  }};
-  recv_thread = std::thread{[&]() {
-    size_t i;
-    // should return false because channel is closed and queue is empty
-    EXPECT_EQ(ch->Receive(&i), false);
-  }};
-
-  send_thread.join();
-  recv_thread.join();
-}
-
-TEST(Channel, SendReceiveClosedBufferedChannelPanics) {
-  size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  SendReceiveWithACloseChannelShouldPanic(ch);
-  delete ch;
-}
-
-TEST(Channel, SendReceiveClosedUnBufferedChannelPanics) {
-  auto ch = MakeChannel<size_t>(0);
-  SendReceiveWithACloseChannelShouldPanic(ch);
-  delete ch;
-}
-
-TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-
-  for (size_t i = 0; i < buffer_size; ++i) {
-    ch->Send(&i);  // sending should not block
-  }
-
-  size_t out;
-  for (size_t i = 0; i < buffer_size / 2; ++i) {
-    EXPECT_EQ(ch->Receive(&out), true);  // receiving should not block
-    EXPECT_EQ(out, i);
-  }
-
-  CloseChannel(ch);
-
-  for (size_t i = buffer_size / 2; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out),
-              true);  // receving should return residual values.
-    EXPECT_EQ(out, i);
-  }
-
-  for (size_t i = 0; i < buffer_size; ++i) {
-    EXPECT_EQ(ch->Receive(&out),
-              false);  // receiving on closed channel should return false
-  }
-  delete ch;
-}
-
-TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
-  const size_t buffer_size = 10;
-  auto ch = MakeChannel<size_t>(buffer_size);
-  std::thread t([&]() {
-    // Try to write more than buffer size.
-    for (size_t i = 0; i < 2 * buffer_size; ++i) {
-      if (i < buffer_size) {
-        ch->Send(&i);  // should block after 10 iterations
-      } else {
-        bool is_exception = false;
-        try {
-          ch->Send(&i);
-        } catch (paddle::platform::EnforceNotMet e) {
-          is_exception = true;
-        }
-        EXPECT_EQ(is_exception, true);
-      }
-    }
-  });
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  CloseChannel(ch);
-  t.join();
-  delete ch;
-}
-
-TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
-  auto ch = MakeChannel<int>(0);
-  RecevingOrderEqualToSendingOrder(ch, 20);
-}
-
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
-  // Test that Receive Order is same as Send Order when number of items
-  // sent is less than size of buffer
-  auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch, 5);
-}
-
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
-  // Test that Receive Order is same as Send Order when number of items
-  // sent is equal to size of buffer
-  auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch, 10);
-}
-
-TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
-  // Test that Receive Order is same as Send Order when number of items
-  // sent is greater than the size of buffer
-  auto ch = MakeChannel<int>(10);
-  RecevingOrderEqualToSendingOrder(ch, 20);
-}
-
-void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the channel
-  // This should unblock all receivers
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  bool send_success[kNumThreads];
-
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          bool is_exception = false;
-          try {
-            ch->Send(&data);
-          } catch (paddle::platform::EnforceNotMet e) {
-            is_exception = true;
-          }
-          *success = !is_exception;
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  if (isBuffered) {
-    // If ch is Buffered, atleast 4 threads must be blocked.
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (!thread_ended[i]) ct++;
-    }
-    EXPECT_GE(ct, 4);
-  } else {
-    // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < kNumThreads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly close the thread
-  // This should unblock all senders
-  CloseChannel(ch);
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  if (isBuffered) {
-    // Verify that only 1 send was successful
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (send_success[i]) ct++;
-    }
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  }
-
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-// This tests that closing a buffered channel also unblocks
-//  any receivers waiting on the channel
-TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(1);
-  ChannelCloseUnblocksReceiversTest(ch);
-  delete ch;
-}
-
-// This tests that closing a buffered channel also unblocks
-//  any senders waiting for channel to have write space
-TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(1);
-  ChannelCloseUnblocksSendersTest(ch, true);
-  delete ch;
-}
-
-// This tests that closing an unbuffered channel also unblocks
-//  unblocks any receivers waiting for senders
-TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelCloseUnblocksReceiversTest(ch);
-  delete ch;
-}
-
-// This tests that closing an unbuffered channel also unblocks
-//  unblocks any senders waiting for senders
-TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelCloseUnblocksSendersTest(ch, false);
-  delete ch;
-}
-
-TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  // Send should block after three iterations
-  // since we only have three receivers.
-  std::thread t([&]() {
-    // Try to send more number of times
-    // than receivers
-    for (int i = 0; i < 4; i++) {
-      try {
-        ch->Send(&i);
-        sum_send += i;
-      } catch (paddle::platform::EnforceNotMet e) {
-      }
-    }
-  });
-  for (int i = 0; i < 3; i++) {
-    int recv;
-    ch->Receive(&recv);
-    EXPECT_EQ(recv, i);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  EXPECT_EQ(sum_send, 3U);
-
-  CloseChannel(ch);
-  t.join();
-  delete ch;
-}
-
-TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
-  auto ch = MakeChannel<int>(0);
-  unsigned sum_send = 0;
-  unsigned sum_receive = 0;
-  // The receiver should block after 5
-  // iterations, since there are only 5 senders.
-  std::thread t([&]() {
-    for (int i = 0; i < 8; i++) {
-      int recv;
-      ch->Receive(&recv);  // should block after the fifth iteration.
-      EXPECT_EQ(recv, i);
-      sum_receive += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    ch->Send(&i);
-    sum_send += i;
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  EXPECT_EQ(sum_send, 10U);
-  EXPECT_EQ(sum_receive, 10U);
-  // send three more elements
-  for (int i = 5; i < 8; i++) {
-    ch->Send(&i);
-    sum_send += i;
-  }
-
-  CloseChannel(ch);
-  t.join();
-  EXPECT_EQ(sum_send, 28U);
-  EXPECT_EQ(sum_receive, 28U);
-  delete ch;
-}
-
-// This tests that destroying a channel unblocks
-//  any senders waiting for channel to have write space
-void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  bool send_success[kNumThreads];
-
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          bool is_exception = false;
-          try {
-            ch->Send(&data);
-          } catch (paddle::platform::EnforceNotMet e) {
-            is_exception = true;
-          }
-          *success = !is_exception;
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-
-  if (isBuffered) {
-    // If channel is buffered, verify that atleast 4 threads are blocked
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (thread_ended[i] == false) ct++;
-    }
-    // Atleast 4 threads must be blocked
-    EXPECT_GE(ct, 4);
-  } else {
-    // Verify that all the threads are blocked
-    for (size_t i = 0; i < kNumThreads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly destroy the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  // Count number of successful sends
-  int ct = 0;
-  for (size_t i = 0; i < kNumThreads; i++) {
-    if (send_success[i]) ct++;
-  }
-
-  if (isBuffered) {
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  } else {
-    // In unbuffered channel, no send should be successful
-    EXPECT_EQ(ct, 0);
-  }
-
-  // Join all threads
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-// This tests that destroying a channel also unblocks
-//  any receivers waiting on the channel
-void ChannelDestroyUnblockReceivers(Channel<int> *ch) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          // All reads should return false
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
-
-  // Verify that all threads are blocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-  // delete the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
-  size_t buffer_size = 1;
-  auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockReceivers(ch);
-}
-
-TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
-  size_t buffer_size = 1;
-  auto ch = MakeChannel<int>(buffer_size);
-  ChannelDestroyUnblockSenders(ch, true);
-}
-
-// This tests that destroying an unbuffered channel also unblocks
-//  unblocks any receivers waiting for senders
-TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockReceivers(ch);
-}
-
-TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
-  auto ch = MakeChannel<int>(0);
-  ChannelDestroyUnblockSenders(ch, false);
-}
-
-TEST(ChannelHolder, ChannelHolderCapacityTest) {
-  const size_t buffer_size = 10;
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(buffer_size);
-  EXPECT_EQ(ch->Cap(), buffer_size);
-  delete ch;
-
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  EXPECT_EQ(ch->Cap(), 0U);
-  delete ch;
-}
-
-void ChannelHolderSendReceive(ChannelHolder *ch) {
-  unsigned sum_send = 0;
-  std::thread t([&]() {
-    for (int i = 0; i < 5; i++) {
-      ch->Send(&i);
-      sum_send += i;
-    }
-  });
-  for (int i = 0; i < 5; i++) {
-    int recv;
-    EXPECT_EQ(ch->Receive(&recv), true);
-    EXPECT_EQ(recv, i);
-  }
-
-  ch->close();
-  t.join();
-  EXPECT_EQ(sum_send, 10U);
-}
-
-TEST(ChannelHolder, ChannelHolderBufferedSendReceiveTest) {
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(10);
-  ChannelHolderSendReceive(ch);
-  delete ch;
-}
-
-TEST(ChannelHolder, ChannelHolderUnBufferedSendReceiveTest) {
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderSendReceive(ch);
-  delete ch;
-}
-
-TEST(ChannelHolder, ChannelUninitializedTest) {
-  ChannelHolder *ch = new ChannelHolder();
-  EXPECT_EQ(ch->IsInitialized(), false);
-  int i = 10;
-  bool send_exception = false;
-  try {
-    ch->Send(&i);
-  } catch (paddle::platform::EnforceNotMet e) {
-    send_exception = true;
-  }
-  EXPECT_EQ(send_exception, true);
-
-  bool recv_exception = false;
-  try {
-    ch->Receive(&i);
-  } catch (paddle::platform::EnforceNotMet e) {
-    recv_exception = true;
-  }
-  EXPECT_EQ(recv_exception, true);
-
-  bool is_exception = false;
-  try {
-    ch->Type();
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-}
-
-TEST(ChannelHolder, ChannelInitializedTest) {
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(2);
-  EXPECT_EQ(ch->IsInitialized(), true);
-  // Channel should remain intialized even after close
-  ch->close();
-  EXPECT_EQ(ch->IsInitialized(), true);
-  delete ch;
-}
-
-TEST(ChannelHolder, TypeMismatchSendTest) {
-  // Test with unbuffered channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  bool is_exception = false;
-  bool boolean_data = true;
-  try {
-    ch->Send(&boolean_data);
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-
-  // Test with Buffered Channel
-  ch = new ChannelHolder();
-  ch->Reset<float>(10);
-  is_exception = false;
-  int int_data = 23;
-  try {
-    ch->Send(&int_data);
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-}
-
-TEST(ChannelHolder, TypeMismatchReceiveTest) {
-  // Test with unbuffered channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  bool is_exception = false;
-  bool float_data;
-  try {
-    ch->Receive(&float_data);
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-
-  // Test with Buffered Channel
-  ch = new ChannelHolder();
-  ch->Reset<float>(10);
-  is_exception = false;
-  int int_data = 23;
-  try {
-    ch->Receive(&int_data);
-  } catch (paddle::platform::EnforceNotMet e) {
-    is_exception = true;
-  }
-  EXPECT_EQ(is_exception, true);
-  delete ch;
-}
-
-void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-
-  // Verify that all the threads are blocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-
-  // Explicitly close the channel
-  // This should unblock all receivers
-  ch->close();
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  bool send_success[kNumThreads];
-
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          bool is_exception = false;
-          try {
-            ch->Send(&data);
-          } catch (paddle::platform::EnforceNotMet e) {
-            is_exception = true;
-          }
-          *success = !is_exception;
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  if (isBuffered) {
-    // If ch is Buffered, atleast 4 threads must be blocked.
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (!thread_ended[i]) ct++;
-    }
-    EXPECT_GE(ct, 4);
-  } else {
-    // If ch is UnBuffered, all the threads should be blocked.
-    for (size_t i = 0; i < kNumThreads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly close the thread
-  // This should unblock all senders
-  ch->close();
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  if (isBuffered) {
-    // Verify that only 1 send was successful
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (send_success[i]) ct++;
-    }
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  }
-
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-// This tests that closing a channelholder unblocks
-//  any receivers waiting on the channel
-TEST(ChannelHolder, ChannelHolderCloseUnblocksReceiversTest) {
-  // Check for buffered channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(1);
-  ChannelHolderCloseUnblocksReceiversTest(ch);
-  delete ch;
-
-  // Check for unbuffered channel
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderCloseUnblocksReceiversTest(ch);
-  delete ch;
-}
-
-// This tests that closing a channelholder unblocks
-//  any senders waiting for channel to have write space
-TEST(Channel, ChannelHolderCloseUnblocksSendersTest) {
-  // Check for buffered channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(1);
-  ChannelHolderCloseUnblocksSendersTest(ch, true);
-  delete ch;
-
-  // Check for unbuffered channel
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderCloseUnblocksSendersTest(ch, false);
-  delete ch;
-}
-
-// This tests that destroying a channelholder unblocks
-//  any senders waiting for channel
-void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-  bool send_success[kNumThreads];
-
-  // Launches threads that try to write and are blocked because of no readers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    send_success[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended, bool *success) {
-          int data = 10;
-          bool is_exception = false;
-          try {
-            ch->Send(&data);
-          } catch (paddle::platform::EnforceNotMet e) {
-            is_exception = true;
-          }
-          *success = !is_exception;
-          *ended = true;
-        },
-        &thread_ended[i], &send_success[i]);
-  }
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait 0.2 sec
-  if (isBuffered) {
-    // If channel is buffered, verify that atleast 4 threads are blocked
-    int ct = 0;
-    for (size_t i = 0; i < kNumThreads; i++) {
-      if (thread_ended[i] == false) ct++;
-    }
-    // Atleast 4 threads must be blocked
-    EXPECT_GE(ct, 4);
-  } else {
-    // Verify that all the threads are blocked
-    for (size_t i = 0; i < kNumThreads; i++) {
-      EXPECT_EQ(thread_ended[i], false);
-    }
-  }
-  // Explicitly destroy the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  // Count number of successfuld sends
-  int ct = 0;
-  for (size_t i = 0; i < kNumThreads; i++) {
-    if (send_success[i]) ct++;
-  }
-
-  if (isBuffered) {
-    // Only 1 send must be successful
-    EXPECT_EQ(ct, 1);
-  } else {
-    // In unbuffered channel, no send should be successful
-    EXPECT_EQ(ct, 0);
-  }
-
-  // Join all threads
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-// This tests that destroying a channelholder also unblocks
-//  any receivers waiting on the channel
-void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
-  const size_t kNumThreads = 5;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-
-  // Launches threads that try to read and are blocked because of no writers
-  for (size_t i = 0; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          // All reads should return false
-          EXPECT_EQ(ch->Receive(&data), false);
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-
-  // Verify that all threads are blocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], false);
-  }
-  // delete the channel
-  delete ch;
-  std::this_thread::sleep_for(std::chrono::milliseconds(200));  // wait
-  // Verify that all threads got unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-TEST(ChannelHolder, ChannelHolderDestroyUnblocksReceiversTest) {
-  // Check for Buffered Channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(1);
-  ChannelHolderDestroyUnblockReceivers(ch);
-  // ch is already deleted already deleted in
-  // ChannelHolderDestroyUnblockReceivers
-
-  // Check for Unbuffered channel
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderDestroyUnblockReceivers(ch);
-}
-
-TEST(ChannelHolder, ChannelHolderDestroyUnblocksSendersTest) {
-  // Check for Buffered Channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(1);
-  ChannelHolderDestroyUnblockSenders(ch, true);
-  // ch is already deleted already deleted in
-  // ChannelHolderDestroyUnblockReceivers
-
-  // Check for Unbuffered channel
-  ch = new ChannelHolder();
-  ch->Reset<int>(0);
-  ChannelHolderDestroyUnblockSenders(ch, false);
-}
-
-// This tests that closing a channelholder many times.
-void ChannelHolderManyTimesClose(ChannelHolder *ch) {
-  const int kNumThreads = 15;
-  std::thread t[kNumThreads];
-  bool thread_ended[kNumThreads];
-
-  // Launches threads that try to send data to channel.
-  for (size_t i = 0; i < kNumThreads / 3; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *ended) {
-          int data = 10;
-          ch->Send(&data);
-          *ended = true;
-        },
-        &thread_ended[i]);
-  }
-
-  // Launches threads that try to receive data to channel.
-  for (size_t i = kNumThreads / 3; i < 2 * kNumThreads / 3; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          int data;
-          if (ch->Receive(&data)) {
-            EXPECT_EQ(data, 10);
-          }
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-
-  // Launches threads that try to close the channel.
-  for (size_t i = 2 * kNumThreads / 3; i < kNumThreads; i++) {
-    thread_ended[i] = false;
-    t[i] = std::thread(
-        [&](bool *p) {
-          if (!ch->IsClosed()) {
-            ch->close();
-          }
-          *p = true;
-        },
-        &thread_ended[i]);
-  }
-
-  std::this_thread::sleep_for(std::chrono::milliseconds(100));  // wait
-
-  // Verify that all threads are unblocked
-  for (size_t i = 0; i < kNumThreads; i++) {
-    EXPECT_EQ(thread_ended[i], true);
-  }
-  EXPECT_TRUE(ch->IsClosed());
-  // delete the channel
-  delete ch;
-  for (size_t i = 0; i < kNumThreads; i++) t[i].join();
-}
-
-TEST(ChannelHolder, ChannelHolderManyTimesCloseTest) {
-  // Check for Buffered Channel
-  ChannelHolder *ch = new ChannelHolder();
-  ch->Reset<int>(10);
-  ChannelHolderManyTimesClose(ch);
-}
diff --git a/paddle/fluid/framework/concurrency_test.cc b/paddle/fluid/framework/concurrency_test.cc
deleted file mode 100644
index bbf67f5ba9..0000000000
--- a/paddle/fluid/framework/concurrency_test.cc
+++ /dev/null
@@ -1,292 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <thread>  // NOLINT
-
-#include "gtest/gtest.h"
-#include "paddle/fluid/framework/block_desc.h"
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-USE_NO_KERNEL_OP(go);
-USE_NO_KERNEL_OP(channel_close);
-USE_NO_KERNEL_OP(channel_create);
-USE_NO_KERNEL_OP(channel_recv);
-USE_NO_KERNEL_OP(channel_send);
-USE_NO_KERNEL_OP(elementwise_add);
-USE_NO_KERNEL_OP(select);
-USE_NO_KERNEL_OP(conditional_block);
-USE_NO_KERNEL_OP(equal);
-USE_NO_KERNEL_OP(assign);
-USE_NO_KERNEL_OP(while);
-USE_NO_KERNEL_OP(print);
-
-namespace f = paddle::framework;
-namespace p = paddle::platform;
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-LoDTensor *CreateVariable(Scope *scope, const p::CPUPlace &place,
-                          std::string name, T value) {
-  // Create LoDTensor<int> of dim [1]
-  auto var = scope->Var(name);
-  auto tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize({1});
-  T *expect = tensor->mutable_data<T>(place);
-  expect[0] = value;
-  return tensor;
-}
-
-void AddOp(const std::string &type, const VariableNameMap &inputs,
-           const VariableNameMap &outputs, AttributeMap attrs,
-           BlockDesc *block) {
-  // insert op
-  auto op = block->AppendOp();
-  op->SetType(type);
-  for (auto &kv : inputs) {
-    op->SetInput(kv.first, kv.second);
-  }
-  for (auto &kv : outputs) {
-    op->SetOutput(kv.first, kv.second);
-  }
-  op->SetAttrMap(attrs);
-}
-
-void AddCase(ProgramDesc *program, Scope *scope, p::CPUPlace *place,
-             BlockDesc *casesBlock, int caseId, int caseType,
-             std::string caseChannel, std::string caseVarName,
-             std::function<void(BlockDesc *, Scope *)> func) {
-  std::string caseCondName = std::string("caseCond") + std::to_string(caseId);
-  std::string caseCondXVarName =
-      std::string("caseCondX") + std::to_string(caseId);
-
-  BlockDesc *caseBlock = program->AppendBlock(*casesBlock);
-  func(caseBlock, scope);
-
-  CreateVariable(scope, *place, caseCondName, false);
-  CreateVariable(scope, *place, caseCondXVarName, caseId);
-  CreateVariable(scope, *place, caseVarName, caseId);
-
-  scope->Var("step_scope");
-
-  AddOp("equal", {{"X", {caseCondXVarName}}, {"Y", {"caseToExecute"}}},
-        {{"Out", {caseCondName}}}, {}, casesBlock);
-
-  AddOp("conditional_block", {{"X", {caseCondName}}, {"Params", {}}},
-        {{"Out", {}}, {"Scope", {"step_scope"}}},
-        {{"sub_block", caseBlock}, {"is_scalar_condition", true}}, casesBlock);
-}
-
-void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
-                        BlockDesc *parentBlock, std::string dataChanName,
-                        std::string quitChanName) {
-  BlockDesc *whileBlock = program->AppendBlock(*parentBlock);
-
-  CreateVariable(scope, *place, "whileExitCond", true);
-  CreateVariable(scope, *place, "caseToExecute", -1);
-  CreateVariable(scope, *place, "case1var", 0);
-
-  CreateVariable(scope, *place, "xtemp", 0);
-
-  // TODO(thuan): Need to create fibXToSend, since channel send moves the actual
-  // data,
-  // which causes the data to be no longer accessible to do the fib calculation
-  // TODO(abhinav): Change channel send to do a copy instead of a move!
-  CreateVariable(scope, *place, "fibXToSend", 0);
-
-  CreateVariable(scope, *place, "fibX", 0);
-  CreateVariable(scope, *place, "fibY", 1);
-  CreateVariable(scope, *place, "quitVar", 0);
-
-  BlockDesc *casesBlock = program->AppendBlock(*whileBlock);
-  std::function<void(BlockDesc * caseBlock)> f = [](BlockDesc *caseBlock) {};
-
-  // TODO(thuan): Remove this once we change channel send to do a copy instead
-  // of move
-  AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"fibXToSend"}}}, {}, whileBlock);
-
-  // Case 0: Send to dataChanName
-  std::function<void(BlockDesc * caseBlock, Scope * scope)> case0Func = [&](
-      BlockDesc *caseBlock, Scope *scope) {
-    AddOp("assign", {{"X", {"fibX"}}}, {{"Out", {"xtemp"}}}, {}, caseBlock);
-    AddOp("assign", {{"X", {"fibY"}}}, {{"Out", {"fibX"}}}, {}, caseBlock);
-    AddOp("elementwise_add", {{"X", {"xtemp"}}, {"Y", {"fibY"}}},
-          {{"Out", {"fibY"}}}, {}, caseBlock);
-  };
-  AddCase(program, scope, place, casesBlock, 0, 1, dataChanName, "fibXToSend",
-          case0Func);
-  std::string case0Config =
-      std::string("0,1,") + dataChanName + std::string(",fibXToSend");
-
-  // Case 1: Receive from quitChanName
-  std::function<void(BlockDesc * caseBlock, Scope * scope)> case2Func = [&](
-      BlockDesc *caseBlock, Scope *scope) {
-    // Exit the while loop after we receive from quit channel.
-    // We assign a false to "whileExitCond" variable, which will
-    // break out of while_op loop
-    CreateVariable(scope, *place, "whileFalse", false);
-    AddOp("assign", {{"X", {"whileFalse"}}}, {{"Out", {"whileExitCond"}}}, {},
-          caseBlock);
-  };
-  AddCase(program, scope, place, casesBlock, 1, 2, quitChanName, "quitVar",
-          case2Func);
-  std::string case1Config =
-      std::string("1,2,") + quitChanName + std::string(",quitVar");
-
-  // Select block
-  AddOp("select", {{"X", {dataChanName, quitChanName}},
-                   {"case_to_execute", {"caseToExecute"}}},
-        {{"Out", {}}},
-        {{"sub_block", casesBlock},
-         {"cases", std::vector<std::string>{case0Config, case1Config}}},
-        whileBlock);
-
-  scope->Var("stepScopes");
-  AddOp("while",
-        {{"X", {dataChanName, quitChanName}}, {"Condition", {"whileExitCond"}}},
-        {{"Out", {}}, {"StepScopes", {"stepScopes"}}},
-        {{"sub_block", whileBlock}}, parentBlock);
-}
-
-TEST(Concurrency, Go_Op) {
-  Scope scope;
-  p::CPUPlace place;
-
-  // Initialize scope variables
-  p::CPUDeviceContext ctx(place);
-
-  // Create channel variable
-  scope.Var("Channel");
-
-  // Create Variables, x0 will be put into channel,
-  // result will be pulled from channel
-  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(&scope, place, "x0", 99);
-  CreateVariable(&scope, place, "result", 0);
-
-  framework::Executor executor(place);
-  ProgramDesc program;
-  BlockDesc *block = program.MutableBlock(0);
-
-  // Create channel OP
-  AddOp("channel_create", {}, {{"Out", {"Channel"}}},
-        {{"capacity", 10}, {"data_type", f::proto::VarType::LOD_TENSOR}},
-        block);
-
-  // Create Go Op routine
-  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
-  AddOp("channel_send", {{"Channel", {"Channel"}}, {"X", {"x0"}}},
-        {{"Status", {"Status"}}}, {}, goOpBlock);
-
-  // Create Go Op
-  AddOp("go", {{"X", {"Channel", "x0"}}}, {}, {{"sub_block", goOpBlock}},
-        block);
-
-  // Create Channel Receive Op
-  AddOp("channel_recv", {{"Channel", {"Channel"}}},
-        {{"Status", {"Status"}}, {"Out", {"result"}}}, {}, block);
-
-  // Create Channel Close Op
-  AddOp("channel_close", {{"Channel", {"Channel"}}}, {}, {}, block);
-
-  // Check the result tensor to make sure it is set to 0
-  const LoDTensor &tensor = (scope.FindVar("result"))->Get<LoDTensor>();
-  auto *initialData = tensor.data<int>();
-  EXPECT_EQ(initialData[0], 0);
-
-  executor.Run(program, &scope, 0, true, true);
-
-  // After we call executor.run, the Go operator should do a channel_send to
-  // set the "result" variable to 99.
-  auto *finalData = tensor.data<int>();
-  EXPECT_EQ(finalData[0], 99);
-}
-
-/**
- * This test implements the fibonacci function using go_op and select_op
- */
-TEST(Concurrency, Select) {
-  Scope scope;
-  p::CPUPlace place;
-
-  // Initialize scope variables
-  p::CPUDeviceContext ctx(place);
-
-  CreateVariable(&scope, place, "Status", false);
-  CreateVariable(&scope, place, "result", 0);
-  CreateVariable(&scope, place, "currentXFib", 0);
-
-  framework::Executor executor(place);
-  ProgramDesc program;
-  BlockDesc *block = program.MutableBlock(0);
-
-  // Create channel OP
-  std::string dataChanName = "Channel";
-  scope.Var(dataChanName);
-  AddOp("channel_create", {}, {{"Out", {dataChanName}}},
-        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
-
-  std::string quitChanName = "Quit";
-  scope.Var(quitChanName);
-  AddOp("channel_create", {}, {{"Out", {quitChanName}}},
-        {{"capacity", 0}, {"data_type", f::proto::VarType::LOD_TENSOR}}, block);
-
-  // Create Go Op routine, which loops 10 times over fibonacci sequence
-  CreateVariable(&scope, place, "xReceiveVar", 0);
-
-  BlockDesc *goOpBlock = program.AppendBlock(program.Block(0));
-  for (int i = 0; i < 10; ++i) {
-    AddOp("channel_recv", {{"Channel", {dataChanName}}},
-          {{"Status", {"Status"}}, {"Out", {"currentXFib"}}}, {}, goOpBlock);
-    AddOp("print", {{"In", {"currentXFib"}}}, {{"Out", {"currentXFib"}}},
-          {{"first_n", 100},
-           {"summarize", -1},
-           {"print_tensor_name", false},
-           {"print_tensor_type", true},
-           {"print_tensor_shape", false},
-           {"print_tensor_lod", false},
-           {"print_phase", std::string("FORWARD")},
-           {"message", std::string("X: ")}},
-          goOpBlock);
-  }
-
-  CreateVariable(&scope, place, "quitSignal", 0);
-  AddOp("channel_send", {{"Channel", {quitChanName}}, {"X", {"quitSignal"}}},
-        {{"Status", {"Status"}}}, {}, goOpBlock);
-
-  // Create Go Op
-  AddOp("go", {{"X", {dataChanName, quitChanName}}}, {},
-        {{"sub_block", goOpBlock}}, block);
-
-  AddFibonacciSelect(&scope, &place, &program, block, dataChanName,
-                     quitChanName);
-
-  // Create Channel Close Op
-  AddOp("channel_close", {{"Channel", {dataChanName}}}, {}, {}, block);
-  AddOp("channel_close", {{"Channel", {quitChanName}}}, {}, {}, block);
-
-  executor.Run(program, &scope, 0, true, true);
-
-  // After we call executor.run, "result" variable should be equal to 34
-  // (which is 10 loops through fibonacci sequence)
-  const LoDTensor &tensor = (scope.FindVar("currentXFib"))->Get<LoDTensor>();
-  auto *finalData = tensor.data<int>();
-  EXPECT_EQ(finalData[0], 34);
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc
index 8d8042a056..70ec6e90a4 100644
--- a/paddle/fluid/framework/executor.cc
+++ b/paddle/fluid/framework/executor.cc
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/executor.h"
 
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
@@ -76,15 +75,13 @@ void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
     var->GetMutable<platform::PlaceList>();
   } else if (var_type == proto::VarType::READER) {
     var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::CHANNEL) {
-    var->GetMutable<ChannelHolder>();
   } else if (var_type == proto::VarType::RAW) {
     // GetMutable will be called in operator
   } else {
     PADDLE_THROW(
         "Variable type %d is not in "
         "[LOD_TENSOR, SELECTED_ROWS, FEED_MINIBATCH, FETCH_LIST, "
-        "LOD_RANK_TABLE, PLACE_LIST, READER, CHANNEL, RAW]",
+        "LOD_RANK_TABLE, PLACE_LIST, READER, RAW]",
         var_type);
   }
 }
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 460401df54..25f0ba4184 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -126,7 +126,6 @@ message VarType {
     LOD_TENSOR_ARRAY = 13;
     PLACE_LIST = 14;
     READER = 15;
-    CHANNEL = 16;
     // Any runtime decided variable type is raw
     // raw variables should manage their own allocations
     // in operators like nccl_op
@@ -158,12 +157,6 @@ message VarType {
   message ReaderDesc { repeated LoDTensorDesc lod_tensor = 1; }
   optional ReaderDesc reader = 5;
 
-  message ChannelDesc {
-    required Type data_type = 1;
-    required int64 capacity = 2;
-  }
-  optional ChannelDesc channel = 6;
-
   message Tuple { repeated Type element_type = 1; }
   optional Tuple tuple = 7;
 }
diff --git a/paddle/fluid/framework/tuple.h b/paddle/fluid/framework/tuple.h
index f6c6a1fec1..508ee931c6 100644
--- a/paddle/fluid/framework/tuple.h
+++ b/paddle/fluid/framework/tuple.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/var_desc.h"
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 1aa0ae0f7c..7e3f002b53 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -88,13 +88,7 @@ std::vector<std::vector<int64_t>> VarDesc::GetShapes() const {
 }
 
 void VarDesc::SetDataType(proto::VarType::Type data_type) {
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      mutable_channel_desc()->set_data_type(data_type);
-      break;
-    default:
-      mutable_tensor_desc()->set_data_type(data_type);
-  }
+  mutable_tensor_desc()->set_data_type(data_type);
 }
 
 void VarDesc::SetDataTypes(
@@ -115,13 +109,7 @@ void VarDesc::SetDataTypes(
 }
 
 proto::VarType::Type VarDesc::GetDataType() const {
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return channel_desc().data_type();
-      break;
-    default:
-      return tensor_desc().data_type();
-  }
+  return tensor_desc().data_type();
 }
 
 std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
@@ -134,17 +122,6 @@ std::vector<proto::VarType::Type> VarDesc::GetDataTypes() const {
   return res;
 }
 
-void VarDesc::SetCapacity(int64_t capacity) {
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      desc_.mutable_type()->mutable_channel()->set_capacity(capacity);
-      break;
-    default:
-      PADDLE_THROW("Setting 'capacity' is not supported by the type of var %s.",
-                   this->Name());
-  }
-}
-
 void VarDesc::SetLoDLevel(int32_t lod_level) {
   switch (desc_.type().type()) {
     case proto::VarType::LOD_TENSOR:
@@ -214,19 +191,6 @@ std::vector<int32_t> VarDesc::GetLoDLevels() const {
   }
 }
 
-const proto::VarType::ChannelDesc &VarDesc::channel_desc() const {
-  PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return desc_.type().channel();
-    default:
-      PADDLE_THROW(
-          "Getting 'channel_desc' is not supported by the type of var %s.",
-          this->Name());
-  }
-}
-
 const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
   PADDLE_ENFORCE(desc_.has_type(), "The var's type hasn't been set.");
   PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
@@ -262,20 +226,6 @@ std::vector<proto::VarType::TensorDesc> VarDesc::tensor_descs() const {
   }
 }
 
-proto::VarType::ChannelDesc *VarDesc::mutable_channel_desc() {
-  PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
-  PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
-  switch (desc_.type().type()) {
-    case proto::VarType::CHANNEL:
-      return desc_.mutable_type()->mutable_channel();
-    default:
-      PADDLE_THROW(
-          "Getting 'mutable_channel_desc' is not supported by the type of var "
-          "%s.",
-          this->Name());
-  }
-}
-
 proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
   PADDLE_ENFORCE(desc_.has_type(), "The var type hasn't been set.");
   PADDLE_ENFORCE(desc_.type().has_type(), "The var type hasn't been set.");
diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h
index 9f7a21ef42..e33849ef50 100644
--- a/paddle/fluid/framework/var_desc.h
+++ b/paddle/fluid/framework/var_desc.h
@@ -87,8 +87,6 @@ class VarDesc {
   void SetDataTypes(
       const std::vector<proto::VarType::Type> &multiple_data_type);
 
-  void SetCapacity(int64_t capacity);
-
   proto::VarType::Type GetDataType() const;
 
   std::vector<proto::VarType::Type> GetDataTypes() const;
@@ -110,10 +108,8 @@ class VarDesc {
   void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
 
  private:
-  const proto::VarType::ChannelDesc &channel_desc() const;
   const proto::VarType::TensorDesc &tensor_desc() const;
   std::vector<proto::VarType::TensorDesc> tensor_descs() const;
-  proto::VarType::ChannelDesc *mutable_channel_desc();
   proto::VarType::TensorDesc *mutable_tensor_desc();
   std::vector<proto::VarType::TensorDesc *> mutable_tensor_descs();
 
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index e9550dbfb9..3b6f1cdb8f 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -41,8 +40,6 @@ inline proto::VarType::Type ToVarType(std::type_index type) {
     return proto::VarType_Type_SELECTED_ROWS;
   } else if (IsType<ReaderHolder>(type)) {
     return proto::VarType_Type_READER;
-  } else if (IsType<ChannelHolder>(type)) {
-    return proto::VarType_Type_CHANNEL;
   } else {
     PADDLE_THROW("ToVarType:Unsupported type %s", type.name());
   }
@@ -66,9 +63,6 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
     case proto::VarType_Type_READER:
       visitor(var.Get<ReaderHolder>());
       return;
-    case proto::VarType_Type_CHANNEL:
-      visitor(var.Get<ChannelHolder>());
-      return;
     default:
       PADDLE_THROW("Not supported visit type, %d", ToVarType(var.Type()));
   }
diff --git a/paddle/fluid/inference/analysis/analysis_pass.h b/paddle/fluid/inference/analysis/analysis_pass.h
index b6edb5529a..13805ea4ac 100644
--- a/paddle/fluid/inference/analysis/analysis_pass.h
+++ b/paddle/fluid/inference/analysis/analysis_pass.h
@@ -41,12 +41,6 @@ class AnalysisPass {
   // all passes have run.
   virtual bool Finalize() { return false; }
 
-  // Get a Pass appropriate to print the Node this pass operates on.
-  virtual AnalysisPass *CreatePrinterPass(std::ostream &os,
-                                          const std::string &banner) const {
-    return nullptr;
-  }
-
   // Create a debugger Pass that draw the DFG by graphviz toolkit.
   virtual AnalysisPass *CreateGraphvizDebugerPass() const { return nullptr; }
 
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9c67df7bdf..fa41266d62 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -313,11 +313,6 @@ op_library(save_combine_op DEPS lod_tensor)
 op_library(load_combine_op DEPS lod_tensor)
 op_library(concat_op DEPS concat)
 
-# FIXME(thuan): Move CSP operators to paddle/fluid/framework/operators/concurrency
-add_subdirectory(concurrency)
-op_library(channel_send_op DEPS concurrency)
-op_library(channel_recv_op DEPS concurrency)
-
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 
 foreach(src ${GENERAL_OPS})
diff --git a/paddle/fluid/operators/channel_close_op.cc b/paddle/fluid/operators/channel_close_op.cc
deleted file mode 100644
index 8e2db250a0..0000000000
--- a/paddle/fluid/operators/channel_close_op.cc
+++ /dev/null
@@ -1,70 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace pf = paddle::framework;
-static constexpr char kChannel[] = "Channel";
-
-namespace paddle {
-namespace operators {
-
-class ChannelCloseOp : public framework::OperatorBase {
- public:
-  ChannelCloseOp(const std::string &type,
-                 const framework::VariableNameMap &inputs,
-                 const framework::VariableNameMap &outputs,
-                 const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &inp = *scope.FindVar(Input(kChannel));
-
-    // Get the mutable version of the channel variable and closes it.
-    pf::ChannelHolder *ch = inp.GetMutable<framework::ChannelHolder>();
-    ch->close();
-  }
-};
-
-class ChannelCloseOpOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInput("Channel"),
-                   "The input of ChannelClose op must be set");
-  }
-};
-
-class ChannelCloseOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kChannel,
-             "The Channel Variable that should be closed by"
-             " the ChannelClose Op.");
-    AddComment(R"DOC(
-Channel Close Operator.
-
-This operator closes an open channel.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(channel_close, paddle::operators::ChannelCloseOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelCloseOpMaker);
diff --git a/paddle/fluid/operators/channel_create_op.cc b/paddle/fluid/operators/channel_create_op.cc
deleted file mode 100644
index a7f59e4088..0000000000
--- a/paddle/fluid/operators/channel_create_op.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/lod_rank_table.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/reader.h"
-
-namespace pf = paddle::framework;
-
-static constexpr char kOutput[] = "Out";
-
-namespace paddle {
-namespace operators {
-
-class ChannelCreateOp : public framework::OperatorBase {
- public:
-  ChannelCreateOp(const std::string &type,
-                  const framework::VariableNameMap &inputs,
-                  const framework::VariableNameMap &outputs,
-                  const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    auto &out = *scope.FindVar(Output(kOutput));
-
-    // Determine the datatype and capacity of the channel to be created
-    // from the attributes provided.
-    auto dtype =
-        static_cast<framework::proto::VarType::Type>(Attr<int>("data_type"));
-    auto capacity = Attr<int>("capacity");
-
-    // Based on the datatype, create a new channel holder initialized with
-    // the given capacity. When capacity is 0, an unbuffered channel is
-    // created.
-    pf::ChannelHolder *ch = out.GetMutable<framework::ChannelHolder>();
-    if (dtype == framework::proto::VarType::LOD_TENSOR) {
-      ch->Reset<pf::LoDTensor>(capacity);
-    } else if (dtype == framework::proto::VarType::SELECTED_ROWS) {
-      ch->Reset<pf::SelectedRows>(capacity);
-    } else if (dtype == framework::proto::VarType::LOD_RANK_TABLE) {
-      ch->Reset<pf::LoDRankTable>(capacity);
-    } else if (dtype == framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      ch->Reset<pf::LoDTensorArray>(capacity);
-    } else if (dtype == framework::proto::VarType::READER) {
-      ch->Reset<pf::ReaderHolder>(capacity);
-    } else if (dtype == framework::proto::VarType::CHANNEL) {
-      ch->Reset<pf::ChannelHolder>(capacity);
-    } else if (dtype == framework::proto::VarType::BOOL) {
-      ch->Reset<bool>(capacity);
-    } else if (dtype == framework::proto::VarType::INT32) {
-      ch->Reset<int>(capacity);
-    } else if (dtype == framework::proto::VarType::INT64) {
-      ch->Reset<int64_t>(capacity);
-    } else if (dtype == framework::proto::VarType::FP32) {
-      ch->Reset<float>(capacity);
-    } else if (dtype == framework::proto::VarType::FP64) {
-      ch->Reset<double>(capacity);
-    } else {
-      PADDLE_THROW(
-          "Data type %d is not in "
-          "[LOD_TENSOR, SELECTED_ROWS, LOD_RANK_TABLE, LOD_TENSOR_ARRAY, "
-          "READER, CHANNEL, BOOL, INT32, INT64, FP32, FP64]",
-          dtype);
-    }
-  }
-};
-
-class ChannelCreateOpOpInferShape : public framework::InferShapeBase {
- public:
-  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasOutput(kOutput),
-                   "The output of ChannelCreate op must be set");
-    context->SetOutputDim(kOutput, {1});
-  }
-};
-
-class ChannelCreateOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddOutput(kOutput,
-              "The object of a Channel type created by ChannelCreate Op.");
-    AddAttr<int>("capacity", "The size of the buffer of Channel.")
-        .SetDefault(0);
-    AddAttr<int>("data_type", "The data type of elements inside the Channel.");
-    AddComment(R"DOC(
-Channel Create Operator.
-
-This operator creates an object of the VarType Channel and returns it.
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(channel_create, paddle::operators::ChannelCreateOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelCreateOpMaker);
diff --git a/paddle/fluid/operators/channel_recv_op.cc b/paddle/fluid/operators/channel_recv_op.cc
deleted file mode 100644
index 101015e837..0000000000
--- a/paddle/fluid/operators/channel_recv_op.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-#include <paddle/fluid/framework/lod_rank_table.h>
-#include <paddle/fluid/framework/lod_tensor_array.h>
-#include <paddle/fluid/framework/reader.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-static constexpr char Channel[] = "Channel";
-static constexpr char Status[] = "Status";
-static constexpr char Out[] = "Out";
-
-namespace paddle {
-namespace operators {
-
-void SetReceiveStatus(const platform::Place &dev_place,
-                      framework::Variable *status_var, bool status) {
-  auto cpu = platform::CPUPlace();
-  auto status_tensor =
-      status_var->GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
-                                                                         cpu);
-  status_tensor[0] = status;
-}
-
-class ChannelRecvOp : public framework::OperatorBase {
- public:
-  ChannelRecvOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const {
-    PADDLE_ENFORCE(ctx->HasInput(Channel),
-                   "Input(Channel) of ChannelRecvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Out),
-                   "Input(Channel) of ChannelRecvOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput(Status),
-                   "Output(Status) of ChannelRecvOp should not be null.");
-    ctx->SetOutputDim("Status", {1});
-  }
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    // Get the channel holder created by channel_create op, passed as input.
-    framework::ChannelHolder *ch =
-        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
-    auto output_var = scope.FindVar(Output(Out));
-    // Receive the data from the channel.
-    bool ok = concurrency::ChannelReceive(ch, output_var);
-
-    // Set the status output of the `ChannelReceive` call.
-    SetReceiveStatus(dev_place, scope.FindVar(Output(Status)), ok);
-  }
-};
-
-class ChannelRecvOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(Channel,
-             "(Channel) A variable which \"receives\" the a value sent"
-             "to it by a channel_send op.")
-        .AsDuplicable();
-    AddOutput(Out,
-              "(Variable) Output Variable that will hold the data received"
-              " from the Channel")
-        .AsDuplicable();
-    AddOutput(Status,
-              "(Tensor) An LoD Tensor that returns a boolean status of the"
-              "result of the receive operation.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(channel_recv, paddle::operators::ChannelRecvOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelRecvOpMaker);
diff --git a/paddle/fluid/operators/channel_send_op.cc b/paddle/fluid/operators/channel_send_op.cc
deleted file mode 100644
index 67d6deb511..0000000000
--- a/paddle/fluid/operators/channel_send_op.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/channel.h"
-#include <paddle/fluid/framework/lod_rank_table.h>
-#include <paddle/fluid/framework/lod_tensor_array.h>
-#include <paddle/fluid/framework/reader.h>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-static constexpr char Channel[] = "Channel";
-static constexpr char X[] = "X";
-
-namespace paddle {
-namespace operators {
-
-class ChannelSendOp : public framework::OperatorBase {
- public:
-  ChannelSendOp(const std::string &type,
-                const framework::VariableNameMap &inputs,
-                const framework::VariableNameMap &outputs,
-                const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
-  void InferShape(framework::InferShapeContext *ctx) const {
-    PADDLE_ENFORCE(ctx->HasInput(Channel),
-                   "Input(Channel) of ChannelSendOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput(X),
-                   "Input(X) of ChannelSendOp should not be null.");
-  }
-
- private:
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    // Get the channel holder created by channel_create op, passed as input.
-    framework::ChannelHolder *ch =
-        scope.FindVar(Input(Channel))->GetMutable<framework::ChannelHolder>();
-    auto input_var = scope.FindVar(Input(X));
-
-    // Send the input data through the channel.
-    concurrency::ChannelSend(ch, input_var);
-  }
-};
-
-class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(Channel,
-             "(Channel) A variable which \"sends\" the passed in value to "
-             "a listening receiver.")
-        .AsDuplicable();
-    AddInput(X, "(Variable) The value which gets sent by the channel.")
-        .AsDuplicable();
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(channel_send, paddle::operators::ChannelSendOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::ChannelSendOpMaker);
diff --git a/paddle/fluid/operators/concurrency/CMakeLists.txt b/paddle/fluid/operators/concurrency/CMakeLists.txt
deleted file mode 100644
index e4617440d1..0000000000
--- a/paddle/fluid/operators/concurrency/CMakeLists.txt
+++ /dev/null
@@ -1 +0,0 @@
-cc_library(concurrency SRCS channel_util.cc DEPS device_context framework_proto boost eigen3)
diff --git a/paddle/fluid/operators/concurrency/channel_util.cc b/paddle/fluid/operators/concurrency/channel_util.cc
deleted file mode 100644
index fba4abf189..0000000000
--- a/paddle/fluid/operators/concurrency/channel_util.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-#include "paddle/fluid/framework/var_type.h"
-
-namespace poc = paddle::operators::concurrency;
-
-void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    ch->Send(var->GetMutable<framework::LoDTensor>());
-  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    ch->Send(var->GetMutable<framework::LoDRankTable>());
-  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    ch->Send(var->GetMutable<framework::LoDTensorArray>());
-  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    ch->Send(var->GetMutable<framework::SelectedRows>());
-  else if (type == framework::proto::VarType_Type_READER)
-    ch->Send(var->GetMutable<framework::ReaderHolder>());
-  else if (type == framework::proto::VarType_Type_CHANNEL)
-    ch->Send(var->GetMutable<framework::ChannelHolder>());
-  else
-    PADDLE_THROW("ChannelSend:Unsupported type");
-}
-
-bool poc::ChannelReceive(framework::ChannelHolder *ch,
-                         framework::Variable *var) {
-  // Get type of channel and use that to call mutable data for Variable
-  auto type = framework::ToVarType(ch->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR)
-    return ch->Receive(var->GetMutable<framework::LoDTensor>());
-  else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
-    return ch->Receive(var->GetMutable<framework::LoDRankTable>());
-  else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
-    return ch->Receive(var->GetMutable<framework::LoDTensorArray>());
-  else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
-    return ch->Receive(var->GetMutable<framework::SelectedRows>());
-  else if (type == framework::proto::VarType_Type_READER)
-    return ch->Receive(var->GetMutable<framework::ReaderHolder>());
-  else if (type == framework::proto::VarType_Type_CHANNEL)
-    return ch->Receive(var->GetMutable<framework::ChannelHolder>());
-  else
-    PADDLE_THROW("ChannelReceive:Unsupported type");
-}
-
-void poc::ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
-                            framework::Variable *var,
-                            std::shared_ptr<std::condition_variable_any> cond,
-                            std::function<bool(framework::ChannelAction)> cb) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensor>(), cond, cb);
-  } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDRankTable>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::LoDTensorArray>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::SelectedRows>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_READER) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::ReaderHolder>(), cond,
-                   cb);
-  } else if (type == framework::proto::VarType_Type_CHANNEL) {
-    ch->AddToSendQ(referrer, var->GetMutable<framework::ChannelHolder>(), cond,
-                   cb);
-  } else {
-    PADDLE_THROW("ChannelAddToSendQ:Unsupported type");
-  }
-}
-
-void poc::ChannelAddToReceiveQ(
-    framework::ChannelHolder *ch, const void *referrer,
-    framework::Variable *var, std::shared_ptr<std::condition_variable_any> cond,
-    std::function<bool(framework::ChannelAction)> cb) {
-  auto type = framework::ToVarType(var->Type());
-  if (type == framework::proto::VarType_Type_LOD_TENSOR) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensor>(), cond,
-                      cb);
-  } else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDRankTable>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::LoDTensorArray>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_SELECTED_ROWS) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::SelectedRows>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_READER) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::ReaderHolder>(),
-                      cond, cb);
-  } else if (type == framework::proto::VarType_Type_CHANNEL) {
-    ch->AddToReceiveQ(referrer, var->GetMutable<framework::ChannelHolder>(),
-                      cond, cb);
-  } else {
-    PADDLE_THROW("ChannelAddToReceiveQ:Unsupported type");
-  }
-}
diff --git a/paddle/fluid/operators/concurrency/channel_util.h b/paddle/fluid/operators/concurrency/channel_util.h
deleted file mode 100644
index cd18ca78c6..0000000000
--- a/paddle/fluid/operators/concurrency/channel_util.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/variable.h"
-
-namespace paddle {
-namespace operators {
-namespace concurrency {
-
-void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
-bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
-
-void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
-                       framework::Variable *var,
-                       std::shared_ptr<std::condition_variable_any> cond,
-                       std::function<bool(framework::ChannelAction)> cb);
-void ChannelAddToReceiveQ(framework::ChannelHolder *ch, const void *referrer,
-                          framework::Variable *var,
-                          std::shared_ptr<std::condition_variable_any> cond,
-                          std::function<bool(framework::ChannelAction)> cb);
-
-}  // namespace concurrency
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/select_op.cc b/paddle/fluid/operators/select_op.cc
deleted file mode 100644
index e71841d4d1..0000000000
--- a/paddle/fluid/operators/select_op.cc
+++ /dev/null
@@ -1,419 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-#include <thread>  // NOLINT
-#include <vector>
-#include "paddle/fluid/framework/channel.h"
-#include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/concurrency/channel_util.h"
-
-#include <boost/tokenizer.hpp>
-
-namespace paddle {
-namespace operators {
-
-static constexpr char kX[] = "X";
-static constexpr char kCaseToExecute[] = "case_to_execute";
-static constexpr char kOutputs[] = "Out";
-
-static constexpr char kCases[] = "cases";
-static constexpr char kCasesBlock[] = "sub_block";
-
-class SelectOp : public framework::OperatorBase {
- public:
-  SelectOp(const std::string &type, const framework::VariableNameMap &inputs,
-           const framework::VariableNameMap &outputs,
-           const framework::AttributeMap &attrs)
-      : framework::OperatorBase(type, inputs, outputs, attrs) {}
-
- private:
-  enum class SelectOpCaseType {
-    DEFAULT = 0,
-    SEND = 1,
-    RECEIVE = 2,
-  };
-
-  struct SelectOpCase {
-    int caseIndex;
-    SelectOpCaseType caseType;
-    std::string channelName;
-    std::string varName;
-
-    SelectOpCase() {}
-
-    SelectOpCase(int caseIndex, SelectOpCaseType caseType,
-                 std::string channelName, std::string varName)
-        : caseIndex(caseIndex),
-          caseType(caseType),
-          channelName(channelName),
-          varName(varName) {}
-  };
-
-  void RunImpl(const framework::Scope &scope,
-               const platform::Place &dev_place) const override {
-    std::vector<std::string> casesConfigs =
-        Attr<std::vector<std::string>>(kCases);
-
-    framework::BlockDesc *casesBlock =
-        Attr<framework::BlockDesc *>(kCasesBlock);
-
-    framework::Scope &casesBlockScope = scope.NewScope();
-
-    std::string caseToExecuteVarName = Input(kCaseToExecute);
-    framework::Variable *caseToExecuteVar =
-        casesBlockScope.FindVar(caseToExecuteVarName);
-
-    // Construct cases from "conditional_block_op"(s) in the casesBlock
-    std::vector<std::shared_ptr<SelectOpCase>> cases =
-        ParseAndShuffleCases(&casesConfigs);
-
-    // Get all unique channels involved in select
-    std::set<framework::ChannelHolder *> channelsSet;
-    for (auto c : cases) {
-      if (!c->channelName.empty()) {
-        auto channelVar = scope.FindVar(c->channelName);
-        framework::ChannelHolder *ch =
-            channelVar->GetMutable<framework::ChannelHolder>();
-
-        if (channelsSet.find(ch) == channelsSet.end()) {
-          channelsSet.insert(ch);
-        }
-      }
-    }
-
-    // Order all channels by their pointer address
-    std::vector<framework::ChannelHolder *> channels(channelsSet.begin(),
-                                                     channelsSet.end());
-    std::sort(channels.begin(), channels.end());
-
-    // Poll all cases
-    int32_t caseToExecute = pollCases(&scope, &cases, channels);
-
-    // At this point, the case to execute has already been determined,
-    // so we can proceed with executing the cases block
-    framework::LoDTensor *caseToExecuteTensor =
-        caseToExecuteVar->GetMutable<framework::LoDTensor>();
-    caseToExecuteTensor->data<int32_t>()[0] = caseToExecute;
-
-    // Execute the cases block, only one case will be executed since we set the
-    // case_to_execute value to the index of the case we want to execute
-    framework::Executor executor(dev_place);
-    framework::ProgramDesc *program = casesBlock->Program();
-    executor.Run(*program, &casesBlockScope, casesBlock->ID(),
-                 false /*create_local_scope*/);
-  }
-
-  /**
-   * Goes through all operators in the casesConfigs and processes
-   * "conditional_block" operators.  These operators are mapped to our
-   * SelectOpCase objects.  We randomize the case orders, and set the
-   * default case (if any exists) as the last case)
-   * @param casesBlock
-   * @return
-   */
-  std::vector<std::shared_ptr<SelectOpCase>> ParseAndShuffleCases(
-      std::vector<std::string> *casesConfigs) const {
-    std::vector<std::shared_ptr<SelectOpCase>> cases;
-    std::shared_ptr<SelectOpCase> defaultCase;
-
-    if (casesConfigs != nullptr) {
-      boost::char_delimiters_separator<char> sep(false, ",", "");
-      for (std::vector<std::string>::iterator itr = casesConfigs->begin();
-           itr < casesConfigs->end(); ++itr) {
-        std::string caseConfig = *itr;
-        boost::tokenizer<> tokens(caseConfig, sep);
-
-        boost::tokenizer<>::iterator tok_iter = tokens.begin();
-        PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case index");
-        std::string caseIndexString = *tok_iter;
-        int caseIndex = std::stoi(caseIndexString);
-
-        ++tok_iter;
-        PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case type");
-        std::string caseTypeString = *tok_iter;
-        SelectOpCaseType caseType = (SelectOpCaseType)std::stoi(caseTypeString);
-
-        std::string caseChannel;
-        std::string caseChannelVar;
-
-        ++tok_iter;
-        if (caseType != SelectOpCaseType::DEFAULT) {
-          PADDLE_ENFORCE(tok_iter != tokens.end(), "Cannot get case channel");
-          caseChannel = *tok_iter;
-
-          ++tok_iter;
-          PADDLE_ENFORCE(tok_iter != tokens.end(),
-                         "Cannot get case channel variable");
-          caseChannelVar = *tok_iter;
-        }
-
-        auto c = std::make_shared<SelectOpCase>(caseIndex, caseType,
-                                                caseChannel, caseChannelVar);
-
-        if (caseType == SelectOpCaseType::DEFAULT) {
-          PADDLE_ENFORCE(defaultCase == nullptr,
-                         "Select can only contain one default case.");
-          defaultCase = c;
-        } else {
-          cases.push_back(c);
-        }
-      }
-    }
-
-    // Randomly sort cases, with default case being last
-    std::random_shuffle(cases.begin(), cases.end());
-    if (defaultCase != nullptr) {
-      cases.push_back(defaultCase);
-    }
-
-    return cases;
-  }
-
-  /**
-   * This method will recursively poll the cases and determines if any case
-   * condition is true.
-   * If none of the cases conditions are true (and there is no default case),
-   * then block
-   * the thread.  The thread may be woken up by a channel operation, at which
-   * point we
-   * execute the case.
-   * @param scope
-   * @param cases
-   * @param channels
-   * @return
-   */
-  int32_t pollCases(const framework::Scope *scope,
-                    std::vector<std::shared_ptr<SelectOpCase>> *cases,
-                    std::vector<framework::ChannelHolder *> channels) const {
-    // Lock all involved channels
-    lockChannels(channels);
-
-    std::atomic<int> caseToExecute(-1);
-
-    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
-    while (it != cases->end()) {
-      std::shared_ptr<SelectOpCase> c = *it;
-
-      auto chVar = scope->FindVar(c->channelName);
-      framework::ChannelHolder *ch =
-          chVar->GetMutable<framework::ChannelHolder>();
-
-      switch (c->caseType) {
-        case SelectOpCaseType::SEND:
-          PADDLE_ENFORCE(!ch->IsClosed(), "Cannot send to a closed channel");
-          if (ch->CanSend()) {
-            // We can send to channel directly, send the data to channel
-            // and execute case
-            auto chVar = scope->FindVar(c->varName);
-            concurrency::ChannelSend(ch, chVar);
-            caseToExecute = c->caseIndex;
-          }
-          break;
-        case SelectOpCaseType::RECEIVE:
-          if (ch->CanReceive()) {
-            // We can receive from channel directly, send the data to channel
-            // and execute case
-            auto chVar = scope->FindVar(c->varName);
-            concurrency::ChannelReceive(ch, chVar);
-            caseToExecute = c->caseIndex;
-          }
-          break;
-        case SelectOpCaseType::DEFAULT:
-          caseToExecute = c->caseIndex;
-          break;
-      }
-
-      if (caseToExecute != -1) {
-        // We found a case to execute, stop looking at other case statements
-        break;
-      }
-
-      ++it;
-    }
-
-    if (caseToExecute == -1) {
-      // None of the cases are eligible to execute, enqueue current thread
-      // into all the sending/receiving queue of each involved channel
-      std::atomic<bool> completed(false);
-      std::recursive_mutex mutex;
-      std::unique_lock<std::recursive_mutex> lock{mutex};
-      // std::condition_variable_any selectCond;
-      auto selectCond = std::make_shared<std::condition_variable_any>();
-
-      std::recursive_mutex callbackMutex;
-      pushThreadOnChannelQueues(scope, cases, selectCond, &caseToExecute,
-                                &completed, &callbackMutex);
-
-      // TODO(thuan): Atomically unlock all channels and sleep current thread
-      unlockChannels(channels);
-      selectCond->wait(lock, [&completed]() { return completed.load(); });
-
-      // Select has been woken up by case operation
-      lockChannels(channels);
-      removeThreadOnChannelQueues(scope, cases);
-
-      if (caseToExecute == -1) {
-        // Recursively poll cases, since we were woken up by a channel close
-        // TODO(thuan): Need to test if this is a valid case
-        unlockChannels(channels);
-        return pollCases(scope, cases, channels);
-      }
-    }
-
-    // At this point, caseToExecute != -1, and we can proceed with executing
-    // the case block
-    unlockChannels(channels);
-
-    return caseToExecute;
-  }
-
-  void lockChannels(std::vector<framework::ChannelHolder *> chs) const {
-    std::vector<framework::ChannelHolder *>::iterator it = chs.begin();
-    while (it != chs.end()) {
-      framework::ChannelHolder *ch = *it;
-      ch->Lock();
-      ++it;
-    }
-  }
-
-  void unlockChannels(std::vector<framework::ChannelHolder *> chs) const {
-    std::vector<framework::ChannelHolder *>::reverse_iterator it = chs.rbegin();
-    while (it != chs.rend()) {
-      framework::ChannelHolder *ch = *it;
-      ch->Unlock();
-      ++it;
-    }
-  }
-
-  void pushThreadOnChannelQueues(
-      const framework::Scope *scope,
-      std::vector<std::shared_ptr<SelectOpCase>> *cases,
-      std::shared_ptr<std::condition_variable_any> rCond,
-      std::atomic<int> *caseToExecute, std::atomic<bool> *completed,
-      std::recursive_mutex *callbackMutex) const {
-    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
-    while (it != cases->end()) {
-      std::shared_ptr<SelectOpCase> c = *it;
-
-      auto chVar = scope->FindVar(c->channelName);
-      framework::ChannelHolder *ch =
-          chVar->GetMutable<framework::ChannelHolder>();
-
-      std::function<bool(framework::ChannelAction channelAction)> cb =
-          [&caseToExecute, &completed, &callbackMutex,
-           c](framework::ChannelAction channelAction) {
-            std::lock_guard<std::recursive_mutex> lock{*callbackMutex};
-
-            bool canProcess = false;
-            if (!(*completed)) {
-              // If the channel wasn't closed, we set the caseToExecute index
-              // as this current case
-              if (channelAction != framework::ChannelAction::CLOSE) {
-                *caseToExecute = c->caseIndex;
-              }
-              // This will allow our conditional variable to break out of wait
-              *completed = true;
-              canProcess = true;
-            }
-
-            return canProcess;
-          };
-
-      switch (c->caseType) {
-        case SelectOpCaseType::SEND: {
-          auto chOutputVar = scope->FindVar(c->varName);
-          concurrency::ChannelAddToSendQ(ch, this, chOutputVar, rCond, cb);
-          break;
-        }
-        case SelectOpCaseType::RECEIVE: {
-          auto chOutputVar = scope->FindVar(c->varName);
-          concurrency::ChannelAddToReceiveQ(ch, this, chOutputVar, rCond, cb);
-          break;
-        }
-        default:
-          break;
-      }
-      ++it;
-    }
-  }
-
-  void removeThreadOnChannelQueues(
-      const framework::Scope *scope,
-      std::vector<std::shared_ptr<SelectOpCase>> *cases) const {
-    std::vector<std::shared_ptr<SelectOpCase>>::iterator it = cases->begin();
-    while (it != cases->end()) {
-      std::shared_ptr<SelectOpCase> c = *it;
-
-      auto chVar = scope->FindVar(c->channelName);
-      framework::ChannelHolder *ch =
-          chVar->GetMutable<framework::ChannelHolder>();
-      switch (c->caseType) {
-        case SelectOpCaseType::SEND: {
-          ch->RemoveFromSendQ(this);
-          break;
-        }
-        case SelectOpCaseType::RECEIVE: {
-          ch->RemoveFromReceiveQ(this);
-          break;
-        }
-        default:
-          break;
-      }
-      ++it;
-    }
-  }
-};
-
-class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput(kX,
-             "A set of variables, which are required by operators inside the "
-             "cases of Select Op")
-        .AsDuplicable();
-    AddInput(kCaseToExecute,
-             "(Int) The variable the sets the index of the case to execute, "
-             "after evaluating the channels being sent to and received from")
-        .AsDuplicable();
-    AddOutput(kOutputs,
-              "A set of variables, which will be assigned with values "
-              "generated by the operators inside the cases of Select Op.")
-        .AsDuplicable();
-    AddAttr<std::vector<std::string>>(kCases,
-                                      "(String vector) Serialized list of"
-                                      "all cases in the select op. Each"
-                                      "case is serialized as: "
-                                      "'<index>,<type>,<channel>,<value>'"
-                                      "where type is 0 for default, 1 for"
-                                      "send, and 2 for receive"
-                                      "No channel and values are needed for"
-                                      "default cases.");
-    AddAttr<framework::BlockDesc *>(kCasesBlock,
-                                    "The cases block inside select_op");
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
-// TODO(thuan): Implement Gradient Operator for SELECT_OP
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OPERATOR(select, paddle::operators::SelectOp,
-                  paddle::framework::EmptyGradOpMaker,
-                  paddle::operators::SelectOpMaker);
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index a5bc441220..3b22718a8c 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -214,7 +214,6 @@ void BindVarDsec(pybind11::module *m) {
       .def("set_shapes", &pd::VarDesc::SetShapes)
       .def("set_dtype", &pd::VarDesc::SetDataType)
       .def("set_dtypes", &pd::VarDesc::SetDataTypes)
-      .def("set_capacity", &pd::VarDesc::SetCapacity)
       .def("shape", &pd::VarDesc::GetShape,
            pybind11::return_value_policy::reference)
       .def("shapes", &pd::VarDesc::GetShapes,
@@ -251,7 +250,6 @@ void BindVarDsec(pybind11::module *m) {
       .value("STEP_SCOPES", pd::proto::VarType::STEP_SCOPES)
       .value("LOD_RANK_TABLE", pd::proto::VarType::LOD_RANK_TABLE)
       .value("LOD_TENSOR_ARRAY", pd::proto::VarType::LOD_TENSOR_ARRAY)
-      .value("CHANNEL", pd::proto::VarType::CHANNEL)
       .value("PLACE_LIST", pd::proto::VarType::PLACE_LIST)
       .value("READER", pd::proto::VarType::READER)
       .value("RAW", pd::proto::VarType::RAW);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index ef2f1f2a20..295af1c583 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,7 +21,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
diff --git a/python/paddle/fluid/concurrency.py b/python/paddle/fluid/concurrency.py
deleted file mode 100644
index e375fdef9c..0000000000
--- a/python/paddle/fluid/concurrency.py
+++ /dev/null
@@ -1,454 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-from .layers.control_flow import BlockGuard, equal
-from .framework import Operator
-from .layer_helper import LayerHelper, unique_name
-from .layers import fill_constant
-from . import core
-
-__all__ = [
-    'make_channel', 'channel_send', 'channel_recv', 'channel_close', 'Select'
-]
-
-
-class Go(BlockGuard):
-    def __init__(self, name=None):
-        self.helper = LayerHelper("go", name=name)
-        super(Go, self).__init__(self.helper.main_program)
-
-    def __enter__(self):
-        super(Go, self).__enter__()
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-        self._construct_go_op()
-        return super(Go, self).__exit__(exc_type, exc_val, exc_tb)
-
-    def _construct_go_op(self):
-        main_program = self.helper.main_program
-        go_block = main_program.current_block()
-        parent_block = main_program.block(main_program.current_block()
-                                          .parent_idx)
-
-        inner_outputs = set()
-        x_name_list = set()
-        for op in go_block.ops:
-            # Iterate over all operators, get all the inputs
-            # and add as input to the Go operator.
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        x_name_list.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        # Iterate over all operators , get all the outputs
-        # add to the output list of Go operator only if
-        # they exist in the parent block.
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in parent_block.vars:
-                out_vars.append(parent_block.var(inner_out_name))
-
-        parent_block.append_op(
-            type='go',
-            inputs={
-                'X': [
-                    parent_block._var_recursive(x_name)
-                    for x_name in x_name_list
-                ]
-            },
-            outputs={},
-            attrs={'sub_block': go_block})
-
-
-class SelectCase(object):
-    DEFAULT = 0
-    SEND = 1
-    RECEIVE = 2
-
-    def __init__(self,
-                 select,
-                 case_idx,
-                 case_to_execute,
-                 channel_action_fn=None,
-                 channel=None,
-                 value=None,
-                 is_copy=False):
-        self.select = select
-        self.helper = LayerHelper('conditional_block')
-        self.main_program = self.helper.main_program
-        self.is_scalar_condition = True
-
-        self.case_to_execute = case_to_execute
-        self.idx = case_idx
-
-        # Since we aren't going to use the `channel_send` or `channel_recv`
-        # functions directly, we just need to capture the name.
-        self.action = (self.SEND
-                       if channel_action_fn.__name__ == ('channel_send') else
-                       self.RECEIVE) if channel_action_fn else self.DEFAULT
-
-        X = value
-        if self.action == self.SEND and is_copy:
-            # We create of copy of the data we want to send
-            copied_X = self.select.parent_block.create_var(
-                name=unique_name.generate(value.name + '_copy'),
-                type=value.type,
-                dtype=value.dtype,
-                shape=value.shape,
-                lod_level=value.lod_level,
-                capacity=value.capacity
-                if hasattr(value, 'capacity') else None, )
-
-            self.select.parent_block.append_op(
-                type="assign", inputs={"X": value}, outputs={"Out": copied_X})
-            X = copied_X
-
-        self.value = X
-        self.channel = channel
-
-    def __enter__(self):
-        self.block = self.main_program._create_block()
-
-    def construct_op(self):
-        main_program = self.helper.main_program
-        cases_block = main_program.current_block()
-
-        inner_outputs = set()
-        input_set = set()
-        params = set()
-
-        for op in self.block.ops:
-            # Iterate over all operators, get all the inputs
-            # and add as input to the SelectCase operator.
-            for iname in op.input_names:
-                for in_var_name in op.input(iname):
-                    if in_var_name not in inner_outputs:
-                        input_set.add(in_var_name)
-
-            for oname in op.output_names:
-                for out_var_name in op.output(oname):
-                    inner_outputs.add(out_var_name)
-
-        param_list = [
-            cases_block.var(each_name) for each_name in params
-            if each_name not in input_set
-        ]
-
-        # Iterate over all operators, get all the outputs
-        # add to the output list of SelectCase operator only if
-        # they exist in the parent block.
-        out_vars = []
-        for inner_out_name in inner_outputs:
-            if inner_out_name in cases_block.vars:
-                out_vars.append(cases_block.var(inner_out_name))
-
-        # First, create an op that will determine whether or not this is the
-        # conditional variable to execute.
-        should_execute_block = equal(
-            fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
-            self.case_to_execute)
-
-        step_scope = cases_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
-
-        cases_block.append_op(
-            type='conditional_block',
-            inputs={'X': [should_execute_block],
-                    'Params': param_list},
-            outputs={'Out': out_vars,
-                     'Scope': [step_scope]},
-            attrs={
-                'sub_block': self.block,
-                'is_scalar_condition': self.is_scalar_condition
-            })
-
-        return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
-                                if self.channel else '', self.value.name
-                                if self.value else '')
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        self.main_program._rollback()
-        if exc_type is not None:
-            return False  # re-raise exception
-        return True
-
-
-class Select(BlockGuard):
-    def __init__(self, name=None):
-        self.helper = LayerHelper('select', name=name)
-        self.parent_block = self.helper.main_program.current_block()
-        self.cases = []
-
-        super(Select, self).__init__(self.helper.main_program)
-        self.case_to_execute = fill_constant(
-            shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
-
-    def __enter__(self):
-        super(Select, self).__enter__()
-        return self
-
-    def case(self, channel_action_fn, channel, value, is_copy=False):
-        """Create a new block for this condition.
-        """
-        select_case = SelectCase(self,
-                                 len(self.cases), self.case_to_execute,
-                                 channel_action_fn, channel, value, is_copy)
-
-        self.cases.append(select_case)
-
-        return select_case
-
-    def default(self):
-        """Create a default case block for this condition.
-        """
-        default_case = SelectCase(self, len(self.cases), self.case_to_execute)
-
-        self.cases.append(default_case)
-
-        return default_case
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if exc_type is not None:
-            return False
-
-        # Create a select op and another block to wrap its
-        # case blocks.
-        select_block = self.helper.main_program.current_block()
-        parent_block = self.helper.main_program.block(select_block.parent_idx)
-
-        # Construct each case op, inside the newly created select block.
-        serialized_cases = []
-        for case in self.cases:
-            serialized_cases.append(case.construct_op())
-
-        intermediate = set()
-        params = set()
-
-        for case_block in select_block.ops:
-            if case_block.attrs and 'sub_block' in case_block.attrs:
-                for each_op in case_block.attrs['sub_block'].ops:
-                    assert isinstance(each_op, Operator)
-                    for iname in each_op.input_names:
-                        for in_var_name in each_op.input(iname):
-                            if in_var_name not in intermediate:
-                                params.add(in_var_name)
-
-                    for oname in each_op.output_names:
-                        for out_var_name in each_op.output(oname):
-                            intermediate.add(out_var_name)
-
-        out_list = [
-            parent_block.var(var_name) for var_name in parent_block.vars
-            if var_name in intermediate
-        ]
-
-        X = [select_block._var_recursive(x_name) for x_name in params]
-
-        # Needs to be used by `equal` inside the cases block.
-        X.append(self.case_to_execute)
-
-        # Construct the select op.
-        parent_block.append_op(
-            type='select',
-            inputs={'X': X,
-                    'case_to_execute': self.case_to_execute},
-            attrs={'sub_block': select_block,
-                   'cases': serialized_cases},
-            outputs={'Out': out_list})
-
-        return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
-
-
-def make_channel(dtype, capacity=0):
-    """
-    Helps implementation of a concurrent program by creating a "channel" of
-    a defined data type. Channels allow for the passing of data in
-    concurrent scenarios - such as when using threads to divide computation.
-    Channels can be used to "send" and "receive" such data concurrently.
-
-    There are two kinds of channels: unbuffered and buffered. Unbuffered
-    channels have no capacity - and thus, block on send and only unblock only
-    once what they have sent has been received.
-
-    On the other hand, buffered channels are initialized with a capacity -
-    and do not block on sends.
-
-    Use this method in combination with `channel_send`, `channel_recv`,
-    `channel_close`, and `Go` to design a concurrent Paddle program.
-
-    Args:
-        dtype (ParamAttr|string): Data type of the data sent in the channel.
-        This data type should be the string name of a numpy data type.
-        capacity (ParamAttr|int): Size of the channel. Defaults to 0 for
-        to create an unbuffered channel.
-
-    Returns:
-        Variable: The channel variable that can be used to send an receive data
-                  of the defined dtype.
-
-    Examples:
-        .. code-block:: python
-
-          ch = fluid.make_channel(dtype='int32', capacity=10)
-          ...
-          # Code to execute in a Go block, which receives the channel data.
-          fluid.channel_send(ch, 100)
-          fluid.channel_close(ch)
-    """
-    helper = LayerHelper('channel_create', **locals())
-    main_program = helper.main_program
-    make_channel_block = main_program.current_block()
-
-    # Make a channel variable (using the channel data type) and make sure it
-    # persists into the global scope.
-    channel = helper.create_variable(
-        name=unique_name.generate('channel'),
-        type=core.VarDesc.VarType.CHANNEL,
-        persistable=True)
-
-    create_channel_op = make_channel_block.append_op(
-        type="channel_create",
-        outputs={"Out": channel},
-        attrs={"data_type": dtype,
-               "capacity": capacity})
-
-    return channel
-
-
-def channel_send(channel, value, is_copy=False):
-    """
-    Sends a value through a channel variable. Used by an unbuffered or buffered
-    channel to pass data from within or to a concurrent Go block, where
-    `channel_recv` to used to get the passed value.
-
-    Args:
-        channel (Variable|Channel): Channel variable created using
-        `make_channel`.
-        value (Variable): Value to send to channel
-        is_copy (bool): Copy data while channel send. If False, then data
-        is moved. The input cannot be used after move. (default False)
-    Returns:
-        Variable: The boolean status on whether or not the channel
-                  successfully sent the passed value.
-
-    Examples:
-        .. code-block:: python
-
-          ch = fluid.make_channel(dtype='int32', capacity=10)
-          ...
-          # Code to execute in a Go block, which receives the channel data.
-          fluid.channel_send(ch, 100)
-    """
-    helper = LayerHelper('channel_send', **locals())
-    main_program = helper.main_program
-    channel_send_block = main_program.current_block()
-
-    X = value
-
-    if is_copy:
-        copied_X = helper.create_variable(
-            name=unique_name.generate(value.name + '_copy'),
-            type=value.type,
-            dtype=value.dtype,
-            shape=value.shape,
-            lod_level=value.lod_level,
-            capacity=value.capacity if hasattr(value, 'capacity') else None)
-
-        assign_op = channel_send_block.append_op(
-            type="assign", inputs={"X": value}, outputs={"Out": copied_X})
-        X = copied_X
-
-    channel_send_block.append_op(
-        type="channel_send", inputs={
-            "Channel": channel,
-            "X": X,
-        })
-
-
-def channel_recv(channel, return_value):
-    """
-    Receives a value through a channel variable. Used by an unbuffered or
-    buffered channel within a concurrent Go block to get data from originally
-    sent using `channel_send`, or from outside such a block where
-    `channel_send` is used to send the value.
-
-    Args:
-        channel (Variable|Channel): Channel variable created using
-        `make_channel`.
-        return_value (Variable): Variable to set as a result of running channel_recv_op
-
-    Returns:
-        Variable: The received value from the channel.
-        Variable: The boolean status on whether or not the channel
-                  successfully received the passed value.
-
-    Examples:
-        .. code-block:: python
-
-          ch = fluid.make_channel(dtype='int32', capacity=10)
-          with fluid.Go():
-            returned_value, return_status = fluid.channel_recv(ch, 'int32')
-
-          # Code to send data through the channel.
-    """
-    helper = LayerHelper('channel_recv', **locals())
-    main_program = helper.main_program
-    channel_recv_block = main_program.current_block()
-
-    status = helper.create_variable(
-        name=unique_name.generate('status'),
-        type=core.VarDesc.VarType.LOD_TENSOR,
-        dtype=core.VarDesc.VarType.BOOL)
-
-    channel_recv_op = channel_recv_block.append_op(
-        type="channel_recv",
-        inputs={"Channel": channel},
-        outputs={"Out": return_value,
-                 "Status": status})
-
-    return return_value, status
-
-
-def channel_close(channel):
-    """
-    Closes a channel created using `make_channel`.
-
-    Args:
-        channel (Variable|Channel): Channel variable created using
-        `make_channel`.
-
-    Examples:
-        .. code-block:: python
-
-          ch = fluid.make_channel(dtype='int32', capacity=10)
-          ...
-          # Code to receive and send data through a channel
-          ...
-          fluid.channel_close(ch)
-    """
-    helper = LayerHelper('channel_close', **locals())
-    main_program = helper.main_program
-    channel_close_block = main_program.current_block()
-
-    channel_close_op = channel_close_block.append_op(
-        type="channel_close", inputs={"Channel": channel})
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index d795b92d79..63988af993 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -541,8 +541,7 @@ class Operator(object):
         'feed', 'fetch', 'save', 'load', 'recurrent', 'go',
         'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'recv',
         'listen_and_serv', 'parallel_do', 'save_combine', 'load_combine',
-        'ncclInit', 'channel_create', 'channel_close', 'channel_send',
-        'channel_recv', 'select', 'checkpoint_notify', 'gen_nccl_id'
+        'ncclInit', 'select', 'checkpoint_notify', 'gen_nccl_id'
     }
 
     def __init__(self,
diff --git a/python/paddle/fluid/tests/no_test_concurrency.py b/python/paddle/fluid/tests/no_test_concurrency.py
deleted file mode 100644
index b5d7676f4a..0000000000
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ /dev/null
@@ -1,260 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid import framework, unique_name, layer_helper
-from paddle.fluid.executor import Executor
-from paddle.fluid.layers import fill_constant, assign, While, elementwise_add, Print
-
-
-class TestRoutineOp(unittest.TestCase):
-    def test_simple_routine(self):
-        ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-
-        # Create LOD_TENSOR<INT64> and put it into the scope.  This placeholder
-        # variable will be filled in and returned by fluid.channel_recv
-        result = self._create_tensor('return_value',
-                                     core.VarDesc.VarType.LOD_TENSOR,
-                                     core.VarDesc.VarType.INT64)
-
-        with fluid.Go():
-            input_value = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.FP64, value=1234)
-            fluid.channel_send(ch, input_value)
-
-        result, status = fluid.channel_recv(ch, result)
-        fluid.channel_close(ch)
-
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-
-        outs = exe.run(fetch_list=[result])
-        self.assertEqual(outs[0], 1234)
-
-    def test_daisy_chain(self):
-        '''
-        Mimics classic Daisy-chain test:  https://talks.golang.org/2012/concurrency.slide#39
-        '''
-        n = 100
-
-        leftmost = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-        left = leftmost
-
-        # TODO(thuan): Use fluid.While() after scope capture is implemented.
-        # https://github.com/PaddlePaddle/Paddle/issues/8502
-        for i in range(n):
-            right = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            with fluid.Go():
-                one_tensor = self._create_one_dim_tensor(1)
-                result = self._create_tensor('return_value',
-                                             core.VarDesc.VarType.LOD_TENSOR,
-                                             core.VarDesc.VarType.INT64)
-
-                result, status = fluid.channel_recv(right, result)
-                one_added = fluid.layers.elementwise_add(x=one_tensor, y=result)
-                fluid.channel_send(left, one_added)
-            left = right
-
-        # Trigger the channel propagation by sending a "1" to rightmost channel
-        with fluid.Go():
-            one_tensor = self._create_one_dim_tensor(1)
-            fluid.channel_send(right, one_tensor)
-
-        leftmost_result = self._create_tensor('return_value',
-                                              core.VarDesc.VarType.LOD_TENSOR,
-                                              core.VarDesc.VarType.INT64)
-        leftmost_result, status = fluid.channel_recv(leftmost, leftmost_result)
-
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-        leftmost_data = exe.run(fetch_list=[leftmost_result])
-
-        # The leftmost_data should be equal to the number of channels + 1
-        self.assertEqual(leftmost_data[0][0], n + 1)
-
-    def _create_one_dim_tensor(self, value):
-        one_dim_tensor = fill_constant(shape=[1], dtype='int', value=value)
-        one_dim_tensor.stop_gradient = True
-        return one_dim_tensor
-
-    def _create_tensor(self, name, type, dtype):
-        return framework.default_main_program().current_block().create_var(
-            name=unique_name.generate(name), type=type, dtype=dtype)
-
-    def _create_persistable_tensor(self, name, type, dtype):
-        return framework.default_main_program().current_block().create_var(
-            name=unique_name.generate(name),
-            type=type,
-            dtype=dtype,
-            persistable=True)
-
-    def test_select(self):
-        with framework.program_guard(framework.Program()):
-            ch1 = fluid.make_channel(
-                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
-
-            result1 = self._create_tensor('return_value',
-                                          core.VarDesc.VarType.LOD_TENSOR,
-                                          core.VarDesc.VarType.FP64)
-
-            input_value = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.FP64, value=10)
-
-            with fluid.Select() as select:
-                with select.case(fluid.channel_send, ch1, input_value):
-                    # Execute something.
-                    pass
-
-                with select.default():
-                    pass
-
-            # This should not block because we are using a buffered channel.
-            result1, status = fluid.channel_recv(ch1, result1)
-            fluid.channel_close(ch1)
-
-            cpu = core.CPUPlace()
-            exe = Executor(cpu)
-
-            result = exe.run(fetch_list=[result1])
-            self.assertEqual(result[0][0], 10)
-
-    def test_fibonacci(self):
-        """
-        Mimics Fibonacci Go example: https://tour.golang.org/concurrency/5
-        """
-        with framework.program_guard(framework.Program()):
-            quit_ch_input_var = self._create_persistable_tensor(
-                'quit_ch_input', core.VarDesc.VarType.LOD_TENSOR,
-                core.VarDesc.VarType.INT32)
-            quit_ch_input = fill_constant(
-                shape=[1],
-                dtype=core.VarDesc.VarType.INT32,
-                value=0,
-                out=quit_ch_input_var)
-
-            result = self._create_persistable_tensor(
-                'result', core.VarDesc.VarType.LOD_TENSOR,
-                core.VarDesc.VarType.INT32)
-            fill_constant(
-                shape=[1],
-                dtype=core.VarDesc.VarType.INT32,
-                value=0,
-                out=result)
-
-            x = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-            y = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=1)
-
-            while_cond = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.BOOL, value=True)
-
-            while_false = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.BOOL, value=False)
-
-            x_tmp = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-
-            def fibonacci(channel, quit_channel):
-                while_op = While(cond=while_cond)
-                with while_op.block():
-                    result2 = fill_constant(
-                        shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
-
-                    with fluid.Select() as select:
-                        with select.case(
-                                fluid.channel_send, channel, x, is_copy=True):
-                            assign(input=x, output=x_tmp)
-                            assign(input=y, output=x)
-                            assign(elementwise_add(x=x_tmp, y=y), output=y)
-
-                        with select.case(fluid.channel_recv, quit_channel,
-                                         result2):
-                            # Quit
-                            helper = layer_helper.LayerHelper('assign')
-                            helper.append_op(
-                                type='assign',
-                                inputs={'X': [while_false]},
-                                outputs={'Out': [while_cond]})
-
-            ch1 = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-            quit_ch = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
-
-            with fluid.Go():
-                for i in range(10):
-                    fluid.channel_recv(ch1, result)
-                    Print(result)
-
-                fluid.channel_send(quit_ch, quit_ch_input)
-
-            fibonacci(ch1, quit_ch)
-
-            fluid.channel_close(ch1)
-            fluid.channel_close(quit_ch)
-
-            cpu = core.CPUPlace()
-            exe = Executor(cpu)
-
-            exe_result = exe.run(fetch_list=[result])
-            self.assertEqual(exe_result[0][0], 34)
-
-    def test_ping_pong(self):
-        """
-        Mimics Ping Pong example: https://gobyexample.com/channel-directions
-        """
-        with framework.program_guard(framework.Program()):
-            result = self._create_tensor('return_value',
-                                         core.VarDesc.VarType.LOD_TENSOR,
-                                         core.VarDesc.VarType.FP64)
-
-            ping_result = self._create_tensor('ping_return_value',
-                                              core.VarDesc.VarType.LOD_TENSOR,
-                                              core.VarDesc.VarType.FP64)
-
-            def ping(ch, message):
-                fluid.channel_send(ch, message, is_copy=True)
-
-            def pong(ch1, ch2):
-                fluid.channel_recv(ch1, ping_result)
-                fluid.channel_send(ch2, ping_result, is_copy=True)
-
-            pings = fluid.make_channel(
-                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
-            pongs = fluid.make_channel(
-                dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
-
-            msg = fill_constant(
-                shape=[1], dtype=core.VarDesc.VarType.FP64, value=9)
-
-            ping(pings, msg)
-            pong(pings, pongs)
-
-            fluid.channel_recv(pongs, result)
-
-            fluid.channel_close(pings)
-            fluid.channel_close(pongs)
-
-            cpu = core.CPUPlace()
-            exe = Executor(cpu)
-
-            exe_result = exe.run(fetch_list=[result])
-            self.assertEqual(exe_result[0][0], 9)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/notest_concurrency.py b/python/paddle/fluid/tests/notest_concurrency.py
deleted file mode 100644
index fd9da4cce0..0000000000
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ /dev/null
@@ -1,41 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle.fluid as fluid
-import paddle.fluid.core as core
-from paddle.fluid.executor import Executor
-
-
-class TestRoutineOp(unittest.TestCase):
-    def test_simple_routine(self):
-        ch = fluid.make_channel(
-            dtype=core.VarDesc.VarType.BOOL, name="CreateChannel")
-        with fluid.Go():
-            fluid.channel_send(ch, True)
-
-        result = fluid.channel_recv(ch)
-        fluid.channel_close(ch)
-
-        cpu = core.CPUPlace()
-        exe = Executor(cpu)
-
-        outs = exe.run(fetch_list=[result])
-        self.assertEqual(outs[0], True)
-
-
-if __name__ == '__main__':
-    unittest.main()

From 5fb72d840a7f6e1cb2edb0129a5ec3c3d06aae0d Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Fri, 28 Sep 2018 13:37:51 +0800
Subject: [PATCH 09/13] add header

test=develop
---
 paddle/fluid/operators/distributed/request_handler.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h
index 3dbbd75b1e..5be7095acd 100644
--- a/paddle/fluid/operators/distributed/request_handler.h
+++ b/paddle/fluid/operators/distributed/request_handler.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <time.h>
+#include <condition_variable>  // NOLINT
 
 #include <functional>
 #include <string>

From 6746b1fdf3b8fc8426fd5c74032cbe9a97dc6377 Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sat, 29 Sep 2018 09:47:08 +0800
Subject: [PATCH 10/13] add missing header

test=develop
---
 paddle/fluid/framework/naive_executor.cc         | 7 ++++---
 paddle/fluid/operators/distributed/rpc_server.cc | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index f681d4ecef..2171213d4d 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -12,11 +12,14 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/framework/naive_executor.h"
+#include <string>
+#include <vector>
+
 #include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/string/pretty_log.h"
@@ -44,8 +47,6 @@ static void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
     var->GetMutable<platform::PlaceList>();
   } else if (var_type == proto::VarType::READER) {
     var->GetMutable<ReaderHolder>();
-  } else if (var_type == proto::VarType::CHANNEL) {
-    var->GetMutable<ChannelHolder>();
   } else if (var_type == proto::VarType::RAW) {
     // GetMutable will be called in operator
   } else {
diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 084480ae48..4758dff96c 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <atomic>
 #include <fstream>
 #include <iostream>
 #include <limits>

From 748be49e778f328f2ee7f5c5864ceeaefb2db840 Mon Sep 17 00:00:00 2001
From: qingqing01 <dangqingqing@baidu.com>
Date: Sat, 29 Sep 2018 10:21:43 +0800
Subject: [PATCH 11/13] Fix random fail in Python3 (#13666)

---
 python/paddle/fluid/contrib/tests/test_quantize_transpiler.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
index 9af3a6c9fd..095e78c053 100644
--- a/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
+++ b/python/paddle/fluid/contrib/tests/test_quantize_transpiler.py
@@ -238,7 +238,7 @@ class TestQuantizeTranspiler(unittest.TestCase):
             test_loss2, = exe.run(program=test_program,
                                   feed=feeder.feed(test_data),
                                   fetch_list=[loss])
-            self.assertAlmostEqual(test_loss1, test_loss2, delta=1e-3)
+            self.assertAlmostEqual(test_loss1, test_loss2, delta=5e-3)
             w_freeze = np.array(fluid.global_scope().find_var('conv2d_1.w_0')
                                 .get_tensor())
             self.assertEqual(np.sum(w_freeze), np.sum(w_quant))

From 33b68fdf25a408024e0d1f196327df3c68a029bb Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sat, 29 Sep 2018 10:41:57 +0800
Subject: [PATCH 12/13] fix compile error

test=develop
---
 paddle/fluid/operators/distributed/rpc_server.cc | 1 -
 paddle/fluid/operators/distributed/rpc_server.h  | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc
index 4758dff96c..084480ae48 100644
--- a/paddle/fluid/operators/distributed/rpc_server.cc
+++ b/paddle/fluid/operators/distributed/rpc_server.cc
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <atomic>
 #include <fstream>
 #include <iostream>
 #include <limits>
diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h
index d88e8c640f..f3e61e1575 100644
--- a/paddle/fluid/operators/distributed/rpc_server.h
+++ b/paddle/fluid/operators/distributed/rpc_server.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <atomic>
 #include <set>
 #include <string>
 #include <thread>  // NOLINT

From 642905958aca0c39f561a7ac623a9e2144d2fb0f Mon Sep 17 00:00:00 2001
From: Xin Pan <panxin.grad@gmail.com>
Date: Sat, 29 Sep 2018 10:56:07 +0800
Subject: [PATCH 13/13] fix compile error

test=develop
---
 paddle/fluid/framework/naive_executor.cc         | 1 -
 paddle/fluid/operators/distributed/grpc_client.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 2171213d4d..53d39513f3 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -15,7 +15,6 @@
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/framework/channel.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/lod_rank_table.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h
index 75a3662316..d8e9cee85b 100644
--- a/paddle/fluid/operators/distributed/grpc_client.h
+++ b/paddle/fluid/operators/distributed/grpc_client.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <time.h>
+#include <atomic>
 
 #include <chrono>              // NOLINT
 #include <condition_variable>  // NOLINT