Merge upstream to branch wrap_squeezes

7 years ago · 13509da69e
parent 03f6292bef 9be39bb4b7
commit 13509da69e
79 changed files with 3144 additions and 737 deletions
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
--- a/paddle/fluid/framework/data_type.cc
+++ b/paddle/fluid/framework/data_type.cc
@ -64,6 +64,7 @@ static DataTypeMap* InitDataTypeMap() {
  RegType(size_t, proto::VarType::SIZE_T);
  RegType(int16_t, proto::VarType::INT16);
  RegType(uint8_t, proto::VarType::UINT8);
+  RegType(int8_t, proto::VarType::INT8);

 #undef RegType
  return retv;
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@ -54,6 +54,9 @@ inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
    case proto::VarType::INT16:
      visitor.template operator()<int16_t>();
      break;
+    case proto::VarType::INT8:
+      visitor.template operator()<int8_t>();
+      break;
    default:
      PADDLE_THROW("Not supported %d", type);
  }
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@ -754,17 +754,26 @@ void MultiDevSSAGraphBuilder::CreateDistTrainOp(ir::Graph *result,
                 node->Op()->Type());

  CreateComputationalOp(result, node, op_dev_id);
-  if (node->Op()->Type() == "concat") {
-    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
-              "fetch_barrier");
+}
+
+void SetOpInputsAllPlaces(ir::Graph *result, ir::Node *node, int num_places) {
+  auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+  for (ir::Node *input : node->inputs) {
+    VarHandle *var = nullptr;
+    for (int place_offset = 0; place_offset < num_places; ++place_offset) {
+      auto &var_holders = result->Get<GraphVars>(kGraphVars)[place_offset];
+      auto &var_holder = var_holders[input->Name()];
+      if (!var_holder.empty()) {
+        var = var_holder.rbegin()->get();
+        op_handle->AddInput(var);
+      }
+    }
  }
 }

 // Create RPC related op handles that connects its in ops and out ops.
 void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
                                          ir::Node *node) const {
-  // FIXME(typhoonzero): Cleanup this deps for both sync mode and async mode
-  //                     put them into transpiler.
  int op_dev_id = -1;
  if (node->Op()->Type() == "send") {
    // TODO(paddle-dev): getting the first var is not safe.
@ -799,8 +808,6 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
    }
    auto recv_param_grad = boost::get<std::vector<std::string>>(
        node->Op()->GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
-    // FIXME(typhoonzero): assume each recv op output one param
-    // Use the same place as send.
    if (recv_param_grad.size() == 2U) {
      op_dev_id = GetVarDeviceID(*result, recv_param_grad[1]);
      VLOG(10) << "recv param " << recv_param_grad[0]
@ -814,34 +821,44 @@ void MultiDevSSAGraphBuilder::CreateRPCOp(ir::Graph *result,
          .emplace(varname, op_dev_id);
    }
  } else {
-    // send_barrier and fetch_barrier op can be scheduled on device 0
+    // send_barrier, fetch_barrier will run on place 0;
    op_dev_id = 0;
  }

  PADDLE_ENFORCE(op_dev_id != -1, "can not find the right place for rpc op: %s",
                 node->Op()->Type());
-
  result->Get<GraphOps>(kGraphOps).emplace_back(new RPCOpHandle(
      result->CreateOpNode(node->Op()), *node->Op(), local_scopes_[op_dev_id],
      node->Op()->Type(), places_[op_dev_id]));

-  // TODO(panyx0718): This might not be needed anymore.
-  if (node->Op()->Type() == "send_barrier") {
-    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "send");
-  } else if (node->Op()->Type() == "recv") {
-    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(),
-              "send_barrier");
-  } else if (node->Op()->Type() == "fetch_barrier") {
-    ConnectOp(result, result->Get<GraphOps>(kGraphOps).back().get(), "recv");
-  } else if (node->Op()->Type() == "send") {
-    // do nothing
+  if (node->Op()->Type() == "send") {
+    CreateOpHandleIOs(result, node, op_dev_id);
  } else {
-    PADDLE_THROW(
-        "rpc op should be in ["
-        "send, send_barrier. recv, fetch_barrier]");
-  }
+    // send_barrier, recv, fetch_barrier's inputs are deps var, get them from
+    // all places
+    auto p = places_[op_dev_id];
+    auto *op_handle = result->Get<GraphOps>(kGraphOps).back().get();
+    op_handle->SetDeviceContext(p,
+                                platform::DeviceContextPool::Instance().Get(p));

-  CreateOpHandleIOs(result, node, op_dev_id);
+    SetOpInputsAllPlaces(result, node, places_.size());
+    for (ir::Node *output : node->outputs) {
+      int outvar_dev_id = op_dev_id;
+      if (node->Op()->Type() == "fetch_barrier") {
+        outvar_dev_id = GetVarDeviceID(*result, output->Name());
+        PADDLE_ENFORCE_NE(outvar_dev_id, -1);
+      }
+      p = places_[outvar_dev_id];
+      ir::Node *new_node = nullptr;
+      if (output->Var()) {
+        new_node = result->CreateVarNode(output->Var());
+      } else {
+        new_node =
+            result->CreateEmptyNode(output->Name(), ir::Node::Type::kVariable);
+      }
+      CreateOpOutput(result, op_handle, new_node, p, outvar_dev_id);
+    }
+  }
 }

 bool MultiDevSSAGraphBuilder::IsScaleLossOp(ir::Node *node) const {
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@ -107,6 +107,7 @@ message VarType {
    // Tensor<size_t> is used in C++.
    SIZE_T = 19;
    UINT8 = 20;
+    INT8 = 21;

    // Other types that may need additional descriptions
    LOD_TENSOR = 7;
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -132,63 +132,6 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
    }
  }

-  std::vector<ir::Node *> send_ops;
-  ir::Node *send_bar = nullptr;
-  std::vector<ir::Node *> recv_ops;
-  ir::Node *fetch_bar = nullptr;
-  for (ir::Node *node : Nodes()) {
-    if (node->Name() == "send") {
-      send_ops.push_back(node);
-    } else if (node->Name() == "send_barrier") {
-      PADDLE_ENFORCE(!send_bar, "only has one send barrier");
-      send_bar = node;
-    } else if (node->Name() == "recv") {
-      recv_ops.push_back(node);
-    } else if (node->Name() == "fetch_barrier") {
-      PADDLE_ENFORCE(!fetch_bar, "only has one fetch barrier");
-      fetch_bar = node;
-    }
-  }
-  if (send_bar) {
-    for (ir::Node *send : send_ops) {
-      ir::Node *dep_var = CreateControlDepVar();
-      send->outputs.push_back(dep_var);
-      dep_var->inputs.push_back(send);
-      send_bar->inputs.push_back(dep_var);
-      dep_var->outputs.push_back(send_bar);
-    }
-    for (ir::Node *recv : recv_ops) {
-      ir::Node *dep_var = CreateControlDepVar();
-      recv->inputs.push_back(dep_var);
-      dep_var->outputs.push_back(recv);
-      send_bar->outputs.push_back(dep_var);
-      dep_var->inputs.push_back(send_bar);
-    }
-  }
-  if (fetch_bar) {
-    for (ir::Node *recv : recv_ops) {
-      ir::Node *dep_var = CreateControlDepVar();
-      recv->outputs.push_back(dep_var);
-      dep_var->inputs.push_back(recv);
-      fetch_bar->inputs.push_back(dep_var);
-      dep_var->outputs.push_back(fetch_bar);
-    }
-  }
-
-  std::vector<std::string> send_vars = FindDistTrainSendVars(send_ops);
-  std::vector<std::string> recv_vars = FindDistTrainRecvVars(recv_ops);
-  for (ir::Node *node : Nodes()) {
-    if (IsDistTrainOp(node, send_vars, recv_vars)) {
-      if (fetch_bar && node->Name() == "concat") {
-        ir::Node *dep_var = CreateControlDepVar();
-        fetch_bar->outputs.push_back(dep_var);
-        dep_var->inputs.push_back(fetch_bar);
-        node->inputs.push_back(dep_var);
-        dep_var->outputs.push_back(node);
-      }
-    }
-  }
-
  /**
   * We should handle write after read(WAR) and write after write(WAW) here.
   * Because some of the operators of the program can be executed parallelly.
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@ -40,7 +40,11 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
                    "When calling this method, the Tensor's numel must be "
                    "equal or larger than zero. "
                    "Please check Tensor::Resize has been called first.");
-  size_t size = requested_size ? requested_size : numel() * SizeOfType(type);
+  size_t size = numel() * SizeOfType(type);
+  if (requested_size) {
+    PADDLE_ENFORCE_GE(requested_size, size);
+    size = requested_size;
+  }
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@ -72,7 +72,7 @@ class DfgPassManagerImpl final : public DfgPassManager {
      auto trt_teller = [&](const Node* node) {
        std::unordered_set<std::string> teller_set(
            {"elementwise_add", "mul", "conv2d", "pool2d", "relu", "softmax",
-             "depthwise_conv2d", "batch_norm"});
+             "depthwise_conv2d", "batch_norm", "concat"});
        if (!node->IsFunction()) return false;

        const auto* func = static_cast<const Function*>(node);
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine.cc
@ -32,6 +32,7 @@ class TensorRTSubgraphPredictor : public NativePaddlePredictor {
      : NativePaddlePredictor(config), config_(config) {}

  bool Init(const std::shared_ptr<framework::Scope>& parent_scope) {
+    FLAGS_IA_enable_tensorrt_subgraph_engine = true;
    VLOG(3) << "Predictor::init()";
    FLAGS_tensorrt_max_batch_size = config_.max_batch_size;
    FLAGS_tensorrt_workspace_size = config_.workspace_size;
@ -161,3 +162,4 @@ USE_TRT_CONVERTER(fc);
 USE_TRT_CONVERTER(pool2d);
 USE_TRT_CONVERTER(softmax);
 USE_TRT_CONVERTER(batch_norm);
+USE_TRT_CONVERTER(concat);
--- a/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
+++ b/paddle/fluid/inference/api/api_tensorrt_subgraph_engine_tester.cc
@ -37,6 +37,7 @@ void CompareTensorRTWithFluid(bool enable_tensorrt) {
  config1.use_gpu = true;
  config1.fraction_of_gpu_memory = 0.3;
  config1.device = 0;
+  config1.max_batch_size = 10;

  auto predictor0 =
      CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config0);
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@ -23,9 +23,11 @@ include_directories("${PADDLE_LIB}")
 include_directories("${PADDLE_LIB}/third_party/install/protobuf/include")
 include_directories("${PADDLE_LIB}/third_party/install/glog/include")
 include_directories("${PADDLE_LIB}/third_party/install/gflags/include")
+if (NOT WIN32)
 include_directories("${PADDLE_LIB}/third_party/install/snappy/include")
 include_directories("${PADDLE_LIB}/third_party/install/snappystream/include")
 include_directories("${PADDLE_LIB}/third_party/install/zlib/include")
+endif(NOT WIN32)

 include_directories("${PADDLE_LIB}/third_party/boost")
 include_directories("${PADDLE_LIB}/third_party/eigen3")
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@ -1,7 +1,7 @@
 # Add TRT tests
 nv_library(tensorrt_converter
  SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
-batch_norm_op.cc activation_op.cc softmax_op.cc 
+batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc
  DEPS tensorrt_engine operator scope framework_proto op_registry)

 nv_test(test_op_converter SRCS test_op_converter.cc DEPS
@ -18,12 +18,12 @@ nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
-
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
-
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine softmax_op SERIAL)
-
 nv_test(test_trt_batch_norm_op SRCS test_batch_norm_op.cc batch_norm_op.cc
        DEPS ${FLUID_CORE_MODULES} tensorrt_engine batch_norm_op SERIAL)
+
+nv_test(test_trt_concat_op SRCS test_concat_op.cc concat_op.cc
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine concat_op SERIAL)
--- a/paddle/fluid/inference/tensorrt/convert/concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/concat_op.cc
@ -0,0 +1,57 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
+class ConcatOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(4) << "convert a fluid mul op to tensorrt mul layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    std::vector<nvinfer1::ITensor*> itensors;
+    for (auto& input_name : op_desc.Input("X")) {
+      itensors.push_back(engine_->GetITensor(input_name));
+    }
+    int axis = boost::get<int>(op_desc.GetAttr("axis"));
+    PADDLE_ENFORCE(axis > 0,
+                   "The axis attr of Concat op should be large than 0 for trt");
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Concatenation, itensors.data(),
+                                       itensors.size());
+    axis = axis - 1;  // Remove batch dim
+    layer->setAxis(axis);
+    auto output_name = op_desc.Output("Out")[0];
+    engine_->SetITensor(output_name, layer->getOutput(0));
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(concat, ConcatOpConverter);
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@ -79,6 +79,14 @@ class OpConverter {
        it =
            Registry<OpConverter>::Lookup("elementwise_" + op_type + "_tensor");
      }
+      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                              op_desc.Type());
+    }
+
+    if (op_desc.Type() == "depthwise_conv2d") {
+      it = Registry<OpConverter>::Lookup("conv2d");
+      PADDLE_ENFORCE_NOT_NULL(it, "no OpConverter for optype [%s]",
+                              op_desc.Type());
    }

    if (!it) {
--- a/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_concat_op.cc
@ -0,0 +1,49 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(concat_op, test) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  TRTConvertValidation validator(10, parameters, scope, 1000);
+  validator.DeclInputVar("concat_x1", nvinfer1::DimsCHW(10, 3, 1));
+  validator.DeclInputVar("concat_x2", nvinfer1::DimsCHW(3, 3, 1));
+  validator.DeclInputVar("concat_x3", nvinfer1::DimsCHW(7, 3, 1));
+  validator.DeclOutputVar("concat_out", nvinfer1::DimsCHW(20, 3, 1));
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("concat");
+  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
+  desc.SetOutput("Out", {"concat_out"});
+
+  int axis = 1;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(5);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+USE_OP(concat);
--- a/paddle/fluid/memory/detail/system_allocator.cc
+++ b/paddle/fluid/memory/detail/system_allocator.cc
@ -11,12 +11,18 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#define GLOG_NO_ABBREVIATED_SEVERITIES

 #include "paddle/fluid/memory/detail/system_allocator.h"

-#include <stdlib.h>    // for malloc and free
+#ifdef _WIN32
+#include <malloc.h>
+#include <windows.h>  // VirtualLock/VirtualUnlock
+#else
 #include <sys/mman.h>  // for mlock and munlock
-#include <algorithm>   // for std::max
+#endif
+#include <stdlib.h>   // for malloc and free
+#include <algorithm>  // for std::max

 #include "gflags/gflags.h"
 #include "paddle/fluid/platform/assert.h"
@ -35,31 +41,42 @@ namespace paddle {
 namespace memory {
 namespace detail {

-void* CPUAllocator::Alloc(size_t* index, size_t size) {
-  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
-  // malloc might not return nullptr if size is zero, but the returned
-  // pointer shall not be dereferenced -- so we make it nullptr.
-  if (size <= 0) return nullptr;
-
-  *index = 0;  // unlock memory
-
+void* AlignedMalloc(size_t size) {
  void* p = nullptr;
-
+  size_t alignment = 32ul;
 #ifdef PADDLE_WITH_MKLDNN
  // refer to https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp
  // memory alignment
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 4096ul, size), 0, "Alloc %ld error!",
-                    size);
+  alignment = 4096ul;
+#endif
+#ifdef _WIN32
+  p = _aligned_malloc(size, alignment);
 #else
-  PADDLE_ENFORCE_EQ(posix_memalign(&p, 32ul, size), 0, "Alloc %ld error!",
+  PADDLE_ENFORCE_EQ(posix_memalign(&p, alignment, size), 0, "Alloc %ld error!",
                    size);
 #endif
  PADDLE_ENFORCE(p, "Fail to allocate CPU memory: size = %d .", size);
+  return p;
+}
+
+void* CPUAllocator::Alloc(size_t* index, size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  *index = 0;  // unlock memory
+
+  void* p = AlignedMalloc(size);

  if (p != nullptr) {
    if (FLAGS_use_pinned_memory) {
      *index = 1;
+#ifdef _WIN32
+      VirtualLock(p, size);
+#else
      mlock(p, size);  // lock memory
+#endif
    }
  }

@ -68,7 +85,11 @@ void* CPUAllocator::Alloc(size_t* index, size_t size) {

 void CPUAllocator::Free(void* p, size_t size, size_t index) {
  if (p != nullptr && index == 1) {
+#ifdef _WIN32
+    VirtualUnlock(p, size);
+#else
    munlock(p, size);
+#endif
  }
  free(p);
 }
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -291,6 +291,8 @@ op_library(unsqueeze_op DEPS reshape_op)
 op_library(squeeze_op DEPS reshape_op)
 op_library(extract_rows_op DEPS memory)
 op_library(flatten_op DEPS reshape_op)
+op_library(sequence_pad_op DEPS sequence_padding)
+op_library(unstack_op DEPS stack_op)

 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)
--- a/paddle/fluid/operators/auc_op.h
+++ b/paddle/fluid/operators/auc_op.h
@ -60,6 +60,20 @@ class AucKernel : public framework::OpKernel<T> {
    const T* inference_data = predict->data<T>();
    const auto* label_data = label->data<int64_t>();

+    // check if states are inited.
+    auto* tp_in = ctx.Input<Tensor>("TP");
+    auto* fp_in = ctx.Input<Tensor>("FP");
+    auto* tn_in = ctx.Input<Tensor>("TN");
+    auto* fn_in = ctx.Input<Tensor>("FN");
+    PADDLE_ENFORCE(tp_in->IsInitialized(), "true_positive is not inited!");
+    PADDLE_ENFORCE(fp_in->IsInitialized(), "false_negative is not inited!");
+    PADDLE_ENFORCE(tn_in->IsInitialized(), "true_negative is not inited!");
+    PADDLE_ENFORCE(fn_in->IsInitialized(), "false_positive is not inited!");
+    PADDLE_ENFORCE_EQ(tp_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(fp_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(tn_in->numel(), num_thresholds, "");
+    PADDLE_ENFORCE_EQ(fn_in->numel(), num_thresholds, "");
+
    auto* tp_data = true_positive->mutable_data<int64_t>(ctx.GetPlace());
    auto* fn_data = false_negative->mutable_data<int64_t>(ctx.GetPlace());
    auto* tn_data = true_negative->mutable_data<int64_t>(ctx.GetPlace());
--- a/paddle/fluid/operators/batch_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/batch_norm_mkldnn_op.cc
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@ -18,15 +18,32 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

+template <typename T>
+struct DequantizeFunctor<platform::CPUDeviceContext, T> {
+  void operator()(const platform::CPUDeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, framework::Tensor* out) {
+    auto in_e = framework::EigenVector<T>::Flatten(*in);
+    const T* scale_factor = scale->data<T>();
+    auto out_e = framework::EigenVector<T>::Flatten(*out);
+
+    auto& dev = *dev_ctx.eigen_device();
+    out_e.device(dev) = (scale_factor[0] / max_range) * in_e;
+  }
+};
+
+template struct DequantizeFunctor<platform::CPUDeviceContext, float>;
+template struct DequantizeFunctor<platform::CPUDeviceContext, double>;
+
 class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
 public:
-  FakeDequantizeMaxAbsOp(const std::string &type,
-                         const framework::VariableNameMap &inputs,
-                         const framework::VariableNameMap &outputs,
-                         const framework::AttributeMap &attrs)
+  FakeDequantizeMaxAbsOp(const std::string& type,
+                         const framework::VariableNameMap& inputs,
+                         const framework::VariableNameMap& outputs,
+                         const framework::AttributeMap& attrs)
      : OperatorWithKernel(type, inputs, outputs, attrs) {}

-  void InferShape(framework::InferShapeContext *ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
@ -42,21 +59,17 @@ class FakeDequantizeMaxAbsOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("X",
             "(Tensor) The input with float-32/64 type is the "
             "low precision tensor.");
+    AddInput("Scale", "(float) The scale in quantization stage.");
    AddOutput("Out",
              "(Tensor) The output is the dequantized high "
              "precision tensor.");
-    AddAttr<int>("num_bits",
-                 "(int) `num_bits` is the quantization level bits, "
-                 "such as 2, 5, 8.");
-    AddAttr<float>("scale",
-                   "(float) The maximum absolute value of low precision tensor."
-                   "It is usually calculated by the fake_quantize_max_abs_op.");
+    AddAttr<float>("max_range", "(float) The max range in quantization stage.");
    AddComment(R"DOC(
 FakeDequantizeMaxAbsOp operator.

 This calculation is an opposite operation of FakeQuantizeMaxAbsOp:

-$$Out = \frac{scale*X}{2^{num_bits} - 1}$$
+$$Out = \frac{scale*X}{ max_range }$$

 )DOC");
  }
--- a/paddle/fluid/operators/fake_dequantize_op.cu
+++ b/paddle/fluid/operators/fake_dequantize_op.cu
@ -14,6 +14,42 @@ limitations under the License. */

 #include "paddle/fluid/operators/fake_dequantize_op.h"

+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void KeDequantize(const T* in, const T* scale, T max_range, int num,
+                             T* out) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < num) {
+    out[idx] = in[idx] * scale[0] / max_range;
+  }
+}
+
+template <typename T>
+struct DequantizeFunctor<platform::CUDADeviceContext, T> {
+  void operator()(const platform::CUDADeviceContext& dev_ctx,
+                  const framework::Tensor* in, const framework::Tensor* scale,
+                  T max_range, framework::Tensor* out) {
+    const T* in_data = in->data<T>();
+    const T* scale_factor = scale->data<T>();
+    T* out_data = out->mutable_data<T>(dev_ctx.GetPlace());
+
+    int num = in->numel();
+    int block = 512;
+    int grid = (num + block - 1) / block;
+
+    KeDequantize<T><<<grid, block, 0, dev_ctx.stream()>>>(
+        in_data, scale_factor, max_range, num, out_data);
+  }
+};
+
+template struct DequantizeFunctor<platform::CUDADeviceContext, float>;
+template struct DequantizeFunctor<platform::CUDADeviceContext, double>;
+
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 using CUDA = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(fake_dequantize_max_abs,
--- a/paddle/fluid/operators/fake_dequantize_op.h
+++ b/paddle/fluid/operators/fake_dequantize_op.h
@ -19,22 +19,29 @@ limitations under the License. */

 namespace paddle {
 namespace operators {
+
+template <typename DeviceContext, typename T>
+struct DequantizeFunctor {
+  void operator()(const DeviceContext& dev_ctx, const framework::Tensor* in,
+                  const framework::Tensor* scale, T max_range,
+                  framework::Tensor* out);
+};
+
 template <typename DeviceContext, typename T>
 class FakeDequantizeMaxAbsKernel : public framework::OpKernel<T> {
 public:
  virtual void Compute(const framework::ExecutionContext& ctx) const {
    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* scale = ctx.Input<framework::Tensor>("Scale");
    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->mutable_data<T>(in->place());

-    int num_bits = ctx.Attr<int>("num_bits");
-    T scale = static_cast<T>(ctx.Attr<float>("scale"));
-    int range = std::pow(2, num_bits) - 1;
+    float max_range = ctx.Attr<float>("max_range");
+
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    out->mutable_data<T>(dev_ctx.GetPlace());

-    auto eigen_out = framework::EigenVector<T>::Flatten(*out);
-    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
-    auto& dev = *ctx.template device_context<DeviceContext>().eigen_device();
-    eigen_out.device(dev) = (scale / range) * eigen_in;
+    DequantizeFunctor<DeviceContext, T>()(dev_ctx, in, scale,
+                                          static_cast<T>(max_range), out);
  }
 };

--- a/paddle/fluid/operators/fetch_barrier_op.cc
+++ b/paddle/fluid/operators/fetch_barrier_op.cc
@ -52,6 +52,8 @@ class FetchBarrierOp : public framework::OperatorBase {
 class FetchBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() {
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
    AddComment(R"DOC(
 SendBarrier operator

--- a/paddle/fluid/operators/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fusion_lstm_op.cc
@ -15,10 +15,14 @@ limitations under the License. */
 #include "paddle/fluid/operators/fusion_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/fc_compute.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+DEFINE_bool(seq_mode, true, "Use sequence mode");

 namespace paddle {
 namespace operators {
@ -98,7 +102,12 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  ctx->ShareLoD("X", "Hidden");
  ctx->ShareLoD("X", "Cell");

-  int xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  int xx_width;
+  if (FLAGS_seq_mode) {
+    xx_width = wx_dims[1];
+  } else {
+    xx_width = x_dims[1] > wx_dims[1] ? wx_dims[1] : x_dims[1];
+  }
  ctx->SetOutputDim("XX", {x_dims[0], xx_width});
  ctx->ShareLoD("X", "XX");
 }
@ -205,10 +214,138 @@ inline void ReorderInitState(const DeviceContext& ctx,
  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }

-template <typename DeviceContext, typename T>
+template <typename T>
 class FuisonLSTMKernel : public framework::OpKernel<T> {
 public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
+  void SeqCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* h0 = ctx.Input<Tensor>("H0");
+    auto* c0 = ctx.Input<Tensor>("C0");
+    auto* wx = ctx.Input<Tensor>("WeightX");
+    auto* wh = ctx.Input<Tensor>("WeightH");
+    auto* bias = ctx.Input<Tensor>("Bias");
+
+    auto* xx = ctx.Output<LoDTensor>("XX");
+    auto* hidden_out = ctx.Output<LoDTensor>("Hidden");
+    auto* cell_out = ctx.Output<LoDTensor>("Cell");
+    bool is_reverse = ctx.Attr<bool>("is_reverse");
+
+    std::function<void(const int, const T *, T *)> act_gate, act_cell, act_cand;
+    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
+    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
+    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      act_gate = act_functor(act_gate_str);
+      act_cell = act_functor(act_cell_str);
+      act_cand = act_functor(act_cand_str);
+    }
+
+    auto x_lod = x->lod();
+    auto x_dims = x->dims();    // T x M
+    auto wh_dims = wh->dims();  // D x 4D
+    const int total_T = x_dims[0];
+    const int N = x_lod[0].size() - 1;  // batch size
+    const int M = x_dims[1];            // x frame size
+    const int D = wh_dims[0];
+    const int D2 = D * 2;
+    const int D3 = D * 3;
+    const int D4 = wh_dims[1];
+
+    const T* x_data = x->data<T>();
+    const T* h0_data = h0 ? h0->data<T>() : NULL;
+    const T* c0_data = c0 ? c0->data<T>() : NULL;
+    const T* wx_data = wx->data<T>();
+    const T* wh_data = wh->data<T>();
+    T* xx_data = xx->mutable_data<T>(ctx.GetPlace());
+    T* hidden_out_data = hidden_out->mutable_data<T>(ctx.GetPlace());
+    T* cell_out_data = cell_out->mutable_data<T>(ctx.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D4, M, x_data, wx_data,
+                                      xx_data, bias->data<T>());
+    int xx_offset = D4;
+    int gate_offset = D;
+    if (is_reverse) {
+      const int offset = (total_T - 1) * D;
+      xx_data = xx_data + offset * 4;
+      hidden_out_data = hidden_out_data + offset;
+      cell_out_data = cell_out_data + offset;
+      xx_offset = -D4;
+      gate_offset = -D;
+    }
+
+    auto move_step = [&]() {
+      xx_data = xx_data + xx_offset;
+      hidden_out_data = hidden_out_data + gate_offset;
+      cell_out_data = cell_out_data + gate_offset;
+    };
+
+    for (int i = 0; i < N; ++i) {
+      int bid = is_reverse ? N - 1 - i : i;
+      int seq_len = x_lod[0][bid + 1] - x_lod[0][bid];
+      const T* prev_cell_data = NULL;
+      const T* prev_hidden_data = NULL;
+      int tstart = 0;
+      if (h0_data) {
+        prev_hidden_data = h0_data + bid * D;
+        prev_cell_data = c0_data + bid * D;
+      } else {
+        // W_ch, W_ih, W_fh, W_oh
+        act_gate(D3, xx_data + D, xx_data + D);
+        act_cand(D, xx_data, xx_data);
+        // cell out= input*tilde
+        blas.VMUL(D, xx_data, xx_data + D, cell_out_data);
+        // hidden out= act_state(cellout) * outgate
+        act_cell(D, cell_out_data, xx_data + D2);
+        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
+
+        // prev
+        prev_hidden_data = hidden_out_data;
+        prev_cell_data = cell_out_data;
+        tstart = 1;
+
+        move_step();
+      }
+      for (int step = tstart; step < seq_len; ++step) {
+        blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D4, D, static_cast<T>(1),
+                  prev_hidden_data, D, wh_data, D4, static_cast<T>(1), xx_data,
+                  D4);
+
+        // W_ch, W_ih, W_fh, W_oh
+        act_gate(D3, xx_data + D, xx_data + D);
+        act_cand(D, xx_data, xx_data);
+
+        // a = forget * prev_cell
+        blas.VMUL(D, xx_data + D2, prev_cell_data, xx_data + D2);
+
+        // b = input * tilde
+        blas.VMUL(D, xx_data, xx_data + D, xx_data + D);
+
+        // cell out= a+b
+        blas.VADD(D, xx_data + D, xx_data + D2, cell_out_data);
+
+        // hidden out= act_state(cellout) * outgate
+        act_cell(D, cell_out_data, xx_data + D2);
+        blas.VMUL(D, xx_data + D2, xx_data + D3, hidden_out_data);
+
+        // prev
+        prev_hidden_data = hidden_out_data;
+        prev_cell_data = cell_out_data;
+
+        move_step();
+      }
+    }
+  }
+
+  void BatchCompute(const framework::ExecutionContext& ctx) const {
+    using DeviceContext = platform::CPUDeviceContext;
    auto* x = ctx.Input<LoDTensor>("X");
    auto* wx = ctx.Input<Tensor>("WeightX");
    auto* wh = ctx.Input<Tensor>("WeightH");
@ -339,6 +476,13 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    // restore the output cell state in LoDTensor from the batch cell
    to_seq(dev_ctx, batch_cell, cell_out);
  }
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    if (FLAGS_seq_mode) {
+      SeqCompute(ctx);
+    } else {
+      BatchCompute(ctx);
+    }
+  }
 };

 }  // namespace operators
@ -348,7 +492,5 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(fusion_lstm, ops::FusionLSTMOp, ops::FusionLSTMOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);

-REGISTER_OP_CPU_KERNEL(
-    fusion_lstm,
-    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FuisonLSTMKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(fusion_lstm, ops::FuisonLSTMKernel<float>,
+                       ops::FuisonLSTMKernel<double>);
--- a/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fusion_seqexpand_concat_fc_op.cc
@ -0,0 +1,201 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fusion_seqexpand_concat_fc_op.h"
+#include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/cpu_vec.h"
+#include "paddle/fluid/operators/math/fc_compute.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+
+void FusionSeqExpandConcatFCOp::InferShape(
+    framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE_GT(
+      ctx->Inputs("X").size(), 1UL,
+      "Inputs(X) of FusionSeqExpandConcatFCOp should larger than 1.");
+  PADDLE_ENFORCE(
+      ctx->HasInput("FCWeight"),
+      "Input(FCWeight) of FusionSeqExpandConcatFCOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasOutput("Out"),
+      "Output(Out) of FusionSeqExpandConcatFCOp should not be null.");
+  PADDLE_ENFORCE(
+      ctx->HasOutput("FCOut"),
+      "Output(FCOut) of FusionSeqExpandConcatFCOp should not be null.");
+
+  auto ins_dims = ctx->GetInputsDim("X");
+  auto w_dims = ctx->GetInputDim("FCWeight");  // (M0+M1+M2+..) x D
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2.");
+  const int D = w_dims[1];
+  int sum = ins_dims[0][1];
+  for (size_t i = 1; i < ins_dims.size(); ++i) {
+    sum += ins_dims[i][1];
+  }
+  PADDLE_ENFORCE_EQ(sum, w_dims[0],
+                    "FC height should be sum of all inputs width.");
+  if (ctx->HasInput("FCBias")) {
+    auto b_dims = ctx->GetInputDim("FCBias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "Input(FCBias)'s rank must be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1, "FCBias shapes must be 1 * %d.", D);
+    PADDLE_ENFORCE_EQ(b_dims[1], D, "FCBias shapes must be 1 * %d.", D);
+  }
+
+  ctx->SetOutputDim("Out", {ins_dims[0][0], D});
+  // fcout should be reshape when run since can not get lod in infershape
+  // explicit share the ref lod
+  ctx->ShareLoD("X", "Out", 0);
+}
+
+framework::OpKernelType FusionSeqExpandConcatFCOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.MultiInput<LoDTensor>("X")[0]->type()),
+      ctx.device_context());
+}
+
+void FusionSeqExpandConcatFCOpMaker::Make() {
+  AddInput("X",
+           "(LoDTensor) input LodDTensors, the first one must be have ref lod "
+           "for sequence expand, and the rest input should have same lod.")
+      .AsDuplicable();
+  AddInput("FCWeight", "(Tensor) the weights of fc.");
+  AddInput("FCBias", "(Tensor, optional) the bias of fc.").AsDispensable();
+  AddOutput("Out", "(LoDTensor) Output LodTensor.");
+  AddOutput(
+      "FCOut",
+      "(Tensor) the intermediate tensor to keep the result of fc."
+      "Shape is (N x D), where N is the batch size, D is the output dim of fc")
+      .AsIntermediate();
+  AddAttr<std::string>("fc_activation",
+                       "(string, default: identity)"
+                       "The activation for the result of fc."
+                       "`identity` by default.")
+      .SetDefault("identity")
+      .InEnum({"sigmoid", "tanh", "relu", "identity"});
+  AddComment(R"DOC(
+Fusion Sequence expand + concat + fc Operator.
+
+All below conditions should be meet:
+
+The ref_level of seq_expand should be 0.
+
+The ref lod of seq_expand level is the first input of concat.
+
+The other inputs should have same lod and same batch size of ref lod.
+
+The seq len of other inputs should be 1.
+
+The concat axis should be 1.
+
+)DOC");
+}
+
+template <typename T>
+class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto* w = ctx.Input<Tensor>("FCWeight");
+    auto* b = ctx.Input<Tensor>("FCBias");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    auto* fc_out = ctx.Output<Tensor>("FCOut");
+
+    auto* ref_in = ins[0];
+    auto ref_lod = ref_in->lod();
+    auto in1_lod = ins[1]->lod();
+    auto ref_dims = ref_in->dims();  // T x M0
+    auto in1_dims = ins[1]->dims();  // N x M1
+    auto w_dims = w->dims();
+    const int N = ref_lod[0].size() - 1;
+    const int total_T = ref_dims[0];
+    const int M0 = ref_dims[1];
+    const int M1 = in1_dims[1];
+    const int D = w_dims[1];
+
+    // some check and fcout should be reshape here
+    // since infershape can not get lod info
+    PADDLE_ENFORCE_EQ(ref_lod.size(), 1UL, "Only support input lod size is 1.");
+    PADDLE_ENFORCE_EQ(in1_lod.size(), 1UL, "Only support input lod size is 1.");
+    PADDLE_ENFORCE_EQ(in1_lod[0].size() - 1, N,
+                      "Batch size of all inputs should be equal.");
+    PADDLE_ENFORCE_EQ(in1_lod[0][N], N,
+                      "Seq_length of other inputs should be 1.");
+    PADDLE_ENFORCE_EQ(in1_dims[0], N, "input height should be batch size.");
+    for (size_t i = 2; i < ins.size(); ++i) {
+      PADDLE_ENFORCE_EQ(ins[i]->dims()[0], N,
+                        "All other inputs height should be equal");
+      PADDLE_ENFORCE_EQ(ins[i]->lod(), in1_lod,
+                        "All other inputs should have same lod");
+    }
+    fc_out->Resize({N, D});
+
+    std::function<void(const int, const T*, T*)> fc_act;
+    auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
+    if (platform::jit::MayIUse(platform::jit::avx)) {
+      math::VecActivations<T, platform::jit::avx> act_functor;
+      fc_act = act_functor(fc_act_str);
+    } else {
+      math::VecActivations<T, platform::jit::isa_any> act_functor;
+      fc_act = act_functor(fc_act_str);
+    }
+
+    const T* ref_in_data = ref_in->data<T>();
+    const T* in1_data = ins[1]->data<T>();
+    const T* w_data = w->data<T>();
+    T* out_data = out->mutable_data<T>(ctx.GetPlace());
+    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
+
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    math::FCCompute<DeviceContext, T>(blas, total_T, D, M0, ref_in_data, w_data,
+                                      out_data, b ? b->data<T>() : NULL);
+    w_data = w_data + M0 * D;
+    // first write on
+    blas.MatMul(N, D, M1, in1_data, w_data, fc_out_data);
+    w_data = w_data + M1 * D;
+    for (size_t i = 2; i < ins.size(); ++i) {
+      // add on
+      const T* in_data = ins[i]->data<T>();
+      const int K = ins[i]->dims()[1];
+      blas.GEMM(CblasNoTrans, CblasNoTrans, N, D, K, static_cast<T>(1), in_data,
+                K, w_data, D, static_cast<T>(1), fc_out_data, D);
+      w_data = w_data + K * D;
+    }
+    T* cur_out_data = out_data;
+    for (int i = 0; i < N; ++i) {
+      int seq_len = ref_lod[0][i + 1] - ref_lod[0][i];
+      T* src = fc_out_data + i * D;
+      for (int step = 0; step < seq_len; ++step) {
+        blas.VADD(D, cur_out_data, src, cur_out_data);
+        cur_out_data = cur_out_data + D;
+      }
+    }
+    fc_act(total_T * D, out_data, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fusion_seqexpand_concat_fc, ops::FusionSeqExpandConcatFCOp,
+                  ops::FusionSeqExpandConcatFCOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+
+REGISTER_OP_CPU_KERNEL(fusion_seqexpand_concat_fc,
+                       ops::FusionSeqExpandConcatFCOpKernel<float>,
+                       ops::FusionSeqExpandConcatFCOpKernel<double>);
--- a/Show More
+++ b/Show More