merge develop

7 years ago · ff052c0e6f
parent c6a5c4b0c0 3ae97aab6c
commit ff052c0e6f
44 changed files with 1340 additions and 239 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -204,6 +204,11 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
 include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
@ -212,15 +217,11 @@ elseif()
    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
 endif()
 include(cudnn)              # set cudnn libraries, must before configure
 include(cupti)
 include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
 include(flags)              # set paddle compile flags
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 include(inference_lib)      # add paddle fluid inference libraries
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@ -50,16 +50,16 @@ if(NOT WITH_PROFILER)
 endif(NOT WITH_PROFILER)
 if(NOT CMAKE_CROSSCOMPILING)
-    if(WITH_AVX AND AVX_FOUND)
+    if(WITH_AVX AND AVX512F_FOUND)
        set(SIMD_FLAG ${AVX512F_FLAG})
    elseif(WITH_AVX AND AVX2_FOUND)
        set(SIMD_FLAG ${AVX2_FLAG})
    elseif(WITH_AVX AND AVX_FOUND)
        set(SIMD_FLAG ${AVX_FLAG})
    elseif(SSE3_FOUND)
        set(SIMD_FLAG ${SSE3_FLAG})
    endif()
 endif()
 if(UNIX AND NOT APPLE)
  # except apple from nix*Os family
  set(LINUX TRUE)
 endif(UNIX AND NOT APPLE)
 if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
@ -103,15 +103,20 @@ if(WITH_GPU)
    endif()
    if(WITH_ANAKIN)
        if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
-            message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
+            message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
        endif()
        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-            message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
+            message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
            set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
        endif()
-        set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
+    endif()
-        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
+    if(WITH_ANAKIN)
-        message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
+        # NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
-        message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
+        # is a softlink to real cudnn.h directory
        set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
        get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)
        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR})
    endif()
 elseif(WITH_AMD_GPU)
    add_definitions(-DPADDLE_WITH_HIP)
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@ -25,8 +25,25 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    $ENV{CUDNN_ROOT}
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
-    /usr/lib)
+    /usr/lib
-find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
+	${CUDA_TOOLKIT_ROOT_DIR}
 	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
 	)
 set(CUDNN_LIB_NAME "")
 if (LINUX)
 set(CUDNN_LIB_NAME "libcudnn.so")
 endif(LINUX)
 if(WIN32)
 # only support cudnn7
 set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)
 if(Apple)
 set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
 endif(Apple)
 find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
          NO_DEFAULT_PATH
    DOC "Path to cuDNN library.")
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@ -47,6 +47,7 @@ ExternalProject_Add(
                        -DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
                        -DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
                        -DCUDNN_ROOT=${CUDNN_ROOT}
                        -DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
                        ${EXTERNAL_OPTIONAL_ARGS}
    CMAKE_CACHE_ARGS    -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
 )
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@ -142,6 +142,11 @@ else()
        ${GPU_COMMON_FLAGS})
 endif()
 if(UNIX AND NOT APPLE)
  # except apple from nix*Os family
  set(LINUX TRUE)
 endif(UNIX AND NOT APPLE)
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
    set(SSE3_FLAG "-msse3")
    set(AVX_FLAG "-mavx")
    set(AVX2_FLAG "-mavx2")
    set(AVX512F_FLAG "-mavx512f")
 elseif(MSVC)
    set(MMX_FLAG "/arch:MMX")
    set(SSE2_FLAG "/arch:SSE2")
@ -81,5 +82,16 @@ int main()
    return 0;
 }" AVX2_FOUND)
 # Check AVX512F
 set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
 set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
 CHECK_CXX_SOURCE_RUNS("
 #include <immintrin.h>
 int main()
 {
    __m512i a = _mm512_undefined_epi32();
    return 0;
 }" AVX512F_FOUND)
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
-mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
+mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -99,8 +99,13 @@ else()
  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
 endif()
-
+if (NOT WIN32)
-cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
+cc_library(parallel_executor SRCS parallel_executor.cc DEPS
        threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
        graph graph_viz_pass multi_devices_graph_pass
        multi_devices_graph_print_pass multi_devices_graph_check_pass
        fast_threaded_ssa_graph_executor)
 endif() # NOT WIN32
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@ -42,3 +42,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b
 cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
 #cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
 #        device_context reduce_op_handle )
 cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
        DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)
--- a/paddle/fluid/framework/details/execution_strategy.h
+++ b/paddle/fluid/framework/details/execution_strategy.h
@ -19,10 +19,13 @@ namespace framework {
 namespace details {
 struct ExecutionStrategy {
  enum ExecutorType { kDefault = 0, kExperimental = 1 };
  size_t num_threads_{0};
  bool use_cuda_{true};
  bool allow_op_delay_{false};
  size_t num_iteration_per_drop_scope_{100};
  ExecutorType type_{kDefault};
 };
 }  //  namespace details
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc
@ -0,0 +1,175 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/multi_devices_helper.h"
 namespace paddle {
 namespace framework {
 namespace details {
 FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
    const std::vector<platform::Place> &places,
    std::unique_ptr<ir::Graph> &&graph)
    : strategy_(strategy),
      local_scopes_(local_scopes),
      places_(places),
      graph_(std::move(graph)),
      pool_(strategy.num_threads_ +
            1),  // add one more thread for generate op_deps
      fetch_ctxs_(places) {
  auto &ops = graph_->Get<details::GraphOps>("ops");
  for (auto &op : ops) {
    int dep = static_cast<int>(op->NotReadyInputSize());
    op_deps_.emplace(op.get(), dep);
    if (dep == 0) {
      bootstrap_ops_.emplace_back(op.get());
    }
  }
  PrepareAtomicOpDeps();
 }
 FeedFetchList FastThreadedSSAGraphExecutor::Run(
    const std::vector<std::string> &fetch_tensors) {
  std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
      op_deps = atomic_op_deps_.get();
  PrepareAtomicOpDeps();
  paddle::framework::FeedFetchList fetches;
  fetches.resize(fetch_tensors.size());
  std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
  std::vector<std::unique_ptr<ir::Node>> fetch_nodes;
  std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
  for (auto &fetch_var_name : fetch_tensors) {
    for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
      auto it = var_map.find(fetch_var_name);
      if (it != var_map.end()) {
        fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
      }
    }
  }
  for (size_t i = 0; i < fetch_tensors.size(); ++i) {
    auto &var_name = fetch_tensors[i];
    auto fetched_var_it = fetched_vars.find(var_name);
    PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
                   "Cannot find fetched variable.(Perhaps the main_program "
                   "is not set to ParallelExecutor)");
    auto &vars = fetched_var_it->second;
    fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
    auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i,
                                 &local_scopes_);
    fetch_ops.emplace_back(op);
    for (auto &p : places_) {
      op->SetDeviceContext(p, fetch_ctxs_.Get(p));
    }
    for (auto *var : vars) {
      op->AddInput(var);
    }
    (*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
  }
  size_t num_complete = 0;
  remaining_ = 0;
  BlockingQueue<size_t> complete_q;
  for (auto op : bootstrap_ops_) {
    RunOpAsync(op_deps.get(), op, &complete_q);
  }
  while (num_complete != op_deps->size()) {
    size_t num_comp = complete_q.Pop();
    if (num_comp == -1UL) {
      int remaining = 0;
      while (true) {
        remaining = remaining_;
        if (remaining == 0) {
          break;
        }
        for (int i = 0; i < remaining; ++i) {
          complete_q.Pop();
        }
      }
      exception_.ReThrow();
    }
    num_complete += num_comp;
  }
  // Wait FetchOps.
  if (!fetch_ops.empty()) {
    fetch_ops.clear();
  }
  return fetches;
 }
 void FastThreadedSSAGraphExecutor::RunOpAsync(
    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
    OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
  ++remaining_;
  this->pool_.enqueue([=] {
    OpHandleBase *op_to_run = op;
    size_t complete = 0;
    while (op_to_run != nullptr) {
      try {
        op_to_run->Run(strategy_.use_cuda_);
        ++complete;
      } catch (...) {
        exception_.Catch(std::current_exception());
        --remaining_;
        complete_q->Push(-1UL);
        return;
      }
      auto &outputs = op_to_run->Outputs();
      op_to_run = nullptr;
      for (auto &output : outputs) {
        for (auto &pending_op : output->PendingOps()) {
          std::atomic<int> &deps = op_deps->at(pending_op);
          if (deps.fetch_sub(1) == 1) {  // pending_op ready
            if (op_to_run == nullptr) {
              op_to_run = pending_op;
            } else {
              this->RunOpAsync(op_deps, pending_op, complete_q);
            }
          }
        }
      }
    }
    --remaining_;
    complete_q->Push(complete);
  });
 }
 void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
  atomic_op_deps_ = pool_.enqueue([&] {
    std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
        new std::unordered_map<OpHandleBase *, std::atomic<int>>;
    for (auto &pair : op_deps_) {
      (*op_deps)[pair.first] = pair.second;
    }
    return std::unique_ptr<
        std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
  });
 }
 const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@ -0,0 +1,64 @@
 // Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 #pragma once
 #include <string>
 #include <vector>
 #include "ThreadPool.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 namespace paddle {
 namespace framework {
 class Scope;
 namespace details {
 class OpHandleBase;
 class FastThreadedSSAGraphExecutor : public SSAGraphExecutor {
 public:
  FastThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                               const std::vector<Scope *> &local_scopes,
                               const std::vector<platform::Place> &places,
                               std::unique_ptr<ir::Graph> &&graph);
  FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
  const ir::Graph &Graph() const override;
 private:
  ExecutionStrategy strategy_;
  std::vector<Scope *> local_scopes_;
  std::vector<platform::Place> places_;
  std::unique_ptr<ir::Graph> graph_;
  std::unordered_map<OpHandleBase *, int> op_deps_;
  std::vector<OpHandleBase *> bootstrap_ops_;
  ::ThreadPool pool_;
  platform::DeviceContextPool fetch_ctxs_;
  std::atomic<int> remaining_;
  void RunOpAsync(std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
                  OpHandleBase *op, BlockingQueue<size_t> *complete_q);
  void PrepareAtomicOpDeps();
  std::future<
      std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>>
      atomic_op_deps_;
  ExceptionHolder exception_;
 };
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@ -158,6 +158,16 @@ void OpHandleBase::RunAndRecordEvent(platform::Place p,
 #endif
 }
 size_t OpHandleBase::NotReadyInputSize() const {
  std::unordered_set<VarHandleBase *> res;
  for (auto *var : inputs_) {
    if (var->GeneratedOp() != nullptr) {
      res.emplace(var);
    }
  }
  return res.size();
 }
 }  // namespace details
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@ -81,6 +81,8 @@ class OpHandleBase {
    return res.size();
  }
  size_t NotReadyInputSize() const;
  const std::vector<VarHandleBase *> &Outputs() const { return outputs_; }
  size_t NoDummyInputSize() const;
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@ -117,7 +117,15 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
    }
    // For output args, always create a new var.
    for (auto &each_var_name : op->OutputArgumentNames()) {
-      ir::Node *var = CreateVarNode(all_vars.at(each_var_name));
+      ir::Node *var = nullptr;
      if (all_vars.count(each_var_name) != 0) {
        var = CreateVarNode(all_vars.at(each_var_name));
      } else {
        // Operation output vars can be @EMPTY@. For example, while_grad
        // can have multi @EMPTY@ outputs with no VarDesc.
        // TODO(panyx0718): Add a test.
        var = CreateEmptyNode(each_var_name, ir::Node::Type::kVariable);
      }
      var_nodes[each_var_name].push_back(var);
      node->outputs.push_back(var);
      var->inputs.push_back(node);
@ -208,7 +216,8 @@ Graph::Graph(const ProgramDesc &program) : program_(program) {
      // Add write after write dependence
      ir::Node *upstream_op =
          (*it_old)->inputs.empty() ? nullptr : (*it_old)->inputs[0];
-      if (upstream_op) {
+      // TODO(zcd): Add a test.
      if (upstream_op && upstream_op != write_op) {
        ir::Node *dep_var = CreateControlDepVar();
        write_op->inputs.push_back(dep_var);
        upstream_op->outputs.push_back(dep_var);
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/nccl_helper.h"
 #endif
 #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
 #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
@ -193,8 +194,14 @@ ParallelExecutor::ParallelExecutor(
      member_->local_scopes_, member_->use_cuda_, build_strategy);
 #endif
  if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
    member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
        exec_strategy, member_->local_scopes_, places, std::move(graph)));
  } else {
    member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
        exec_strategy, member_->local_scopes_, places, std::move(graph)));
  }
  member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor(
      exec_strategy, member_->local_scopes_, std::move(var_infos),
      member_->places_, std::move(member_->executor_)));
--- a/paddle/fluid/framework/program_desc.cc
+++ b/paddle/fluid/framework/program_desc.cc
@ -55,11 +55,20 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
    auto all_ops = blocks_[block_id]->AllOps();
    for (size_t op_id = 0; op_id < all_ops.size(); ++op_id) {
      auto &op = all_ops[op_id];
      for (const std::string &attr_name : op->AttrNames()) {
        if (op->GetAttrType(attr_name) == proto::AttrType::BLOCK) {
          int sub_block_id =
              o.Block(block_id).Op(op_id)->GetBlockAttrId(attr_name);
          op->SetBlockAttr(attr_name, MutableBlock(sub_block_id));
        } else if (op->GetAttrType(attr_name) == proto::AttrType::BLOCKS) {
          std::vector<int> sub_block_ids =
              o.Block(block_id).Op(op_id)->GetBlocksAttrIds(attr_name);
          std::vector<BlockDesc *> block_descs;
          for (int block_id : sub_block_ids) {
            block_descs.push_back(MutableBlock(block_id));
          }
          op->SetBlocksAttr(attr_name, block_descs);
        }
      }
    }
@ -68,24 +77,16 @@ ProgramDesc::ProgramDesc(const ProgramDesc &o) {
 ProgramDesc::ProgramDesc(const proto::ProgramDesc &desc) {
  desc_ = desc;
-  for (auto &block_desc : *desc_.mutable_blocks()) {
+  InitFromProto();
    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
  for (auto &block : blocks_) {
    for (auto *op : block->AllOps()) {
      for (const auto &attr : op->Proto()->attrs()) {
        if (attr.type() == proto::AttrType::BLOCK) {
          size_t blk_idx = attr.block_idx();
          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
        }
      }
    }
  }
 }
 ProgramDesc::ProgramDesc(const std::string &binary_str) {
  PADDLE_ENFORCE(desc_.ParseFromString(binary_str),
                 "Fail to parse program_desc from binary string.");
  InitFromProto();
 }
 void ProgramDesc::InitFromProto() {
  for (auto &block_desc : *desc_.mutable_blocks()) {
    blocks_.emplace_back(new BlockDesc(this, &block_desc));
  }
@ -95,6 +96,13 @@ ProgramDesc::ProgramDesc(const std::string &binary_str) {
        if (attr.type() == proto::AttrType::BLOCK) {
          size_t blk_idx = attr.block_idx();
          op->SetBlockAttr(attr.name(), this->MutableBlock(blk_idx));
        } else if (attr.type() == proto::AttrType::BLOCKS) {
          auto blks_idx = attr.blocks_idx();
          std::vector<BlockDesc *> block_descs;
          for (int blk_idx : blks_idx) {
            block_descs.push_back(this->MutableBlock(blk_idx));
          }
          op->SetBlocksAttr(attr.name(), block_descs);
        }
      }
    }
--- a/paddle/fluid/framework/program_desc.h
+++ b/paddle/fluid/framework/program_desc.h
@ -76,6 +76,8 @@ class ProgramDesc {
  void SetFetchHolderName(const std::string &fetch_holder_name);
 private:
  void InitFromProto();
  proto::ProgramDesc desc_;
  std::vector<std::unique_ptr<BlockDesc>> blocks_;
--- a/paddle/fluid/framework/program_desc_test.cc
+++ b/paddle/fluid/framework/program_desc_test.cc
@ -42,6 +42,19 @@ TEST(ProgramDesc, copy_ctor) {
  out->SetType(proto::VarType::LOD_TENSOR);
  op->SetOutput("Y", {out->Name()});
  BlockDesc* new_block = program.AppendBlock(*global_block);
  op = new_block->AppendOp();
  op->SetType("mul");
  op = global_block->AppendOp();
  op->SetType("op_with_subblock");
  op->SetAttr("sub_block", new_block);
  std::vector<BlockDesc*> sub_blocks;
  sub_blocks.push_back(program.AppendBlock(*global_block));
  sub_blocks.push_back(program.AppendBlock(*global_block));
  op->SetAttr("sub_blocks", sub_blocks);
  ProgramDesc program_copy(program);
  auto* global_block_copy = program_copy.MutableBlock(0);
@ -64,6 +77,8 @@ TEST(ProgramDesc, copy_ctor) {
  assert_same_var("Y", y);
  assert_same_var("Out", out);
  bool found_sub_block = false;
  bool found_sub_blocks = false;
  for (size_t i = 0; i < global_block->OpSize(); ++i) {
    auto op_origin = global_block->Op(i);
    auto op_copy = global_block_copy->Op(i);
@ -74,8 +89,17 @@ TEST(ProgramDesc, copy_ctor) {
    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
              op_origin->Proto()->SerializeAsString());
  }
    if (op->Type() == "op_with_subblock") {
      ASSERT_EQ(1, op->GetBlockAttrId("sub_block"));
      found_sub_block = true;
      ASSERT_EQ(2, op->GetBlocksAttrIds("sub_blocks").size());
      found_sub_blocks = true;
    }
  }
  ASSERT_TRUE(found_sub_block);
  ASSERT_TRUE(found_sub_blocks);
  // Not check block's protostr are same it because the order of vars could be
  // different and it is correct.
 }
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@ -62,13 +62,13 @@ endif()
 if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
-    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
+    cc_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
-    #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)
+    cc_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
    function(anakin_target target_name)
      target_compile_options(${target_name} BEFORE PUBLIC ${ANAKIN_COMPILE_EXTRA_FLAGS})
    endfunction()
    anakin_target(inference_anakin_api)
-    #anakin_target(inference_anakin_api_shared)
+    anakin_target(inference_anakin_api_shared)
    if (WITH_TESTING)
        cc_test(inference_anakin_test SRCS api_anakin_engine_tester.cc
                ARGS --model=${ANAKIN_SOURCE_DIR}/mobilenet_v2.anakin.bin
--- a/paddle/fluid/inference/api/high_level_api_cn.md
+++ b/paddle/fluid/inference/api/high_level_api_cn.md
@ -65,13 +65,13 @@ config.model_dir = "xxx";
 config.use_gpu = false;
 // 创建一个原生的 PaddlePredictor
 auto predictor =
-      paddle::CreatePaddlePredictor<NativeConfig, PaddleEngineKind::kNative>(config);
+      paddle::CreatePaddlePredictor<paddle::NativeConfig, paddle::PaddleEngineKind::kNative>(config);
 // 创建输入 tensor
 int64_t data[4] = {1, 2, 3, 4};
 paddle::PaddleTensor tensor{.name = "",
                            .shape = std::vector<int>({4, 1}),
-                            .data = PaddleBuf(data, sizeof(data)),
+                            .data = paddle::PaddleBuf(data, sizeof(data)),
-                            .dtype = PaddleDType::INT64};
+                            .dtype = paddle::PaddleDType::INT64};
 // 创建输出 tensor，输出 tensor 的内存可以复用
 std::vector<paddle::PaddleTensor> outputs;
 // 执行预测
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@ -1,4 +1,4 @@
-nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto)
+nv_library(tensorrt_engine SRCS engine.cc DEPS framework_proto device_context)
 nv_test(test_tensorrt SRCS test_tensorrt.cc DEPS dynload_cuda device_context dynamic_loader)
 nv_test(test_tensorrt_engine SRCS test_engine.cc DEPS dynload_cuda tensorrt_engine)
 add_subdirectory(convert)
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@ -84,6 +84,15 @@ function(op_library TARGET)
        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
    endif()
    #remove windows unsupported op
    if (WIN32)
    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op")
        if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
          return()
        endif()
    endforeach()
    endif(WIN32)
    list(LENGTH op_library_DEPS op_library_DEPS_len)
    if (${op_library_DEPS_len} GREATER 0)
        set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
@ -181,19 +190,19 @@ function(op_library TARGET)
 endfunction()
 add_subdirectory(math)
 if (NOT WIN32)
 add_subdirectory(nccl)
 if(WITH_GPU)
    op_library(nccl_op DEPS nccl_common)
    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
 else()
    set(DEPS_OPS ${DEPS_OPS} nccl_op)
 endif()
 endif() # NOT WIN32
 set(DISTRIBUTE_DEPS "")
 if(WITH_DISTRIBUTE)
    add_subdirectory(distributed)
    set(DISTRIBUTE_DEPS "")
    if(WITH_GRPC)
        set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node)
@ -222,7 +231,7 @@ if(WITH_DISTRIBUTE)
    #set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    #cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
    #        listen_and_serv_op sum_op executor SERIAL)
-    if(WITH_GPU)
+    if(WITH_GPU AND NOT WIN32)
        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS listen_and_serv_op ${DISTRIBUTE_DEPS} executor SERIAL)
        if(WITH_GRPC)
@ -233,7 +242,7 @@ if(WITH_DISTRIBUTE)
        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    else()
        set(DEPS_OPS ${DEPS_OPS} gen_nccl_id_op)
-    endif()
+    endif() # WITH_GPU AND NOT WIN32
 else()
    set(DEPS_OPS ${DEPS_OPS}  checkpoint_notify_op prefetch_op recv_op listen_and_serv_op send_op send_barrier_op fetch_barrier_op gen_nccl_id_op)
 endif()
@ -331,5 +340,7 @@ cc_test(beam_search_op_test SRCS beam_search_op_test.cc DEPS lod_tensor beam_sea
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
 cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
 if(NOT WIN32)
 nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
--- a/paddle/fluid/operators/conditional_block_op.cc
+++ b/paddle/fluid/operators/conditional_block_op.cc
@ -29,9 +29,9 @@ class ConditionalOp : public framework::OperatorBase {
 protected:
  std::vector<const framework::LoDTensor *> InputTensors(
-      const framework::Scope &scope) const {
+      const framework::Scope &scope, const std::string &in_name) const {
    std::vector<const framework::LoDTensor *> retv;
-    auto xs = Inputs("X");
+    auto xs = Inputs(in_name);
    retv.resize(xs.size(), nullptr);
    std::transform(
        xs.begin(), xs.end(), retv.begin(),
@ -81,12 +81,18 @@ class ConditionalBlockOp : public ConditionalOp {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
    auto xs = InputTensors(scope);
    bool need_run;
    if (Attr<bool>("is_scalar_condition")) {
      // When is_scalar_condition is True, the conditional variable is a scalar,
      // whether need to execute the operators in sub-block depends on the
      // conditional variable (Cond).
      auto xs = InputTensors(scope, "Cond");
      need_run = ScalarCondition(xs);
    } else {
      // When is_scalar_condition is False, the conditional variable maybe a
      // vector or tensor, whether need to execute the operators in sub-block
      // depends on the input variables (Input).
      auto xs = InputTensors(scope, "Input");
      need_run = std::all_of(
          xs.begin(), xs.end(),
          [](const framework::LoDTensor *t) { return t->numel() != 0; });
@ -110,11 +116,11 @@ class ConditionalBlockOp : public ConditionalOp {
 class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
-    AddInput("X",
+    AddInput("Cond",
-             "The conditional variable of this operator. If X is empty, the "
+             "The conditional variable of this operator. If Cond is empty, the "
             "whole sub-block will not be executed.")
        .AsDuplicable();
-    AddInput("Params", "The input variables of the sub-block.").AsDuplicable();
+    AddInput("Input", "The input variables of the sub-block.").AsDuplicable();
    AddOutput("Out", "The output variables of the sub-block.").AsDuplicable();
    AddOutput("Scope",
              "(std::vector<Scope*>) The step scope of conditional block. To "
@ -123,13 +129,18 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
    AddAttr<framework::BlockDesc *>(
        "sub_block", "The step block of conditional block operator");
    AddAttr<bool>("is_scalar_condition",
-                  "the input X is used as scalar "
+                  "The conditional variable (Cond) is used as scalar "
-                  "condition")
+                  "condition.")
        .SetDefault(false);
    AddComment(R"DOC(Conditional block operator
-Run the sub-block if X is not empty. Params is the other inputs and Out is the
+If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
-outputs of the sub-block.
+run the operators in sub-block if Cond is True.
 If `is_scalar_condition` is False, the conditional variable (Cond) is a vector or
 tensor, run the operators in sub-block if all of input variables are not empty.
 )DOC");
  }
 };
@ -145,12 +156,12 @@ class ConditionalBlockGradOp : public ConditionalOp {
 private:
  void RunImpl(const framework::Scope &scope,
               const platform::Place &dev_place) const override {
    auto xs = this->InputTensors(scope);
    bool need_run;
    if (Attr<bool>("is_scalar_condition")) {
      auto xs = this->InputTensors(scope, "Cond");
      need_run = ScalarCondition(xs);
    } else {
      auto xs = this->InputTensors(scope, "Input");
      need_run = std::all_of(
          xs.begin(), xs.end(),
          [](const framework::LoDTensor *t) { return t->numel() != 0; });
@ -166,11 +177,11 @@ class ConditionalBlockGradOp : public ConditionalOp {
      auto *block = Attr<framework::BlockDesc *>("sub_block");
      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Params"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Input"),
-                                  Outputs(framework::GradVarName("Params")));
+                                  Outputs(framework::GradVarName("Input")));
-      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("X"),
+      AssignLocalGradientToGlobal(dev_place, cur_scope, Inputs("Cond"),
-                                  Outputs(framework::GradVarName("X")));
+                                  Outputs(framework::GradVarName("Cond")));
    }
  }
@ -199,15 +210,15 @@ class ConditionalBlockGradOp : public ConditionalOp {
 class ConditionalBlockGradInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *context) const override {
-    PADDLE_ENFORCE(context->HasInputs("X"));
+    PADDLE_ENFORCE(context->HasInputs("Cond"));
-    if (context->HasInputs("Params")) {
+    if (context->HasInputs("Input")) {
-      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Params")));
+      PADDLE_ENFORCE(context->HasOutputs(framework::GradVarName("Input")));
-      context->SetOutputsDim(framework::GradVarName("Params"),
+      context->SetOutputsDim(framework::GradVarName("Input"),
-                             context->GetInputsDim("Params"));
+                             context->GetInputsDim("Input"));
    }
-    if (context->HasOutputs(framework::GradVarName("X"))) {
+    if (context->HasOutputs(framework::GradVarName("Cond"))) {
-      context->SetOutputsDim(framework::GradVarName("X"),
+      context->SetOutputsDim(framework::GradVarName("Cond"),
-                             context->GetInputsDim("X"));
+                             context->GetInputsDim("Cond"));
    }
  }
 };
@ -220,14 +231,15 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpDescMaker {
  std::unique_ptr<framework::OpDesc> Apply() const override {
    auto grad_op = new framework::OpDesc();
    grad_op->SetType("conditional_block_grad");
-    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput("Cond", Input("Cond"));
-    grad_op->SetInput("Params", Input("Params"));
+    grad_op->SetInput("Input", Input("Input"));
    grad_op->SetInput("Out", Output("Out"));
    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
    grad_op->SetInput("Scope", Output("Scope"));
-    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X", false));
+    grad_op->SetOutput(framework::GradVarName("Cond"),
-    grad_op->SetOutput(framework::GradVarName("Params"),
+                       InputGrad("Cond", false));
-                       InputGrad("Params", false));
+    grad_op->SetOutput(framework::GradVarName("Input"),
                       InputGrad("Input", false));
    grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
    grad_op->SetAttr("is_scalar_condition", GetAttr("is_scalar_condition"));
    return std::unique_ptr<framework::OpDesc>(grad_op);
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@ -85,6 +85,199 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
    int* track_value =
        track.mutable_data<int>(emission_dims, platform::CPUPlace());
 #ifdef __AVX__
 // It use the AVX or AVX512 instruction to deal the data as the vector of 8 or
 // 16 elements per iteration. Then it can implement the parallel processing.
 // Only optimize for float type.
 #ifdef __AVX512F__
    size_t step_size = 16;
 #else
    size_t step_size = 8;
 #endif
    if (std::is_same<T, float>::value && (tag_num >= step_size)) {
      size_t steps = tag_num / step_size;
      size_t remain = tag_num % step_size;
      int last_offset = static_cast<int>(remain) - static_cast<int>(step_size);
      // Setup the alpha initial value.
      size_t i_offset = 0;
      for (size_t i = 0; i <= steps; ++i) {
 #ifdef __AVX512F__
        // Declare the variable for the content of weights, input and alpha
        // values.
        __m512 w_content, x_content, alpha_content;
        // Load the relevant data into the variables from un-aligned address.
        w_content = _mm512_loadu_ps((const float*)(w + i_offset));
        x_content = _mm512_loadu_ps((const float*)(x + i_offset));
        alpha_content = _mm512_add_ps(w_content, x_content);
        // Save the alpha value.
        _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
                         alpha_content);
 #else
        // Declare the variable for the content of weights, input and alpha
        // values.
        __m256 w_content, x_content, alpha_content;
        // Load the relevant data into the variables from un-aligned address.
        w_content = _mm256_loadu_ps((const float*)(w + i_offset));
        x_content = _mm256_loadu_ps((const float*)(x + i_offset));
        alpha_content = _mm256_add_ps(w_content, x_content);
        // Save the alpha value.
        _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + i_offset),
                         alpha_content);
 #endif
        i_offset += step_size;
        if (i == steps - 1) {
          if (remain > 0) {
            i_offset += last_offset;
          } else {
            break;
          }
        }
      }
      // Use the column-major strategy to get the location of maximum score.
      size_t seq_offset = 0;
      for (size_t k = 1; k < seq_len; ++k) {
        size_t j_offset = 0;
        for (size_t j = 0; j <= steps; ++j) {
 #ifdef __AVX512F__
          // Initialize the variables of maximum score and location.
          __m512 max_score = _mm512_set1_ps(-std::numeric_limits<T>::max());
          __m512i max_j = _mm512_setzero_si512();
 #else
          // Initialize the variables of maximum score and location.
          __m256 max_score = _mm256_set1_ps(-std::numeric_limits<T>::max());
          __m256i max_j = _mm256_set1_epi32(0);
 #endif
          // Calculate the offset of transition_weights.
          size_t trans_offset = state_trans_base_idx * tag_num + j_offset;
          for (size_t i = 0; i < tag_num; ++i) {
 #ifdef __AVX512F__
            // Initalize the content of alpha variable with related offset.
            __m512 alpha_content =
                _mm512_set1_ps(*(const float*)(alpha_value + seq_offset + i));
            // Obtain the content of weights from un-aligned address.
            __m512 w_content =
                _mm512_loadu_ps((const float*)(w + trans_offset));
            __m512 score_v = _mm512_add_ps(alpha_content, w_content);
            __mmask16 mask = _mm512_cmp_ps_mask(score_v, max_score, _CMP_GT_OS);
            // According to the mask value, it update the index of the max_score
            // location.
            max_j = _mm512_mask_set1_epi32(max_j, mask, i);
            // Update the max_score value.
            max_score = _mm512_max_ps(max_score, score_v);
 #else
            // Initalize the content of alpha variable with related offset.
            __m256 alpha_content = _mm256_broadcast_ss(
                (const float*)(alpha_value + seq_offset + i));
            // Obtain the content of weights from un-aligned address.
            __m256 w_content =
                _mm256_loadu_ps((const float*)(w + trans_offset));
            __m256 score_v = _mm256_add_ps(alpha_content, w_content);
            __m256 mask = _mm256_cmp_ps(score_v, max_score, _CMP_GT_OS);
 #ifdef __AVX2__
            // According to the mask value, it update the index of the max_score
            // location.
            max_j = _mm256_or_si256(
                _mm256_andnot_si256((__m256i)mask, max_j),
                _mm256_and_si256((__m256i)mask, _mm256_set1_epi32(i)));
 #else
            __m128i lo_max_j = _mm256_extractf128_si256(max_j, 0);
            __m128i hi_max_j = _mm256_extractf128_si256(max_j, 1);
            __m128i lo_mask = _mm256_extractf128_si256((__m256i)mask, 0);
            __m128i hi_mask = _mm256_extractf128_si256((__m256i)mask, 1);
            lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j);
            hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j);
            lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i));
            hi_mask = _mm_and_si128(hi_mask, _mm_set1_epi32(i));
            lo_max_j = _mm_or_si128(lo_mask, lo_max_j);
            hi_max_j = _mm_or_si128(hi_mask, hi_max_j);
            // According to the mask value, it update the index of the max_score
            // location.
            max_j = _mm256_insertf128_si256(max_j, lo_max_j, 0);
            max_j = _mm256_insertf128_si256(max_j, hi_max_j, 1);
 #endif
            // Update the max_score value.
            max_score = _mm256_max_ps(max_score, score_v);
 #endif
            trans_offset += tag_num;
          }
 #ifdef __AVX512F__
          // Update the alpha and track values.
          __m512 x_content = _mm512_loadu_ps(
              (const float*)(x + seq_offset + tag_num + j_offset));
          max_score = _mm512_add_ps(max_score, x_content);
          _mm512_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
                                                    tag_num + j_offset),
                           max_score);
          _mm512_storeu_si512(
              reinterpret_cast<__m512i*>(track_value + seq_offset + tag_num +
                                         j_offset),
              max_j);
 #else
          // Update the alpha and track values.
          __m256 x_content = _mm256_loadu_ps(
              (const float*)(x + seq_offset + tag_num + j_offset));
          max_score = _mm256_add_ps(max_score, x_content);
          _mm256_storeu_ps(reinterpret_cast<float*>(alpha_value + seq_offset +
                                                    tag_num + j_offset),
                           max_score);
          _mm256_storeu_si256(
              reinterpret_cast<__m256i*>(track_value + seq_offset + tag_num +
                                         j_offset),
              max_j);
 #endif
          // Calculate the offset of next step
          j_offset += step_size;
          if (j == steps - 1) {
            if (remain > 0) {
              j_offset += last_offset;
            } else {
              break;
            }
          }
        }
        seq_offset += tag_num;
      }
    } else {
      for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
      for (size_t k = 1; k < seq_len; ++k) {
        for (size_t i = 0; i < tag_num; ++i) {
          T max_score = -std::numeric_limits<T>::max();
          int max_j = 0;
          for (size_t j = 0; j < tag_num; ++j) {
            T score = alpha_value[(k - 1) * tag_num + j] +
                      w[(j + state_trans_base_idx) * tag_num + i];
            if (score > max_score) {
              max_score = score;
              max_j = j;
            }
          }
          alpha_value[k * tag_num + i] = max_score + x[k * tag_num + i];
          track_value[k * tag_num + i] = max_j;
        }
      }
    }
 #else
    for (size_t i = 0; i < tag_num; ++i) alpha_value[i] = w[i] + x[i];
    for (size_t k = 1; k < seq_len; ++k) {
@ -105,6 +298,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
      }
    }
 #endif
    T max_score = -std::numeric_limits<T>::max();
    int max_i = 0;
    for (size_t i = 0; i < tag_num; ++i) {
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@ -80,6 +80,9 @@ inline framework::DDim trim_trailing_singular_dims(
  for (int i = 0; i < actual_dims_size; ++i) {
    trim_dims[i] = dims[i];
  }
  if (trim_dims.size() == 0) {
    return framework::DDim(framework::make_dim());
  }
  framework::DDim actual_dims = framework::make_ddim(trim_dims);
  return actual_dims;
 }
--- a/Show More
+++ b/Show More