Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize_hs_op

, test=develop
6 years ago · 8c75705984
parent b387a19410 6d04a9cf47
commit 8c75705984
40 changed files with 1840 additions and 244 deletions
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
    return()
 ENDIF()
 INCLUDE(GNUInstallDirs)
 INCLUDE(ExternalProject)
 SET(NGRAPH_PROJECT         "extern_ngraph")
@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
 SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
 SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
 SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
 SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 ExternalProject_Add(
    ${NGRAPH_PROJECT}
@ -63,18 +69,6 @@ ExternalProject_Add(
    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
 )
 if(UNIX AND NOT APPLE)
    include(GNUInstallDirs)
    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 else()
    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
 endif()
 MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
 SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
 SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
 SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
 # Workaround for nGraph expecting mklml to be in mkldnn install directory.
 ExternalProject_Add_Step(
    ${NGRAPH_PROJECT}
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -129,6 +129,15 @@ if (WITH_MKLDNN)
            )
 endif ()
 if (WITH_NGRAPH)
    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
    copy(ngraph_lib
            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
            DSTS ${dst_dir} ${dst_dir}
            DEPS ngraph
            )
 endif ()
 if (NOT WIN32)
    if (NOT MOBILE_INFERENCE AND NOT RPI)
        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -182,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@ -299,6 +299,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -127,8 +127,9 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+
 if(NOT WIN32)
 cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
 cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
  shape_inference data_transform lod_tensor profiler)
 endif(NOT WIN32)
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@ -32,9 +32,7 @@ enum OpInfoFillType {
  kOpProtoAndCheckerMaker = 1,
  kGradOpDescMaker = 2,
  kVarTypeInference = 3,
-  kShapeInference = 4,
+  kShapeInference = 4
  kEstimateFlops = 5,
  kUnknown = -1
 };
 template <typename T>
@ -50,10 +48,8 @@ struct OpInfoFillTypeID {
                                    ? kVarTypeInference
                                    : (std::is_base_of<InferShapeBase, T>::value
                                           ? kShapeInference
-                                           : (std::is_base_of<EstimateFlopsBase,
+                                           : static_cast<OpInfoFillType>(
-                                                              T>::value
+                                                 -1)))));
                                                  ? kEstimateFlops
                                                  : kUnknown)))));
  }
 };
@ -143,16 +139,6 @@ struct OpInfoFiller<T, kShapeInference> {
  }
 };
 template <typename T>
 struct OpInfoFiller<T, kEstimateFlops> {
  void operator()(const char* op_tpe, OpInfo* info) const {
    info->estimate_flops_ = [](InferShapeContext* ctx) {
      T estimate_flops;
      return estimate_flops(ctx);
    };
  }
 };
 }  // namespace details
 }  // namespace framework
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
  for (const Node* n : graph->Nodes()) {
    if (n->IsOp()) {
      auto* op = n->Op();
-      if (op->HasAttr("is_test")) {
+      if (n->RuntimeHasAttr("is_test")) {
        op->SetAttr("is_test", true);
      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                 end(op_list)) {
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
      auto* op = node->Op();
      auto op_name = boost::get<std::string>(op->GetAttr("name"));
      if (op_name == "conv3") {
-        ASSERT_FALSE(op->HasAttr("is_test"));
+        ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
      } else {
-        ASSERT_TRUE(op->HasAttr("is_test"));
+        ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
      }
    }
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  VLOG(3) << "Aplies MKL-DNN placement strategy.";
  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
+    if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
      n->Op()->SetAttr("use_mkldnn", true);
    }
  }
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 namespace paddle {
 namespace framework {
@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
 const char Node::kControlDepVarName[] = "__control_var";
 #endif
-std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
+std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
                                        Node::Type type) {
  return std::unique_ptr<Node>(new Node(name, type));
 }
 bool Node::RuntimeHasAttr(const std::string &name) const {
  if (Op()->HasAttr(name)) {
    return true;
  } else {
    auto &op_info = OpInfoMap::Instance();
    auto op_type = Op()->Type();
    if (op_info.Has(op_type)) {
      auto op_info_ptr = op_info.Get(op_type);
      if (op_info_ptr.HasOpProtoAndChecker()) {
        const proto::OpProto &proto = op_info_ptr.Proto();
        for (int i = 0; i != proto.attrs_size(); ++i) {
          const proto::OpProto::Attr &attr = proto.attrs(i);
          if (attr.name() == name) {
            return true;
          }
        }
      }
    }
  }
  return false;
 }
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -108,6 +108,18 @@ class Node {
           Name().find(ir::Node::kControlDepVarName) != std::string::npos;
  }
  // RuntimeHasAttr is different with HasAttr now.
  // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
  // thus, if stored program_desc_ are old which don't have an attr, a new
  // library which adds the attr already will fail on this function.
  // Details:
  // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
  // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
  // problem.
  // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
  // RuntimeHasAttr.
  bool RuntimeHasAttr(const std::string& name) const;
  std::vector<Node*> inputs;
  std::vector<Node*> outputs;
--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@ -15,23 +15,105 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
 #include <vector>
 #include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "ngraph/ngraph.hpp"
 namespace paddle {
 namespace framework {
 static std::shared_ptr<ngraph::Node> GetNode(
    const std::shared_ptr<OperatorBase>& op, const std::string prm,
    const VariableNameMap& var_map,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
  auto& var_names = var_map.at(prm);
  PADDLE_ENFORCE_EQ(var_names.size(), 1,
                    "op %s prm %s expects one associated var", op->Type(), prm);
  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
    return (*ngb_node_map)[var_names[0]];
  } else {
    return nullptr;
  }
 }
 static std::shared_ptr<ngraph::Node> GetInputNode(
    const std::shared_ptr<OperatorBase>& op, const std::string prm,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
  return GetNode(op, prm, op->Inputs(), ngb_node_map);
 }
 static std::shared_ptr<ngraph::Node> GetOutputNode(
    const std::shared_ptr<OperatorBase>& op, const std::string prm,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
  return GetNode(op, prm, op->Outputs(), ngb_node_map);
 }
 static void SetOutputNode(
    const std::shared_ptr<OperatorBase>& op, const std::string prm,
    std::shared_ptr<ngraph::Node> node,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
  auto& var_names = op->Outputs().at(prm);
  if (var_names.size() == 1) {
    (*ngb_node_map)[var_names[0]] = node;
  } else if (var_names.size() == 0) {
    (*ngb_node_map)[""] = node;
  } else {
    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
  }
 }
 static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
                      const std::string prm) {
  auto& outputs = op->Outputs();
  if (outputs.find(prm) == outputs.end()) return false;
  return outputs.at(prm).size() > 0;
 }
 template <typename T>
 static void BuildBinaryNode(
    const std::shared_ptr<OperatorBase>& op,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
  auto x = GetInputNode(op, "X", ngb_node_map);
  auto y = GetInputNode(op, "Y", ngb_node_map);
  auto out = std::make_shared<T>(x, y);
  SetOutputNode(op, "Out", out, ngb_node_map);
 }
 template <typename T>
 static void BuildUnaryNode(
    const std::shared_ptr<OperatorBase>& op,
    std::shared_ptr<
        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
        ngb_node_map) {
  auto input = GetInputNode(op, "X", ngb_node_map);
  auto out = std::make_shared<T>(input);
  SetOutputNode(op, "Out", out, ngb_node_map);
 }
 std::map<std::string,
         std::function<void(const std::shared_ptr<OperatorBase>&,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {};
+    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};
-void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
  auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map);
+  NG_NODE_MAP[op_type](op, ngb_node_map_);
 }
 }  // namespace framework
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@ -20,16 +20,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <unordered_map>
 #include <vector>
-#include "paddle/fluid/framework/operator.h"
+#include "ngraph/node.hpp"
 #include "paddle/fluid/platform/enforce.h"
 #include "ngraph/ngraph.hpp"
 namespace paddle {
 namespace framework {
 class OperatorBase;
 class NgraphBridge {
 public:
  static std::map<
@ -43,14 +41,14 @@ class NgraphBridge {
      std::shared_ptr<
          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
          var_node_map)
-      : ngb_node_map(var_node_map) {}
+      : ngb_node_map_(var_node_map) {}
-  void build_graph(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);
 private:
  std::shared_ptr<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      ngb_node_map;
+      ngb_node_map_;
 };
 }  // namespace framework
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@ -17,24 +17,19 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/variant.h"
-#include "ngraph/ngraph.hpp"
+#include "ngraph/type/element_type.hpp"
 namespace paddle {
 namespace framework {
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@ -31,12 +31,6 @@ class InferShapeBase {
  virtual void operator()(InferShapeContext*) const = 0;
 };
 class EstimateFlopsBase {
 public:
  virtual ~EstimateFlopsBase() = default;
  virtual size_t operator()(InferShapeContext*) const = 0;
 };
 struct OpInfo {
  OpCreator creator_;
  GradOpMakerFN grad_op_maker_;
@ -44,7 +38,6 @@ struct OpInfo {
  OpAttrChecker* checker_{nullptr};
  InferVarTypeFN infer_var_type_;
  InferShapeFN infer_shape_;
  EstimateFlopsFN estimate_flops_;
  bool HasOpProtoAndChecker() const {
    return proto_ != nullptr && checker_ != nullptr;
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
                 "Tensor %s contains NAN", name);
 }
 void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
                                           const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
  this->InferShape(&infer_shape_ctx);
 }
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -128,6 +128,8 @@ class OperatorBase {
  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;
  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
  virtual void RuntimeInferShape(const Scope& scope,
                                 const platform::Place& place) const {}
 protected:
  std::string type_;
@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
  }
  void RuntimeInferShape(const Scope& scope,
                         const platform::Place& place) const override;
 protected:
  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
  virtual OpKernelType GetKernelTypeForVar(
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@ -54,7 +54,5 @@ using InferVarTypeFN =
 using InferShapeFN = std::function<void(InferShapeContext*)>;
 using EstimateFlopsFN = std::function<void(InferShapeContext*)>;
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
 link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")
 if (NOT WIN32)
    set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
    if(EXISTS ${NGRAPH_PATH})
        include(GNUInstallDirs)
        include_directories("${NGRAPH_PATH}/include")
        link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
        set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
    endif()
 endif()
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)
 if(WITH_MKL)
@ -106,7 +116,7 @@ endif()
 if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB}
+    ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
    glog gflags protobuf snappystream snappy z xxhash
    ${EXTERNAL_LIB})
 else()
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@ -36,12 +36,10 @@ class SequenceMaskOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");
-    auto maxlen = ctx->Attrs().Get<int>("maxlen");
+    int maxlen = ctx->Attrs().Get<int>("maxlen");
-    if (maxlen > 0) {  // We can only infershape when maxlen > 0
+    auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
-      auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
+    dim.push_back(maxlen > 0 ? maxlen : -1);
-      dim.push_back(maxlen);
+    ctx->SetOutputDim("Y", framework::make_ddim(dim));
      ctx->SetOutputDim("Y", framework::make_ddim(dim));
    }
  }
 };
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {
 using framework::Tensor;
 const int kIgnoreIndex = -100;
 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
 public:
@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
    AddOutput("Out",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
              " of elementwise logistic losses.");
    AddAttr<int>("ignore_index",
                 "(int, default kIgnoreIndex), Specifies a target value that "
                 "is ignored and"
                 "does not contribute to the input gradient.")
        .SetDefault(kIgnoreIndex);
    AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@ -15,33 +15,72 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/hostdevice.h"
 #include "paddle/legacy/utils/Logging.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 struct SigmoidCrossEntropyWithLogitsForward {
  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
      : ignore_index(ignore_index) {}
  HOSTDEVICE T operator()(const T &x, const T &label) const {
    if (static_cast<int>(label) == ignore_index) {
      return static_cast<T>(0.);
    }
    T term1 = (x > 0) ? x : 0;
    T term2 = x * label;
    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
    return term1 - term2 + term3;
  }
  int ignore_index;
 };
 template <typename T>
 struct SigmoidCrossEntropyWithLogitsBackward {
  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
      : ignore_index(ignore_index) {}
  HOSTDEVICE T operator()(const T &x, const T &label) const {
    if (static_cast<int>(label) == ignore_index) {
      return static_cast<T>(0.);
    }
    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
    return simoid_x - label;
  }
  int ignore_index;
 };
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const Tensor *X = context.Input<Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
+    const Tensor *Labels = context.Input<Tensor>("Label");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Tensor *Out = context.Output<Tensor>("Out");
    Out->mutable_data<T>(context.GetPlace());
    int ignore_index = context.Attr<int>("ignore_index");
-    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto x = EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto out = EigenVector<T>::Flatten(*Out);
    auto &place = *context.device_context<DeviceContext>().eigen_device();
-    // term1 = max(x, 0)
+    out.device(place) = x.binaryExpr(
-    auto term1 = x.cwiseMax(static_cast<T>(0));
+        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
    // term2 = x * labels
    auto term2 = x * labels;
    // term3 = log(1 + exp(-abs(x)))
    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
    out.device(place) = term1 - term2 + term3;
  }
 };
@ -50,23 +89,23 @@ template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const Tensor *X = context.Input<Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
+    const Tensor *Labels = context.Input<Tensor>("Label");
-    const framework::Tensor *dOut =
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
    framework::Tensor *dX =
        context.Output<framework::Tensor>(framework::GradVarName("X"));
    dX->mutable_data<T>(context.GetPlace());
-    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto ignore_index = context.Attr<int>("ignore_index");
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto x = EigenVector<T>::Flatten(*X);
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto labels = EigenVector<T>::Flatten(*Labels);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto dout = EigenVector<T>::Flatten(*dOut);
    auto dx = EigenVector<T>::Flatten(*dX);
    auto &place =
        *context.template device_context<DeviceContext>().eigen_device();
-    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
-    dx.device(place) = dout * (sigmoid_x - labels);
+                                         static_cast<int>(ignore_index)));
    dx.device(place) = dout * diff;
  }
 };
--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
@ -0,0 +1,221 @@
 /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at
   http://www.apache.org/licenses/LICENSE-2.0
   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/fluid/operators/yolov3_loss_op.h"
 #include "paddle/fluid/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 using framework::Tensor;
 class Yolov3LossOp : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of Yolov3LossOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("GTBox"),
                   "Input(GTBox) of Yolov3LossOp should not be null.");
    PADDLE_ENFORCE(ctx->HasInput("GTLabel"),
                   "Input(GTLabel) of Yolov3LossOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
                   "Output(Loss) of Yolov3LossOp should not be null.");
    auto dim_x = ctx->GetInputDim("X");
    auto dim_gtbox = ctx->GetInputDim("GTBox");
    auto dim_gtlabel = ctx->GetInputDim("GTLabel");
    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
    auto class_num = ctx->Attrs().Get<int>("class_num");
    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
    PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
                      "Input(X) dim[3] and dim[4] should be euqal.");
    PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
                      "Input(X) dim[1] should be equal to (anchor_number * (5 "
                      "+ class_num)).");
    PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
                      "Input(GTBox) should be a 3-D tensor");
    PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
    PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
                      "Input(GTBox) should be a 2-D tensor");
    PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
                      "Input(GTBox) and Input(GTLabel) dim[0] should be same");
    PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
                      "Input(GTBox) and Input(GTLabel) dim[1] should be same");
    PADDLE_ENFORCE_GT(anchors.size(), 0,
                      "Attr(anchors) length should be greater then 0.");
    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
                      "Attr(anchors) length should be even integer.");
    PADDLE_ENFORCE_GT(class_num, 0,
                      "Attr(class_num) should be an integer greater then 0.");
    std::vector<int64_t> dim_out({1});
    ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
        platform::CPUPlace());
  }
 };
 class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X",
             "The input tensor of YOLO v3 loss operator, "
             "This is a 4-D tensor with shape of [N, C, H, W]."
             "H and W should be same, and the second dimention(C) stores"
             "box locations, confidence score and classification one-hot"
             "key of each anchor box");
    AddInput("GTBox",
             "The input tensor of ground truth boxes, "
             "This is a 3-D tensor with shape of [N, max_box_num, 5], "
             "max_box_num is the max number of boxes in each image, "
             "In the third dimention, stores x, y, w, h coordinates, "
             "x, y is the center cordinate of boxes and w, h is the "
             "width and height and x, y, w, h should be divided by "
             "input image height to scale to [0, 1].");
    AddInput("GTLabel",
             "The input tensor of ground truth label, "
             "This is a 2-D tensor with shape of [N, max_box_num], "
             "and each element shoudl be an integer to indicate the "
             "box class id.");
    AddOutput("Loss",
              "The output yolov3 loss tensor, "
              "This is a 1-D tensor with shape of [1]");
    AddAttr<int>("class_num", "The number of classes to predict.");
    AddAttr<std::vector<int>>("anchors",
                              "The anchor width and height, "
                              "it will be parsed pair by pair.");
    AddAttr<float>("ignore_thresh",
                   "The ignore threshold to ignore confidence loss.");
    AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
        .SetDefault(1.0);
    AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
        .SetDefault(1.0);
    AddAttr<float>(
        "loss_weight_conf_target",
        "The weight of confidence score loss in locations with target object.")
        .SetDefault(1.0);
    AddAttr<float>("loss_weight_conf_notarget",
                   "The weight of confidence score loss in locations without "
                   "target object.")
        .SetDefault(1.0);
    AddAttr<float>("loss_weight_class", "The weight of classification loss.")
        .SetDefault(1.0);
    AddComment(R"DOC(
         This operator generate yolov3 loss by given predict result and ground
         truth boxes.
         The output of previous network is in shape [N, C, H, W], while H and W
         should be the same, specify the grid size, each grid point predict given
         number boxes, this given number is specified by anchors, it should be 
         half anchors length, which following will be represented as S. In the 
         second dimention(the channel dimention), C should be S * (class_num + 5),
         class_num is the box categoriy number of source dataset(such as coco), 
         so in the second dimention, stores 4 box location coordinates x, y, w, h 
         and confidence score of the box and class one-hot key of each anchor box.
         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
         correspnd to:
         $$
         b_x = \sigma(t_x) + c_x
         b_y = \sigma(t_y) + c_y
         b_w = p_w e^{t_w}
         b_h = p_h e^{t_h}
         $$
         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
         is specified by anchors.
         As for confidence score, it is the logistic regression value of IoU between
         anchor boxes and ground truth boxes, the score of the anchor box which has 
         the max IoU should be 1, and if the anchor box has IoU bigger then ignore 
         thresh, the confidence score loss of this anchor box will be ignored.
         Therefore, the yolov3 loss consist of three major parts, box location loss,
         confidence score loss, and classification loss. The MSE loss is used for 
         box location, and binary cross entropy loss is used for confidence score 
         loss and classification loss.
         Final loss will be represented as follow.
         $$
         loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh}
              + \loss_weight_{conf_target} * loss_{conf_target}
              + \loss_weight_{conf_notarget} * loss_{conf_notarget}
              + \loss_weight_{class} * loss_{class}
         $$
         )DOC");
  }
 };
 class Yolov3LossOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
                   "Input(Loss@GRAD) should not be null");
    auto dim_x = ctx->GetInputDim("X");
    if (ctx->HasOutput(framework::GradVarName("X"))) {
      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
    }
  }
 protected:
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext& ctx) const override {
    return framework::OpKernelType(
        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
        platform::CPUPlace());
  }
 };
 class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
 public:
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
  std::unique_ptr<framework::OpDesc> Apply() const override {
    auto* op = new framework::OpDesc();
    op->SetType("yolov3_loss_grad");
    op->SetInput("X", Input("X"));
    op->SetInput("GTBox", Input("GTBox"));
    op->SetInput("GTLabel", Input("GTLabel"));
    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
    op->SetAttrMap(Attrs());
    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
    op->SetOutput(framework::GradVarName("GTBox"), {});
    op->SetOutput(framework::GradVarName("GTLabel"), {});
    return std::unique_ptr<framework::OpDesc>(op);
  }
 };
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker,
                  ops::Yolov3LossGradMaker);
 REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad);
 REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel<float>,
                       ops::Yolov3LossKernel<double>);
 REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel<float>,
                       ops::Yolov3LossGradKernel<double>);
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ b/paddle/fluid/operators/yolov3_loss_op.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@ -442,6 +442,8 @@ EOF
        make install -j 8
        if [ "$1" == "cp27-cp27m" ]; then
            pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
            set -e
            python -c "import paddle.fluid"
        elif [ "$1" == "cp35-cp35m" ]; then
            pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        elif [ "$1" == "cp36-cp36m" ]; then
@ -449,7 +451,7 @@ EOF
        elif [ "$1" == "cp37-cp37m" ]; then
            pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        fi
-
+      
        if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
            paddle version
        fi
--- a/Show More
+++ b/Show More