Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into optimize_hs_op

, test=develop
7 years ago · 8c75705984
parent b387a19410 6d04a9cf47
commit 8c75705984
40 changed files with 1840 additions and 244 deletions
--- a/cmake/external/ngraph.cmake
+++ b/cmake/external/ngraph.cmake
@ -32,6 +32,8 @@ IF(NOT ${WITH_NGRAPH})
    return()
 ENDIF()

+INCLUDE(GNUInstallDirs)
+
 INCLUDE(ExternalProject)

 SET(NGRAPH_PROJECT         "extern_ngraph")
@ -40,10 +42,14 @@ SET(NGRAPH_GIT_TAG         "f9fd9d4cc318dc59dd4b68448e7fbb5f67a28bd0")
 SET(NGRAPH_SOURCES_DIR     ${THIRD_PARTY_PATH}/ngraph)
 SET(NGRAPH_INSTALL_DIR     ${THIRD_PARTY_PATH}/install/ngraph)
 SET(NGRAPH_INC_DIR         ${NGRAPH_INSTALL_DIR}/include)
+SET(NGRAPH_LIB_DIR         ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
 SET(NGRAPH_SHARED_LIB_NAME libngraph.so.${NGRAPH_VERSION})
 SET(NGRAPH_CPU_LIB_NAME    libcpu_backend.so)
 SET(NGRAPH_TBB_LIB_NAME    libtbb.so.2)
 SET(NGRAPH_GIT_REPO        "https://github.com/NervanaSystems/ngraph.git")
+SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
+SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
+SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})

 ExternalProject_Add(
    ${NGRAPH_PROJECT}
@ -63,18 +69,6 @@ ExternalProject_Add(
    CMAKE_ARGS          -DMKLDNN_LIB_DIR=${MKLDNN_INSTALL_DIR}/lib
 )

-if(UNIX AND NOT APPLE)
-    include(GNUInstallDirs)
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/${CMAKE_INSTALL_LIBDIR})
-else()
-    SET(NGRAPH_LIB_DIR ${NGRAPH_INSTALL_DIR}/lib)
-endif()
-MESSAGE(STATUS "nGraph lib will be installed at: ${NGRAPH_LIB_DIR}")
-
-SET(NGRAPH_SHARED_LIB      ${NGRAPH_LIB_DIR}/${NGRAPH_SHARED_LIB_NAME})
-SET(NGRAPH_CPU_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_CPU_LIB_NAME})
-SET(NGRAPH_TBB_LIB         ${NGRAPH_LIB_DIR}/${NGRAPH_TBB_LIB_NAME})
-
 # Workaround for nGraph expecting mklml to be in mkldnn install directory.
 ExternalProject_Add_Step(
    ${NGRAPH_PROJECT}
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@ -129,6 +129,15 @@ if (WITH_MKLDNN)
            )
 endif ()

+if (WITH_NGRAPH)
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/ngraph")
+    copy(ngraph_lib
+            SRCS ${NGRAPH_INC_DIR} ${NGRAPH_LIB_DIR}
+            DSTS ${dst_dir} ${dst_dir}
+            DEPS ngraph
+            )
+endif ()
+
 if (NOT WIN32)
    if (NOT MOBILE_INFERENCE AND NOT RPI)
        set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -182,7 +182,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None,
 paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
-paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None))
 paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
@ -299,6 +299,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i
 paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None))
 paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None))
 paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None))
 paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1))
 paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,))
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@ -127,8 +127,9 @@ cc_library(version SRCS version.cc)
 cc_test(version_test SRCS version_test.cc DEPS version)

 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog version)
-cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto)
+
 if(NOT WIN32)
+cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
 cc_library(ngraph_operator SRCS ngraph_operator.cc DEPS ngraph_bridge operator op_info device_context tensor scope glog
  shape_inference data_transform lod_tensor profiler)
 endif(NOT WIN32)
--- a/paddle/fluid/framework/details/op_registry.h
+++ b/paddle/fluid/framework/details/op_registry.h
@ -32,9 +32,7 @@ enum OpInfoFillType {
  kOpProtoAndCheckerMaker = 1,
  kGradOpDescMaker = 2,
  kVarTypeInference = 3,
-  kShapeInference = 4,
-  kEstimateFlops = 5,
-  kUnknown = -1
+  kShapeInference = 4
 };

 template <typename T>
@ -50,10 +48,8 @@ struct OpInfoFillTypeID {
                                    ? kVarTypeInference
                                    : (std::is_base_of<InferShapeBase, T>::value
                                           ? kShapeInference
-                                           : (std::is_base_of<EstimateFlopsBase,
-                                                              T>::value
-                                                  ? kEstimateFlops
-                                                  : kUnknown)))));
+                                           : static_cast<OpInfoFillType>(
+                                                 -1)))));
  }
 };

@ -143,16 +139,6 @@ struct OpInfoFiller<T, kShapeInference> {
  }
 };

-template <typename T>
-struct OpInfoFiller<T, kEstimateFlops> {
-  void operator()(const char* op_tpe, OpInfo* info) const {
-    info->estimate_flops_ = [](InferShapeContext* ctx) {
-      T estimate_flops;
-      return estimate_flops(ctx);
-    };
-  }
-};
-
 }  // namespace details

 }  // namespace framework
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@ -38,7 +38,7 @@ std::unique_ptr<ir::Graph> IsTestPass::ApplyImpl(
  for (const Node* n : graph->Nodes()) {
    if (n->IsOp()) {
      auto* op = n->Op();
-      if (op->HasAttr("is_test")) {
+      if (n->RuntimeHasAttr("is_test")) {
        op->SetAttr("is_test", true);
      } else if (std::find(begin(op_list), end(op_list), op->Type()) !=
                 end(op_list)) {
--- a/paddle/fluid/framework/ir/is_test_pass_tester.cc
+++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc
@ -104,9 +104,9 @@ TEST(IsTestPass, basic) {
      auto* op = node->Op();
      auto op_name = boost::get<std::string>(op->GetAttr("name"));
      if (op_name == "conv3") {
-        ASSERT_FALSE(op->HasAttr("is_test"));
+        ASSERT_FALSE(node->RuntimeHasAttr("is_test"));
      } else {
-        ASSERT_TRUE(op->HasAttr("is_test"));
+        ASSERT_TRUE(node->RuntimeHasAttr("is_test"));
        EXPECT_TRUE(boost::get<bool>(op->GetAttr("is_test")));
      }
    }
--- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc
@ -22,7 +22,7 @@ std::unique_ptr<ir::Graph> MKLDNNPlacementPass::ApplyImpl(
    std::unique_ptr<ir::Graph> graph) const {
  VLOG(3) << "Aplies MKL-DNN placement strategy.";
  for (const Node* n : graph->Nodes()) {
-    if (n->IsOp() && n->Op()->HasAttr("use_mkldnn")) {
+    if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) {
      n->Op()->SetAttr("use_mkldnn", true);
    }
  }
--- a/paddle/fluid/framework/ir/node.cc
+++ b/paddle/fluid/framework/ir/node.cc
@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/op_info.h"

 namespace paddle {
 namespace framework {
@ -24,10 +25,33 @@ constexpr char Node::kControlDepVarName[];
 const char Node::kControlDepVarName[] = "__control_var";
 #endif

-std::unique_ptr<Node> CreateNodeForTest(const std::string& name,
+std::unique_ptr<Node> CreateNodeForTest(const std::string &name,
                                        Node::Type type) {
  return std::unique_ptr<Node>(new Node(name, type));
 }
+
+bool Node::RuntimeHasAttr(const std::string &name) const {
+  if (Op()->HasAttr(name)) {
+    return true;
+  } else {
+    auto &op_info = OpInfoMap::Instance();
+    auto op_type = Op()->Type();
+    if (op_info.Has(op_type)) {
+      auto op_info_ptr = op_info.Get(op_type);
+      if (op_info_ptr.HasOpProtoAndChecker()) {
+        const proto::OpProto &proto = op_info_ptr.Proto();
+        for (int i = 0; i != proto.attrs_size(); ++i) {
+          const proto::OpProto::Attr &attr = proto.attrs(i);
+          if (attr.name() == name) {
+            return true;
+          }
+        }
+      }
+    }
+  }
+  return false;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@ -108,6 +108,18 @@ class Node {
           Name().find(ir::Node::kControlDepVarName) != std::string::npos;
  }

+  // RuntimeHasAttr is different with HasAttr now.
+  // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr,
+  // thus, if stored program_desc_ are old which don't have an attr, a new
+  // library which adds the attr already will fail on this function.
+  // Details:
+  // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087
+  // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above
+  // problem.
+  // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding
+  // RuntimeHasAttr.
+  bool RuntimeHasAttr(const std::string& name) const;
+
  std::vector<Node*> inputs;
  std::vector<Node*> outputs;

--- a/paddle/fluid/framework/ngraph_bridge.cc
+++ b/paddle/fluid/framework/ngraph_bridge.cc
@ -15,23 +15,105 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH
 #include <algorithm>
 #include <functional>
+#include <vector>

 #include "paddle/fluid/framework/ngraph_bridge.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"

 #include "ngraph/ngraph.hpp"

 namespace paddle {
 namespace framework {

+static std::shared_ptr<ngraph::Node> GetNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    const VariableNameMap& var_map,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = var_map.at(prm);
+  PADDLE_ENFORCE_EQ(var_names.size(), 1,
+                    "op %s prm %s expects one associated var", op->Type(), prm);
+  if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) {
+    return (*ngb_node_map)[var_names[0]];
+  } else {
+    return nullptr;
+  }
+}
+
+static std::shared_ptr<ngraph::Node> GetInputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Inputs(), ngb_node_map);
+}
+
+static std::shared_ptr<ngraph::Node> GetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  return GetNode(op, prm, op->Outputs(), ngb_node_map);
+}
+
+static void SetOutputNode(
+    const std::shared_ptr<OperatorBase>& op, const std::string prm,
+    std::shared_ptr<ngraph::Node> node,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto& var_names = op->Outputs().at(prm);
+  if (var_names.size() == 1) {
+    (*ngb_node_map)[var_names[0]] = node;
+  } else if (var_names.size() == 0) {
+    (*ngb_node_map)[""] = node;
+  } else {
+    PADDLE_THROW("prm %s has more than 1 var_names.", prm);
+  }
+}
+
+static bool HasOutput(const std::shared_ptr<OperatorBase>& op,
+                      const std::string prm) {
+  auto& outputs = op->Outputs();
+  if (outputs.find(prm) == outputs.end()) return false;
+  return outputs.at(prm).size() > 0;
+}
+
+template <typename T>
+static void BuildBinaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto x = GetInputNode(op, "X", ngb_node_map);
+  auto y = GetInputNode(op, "Y", ngb_node_map);
+  auto out = std::make_shared<T>(x, y);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
+template <typename T>
+static void BuildUnaryNode(
+    const std::shared_ptr<OperatorBase>& op,
+    std::shared_ptr<
+        std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
+        ngb_node_map) {
+  auto input = GetInputNode(op, "X", ngb_node_map);
+  auto out = std::make_shared<T>(input);
+  SetOutputNode(op, "Out", out, ngb_node_map);
+}
+
 std::map<std::string,
         std::function<void(const std::shared_ptr<OperatorBase>&,
                            std::shared_ptr<std::unordered_map<
                                std::string, std::shared_ptr<ngraph::Node>>>)>>
-    NgraphBridge::NG_NODE_MAP = {};
+    NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode<ngraph::op::Relu>},
+                                 {"tanh", BuildUnaryNode<ngraph::op::Tanh>}};

-void NgraphBridge::build_graph(const std::shared_ptr<OperatorBase>& op) {
+void NgraphBridge::BuildNgNode(const std::shared_ptr<OperatorBase>& op) {
  auto& op_type = op->Type();
-  NG_NODE_MAP[op_type](op, ngb_node_map);
+  NG_NODE_MAP[op_type](op, ngb_node_map_);
 }

 }  // namespace framework
--- a/paddle/fluid/framework/ngraph_bridge.h
+++ b/paddle/fluid/framework/ngraph_bridge.h
@ -20,16 +20,14 @@ limitations under the License. */
 #include <map>
 #include <string>
 #include <unordered_map>
-#include <vector>

-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#include "ngraph/ngraph.hpp"
+#include "ngraph/node.hpp"

 namespace paddle {
 namespace framework {

+class OperatorBase;
+
 class NgraphBridge {
 public:
  static std::map<
@ -43,14 +41,14 @@ class NgraphBridge {
      std::shared_ptr<
          std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
          var_node_map)
-      : ngb_node_map(var_node_map) {}
+      : ngb_node_map_(var_node_map) {}

-  void build_graph(const std::shared_ptr<OperatorBase>& op);
+  void BuildNgNode(const std::shared_ptr<OperatorBase>& op);

 private:
  std::shared_ptr<
      std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
-      ngb_node_map;
+      ngb_node_map_;
 };

 }  // namespace framework
--- a/paddle/fluid/framework/ngraph_operator.cc
+++ b/paddle/fluid/framework/ngraph_operator.cc
--- a/paddle/fluid/framework/ngraph_operator.h
+++ b/paddle/fluid/framework/ngraph_operator.h
@ -17,24 +17,19 @@ limitations under the License. */
 #ifdef PADDLE_WITH_NGRAPH

 #include <algorithm>
-#include <atomic>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #include "paddle/fluid/framework/attribute.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/ngraph_bridge.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/op_kernel_type.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/variant.h"

-#include "ngraph/ngraph.hpp"
+#include "ngraph/type/element_type.hpp"

 namespace paddle {
 namespace framework {
--- a/paddle/fluid/framework/op_info.h
+++ b/paddle/fluid/framework/op_info.h
@ -31,12 +31,6 @@ class InferShapeBase {
  virtual void operator()(InferShapeContext*) const = 0;
 };

-class EstimateFlopsBase {
- public:
-  virtual ~EstimateFlopsBase() = default;
-  virtual size_t operator()(InferShapeContext*) const = 0;
-};
-
 struct OpInfo {
  OpCreator creator_;
  GradOpMakerFN grad_op_maker_;
@ -44,7 +38,6 @@ struct OpInfo {
  OpAttrChecker* checker_{nullptr};
  InferVarTypeFN infer_var_type_;
  InferShapeFN infer_shape_;
-  EstimateFlopsFN estimate_flops_;

  bool HasOpProtoAndChecker() const {
    return proto_ != nullptr && checker_ != nullptr;
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@ -695,6 +695,12 @@ static void CheckTensorNANOrInf(const std::string& name,
                 "Tensor %s contains NAN", name);
 }

+void OperatorWithKernel::RuntimeInferShape(const Scope& scope,
+                                           const platform::Place& place) const {
+  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
+  this->InferShape(&infer_shape_ctx);
+}
+
 void OperatorWithKernel::RunImpl(const Scope& scope,
                                 const platform::Place& place) const {
  RuntimeInferShapeContext infer_shape_ctx(*this, scope);
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@ -128,6 +128,8 @@ class OperatorBase {
  virtual std::vector<std::string> OutputVars(bool has_intermediate) const;

  void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; }
+  virtual void RuntimeInferShape(const Scope& scope,
+                                 const platform::Place& place) const {}

 protected:
  std::string type_;
@ -348,6 +350,9 @@ class OperatorWithKernel : public OperatorBase {
    OpInfoMap::Instance().Get(Type()).infer_shape_(ctx);
  }

+  void RuntimeInferShape(const Scope& scope,
+                         const platform::Place& place) const override;
+
 protected:
  virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
  virtual OpKernelType GetKernelTypeForVar(
--- a/paddle/fluid/framework/type_defs.h
+++ b/paddle/fluid/framework/type_defs.h
@ -54,7 +54,5 @@ using InferVarTypeFN =

 using InferShapeFN = std::function<void(InferShapeContext*)>;

-using EstimateFlopsFN = std::function<void(InferShapeContext*)>;
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@ -79,6 +79,16 @@ link_directories("${PADDLE_LIB}/third_party/install/gflags/lib")
 link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib")
 link_directories("${PADDLE_LIB}/paddle/lib")

+if (NOT WIN32)
+    set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph")
+    if(EXISTS ${NGRAPH_PATH})
+        include(GNUInstallDirs)
+        include_directories("${NGRAPH_PATH}/include")
+        link_directories("${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}")
+        set(NGRAPH_LIB ${NGRAPH_PATH}/${CMAKE_INSTALL_LIBDIR}/libngraph${CMAKE_SHARED_LIBRARY_SUFFIX})
+    endif()
+endif()
+
 add_executable(${DEMO_NAME} ${DEMO_NAME}.cc)

 if(WITH_MKL)
@ -106,7 +116,7 @@ endif()
 if (NOT WIN32)
 set(EXTERNAL_LIB "-lrt -ldl -lpthread")
 set(DEPS ${DEPS}
-    ${MATH_LIB} ${MKLDNN_LIB}
+    ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB}
    glog gflags protobuf snappystream snappy z xxhash
    ${EXTERNAL_LIB})
 else()
--- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h
@ -36,13 +36,11 @@ class SequenceMaskOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist");
    PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist");

-    auto maxlen = ctx->Attrs().Get<int>("maxlen");
-    if (maxlen > 0) {  // We can only infershape when maxlen > 0
+    int maxlen = ctx->Attrs().Get<int>("maxlen");
    auto dim = framework::vectorize2int(ctx->GetInputDim("X"));
-      dim.push_back(maxlen);
+    dim.push_back(maxlen > 0 ? maxlen : -1);
    ctx->SetOutputDim("Y", framework::make_ddim(dim));
  }
-  }
 };

 class SequenceMaskOpMaker : public framework::OpProtoAndCheckerMaker {
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {

 using framework::Tensor;
+const int kIgnoreIndex = -100;

 class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
 public:
@ -100,6 +101,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker
    AddOutput("Out",
              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
              " of elementwise logistic losses.");
+    AddAttr<int>("ignore_index",
+                 "(int, default kIgnoreIndex), Specifies a target value that "
+                 "is ignored and"
+                 "does not contribute to the input gradient.")
+        .SetDefault(kIgnoreIndex);
    AddComment(R"DOC(
 SigmoidCrossEntropyWithLogits Operator.

--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h
@ -15,33 +15,72 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/hostdevice.h"
+#include "paddle/legacy/utils/Logging.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+struct SigmoidCrossEntropyWithLogitsForward {
+  HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index)
+      : ignore_index(ignore_index) {}
+
+  HOSTDEVICE T operator()(const T &x, const T &label) const {
+    if (static_cast<int>(label) == ignore_index) {
+      return static_cast<T>(0.);
+    }
+    T term1 = (x > 0) ? x : 0;
+    T term2 = x * label;
+    T term3 = std::log(static_cast<T>(1) + std::exp(-(std::abs(x))));
+    return term1 - term2 + term3;
+  }
+
+  int ignore_index;
+};
+
+template <typename T>
+struct SigmoidCrossEntropyWithLogitsBackward {
+  HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index)
+      : ignore_index(ignore_index) {}
+
+  HOSTDEVICE T operator()(const T &x, const T &label) const {
+    if (static_cast<int>(label) == ignore_index) {
+      return static_cast<T>(0.);
+    }
+    T simoid_x = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-x));
+    return simoid_x - label;
+  }
+
+  int ignore_index;
+};
+
 // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
 template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    Tensor *Out = context.Output<Tensor>("Out");
    Out->mutable_data<T>(context.GetPlace());
+    int ignore_index = context.Attr<int>("ignore_index");

-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto x = EigenVector<T>::Flatten(*X);
+    auto labels = EigenVector<T>::Flatten(*Labels);
+    auto out = EigenVector<T>::Flatten(*Out);
    auto &place = *context.device_context<DeviceContext>().eigen_device();

-    // term1 = max(x, 0)
-    auto term1 = x.cwiseMax(static_cast<T>(0));
-    // term2 = x * labels
-    auto term2 = x * labels;
-    // term3 = log(1 + exp(-abs(x)))
-    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
-
-    out.device(place) = term1 - term2 + term3;
+    out.device(place) = x.binaryExpr(
+        labels, SigmoidCrossEntropyWithLogitsForward<T>(ignore_index));
  }
 };

@ -50,23 +89,23 @@ template <typename DeviceContext, typename T>
 class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
-    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-    const framework::Tensor *dOut =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    framework::Tensor *dX =
-        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Labels = context.Input<Tensor>("Label");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
    dX->mutable_data<T>(context.GetPlace());

-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto labels = framework::EigenVector<T>::Flatten(*Labels);
-    auto dout = framework::EigenVector<T>::Flatten(*dOut);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto ignore_index = context.Attr<int>("ignore_index");
+    auto x = EigenVector<T>::Flatten(*X);
+    auto labels = EigenVector<T>::Flatten(*Labels);
+    auto dout = EigenVector<T>::Flatten(*dOut);
+    auto dx = EigenVector<T>::Flatten(*dX);
    auto &place =
        *context.template device_context<DeviceContext>().eigen_device();

-    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
-    dx.device(place) = dout * (sigmoid_x - labels);
+    auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward<T>(
+                                         static_cast<int>(ignore_index)));
+    dx.device(place) = dout * diff;
  }
 };

--- a/paddle/fluid/operators/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/yolov3_loss_op.cc
@ -0,0 +1,221 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/fluid/operators/yolov3_loss_op.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class Yolov3LossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GTBox"),
+                   "Input(GTBox) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("GTLabel"),
+                   "Input(GTLabel) of Yolov3LossOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Loss"),
+                   "Output(Loss) of Yolov3LossOp should not be null.");
+
+    auto dim_x = ctx->GetInputDim("X");
+    auto dim_gtbox = ctx->GetInputDim("GTBox");
+    auto dim_gtlabel = ctx->GetInputDim("GTLabel");
+    auto anchors = ctx->Attrs().Get<std::vector<int>>("anchors");
+    auto class_num = ctx->Attrs().Get<int>("class_num");
+    PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor.");
+    PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3],
+                      "Input(X) dim[3] and dim[4] should be euqal.");
+    PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num),
+                      "Input(X) dim[1] should be equal to (anchor_number * (5 "
+                      "+ class_num)).");
+    PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3,
+                      "Input(GTBox) should be a 3-D tensor");
+    PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5");
+    PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2,
+                      "Input(GTBox) should be a 2-D tensor");
+    PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0],
+                      "Input(GTBox) and Input(GTLabel) dim[0] should be same");
+    PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1],
+                      "Input(GTBox) and Input(GTLabel) dim[1] should be same");
+    PADDLE_ENFORCE_GT(anchors.size(), 0,
+                      "Attr(anchors) length should be greater then 0.");
+    PADDLE_ENFORCE_EQ(anchors.size() % 2, 0,
+                      "Attr(anchors) length should be even integer.");
+    PADDLE_ENFORCE_GT(class_num, 0,
+                      "Attr(class_num) should be an integer greater then 0.");
+
+    std::vector<int64_t> dim_out({1});
+    ctx->SetOutputDim("Loss", framework::make_ddim(dim_out));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of YOLO v3 loss operator, "
+             "This is a 4-D tensor with shape of [N, C, H, W]."
+             "H and W should be same, and the second dimention(C) stores"
+             "box locations, confidence score and classification one-hot"
+             "key of each anchor box");
+    AddInput("GTBox",
+             "The input tensor of ground truth boxes, "
+             "This is a 3-D tensor with shape of [N, max_box_num, 5], "
+             "max_box_num is the max number of boxes in each image, "
+             "In the third dimention, stores x, y, w, h coordinates, "
+             "x, y is the center cordinate of boxes and w, h is the "
+             "width and height and x, y, w, h should be divided by "
+             "input image height to scale to [0, 1].");
+    AddInput("GTLabel",
+             "The input tensor of ground truth label, "
+             "This is a 2-D tensor with shape of [N, max_box_num], "
+             "and each element shoudl be an integer to indicate the "
+             "box class id.");
+    AddOutput("Loss",
+              "The output yolov3 loss tensor, "
+              "This is a 1-D tensor with shape of [1]");
+
+    AddAttr<int>("class_num", "The number of classes to predict.");
+    AddAttr<std::vector<int>>("anchors",
+                              "The anchor width and height, "
+                              "it will be parsed pair by pair.");
+    AddAttr<float>("ignore_thresh",
+                   "The ignore threshold to ignore confidence loss.");
+    AddAttr<float>("loss_weight_xy", "The weight of x, y location loss.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_wh", "The weight of w, h location loss.")
+        .SetDefault(1.0);
+    AddAttr<float>(
+        "loss_weight_conf_target",
+        "The weight of confidence score loss in locations with target object.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_conf_notarget",
+                   "The weight of confidence score loss in locations without "
+                   "target object.")
+        .SetDefault(1.0);
+    AddAttr<float>("loss_weight_class", "The weight of classification loss.")
+        .SetDefault(1.0);
+    AddComment(R"DOC(
+         This operator generate yolov3 loss by given predict result and ground
+         truth boxes.
+         
+         The output of previous network is in shape [N, C, H, W], while H and W
+         should be the same, specify the grid size, each grid point predict given
+         number boxes, this given number is specified by anchors, it should be 
+         half anchors length, which following will be represented as S. In the 
+         second dimention(the channel dimention), C should be S * (class_num + 5),
+         class_num is the box categoriy number of source dataset(such as coco), 
+         so in the second dimention, stores 4 box location coordinates x, y, w, h 
+         and confidence score of the box and class one-hot key of each anchor box.
+
+         While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions
+         correspnd to:
+
+         $$
+         b_x = \sigma(t_x) + c_x
+         b_y = \sigma(t_y) + c_y
+         b_w = p_w e^{t_w}
+         b_h = p_h e^{t_h}
+         $$
+
+         While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$
+         is specified by anchors.
+
+         As for confidence score, it is the logistic regression value of IoU between
+         anchor boxes and ground truth boxes, the score of the anchor box which has 
+         the max IoU should be 1, and if the anchor box has IoU bigger then ignore 
+         thresh, the confidence score loss of this anchor box will be ignored.
+
+         Therefore, the yolov3 loss consist of three major parts, box location loss,
+         confidence score loss, and classification loss. The MSE loss is used for 
+         box location, and binary cross entropy loss is used for confidence score 
+         loss and classification loss.
+
+         Final loss will be represented as follow.
+
+         $$
+         loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh}
+              + \loss_weight_{conf_target} * loss_{conf_target}
+              + \loss_weight_{conf_notarget} * loss_{conf_notarget}
+              + \loss_weight_{class} * loss_{class}
+         $$
+         )DOC");
+  }
+};
+
+class Yolov3LossOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null");
+    auto dim_x = ctx->GetInputDim("X");
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), dim_x);
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
+};
+
+class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDesc> Apply() const override {
+    auto* op = new framework::OpDesc();
+    op->SetType("yolov3_loss_grad");
+    op->SetInput("X", Input("X"));
+    op->SetInput("GTBox", Input("GTBox"));
+    op->SetInput("GTLabel", Input("GTLabel"));
+    op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+
+    op->SetAttrMap(Attrs());
+
+    op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    op->SetOutput(framework::GradVarName("GTBox"), {});
+    op->SetOutput(framework::GradVarName("GTLabel"), {});
+    return std::unique_ptr<framework::OpDesc>(op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker,
+                  ops::Yolov3LossGradMaker);
+REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad);
+REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel<float>,
+                       ops::Yolov3LossKernel<double>);
+REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel<float>,
+                       ops::Yolov3LossGradKernel<double>);
--- a/paddle/fluid/operators/yolov3_loss_op.h
+++ b/paddle/fluid/operators/yolov3_loss_op.h
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@ -442,6 +442,8 @@ EOF
        make install -j 8
        if [ "$1" == "cp27-cp27m" ]; then
            pip install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
+            set -e
+            python -c "import paddle.fluid"
        elif [ "$1" == "cp35-cp35m" ]; then
            pip3.5 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
        elif [ "$1" == "cp36-cp36m" ]; then
--- a/Show More
+++ b/Show More