From 2811ea4440c8dd3ecaedc7476f9a7c0cb1519a2c Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Wed, 28 Mar 2018 10:45:52 +0200
Subject: [PATCH 1/4] Implementation of MKLDNN FC

---
 paddle/fluid/operators/CMakeLists.txt         |  29 +-
 paddle/fluid/operators/fc_mkldnn_op.cc        | 410 ++++++++++++++++++
 paddle/fluid/operators/fc_mkldnn_op.h         |  47 ++
 python/paddle/fluid/layers/nn.py              |  32 +-
 .../fluid/tests/unittests/test_fc_op.py       |  99 +++++
 5 files changed, 603 insertions(+), 14 deletions(-)
 create mode 100644 paddle/fluid/operators/fc_mkldnn_op.cc
 create mode 100644 paddle/fluid/operators/fc_mkldnn_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_fc_op.py

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 9ed79453b9..6c79998f07 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -1,6 +1,14 @@
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
-string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
+if(WITH_MKLDNN)
+    string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
+else()
+    foreach(item ${GENERAL_OPS})
+        if(${item} MATCHES ".*_mkldnn_op")
+            list(REMOVE_ITEM GENERAL_OPS ${item})
+        endif()
+    endforeach(item)
+endif()
 list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
@@ -80,7 +88,12 @@ function(op_library TARGET)
     endif()
 
     list(LENGTH cc_srcs cc_srcs_len)
-    if (${cc_srcs_len} EQUAL 0)
+    if(WITH_MKLDNN)
+        list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
+        if (${cc_srcs_len} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0)
+            message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+        endif()
+    elseif(${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
@@ -109,7 +122,16 @@ function(op_library TARGET)
     # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
-    file(READ ${TARGET}.cc TARGET_CONTENT)
+    # This replacing is needed, when the CPU's kernel doesn't exist.
+    string(REPLACE "_op" "_mkldnn_op" target_mkldnn_file "${TARGET}")
+    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+        file(READ ${TARGET}.cc TARGET_CONTENT)
+    elseif(WITH_MKLDNN AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${target_mkldnn_file}.cc)
+        file(READ ${target_mkldnn_file}.cc TARGET_CONTENT)
+    else() 
+        message(FATAL_ERROR "Cannot read the ${TARGET} file from ${CMAKE_CURRENT_SOURCE_DIR}")
+    endif()
+
     string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
     string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
     if (one_register STREQUAL "")
@@ -224,7 +246,6 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
-
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
 else()
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
new file mode 100644
index 0000000000..48655d36fc
--- /dev/null
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -0,0 +1,410 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fc_mkldnn_op.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/mkldnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::Tensor;
+using paddle::platform::MKLDNNDeviceContext;
+
+void FCOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "X(Input) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("W"),
+                 "W(Input) of Fully Connected should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("W");
+  std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
+
+  PADDLE_ENFORCE(in_dims.size() == 4,
+                 "Fully Connected input should be 4-D tensor.");
+
+  PADDLE_ENFORCE(w_dims.size() == 2,
+                 "Fully Connected input should be 2-D tensor.");
+
+  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->ShareLoD("Input", "Out");
+}
+
+framework::OpKernelType FCOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout, library);
+}
+
+void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("W");
+
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("W"))) {
+    ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
+  }
+}
+
+framework::OpKernelType FCOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+
+  std::string data_format = ctx.Attr<std::string>("data_format");
+  framework::DataLayout layout = framework::StringToDataLayout(data_format);
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout, library);
+}
+
+class FCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "Input",
+        "(Tensor) The input tensor of fully connected operator. "
+        "The format of input tensor is NCHW, where N is batch size, C is the "
+        "number of channels, H is the height of the feature, "
+        "and W is the width of the feature.");
+    AddInput("W", "(Tensor), The second input tensor of fc op.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of pooling operator. "
+              "The format of output tensor is also NCHW, "
+              "where N is batch size, C is the number of channels, "
+              "H is the height of the feature, "
+              "and W is the width of the feature.");
+    AddAttr<bool>("use_mkldnn",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<bool>("with_bias",
+                  "(bool, default false) Only used in mkldnn kernel")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "data_format",
+        "(string, default NCHW) Only used in "
+        "An optional string from: \"NHWC\", \"NCHW\". "
+        "Defaults to \"NHWC\". Specify the data format of the output data, "
+        "the input will be transformed automatically. ")
+        .SetDefault("AnyLayout");
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+struct MKLDNNMatrixSize final {
+  explicit MKLDNNMatrixSize(const std::vector<int>& in,
+                            const std::vector<int>& w)
+      : mb{in[0]}, ic{in[1]}, oc{w[1]}, h{in[2]}, w{in[3]} {}
+
+  bool is_spatial() const { return h > 1 && w > 1; }
+
+  const int mb;
+  const int ic;
+  const int oc;
+  const int h, w;
+};
+
+template <typename T>
+class MKLDNNMD {
+ public:
+  explicit MKLDNNMD(const T* in, const T* w, bool bias)
+      : sz_(std::unique_ptr<MKLDNNMatrixSize>(new MKLDNNMatrixSize(
+            paddle::framework::vectorize2int(in->dims()),
+            paddle::framework::vectorize2int(w->dims())))) {
+    with_bias_ = bias;
+  }
+
+  mkldnn::memory::desc dst() const {
+    return platform::MKLDNNMemDesc({sz_->mb, sz_->oc},
+                                   mkldnn::memory::data_type::f32,
+                                   mkldnn::memory::format::nc);
+  }
+
+  mkldnn::memory::desc src() const {
+    return sz_->is_spatial()
+               ? platform::MKLDNNMemDesc({sz_->mb, sz_->ic, sz_->h, sz_->w},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::nchw)
+               : platform::MKLDNNMemDesc({sz_->mb, sz_->ic},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::nc);
+  }
+
+  mkldnn::memory::desc weights() const {
+    return sz_->is_spatial()
+               ? platform::MKLDNNMemDesc({sz_->oc, sz_->ic, sz_->h, sz_->w},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::oihw)
+               : platform::MKLDNNMemDesc({sz_->oc, sz_->ic},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::oi);
+  }
+
+  mkldnn::memory::desc bias() const {
+    return with_bias_
+               ? platform::MKLDNNMemDesc({sz_->oc},
+                                         mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::format_undef)
+               : platform::MKLDNNMemDesc({}, mkldnn::memory::data_type::f32,
+                                         mkldnn::memory::format::format_undef);
+  }
+
+ private:
+  std::unique_ptr<MKLDNNMatrixSize> sz_;
+  bool with_bias_;
+};
+
+class MKLDNNMemory {
+ public:
+  MKLDNNMemory(MKLDNNMD<Tensor>* t, const mkldnn::engine& e)
+      : md_{t}, engine_{e} {}
+  virtual ~MKLDNNMemory() = default;
+
+  template <typename Output>
+  mkldnn::memory dst(const Output* out) {
+    return mkldnn::memory({md_->dst(), engine_},
+                          static_cast<void*>(const_cast<float*>(out)));
+  }
+
+  template <typename Output>
+  mkldnn::memory dst(Output* out) {
+    return mkldnn::memory({md_->dst(), engine_}, out);
+  }
+
+  template <typename Input>
+  mkldnn::memory src(const Input* in) {
+    return mkldnn::memory({md_->src(), engine_},
+                          static_cast<void*>(const_cast<float*>(in)));
+  }
+
+  template <typename Weight>
+  mkldnn::memory weights(const Weight* w) {
+    return mkldnn::memory({md_->weights(), engine_},
+                          static_cast<void*>(const_cast<float*>(w)));
+  }
+
+  mkldnn::memory bias() {
+    return mkldnn::memory(mkldnn::memory::primitive_desc(md_->bias(), engine_));
+  }
+
+ private:
+  MKLDNNMD<Tensor>* md_;
+  const mkldnn::engine& engine_;
+};
+
+template <typename T>
+class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    auto input = ctx.Input<Tensor>("Input");
+    auto w = ctx.Input<Tensor>("W");
+
+    PADDLE_ENFORCE(input->dims().size() == 4,
+                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(w->dims().size() == 2,
+                   "Weights must be with 2 dimensions, i.e. NC");
+
+    bool with_bias = ctx.Attr<bool>("with_bias");
+    MKLDNNMD<Tensor> md(input, w, with_bias);
+
+    std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
+        FcFwdPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
+                           with_bias, mkldnn_engine);
+
+    const std::string key = ctx.op().Output("Out");
+    const std::string key_fc_pd = key + "@fc_pd";
+
+    dev_ctx.SetBlob(key_fc_pd, pd);
+
+    MKLDNNMemory mem(&md, mkldnn_engine);
+
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+
+    auto output = ctx.Output<Tensor>("Out");
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    auto dst_memory = mem.dst(output_data);
+    auto src_memory = mem.src(input_data);
+    auto weights_memory = mem.weights(w_data);
+    auto bias_memory = mem.bias();
+
+    auto forward = with_bias ? mkldnn::inner_product_forward(
+                                   *pd, src_memory, weights_memory, bias_memory,
+                                   dst_memory)
+                             : mkldnn::inner_product_forward(
+                                   *pd, src_memory, weights_memory, dst_memory);
+
+    std::vector<mkldnn::primitive> pipeline = {forward};
+    mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+  }
+
+ private:
+  std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>
+  FcFwdPrimitiveDesc(const mkldnn::memory::desc& src,
+                     const mkldnn::memory::desc& weights,
+                     const mkldnn::memory::desc& dst,
+                     const mkldnn::memory::desc& bias, const bool with_bias,
+                     const mkldnn::engine& engine) const {
+    auto desc = with_bias
+                    ? mkldnn::inner_product_forward::desc(
+                          mkldnn::prop_kind::forward, src, weights, bias, dst)
+                    : mkldnn::inner_product_forward::desc(
+                          mkldnn::prop_kind::forward, src, weights, dst);
+
+    auto pd = new mkldnn::inner_product_forward::primitive_desc(desc, engine);
+    return std::unique_ptr<mkldnn::inner_product_forward::primitive_desc>(pd);
+  }
+};
+
+template <typename T>
+class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+
+    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    T* input_grad_data = nullptr;
+    T* w_grad_data = nullptr;
+
+    Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* w_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
+
+    if (input_grad) {
+      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+    }
+    if (w_grad) {
+      w_grad_data = w_grad->mutable_data<T>(ctx.GetPlace());
+    }
+
+    const Tensor* input = ctx.Input<Tensor>("Input");
+    const T* input_data = input->data<T>();
+
+    const Tensor* w = ctx.Input<Tensor>("W");
+    const T* w_data = w->data<T>();
+
+    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    const T* out_grad_data = out_grad->data<T>();
+
+    bool with_bias = ctx.Attr<bool>("with_bias");
+
+    MKLDNNMD<Tensor> md(input, w, with_bias);
+    MKLDNNMemory mem(&md, mkldnn_engine);
+
+    auto dst_memory = mem.dst(out_grad_data);
+    auto src_memory = mem.src(input_data);
+    auto weights_memory = mem.weights(w_data);
+    auto bias_memory = mem.bias();
+
+    const std::string key = ctx.op().Input("Out");
+    const std::string key_fc_pd = key + "@fc_pd";
+
+    auto pd =
+        std::static_pointer_cast<mkldnn::inner_product_forward::primitive_desc>(
+            dev_ctx.GetBlob(key_fc_pd));
+
+    PADDLE_ENFORCE(pd != nullptr, "Fail to find key_fc_pd in device context");
+
+    if (w_grad) {
+      auto weights_grad_memory = mem.weights(w_grad_data);
+
+      mkldnn::inner_product_backward_weights::primitive_desc bwd_weight_pd =
+          FcBwdWeightsPrimitiveDesc(md.src(), md.weights(), md.dst(), md.bias(),
+                                    with_bias, *pd, mkldnn_engine);
+
+      auto bwd_weights_prim = mkldnn::inner_product_backward_weights(
+          bwd_weight_pd, src_memory, dst_memory, weights_grad_memory,
+          bias_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_weights_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+
+    if (input_grad) {
+      auto src_grad_memory = mem.src(input_grad_data);
+
+      mkldnn::inner_product_backward_data::primitive_desc bwd_data_pd =
+          FcBwdDataPrimitiveDesc(md.src(), md.weights(), md.dst(), *pd,
+                                 mkldnn_engine);
+
+      auto bwd_data_prim = mkldnn::inner_product_backward_data(
+          bwd_data_pd, dst_memory, weights_memory, src_grad_memory);
+
+      std::vector<mkldnn::primitive> pipeline{bwd_data_prim};
+      mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
+    }
+  }
+
+ private:
+  mkldnn::inner_product_backward_weights::primitive_desc
+  FcBwdWeightsPrimitiveDesc(
+      const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
+      const mkldnn::memory::desc& diff_dst, const mkldnn::memory::desc& bias,
+      const bool with_bias,
+      const mkldnn::inner_product_forward::primitive_desc& pd,
+      const mkldnn::engine& engine) const {
+    auto bwd_weight_desc = with_bias
+                               ? mkldnn::inner_product_backward_weights::desc(
+                                     src, diff_weights, bias, diff_dst)
+                               : mkldnn::inner_product_backward_weights::desc(
+                                     src, diff_weights, bias, diff_dst);
+
+    return mkldnn::inner_product_backward_weights::primitive_desc(
+        bwd_weight_desc, engine, pd);
+  }
+
+  mkldnn::inner_product_backward_data::primitive_desc FcBwdDataPrimitiveDesc(
+      const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
+      const mkldnn::memory::desc& diff_dst,
+      const mkldnn::inner_product_forward::primitive_desc& pd,
+      const mkldnn::engine& engine) const {
+    auto bwd_data_desc =
+        mkldnn::inner_product_backward_data::desc(diff_src, weights, diff_dst);
+    return mkldnn::inner_product_backward_data::primitive_desc(bwd_data_desc,
+                                                               engine, pd);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
+            paddle::operators::FCOpGrad);
+
+REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::FCMKLDNNOpKernel<float>);
+
+REGISTER_OP_KERNEL(fc_grad, MKLDNN, ::paddle::platform::CPUPlace,
+                   paddle::operators::FCMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/fc_mkldnn_op.h b/paddle/fluid/operators/fc_mkldnn_op.h
new file mode 100644
index 0000000000..9e6c66491d
--- /dev/null
+++ b/paddle/fluid/operators/fc_mkldnn_op.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FCOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+class FCOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 3d13133bf2..bfae205bcf 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -86,6 +86,7 @@ def fc(input,
        param_attr=None,
        bias_attr=None,
        use_mkldnn=False,
+       with_bias=False,
        act=None,
        name=None):
     """
@@ -133,6 +134,8 @@ def fc(input,
         bias_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for the bias
             of this layer. If it is set to None, no bias will be added to the output units.
         act (str, default None): Activation to be applied to the output of this layer.
+        use_mkldnn(bool): Use mkldnn kernel or not, it is valid only when the mkldnn
+            library is installed. Default: False
         name (str, default None): The name of this layer.
 
     Returns:
@@ -162,16 +165,25 @@ def fc(input,
         w = helper.create_parameter(
             attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
         tmp = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="mul",
-            inputs={"X": input_var,
-                    "Y": w},
-            outputs={"Out": tmp},
-            attrs={
-                "x_num_col_dims": num_flatten_dims,
-                "y_num_col_dims": 1,
-                'use_mkldnn': use_mkldnn
-            })
+        if use_mkldnn == False:
+            helper.append_op(
+                type="mul",
+                inputs={"X": input_var,
+                        "Y": w},
+                outputs={"Out": tmp},
+                attrs={
+                    "x_num_col_dims": num_flatten_dims,
+                    "y_num_col_dims": 1,
+                    'use_mkldnn': use_mkldnn
+                })
+        else:
+            helper.append_op(
+                type="fc",
+                inputs={"Input": input_var,
+                        "W": w},
+                outputs={"Out": tmp},
+                attrs={"use_mkldnn": use_mkldnn,
+                       "with_bias": with_bias})
         mul_results.append(tmp)
 
     # sum
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
new file mode 100644
index 0000000000..3f547f3c48
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def fully_connected_naive(input, weights, bias_data=None):
+    in_n, in_c, in_h, in_w = input.shape
+    w_h, w_c = weights.shape
+
+    x_data = np.reshape(input, [in_n, in_c * in_h * in_w])
+    w_data = np.transpose(np.reshape(weights, (w_c, in_c * in_h * in_w)))
+    result = None
+
+    if not bias_data:
+        result = np.dot(x_data, w_data)
+    else:
+        result = np.dot(x_data, w_data) + bias_data
+
+    return result
+
+
+class MatrixGenerate:
+    def __init__(self, mb, ic, oc, h, w):
+        self.input = np.random.random((mb, ic, h, w)).astype("float32")
+        self.weights = np.random.random((ic * h * w, oc)).astype("float32")
+
+
+class TestFCMKLDNNOp(OpTest):
+    def setUp(self):
+        self.op_type = "fc"
+        self.use_mkldnn = True
+        self.with_bias = True
+        self.matrix = MatrixGenerate(1, 10, 15, 3, 3)
+
+        self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}
+
+        self.attrs = {
+            'use_mkldnn': self.use_mkldnn,
+            'with_bias': self.with_bias
+        }
+
+        self.outputs = {
+            'Out': fully_connected_naive(self.matrix.input, self.matrix.weights)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(set(['Input', 'W']), 'Out', max_relative_error=0.9)
+
+    def test_check_grad_no_weight(self):
+        self.check_grad(
+            ['Input'], 'Out', max_relative_error=0.5, no_grad_set=set('W'))
+
+
+class TestFCMKLDNNOp1(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 15, 48, 2, 2)
+
+
+class TestFCMKLDNNOp2(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 32, 40, 1, 1)
+
+
+class TestFCMKLDNNOp3(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.matrix = MatrixGenerate(2, 2, 4, 1, 1)
+
+
+class TestFCMKLDNNOp4(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.with_bias = False
+        self.matrix = MatrixGenerate(2, 32, 48, 2, 2)
+
+
+class TestFCMKLDNNOp4(TestFCMKLDNNOp):
+    def init_op_type(self):
+        self.with_bias = False
+        self.matrix = MatrixGenerate(2, 32, 1000, 6, 6)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 34a8084328921d4d043fc3c8308063d38087e62f Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Thu, 29 Mar 2018 20:31:59 +0200
Subject: [PATCH 2/4] Added new fc files, register fc kernel

---
 paddle/fluid/operators/CMakeLists.txt         |  29 +----
 paddle/fluid/operators/fc_mkldnn_op.cc        | 108 +---------------
 paddle/fluid/operators/fc_op.cc               | 122 ++++++++++++++++++
 .../operators/{fc_mkldnn_op.h => fc_op.h}     |   5 +
 python/paddle/fluid/layers/nn.py              |  67 ++++++----
 5 files changed, 178 insertions(+), 153 deletions(-)
 create mode 100644 paddle/fluid/operators/fc_op.cc
 rename paddle/fluid/operators/{fc_mkldnn_op.h => fc_op.h} (91%)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 6c79998f07..9ed79453b9 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -1,14 +1,6 @@
 file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
+string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
 string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
-if(WITH_MKLDNN)
-    string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
-else()
-    foreach(item ${GENERAL_OPS})
-        if(${item} MATCHES ".*_mkldnn_op")
-            list(REMOVE_ITEM GENERAL_OPS ${item})
-        endif()
-    endforeach(item)
-endif()
 list(REMOVE_DUPLICATES GENERAL_OPS)
 set(DEPS_OPS "")
 set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
@@ -88,12 +80,7 @@ function(op_library TARGET)
     endif()
 
     list(LENGTH cc_srcs cc_srcs_len)
-    if(WITH_MKLDNN)
-        list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
-        if (${cc_srcs_len} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0)
-            message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
-        endif()
-    elseif(${cc_srcs_len} EQUAL 0)
+    if (${cc_srcs_len} EQUAL 0)
         message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
     endif()
 
@@ -122,16 +109,7 @@ function(op_library TARGET)
     # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
     # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
     # And for detail pybind information, please see generated paddle/pybind/pybind.h.
-    # This replacing is needed, when the CPU's kernel doesn't exist.
-    string(REPLACE "_op" "_mkldnn_op" target_mkldnn_file "${TARGET}")
-    if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-        file(READ ${TARGET}.cc TARGET_CONTENT)
-    elseif(WITH_MKLDNN AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${target_mkldnn_file}.cc)
-        file(READ ${target_mkldnn_file}.cc TARGET_CONTENT)
-    else() 
-        message(FATAL_ERROR "Cannot read the ${TARGET} file from ${CMAKE_CURRENT_SOURCE_DIR}")
-    endif()
-
+    file(READ ${TARGET}.cc TARGET_CONTENT)
     string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
     string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
     if (one_register STREQUAL "")
@@ -246,6 +224,7 @@ op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
 else()
diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index 48655d36fc..3e006189ef 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fc_mkldnn_op.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/operators/fc_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
 
@@ -23,105 +23,12 @@ namespace operators {
 using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
-void FCOp::InferShape(framework::InferShapeContext* ctx) const {
-  PADDLE_ENFORCE(ctx->HasInput("Input"),
-                 "X(Input) of Fully Connected should not be null.");
-  PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                 "Out(Output) of Fully Connected should not be null.");
-  PADDLE_ENFORCE(ctx->HasInput("W"),
-                 "W(Input) of Fully Connected should not be null.");
-
-  auto in_dims = ctx->GetInputDim("Input");
-  auto w_dims = ctx->GetInputDim("W");
-  std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
-
-  PADDLE_ENFORCE(in_dims.size() == 4,
-                 "Fully Connected input should be 4-D tensor.");
-
-  PADDLE_ENFORCE(w_dims.size() == 2,
-                 "Fully Connected input should be 2-D tensor.");
-
-  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
-  ctx->ShareLoD("Input", "Out");
-}
-
-framework::OpKernelType FCOp::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library{framework::LibraryType::kMKLDNN};
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout = framework::StringToDataLayout(data_format);
-
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout, library);
-}
-
-void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
-  auto in_dims = ctx->GetInputDim("Input");
-  auto w_dims = ctx->GetInputDim("W");
-
-  if (ctx->HasOutput(framework::GradVarName("Input"))) {
-    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-  }
-  if (ctx->HasOutput(framework::GradVarName("W"))) {
-    ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
-  }
-}
-
-framework::OpKernelType FCOpGrad::GetExpectedKernelType(
-    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library{framework::LibraryType::kMKLDNN};
-
-  std::string data_format = ctx.Attr<std::string>("data_format");
-  framework::DataLayout layout = framework::StringToDataLayout(data_format);
-
-  return framework::OpKernelType(
-      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
-      layout, library);
-}
-
-class FCOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "Input",
-        "(Tensor) The input tensor of fully connected operator. "
-        "The format of input tensor is NCHW, where N is batch size, C is the "
-        "number of channels, H is the height of the feature, "
-        "and W is the width of the feature.");
-    AddInput("W", "(Tensor), The second input tensor of fc op.");
-    AddOutput("Out",
-              "(Tensor) The output tensor of pooling operator. "
-              "The format of output tensor is also NCHW, "
-              "where N is batch size, C is the number of channels, "
-              "H is the height of the feature, "
-              "and W is the width of the feature.");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<bool>("with_bias",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false);
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout");
-    AddComment(R"DOC(
-)DOC");
-  }
-};
-
 struct MKLDNNMatrixSize final {
   explicit MKLDNNMatrixSize(const std::vector<int>& in,
                             const std::vector<int>& w)
       : mb{in[0]}, ic{in[1]}, oc{w[1]}, h{in[2]}, w{in[3]} {}
 
-  bool is_spatial() const { return h > 1 && w > 1; }
+  bool is_spatial() const { return h > 2 && w > 2; }
 
   const int mb;
   const int ic;
@@ -229,12 +136,12 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto input = ctx.Input<Tensor>("Input");
     auto w = ctx.Input<Tensor>("W");
 
-    PADDLE_ENFORCE(input->dims().size() == 4,
-                   "Input must be with 4 dimensions, i.e. NCHW");
+    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 2,
+                   "Input must be with 2 or 4 dimensions, i.e. NCHW");
     PADDLE_ENFORCE(w->dims().size() == 2,
                    "Weights must be with 2 dimensions, i.e. NC");
 
-    bool with_bias = ctx.Attr<bool>("with_bias");
+    bool with_bias = ctx.Attr<bool>("bias_attr");
     MKLDNNMD<Tensor> md(input, w, with_bias);
 
     std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
@@ -319,7 +226,7 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
     const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
     const T* out_grad_data = out_grad->data<T>();
 
-    bool with_bias = ctx.Attr<bool>("with_bias");
+    bool with_bias = ctx.Attr<bool>("bias_attr");
 
     MKLDNNMD<Tensor> md(input, w, with_bias);
     MKLDNNMemory mem(&md, mkldnn_engine);
@@ -400,9 +307,6 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
-            paddle::operators::FCOpGrad);
-
 REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace,
                    paddle::operators::FCMKLDNNOpKernel<float>);
 
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
new file mode 100644
index 0000000000..93b59854db
--- /dev/null
+++ b/paddle/fluid/operators/fc_op.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fc_op.h"
+
+namespace paddle {
+namespace operators {
+
+void FCOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "X(Input) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Fully Connected should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("W"),
+                 "W(Input) of Fully Connected should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("W");
+  std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
+
+  PADDLE_ENFORCE(in_dims.size() == 4,
+                 "Fully Connected input should be 4-D tensor.");
+
+  PADDLE_ENFORCE(w_dims.size() == 2,
+                 "Fully Connected input should be 2-D tensor.");
+
+  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  ctx->ShareLoD("Input", "Out");
+}
+
+framework::OpKernelType FCOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout, library);
+}
+
+void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto w_dims = ctx->GetInputDim("W");
+
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("W"))) {
+    ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
+  }
+}
+
+framework::OpKernelType FCOpGrad::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+  framework::DataLayout layout{framework::DataLayout::kAnyLayout};
+
+  return framework::OpKernelType(
+      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
+      layout, library);
+}
+
+FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of fully connected operator. "
+      "The format of input tensor is NCHW, where N is batch size, C is the "
+      "number of channels, H is the height of the feature, "
+      "and W is the width of the feature.");
+  AddInput("W", "(Tensor), The second input tensor of fc op.");
+  AddOutput("Out",
+            "(Tensor) The output tensor of fully connected operator. "
+            "The format of output tensor is also NCHW, "
+            "where N is batch size, C is the number of channels, "
+            "H is the height of the feature, "
+            "and W is the width of the feature.");
+  AddAttr<bool>("use_mkldnn",
+                "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddAttr<bool>("bias_attr", "(bool, default false) Only used in mkldnn kernel")
+      .SetDefault(false);
+  AddComment(R"DOC(
+  Fully Connected Operator.
+
+  The fully connected operation calculates the output based on the input, weights and bias attribute.
+  The size of each dimension of the parameters checked in the infer-shape.
+  Input(Input) is NCHW or NC format. Where N is batch size, C is the number of channels, 
+               H is the height of the feature, and W is the width of the feature.
+  Weights(W) is OIHW or OI format. Where H is the height of the feature, W is the width of the feature, 
+             O is the height of output, and I is the number of channels.
+  Output(Out) is NC format. Where N is batch size, and C is the number of channels.
+  The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
+  Additional parametrs are use_mkldnn and bias_attr.
+  The input(X) size and output(Out) size may be diffrent.
+
+Example:
+  Input:
+       Input shape:  $(N, C_{in}, H_{in}, W_{in})$
+       Weight shape: $(O_{out}, I_{in}, H_{in}, W_{in})$
+       Bias shape:   $(O_{out})$
+  Output:
+       Output shape: $(N, C_{out})$
+)DOC");
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker, fc_grad,
+            paddle::operators::FCOpGrad);
diff --git a/paddle/fluid/operators/fc_mkldnn_op.h b/paddle/fluid/operators/fc_op.h
similarity index 91%
rename from paddle/fluid/operators/fc_mkldnn_op.h
rename to paddle/fluid/operators/fc_op.h
index 9e6c66491d..70fa96440d 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -43,5 +43,10 @@ class FCOpGrad : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override;
 };
 
+class FCOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FCOpMaker(OpProto* proto, OpAttrChecker* op_checker);
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index bfae205bcf..40809b304f 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -86,7 +86,6 @@ def fc(input,
        param_attr=None,
        bias_attr=None,
        use_mkldnn=False,
-       with_bias=False,
        act=None,
        name=None):
     """
@@ -156,16 +155,39 @@ def fc(input,
     dtype = helper.input_dtype()
 
     mul_results = []
-    for input_var, param_attr in helper.iter_inputs_and_params():
-        input_shape = input_var.shape
+    if use_mkldnn:
+        tmp = helper.create_tmp_variable(dtype)
+        input_shape = input.shape
         param_shape = [
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
 
         w = helper.create_parameter(
-            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-        tmp = helper.create_tmp_variable(dtype)
-        if use_mkldnn == False:
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            is_bias=False)
+        bias_attr = False
+        if bias_attr is not None:
+            bias_attr = True
+        helper.append_op(
+            type="fc",
+            inputs={"Input": input,
+                    "W": w},
+            outputs={"Out": tmp},
+            attrs={"use_mkldnn": use_mkldnn,
+                   "bias_attr": bias_attr})
+        return helper.append_activation(tmp)
+    else:
+        for input_var, param_attr in helper.iter_inputs_and_params():
+            input_shape = input_var.shape
+            param_shape = [
+                reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
+            ] + [size]
+
+            w = helper.create_parameter(
+                attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
+            tmp = helper.create_tmp_variable(dtype)
             helper.append_op(
                 type="mul",
                 inputs={"X": input_var,
@@ -174,29 +196,22 @@ def fc(input,
                 attrs={
                     "x_num_col_dims": num_flatten_dims,
                     "y_num_col_dims": 1,
-                    'use_mkldnn': use_mkldnn
                 })
+            mul_results.append(tmp)
+
+        if len(mul_results) == 1:
+            pre_bias = mul_results[0]
         else:
+            pre_bias = helper.create_tmp_variable(dtype)
             helper.append_op(
-                type="fc",
-                inputs={"Input": input_var,
-                        "W": w},
-                outputs={"Out": tmp},
-                attrs={"use_mkldnn": use_mkldnn,
-                       "with_bias": with_bias})
-        mul_results.append(tmp)
-
-    # sum
-    if len(mul_results) == 1:
-        pre_bias = mul_results[0]
-    else:
-        pre_bias = helper.create_tmp_variable(dtype)
-        helper.append_op(
-            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
-    # add bias
-    pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
-    # add activation
-    return helper.append_activation(pre_activation)
+                type="sum",
+                inputs={"X": mul_results},
+                outputs={"Out": pre_bias})
+        # add bias
+        pre_activation = helper.append_bias_op(
+            pre_bias, dim_start=num_flatten_dims)
+        # add activation
+        return helper.append_activation(pre_activation)
 
 
 def embedding(input,

From 32f8ac7d3ba1c13ceb8e8bc186ed7c65e6099be6 Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Fri, 30 Mar 2018 20:09:33 +0200
Subject: [PATCH 3/4] Remove additional message

---
 paddle/fluid/operators/fc_mkldnn_op.cc | 41 ++++++++++----------------
 paddle/fluid/operators/fc_op.cc        | 17 ++---------
 python/paddle/fluid/layers/nn.py       |  5 ++--
 3 files changed, 21 insertions(+), 42 deletions(-)

diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index 3e006189ef..d0023713b4 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -23,67 +23,56 @@ namespace operators {
 using paddle::framework::Tensor;
 using paddle::platform::MKLDNNDeviceContext;
 
-struct MKLDNNMatrixSize final {
-  explicit MKLDNNMatrixSize(const std::vector<int>& in,
-                            const std::vector<int>& w)
-      : mb{in[0]}, ic{in[1]}, oc{w[1]}, h{in[2]}, w{in[3]} {}
-
-  bool is_spatial() const { return h > 2 && w > 2; }
-
-  const int mb;
-  const int ic;
-  const int oc;
-  const int h, w;
-};
-
 template <typename T>
 class MKLDNNMD {
  public:
   explicit MKLDNNMD(const T* in, const T* w, bool bias)
-      : sz_(std::unique_ptr<MKLDNNMatrixSize>(new MKLDNNMatrixSize(
-            paddle::framework::vectorize2int(in->dims()),
-            paddle::framework::vectorize2int(w->dims())))) {
+      : in{paddle::framework::vectorize2int(in->dims())},
+        w{paddle::framework::vectorize2int(w->dims())} {
     with_bias_ = bias;
   }
 
   mkldnn::memory::desc dst() const {
-    return platform::MKLDNNMemDesc({sz_->mb, sz_->oc},
+    return platform::MKLDNNMemDesc({in[0], w[1]},
                                    mkldnn::memory::data_type::f32,
                                    mkldnn::memory::format::nc);
   }
 
   mkldnn::memory::desc src() const {
-    return sz_->is_spatial()
-               ? platform::MKLDNNMemDesc({sz_->mb, sz_->ic, sz_->h, sz_->w},
+    return is_spatial()
+               ? platform::MKLDNNMemDesc({in[0], in[1], in[2], in[3]},
                                          mkldnn::memory::data_type::f32,
                                          mkldnn::memory::format::nchw)
-               : platform::MKLDNNMemDesc({sz_->mb, sz_->ic},
+               : platform::MKLDNNMemDesc({in[0], in[1]},
                                          mkldnn::memory::data_type::f32,
                                          mkldnn::memory::format::nc);
   }
 
   mkldnn::memory::desc weights() const {
-    return sz_->is_spatial()
-               ? platform::MKLDNNMemDesc({sz_->oc, sz_->ic, sz_->h, sz_->w},
+    return is_spatial()
+               ? platform::MKLDNNMemDesc({w[1], in[1], in[2], in[3]},
                                          mkldnn::memory::data_type::f32,
                                          mkldnn::memory::format::oihw)
-               : platform::MKLDNNMemDesc({sz_->oc, sz_->ic},
+               : platform::MKLDNNMemDesc({w[1], in[1]},
                                          mkldnn::memory::data_type::f32,
                                          mkldnn::memory::format::oi);
   }
 
   mkldnn::memory::desc bias() const {
     return with_bias_
-               ? platform::MKLDNNMemDesc({sz_->oc},
-                                         mkldnn::memory::data_type::f32,
+               ? platform::MKLDNNMemDesc({w[1]}, mkldnn::memory::data_type::f32,
                                          mkldnn::memory::format::format_undef)
                : platform::MKLDNNMemDesc({}, mkldnn::memory::data_type::f32,
                                          mkldnn::memory::format::format_undef);
   }
 
  private:
-  std::unique_ptr<MKLDNNMatrixSize> sz_;
+  bool is_spatial() const { return in.size() > 1 && w.size() > 1; }
+
+  std::vector<int> in;
+  std::vector<int> w;
   bool with_bias_;
+  bool is_spatial_;
 };
 
 class MKLDNNMemory {
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 93b59854db..41f9c0b8a9 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -29,8 +29,8 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
   auto w_dims = ctx->GetInputDim("W");
   std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
 
-  PADDLE_ENFORCE(in_dims.size() == 4,
-                 "Fully Connected input should be 4-D tensor.");
+  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 2,
+                 "Fully Connected input should be 2-D or 4-D tensor.");
 
   PADDLE_ENFORCE(w_dims.size() == 2,
                  "Fully Connected input should be 2-D tensor.");
@@ -96,22 +96,11 @@ FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
 
   The fully connected operation calculates the output based on the input, weights and bias attribute.
   The size of each dimension of the parameters checked in the infer-shape.
-  Input(Input) is NCHW or NC format. Where N is batch size, C is the number of channels, 
-               H is the height of the feature, and W is the width of the feature.
-  Weights(W) is OIHW or OI format. Where H is the height of the feature, W is the width of the feature, 
-             O is the height of output, and I is the number of channels.
-  Output(Out) is NC format. Where N is batch size, and C is the number of channels.
   The matrix of bias is generated by the mkldnn framework, when the bias_attr is True.
   Additional parametrs are use_mkldnn and bias_attr.
   The input(X) size and output(Out) size may be diffrent.
 
-Example:
-  Input:
-       Input shape:  $(N, C_{in}, H_{in}, W_{in})$
-       Weight shape: $(O_{out}, I_{in}, H_{in}, W_{in})$
-       Bias shape:   $(O_{out})$
-  Output:
-       Output shape: $(N, C_{out})$
+  The fully connected layer only supports MKLDNN version
 )DOC");
 }
 
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 40809b304f..d2e7d58524 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -167,8 +167,9 @@ def fc(input,
             shape=param_shape,
             dtype=dtype,
             is_bias=False)
-        bias_attr = False
-        if bias_attr is not None:
+        if bias_attr is None or bias_attr is False:
+            bias_attr = False
+        else:
             bias_attr = True
         helper.append_op(
             type="fc",

From 46e14bbcbb046780f62aa0a41a9ce083ca9677bc Mon Sep 17 00:00:00 2001
From: mozga-intel <mateusz.ozga@intel.com>
Date: Tue, 3 Apr 2018 14:57:37 +0200
Subject: [PATCH 4/4] Enforce: 2 and 4 dims, remove information about out in
 format

---
 paddle/fluid/operators/fc_mkldnn_op.cc |  6 +++---
 paddle/fluid/operators/fc_op.cc        | 21 ++++++---------------
 2 files changed, 9 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/fc_mkldnn_op.cc b/paddle/fluid/operators/fc_mkldnn_op.cc
index d0023713b4..9c704a2949 100644
--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -125,10 +125,10 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto input = ctx.Input<Tensor>("Input");
     auto w = ctx.Input<Tensor>("W");
 
-    PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 2,
+    PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
                    "Input must be with 2 or 4 dimensions, i.e. NCHW");
-    PADDLE_ENFORCE(w->dims().size() == 2,
-                   "Weights must be with 2 dimensions, i.e. NC");
+    PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
+                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
 
     bool with_bias = ctx.Attr<bool>("bias_attr");
     MKLDNNMD<Tensor> md(input, w, with_bias);
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 41f9c0b8a9..381771f157 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/fc_op.h"
+#include <vector>
 
 namespace paddle {
 namespace operators {
@@ -29,11 +30,11 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
   auto w_dims = ctx->GetInputDim("W");
   std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
 
-  PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 2,
+  PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                  "Fully Connected input should be 2-D or 4-D tensor.");
 
-  PADDLE_ENFORCE(w_dims.size() == 2,
-                 "Fully Connected input should be 2-D tensor.");
+  PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4,
+                 "Fully Connected input should be 2-D or 4-D tensor.");
 
   ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
   ctx->ShareLoD("Input", "Out");
@@ -73,19 +74,9 @@ framework::OpKernelType FCOpGrad::GetExpectedKernelType(
 
 FCOpMaker::FCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
     : OpProtoAndCheckerMaker(proto, op_checker) {
-  AddInput(
-      "Input",
-      "(Tensor) The input tensor of fully connected operator. "
-      "The format of input tensor is NCHW, where N is batch size, C is the "
-      "number of channels, H is the height of the feature, "
-      "and W is the width of the feature.");
+  AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
   AddInput("W", "(Tensor), The second input tensor of fc op.");
-  AddOutput("Out",
-            "(Tensor) The output tensor of fully connected operator. "
-            "The format of output tensor is also NCHW, "
-            "where N is batch size, C is the number of channels, "
-            "H is the height of the feature, "
-            "and W is the width of the feature.");
+  AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
   AddAttr<bool>("use_mkldnn",
                 "(bool, default false) Only used in mkldnn kernel")
       .SetDefault(false);