From b969116988a793718c9cce0bbe98bf84c0215412 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 16 Nov 2018 07:49:36 +0000
Subject: [PATCH 01/13] fxi avg pool trt bug and fix cpplint

---
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +-
 .../inference/tensorrt/convert/pool2d_op.cc   | 146 +++++++++++-------
 .../tensorrt/convert/test_pool2d_op.cc        |  16 +-
 .../inference/tensorrt/plugin/CMakeLists.txt  |   3 +-
 .../tensorrt/plugin/avg_pool_op_plugin.cu     |  62 ++++++++
 .../tensorrt/plugin/avg_pool_op_plugin.h      | 109 +++++++++++++
 .../tensorrt/plugin/split_op_plugin.cu        |   7 +-
 .../tensorrt/plugin/split_op_plugin.h         |  27 ++--
 paddle/fluid/operators/math/pooling.cu        |  36 +++++
 paddle/fluid/operators/math/pooling.h         |  12 ++
 10 files changed, 339 insertions(+), 81 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ed4c398cee..396ba510c8 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -18,7 +18,7 @@ nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc
 nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine conv_op SERIAL)
 nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc
-        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op SERIAL)
+        DEPS ${FLUID_CORE_MODULES} tensorrt_engine pool_op  tensorrt_plugin SERIAL)
 nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc
         DEPS ${FLUID_CORE_MODULES} tensorrt_engine elementwise_add_op SERIAL)
 nv_test(test_trt_softmax_op SRCS test_softmax_op.cc softmax_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index 4885002084..db8e7f8438 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -13,25 +13,57 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+void DealCeilMode(const nvinfer1::Dims &input_shape, std::vector<int> ksize,
+                  std::vector<int> strides, std::vector<int> paddings,
+                  nvinfer1::DimsHW *pre_pad, nvinfer1::DimsHW *post_pad,
+                  int input_dims) {
+  int input_height = input_shape.d[input_dims - 2];
+  int input_width = input_shape.d[input_dims - 1];
+  int floor_h_output_size =
+      (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+  int ceil_h_output_size =
+      (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
+          strides[0] +
+      1;
+
+  int floor_w_output_size =
+      (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+  int ceil_w_output_size =
+      (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) / strides[1] +
+      1;
+  if (floor_h_output_size != ceil_h_output_size) {
+    post_pad->h() = strides[0] - 1;
+  }
+
+  if (floor_w_output_size != ceil_w_output_size) {
+    post_pad->w() = strides[1] - 1;
+  }
+}
+
 /*
  * Pool2dOp, IPoolingLayer in TRT. This Layer doesn't has weights.
  */
 class Pool2dOpConverter : public OpConverter {
  public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3)
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope, bool test_mode) override {
+    VLOG(40)
         << "convert a fluid pool2d op to tensorrt pool2d layer without bias";
     framework::OpDesc op_desc(op, nullptr);
     // Declare inputs
     PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
     PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    nvinfer1::Dims input_shape = input1->getDimensions();
+    int input_dims = input_shape.nbDims;
+
+    PADDLE_ENFORCE_EQ(input_dims, 3UL);
 
     bool global_pooling = boost::get<bool>(op_desc.GetAttr("global_pooling"));
     std::string pool_type =
@@ -44,23 +76,6 @@ class Pool2dOpConverter : public OpConverter {
         boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
     bool ceil_mode = boost::get<bool>(op_desc.GetAttr("ceil_mode"));
 
-    nvinfer1::Dims input_shape = input1->getDimensions();
-    int nbDims = input_shape.nbDims;
-    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
-    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
-    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
-
-    if (global_pooling == true) {
-      nv_ksize.d[0] = input_shape.d[nbDims - 2];
-      nv_ksize.d[1] = input_shape.d[nbDims - 1];
-      nv_strides.h() = 1;
-      nv_strides.w() = 1;
-      nv_paddings.h() = 0;
-      nv_paddings.w() = 0;
-    }
-
-    PADDLE_ENFORCE_EQ(input1->getDimensions().nbDims, 3UL);
-
     nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX;
     if (pool_type == "max") {
       nv_pool_type = nvinfer1::PoolingType::kMAX;
@@ -70,48 +85,71 @@ class Pool2dOpConverter : public OpConverter {
       PADDLE_THROW("TensorRT unsupported pooling type!");
     }
 
-    if (ceil_mode) {
-      nvinfer1::DimsHW pre_pad(0, 0);
-      nvinfer1::DimsHW post_pad(0, 0);
-      int input_height = input_shape.d[nbDims - 2];
-      int input_width = input_shape.d[nbDims - 1];
-      int floor_h_output_size =
-          (input_height - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
-      int ceil_h_output_size =
-          (input_height - ksize[0] + 2 * paddings[0] + strides[0] - 1) /
-              strides[0] +
-          1;
-
-      int floor_w_output_size =
-          (input_width - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
-      int ceil_w_output_size =
-          (input_width - ksize[1] + 2 * paddings[1] + strides[1] - 1) /
-              strides[1] +
-          1;
-      if (floor_h_output_size != ceil_h_output_size) {
-        post_pad.h() = strides[0] - 1;
+    nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]);
+    nvinfer1::DimsHW nv_strides(strides[0], strides[1]);
+    nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]);
+
+    nvinfer1::ILayer *layer = nullptr;
+
+    if (global_pooling == true) {
+      nv_ksize.d[0] = input_shape.d[input_dims - 2];
+      nv_ksize.d[1] = input_shape.d[input_dims - 1];
+      auto *layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Pooling, *const_cast<nvinfer1::ITensor *>(input1),
+          nv_pool_type, nv_ksize);
+      PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
+      auto output_name = op_desc.Output("Out")[0];
+      layer->setName(("pool2d (Output: " + output_name + ")").c_str());
+      layer->getOutput(0)->setName(output_name.c_str());
+      engine_->SetITensor(output_name, layer->getOutput(0));
+      if (test_mode ||
+          output_name == "patch_6_pool1.avg_pool.output.1.tmp_0702") {
+        engine_->DeclareOutput(output_name);
       }
+      return;
+    }
 
-      if (floor_w_output_size != ceil_w_output_size) {
-        post_pad.w() = strides[1] - 1;
+    if (pool_type == "max") {
+      nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]);
+      nvinfer1::DimsHW post_pad(paddings[0], paddings[1]);
+      if (ceil_mode) {
+        // If ceil mode is true, we will pad the appropriate size to the input.
+        DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad,
+                     input_dims);
+        auto *pad_layer = TRT_ENGINE_ADD_LAYER(
+            engine_, Padding, *const_cast<nvinfer1::ITensor *>(input1), pre_pad,
+            post_pad);
+        PADDLE_ENFORCE_NOT_NULL(
+            pad_layer, "pad layer in poolOp converter could not be created.");
+        input1 = pad_layer->getOutput(0);
+      }
+      auto *pool_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Pooling, *const_cast<nvinfer1::ITensor *>(input1),
+          nv_pool_type, nv_ksize);
+      PADDLE_ENFORCE_NOT_NULL(pool_layer, "pool layer could not be created.");
+      pool_layer->setStride(nv_strides);
+      pool_layer->setPadding(nv_paddings);
+      layer = pool_layer;
+    } else {
+      // Average pooling needs to exclude the padding pixels from the average
+      // mean.
+      // It is not supported well by TRT, we use a plugin here.
+      std::vector<int> input_shape_v;
+      for (int i = 0; i < input_dims; i++) {
+        input_shape_v.push_back(input_shape.d[i]);
       }
-      auto* layer = TRT_ENGINE_ADD_LAYER(
-          engine_, Padding, *const_cast<nvinfer1::ITensor*>(input1), pre_pad,
-          post_pad);
-      input1 = layer->getOutput(0);
+      AvgPoolPlugin *plugin =
+          new AvgPoolPlugin(ceil_mode, ksize, strides, paddings, input_shape_v);
+      auto *avg_pool_layer = engine_->AddPlugin(&input1, 1, plugin);
+      layer = avg_pool_layer;
     }
-    auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling,
-                                       *const_cast<nvinfer1::ITensor*>(input1),
-                                       nv_pool_type, nv_ksize);
-    PADDLE_ENFORCE_NOT_NULL(layer, "pool layer could not be created.");
-    layer->setStride(nv_strides);
-    layer->setPadding(nv_paddings);
 
     auto output_name = op_desc.Output("Out")[0];
     layer->setName(("pool2d (Output: " + output_name + ")").c_str());
     layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {
+    if (test_mode ||
+        output_name == "patch_6_pool1.avg_pool.output.1.tmp_0702") {
       engine_->DeclareOutput(output_name);
     }
   }
diff --git a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
index ee597f8465..bded833505 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_pool2d_op.cc
@@ -20,20 +20,21 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-void test_pool2d(bool global_pooling, bool ceil_mode) {
+void test_pool2d(bool global_pooling, bool ceil_mode,
+                 std::string pool_type = "max") {
   framework::Scope scope;
   std::unordered_set<std::string> parameters;
   TRTConvertValidation validator(5, parameters, scope, 1 << 15);
 
   // The ITensor's Dims should not contain the batch size.
   // So, the ITensor's Dims of input and output should be C * H * W.
-  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 13, 14));
+  validator.DeclInputVar("pool2d-X", nvinfer1::Dims3(3, 6, 7));
   if (global_pooling)
     validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 1, 1));
   else if (ceil_mode)
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 7));
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 4));
   else
-    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 6, 6));
+    validator.DeclOutputVar("pool2d-Out", nvinfer1::Dims3(3, 3, 3));
 
   // Prepare Op description
   framework::OpDesc desc;
@@ -41,10 +42,10 @@ void test_pool2d(bool global_pooling, bool ceil_mode) {
   desc.SetInput("X", {"pool2d-X"});
   desc.SetOutput("Out", {"pool2d-Out"});
 
-  std::vector<int> ksize({3, 3});
+  std::vector<int> ksize({2, 2});
   std::vector<int> strides({2, 2});
   std::vector<int> paddings({0, 0});
-  std::string pooling_t = "max";
+  std::string pooling_t = pool_type;
 
   desc.SetAttr("pooling_type", pooling_t);
   desc.SetAttr("ksize", ksize);
@@ -63,7 +64,8 @@ void test_pool2d(bool global_pooling, bool ceil_mode) {
 TEST(Pool2dOpConverter, normal) { test_pool2d(false, false); }
 TEST(Pool2dOpConverter, test_global_pooling) { test_pool2d(true, false); }
 
-TEST(Pool2dOpConverter, test_ceil_mode) { test_pool2d(false, true); }
+TEST(Pool2dOpConverter, max_ceil_test) { test_pool2d(false, true); }
+TEST(Pool2dOpConverter, avg_ceil_test) { test_pool2d(false, true, "avg"); }
 
 }  // namespace tensorrt
 }  // namespace inference
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 71b7a55161..c246f8341f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1 +1,2 @@
-nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu DEPS enforce)
+nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu
+avg_pool_op_plugin.cu DEPS enforce pooling)
diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
new file mode 100644
index 0000000000..e440f1c313
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
@@ -0,0 +1,62 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
+#include "paddle/fluid/operators/math/pooling.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
+    int index, const nvinfer1::Dims* inputDims, int nbInputs) {
+  assert(nbInputs == 1);
+  assert(index == 0);
+  assert(inputDims[0].nbDims == 3);
+  nvinfer1::Dims const& input_dims = inputDims[0];
+
+  nvinfer1::Dims output_dims = input_dims;
+
+  output_dims.d[1] = output_shape_[1];
+  output_dims.d[2] = output_shape_[2];
+  return output_dims;
+}
+
+int AvgPoolPlugin::enqueue(int batchSize, const void* const* inputs,
+                           void** outputs, void* workspace,
+                           cudaStream_t stream) {
+  auto const& input_dims = this->getInputDims(0);
+  int input_size = 0;
+  float const* idata = reinterpret_cast<float const*>(inputs[0]);
+  float** odatas = reinterpret_cast<float**>(outputs);
+
+  paddle::operators::math::AvgPool<float> pool_process;
+  paddle::operators::math::Pool2dDirectCUDAFunctor<
+      paddle::operators::math::AvgPool<float>, float>
+      pool2d_forward;
+
+  std::vector<int> input_shape = input_shape_;
+  std::vector<int> output_shape = output_shape_;
+  input_shape.insert(input_shape.begin(), batchSize);
+  output_shape.insert(output_shape.begin(), batchSize);
+
+  pool2d_forward(idata, input_shape, output_shape, ksize_, strides_, paddings_,
+                 pool_process, true, odatas[0], stream);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
new file mode 100644
index 0000000000..e83fd38858
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <cassert>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class AvgPoolPlugin : public PluginTensorRT {
+ private:
+  bool ceil_mode_;
+  std::vector<int> ksize_;
+  std::vector<int> strides_;
+  std::vector<int> paddings_;
+  std::vector<int> input_shape_;
+  std::vector<int> output_shape_;
+
+ protected:
+  size_t getSerializationSize() override {
+    return SerializedSize(ceil_mode_) + SerializedSize(ksize_) +
+           SerializedSize(strides_) + SerializedSize(paddings_) +
+           SerializedSize(input_shape_) + getBaseSerializationSize();
+  }
+
+  // TRT will call this func when we need to serialize the configuration of
+  // tensorrt.
+  // It should not be called by users.
+  void serialize(void *buffer) override {
+    serializeBase(buffer);
+    SerializeValue(&buffer, ceil_mode_);
+    SerializeValue(&buffer, ksize_);
+    SerializeValue(&buffer, strides_);
+    SerializeValue(&buffer, paddings_);
+    SerializeValue(&buffer, input_shape_);
+  }
+
+ public:
+  AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
+                std::vector<int> strides, std::vector<int> paddings,
+                std::vector<int> input_shape)
+      : ceil_mode_(ceil_mode),
+        ksize_(ksize),
+        strides_(strides),
+        paddings_(paddings),
+        input_shape_(input_shape) {
+    int output_h, output_w;
+    output_shape_ = input_shape_;
+    if (!ceil_mode_) {
+      output_h =
+          (input_shape[1] - ksize_[0] + 2 * paddings_[0]) / strides_[0] + 1;
+      output_w =
+          (input_shape[2] - ksize_[1] + 2 * paddings_[1]) / strides_[1] + 1;
+    } else {
+      output_h =
+          (input_shape[1] - ksize_[0] + 2 * paddings_[0] + strides_[0] - 1) /
+              strides_[0] +
+          1;
+      output_w =
+          (input_shape[2] - ksize_[1] + 2 * paddings_[1] + strides_[1] - 1) /
+              strides_[1] +
+          1;
+    }
+    output_shape_[1] = output_h;
+    output_shape_[2] = output_w;
+  }
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  AvgPoolPlugin(void const *serialData, size_t serialLength) {
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &ceil_mode_);
+    DeserializeValue(&serialData, &serialLength, &ksize_);
+    DeserializeValue(&serialData, &serialLength, &strides_);
+    DeserializeValue(&serialData, &serialLength, &paddings_);
+    DeserializeValue(&serialData, &serialLength, &input_shape_);
+  }
+
+  AvgPoolPlugin *clone() const override {
+    return new AvgPoolPlugin(ceil_mode_, ksize_, strides_, paddings_,
+                             input_shape_);
+  }
+
+  const char *getPluginType() const override { return "avg_pool"; }
+  int getNbOutputs() const override { return 1; }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int initialize() override { return 0; }
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index bd6a44dcc1..14c286ff90 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -12,7 +12,6 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include <stdio.h>
 #include <cassert>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
 
@@ -76,6 +75,6 @@ int SplitPlugin::enqueue(int batchSize, const void* const* inputs,
   return cudaGetLastError() != cudaSuccess;
 }
 
-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 7281e40c33..1b4ac34d31 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -13,7 +13,7 @@
 // limitations under the License.
 
 #pragma once
-
+#include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
 namespace paddle {
@@ -27,7 +27,7 @@ class SplitPlugin : public PluginTensorRT {
   std::vector<int> segment_offsets_;
 
  protected:
-  virtual size_t getSerializationSize() override {
+  size_t getSerializationSize() override {
     return SerializedSize(axis_) + SerializedSize(output_length_) +
            getBaseSerializationSize();
   }
@@ -35,7 +35,7 @@ class SplitPlugin : public PluginTensorRT {
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
   // It should not be called by users.
-  virtual void serialize(void *buffer) override {
+  void serialize(void *buffer) override {
     serializeBase(buffer);
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, output_length_);
@@ -59,16 +59,15 @@ class SplitPlugin : public PluginTensorRT {
     return new SplitPlugin(axis_, output_length_);
   }
 
-  virtual const char *getPluginType() const override { return "split"; }
-  virtual int getNbOutputs() const override { return output_length_.size(); }
-  virtual nvinfer1::Dims getOutputDimensions(int index,
-                                             const nvinfer1::Dims *inputs,
-                                             int nbInputDims) override;
-  virtual int initialize() override;
-  virtual int enqueue(int batchSize, const void *const *inputs, void **outputs,
-                      void *workspace, cudaStream_t stream) override;
+  const char *getPluginType() const override { return "split"; }
+  int getNbOutputs() const override { return output_length_.size(); }
+  nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
+                                     int nbInputDims) override;
+  int initialize() override;
+  int enqueue(int batchSize, const void *const *inputs, void **outputs,
+              void *workspace, cudaStream_t stream) override;
 };
 
-}  // tensorrt
-}  // inference
-}  // paddle
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index a689eb4224..cdc79e207a 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -153,6 +153,37 @@ __global__ void KernelMaxPool2DGrad(
   }
 }
 
+template <typename PoolProcess, typename T>
+void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
+    const T* input, const std::vector<int>& input_shape,
+    const std::vector<int>& output_shape, const std::vector<int>& ksize,
+    const std::vector<int>& strides, const std::vector<int>& paddings,
+    PoolProcess pool_compute, bool exclusive, T* output, cudaStream_t stream) {
+  const int batch_size = input_shape[0];
+  const int input_channels = input_shape[1];
+  const int input_height = input_shape[2];
+  const int input_width = input_shape[3];
+  const int output_channels = output_shape[1];
+  const int output_height = output_shape[2];
+  const int output_width = output_shape[3];
+  const int ksize_height = ksize[0];
+  const int ksize_width = ksize[1];
+  const int stride_height = strides[0];
+  const int stride_width = strides[1];
+  const int padding_height = paddings[0];
+  const int padding_width = paddings[1];
+
+  int nthreads = batch_size * output_channels * output_height * output_width;
+  int blocks = (nthreads + 1024 - 1) / 1024;
+  dim3 threads(1024, 1);
+  dim3 grid(blocks, 1);
+
+  KernelPool2D<PoolProcess, T><<<grid, threads, 0, stream>>>(
+      nthreads, input, input_channels, input_height, input_width, output_height,
+      output_width, ksize_height, ksize_width, stride_height, stride_width,
+      padding_height, padding_width, pool_compute, exclusive, output);
+}
+
 /*
  * All tensors are in NCHW format.
  * Ksize, strides, paddings are two elements. These two elements represent
@@ -291,6 +322,11 @@ class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
   }
 };
 
+template class Pool2dDirectCUDAFunctor<paddle::operators::math::MaxPool<float>,
+                                       float>;
+template class Pool2dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
+                                       float>;
+
 template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
 
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 0f64e321bf..fa732f96d4 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -82,6 +82,18 @@ class AvgPoolGrad {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
+
+template <typename PoolProcess, typename T>
+class Pool2dDirectCUDAFunctor {
+ public:
+  void operator()(const T* input, const std::vector<int>& input_shape,
+                  const std::vector<int>& output_shape,
+                  const std::vector<int>& ksize,
+                  const std::vector<int>& strides,
+                  const std::vector<int>& paddings, PoolProcess pool_compute,
+                  bool exclusive, T* output, cudaStream_t stream);
+};
+
 template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dFunctor {
  public:

From 8f9a8c455a2ec22f5f67cc464d5b6a82cafbfb57 Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 16 Nov 2018 08:14:04 +0000
Subject: [PATCH 02/13] delete unused test code.

test=develop
---
 paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index db8e7f8438..2cfd0f6905 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -102,8 +102,7 @@ class Pool2dOpConverter : public OpConverter {
       layer->setName(("pool2d (Output: " + output_name + ")").c_str());
       layer->getOutput(0)->setName(output_name.c_str());
       engine_->SetITensor(output_name, layer->getOutput(0));
-      if (test_mode ||
-          output_name == "patch_6_pool1.avg_pool.output.1.tmp_0702") {
+      if (test_mode) {
         engine_->DeclareOutput(output_name);
       }
       return;
@@ -148,8 +147,7 @@ class Pool2dOpConverter : public OpConverter {
     layer->setName(("pool2d (Output: " + output_name + ")").c_str());
     layer->getOutput(0)->setName(output_name.c_str());
     engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode ||
-        output_name == "patch_6_pool1.avg_pool.output.1.tmp_0702") {
+    if (test_mode) {
       engine_->DeclareOutput(output_name);
     }
   }

From 9b64aac41ffa89cd742c9a926591a4607b3c15ed Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Fri, 16 Nov 2018 09:54:36 +0000
Subject: [PATCH 03/13] add macro for pool2dDirectCUDAFunctor

test=develop
---
 paddle/fluid/operators/math/pooling.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index fa732f96d4..923babd4c2 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -82,7 +82,7 @@ class AvgPoolGrad {
  * This is different from average pooling. So we rewrite the max_pool_grad:
  * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
  */
-
+#ifdef PADDLE_WITH_CUDA
 template <typename PoolProcess, typename T>
 class Pool2dDirectCUDAFunctor {
  public:
@@ -93,6 +93,7 @@ class Pool2dDirectCUDAFunctor {
                   const std::vector<int>& paddings, PoolProcess pool_compute,
                   bool exclusive, T* output, cudaStream_t stream);
 };
+#endif
 
 template <typename DeviceContext, typename PoolProcess, typename T>
 class Pool2dFunctor {

From b3364d40350c95e7fc804f79dfac42057590c108 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 21 Nov 2018 10:03:50 +0800
Subject: [PATCH 04/13] fix(Macos): fix compile on macos

test=develop
---
 paddle/fluid/memory/allocation/best_fit_allocator_test.cc | 1 +
 paddle/fluid/memory/allocation/best_fit_allocator_test.cu | 1 +
 2 files changed, 2 insertions(+)

diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
index 4122b3d709..20748a23a1 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/memory/allocation/best_fit_allocator.h"
+#include <random>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
index 50aecda97a..f7f17e1d36 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
+++ b/paddle/fluid/memory/allocation/best_fit_allocator_test.cu
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include <random>
 #include <thread>  // NOLINT
 #include <vector>
 #include "gtest/gtest.h"

From 175b847f6dd900456dc0f0a39cb1eb3394431ea6 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 12:24:25 +0800
Subject: [PATCH 05/13] Add API example for logical ops and clip ops

test=develop
---
 python/paddle/fluid/layers/nn.py | 250 ++++++++++++++++++-------------
 1 file changed, 149 insertions(+), 101 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 99acd7e308..7b0a3e2c82 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -726,11 +726,11 @@ def dynamic_gru(input,
             create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
             the bias in the update gate, reset gate and candidate calculations.
-            If it is set to False, no bias will be applied to the update gate, 
-            reset gate and candidate calculations. If it is set to None or one 
-            attribute of ParamAttr, dynamic_gru will create ParamAttr as 
+            If it is set to False, no bias will be applied to the update gate,
+            reset gate and candidate calculations. If it is set to None or one
+            attribute of ParamAttr, dynamic_gru will create ParamAttr as
             bias_attr. If the Initializer of the bias_attr is not set, the bias
             is initialized zero. Default: None.
         is_reverse(bool): Whether to compute reversed GRU, default
@@ -847,11 +847,11 @@ def gru_unit(input,
             create ParamAttr as param_attr. If the Initializer of the param_attr
             is not set, the parameter is initialized with Xavier. Default: None.
         bias_attr (ParamAttr|bool|None): The parameter attribute for the bias
-            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates 
+            of GRU. Note that the bias with :math:`(1 \\times 3D)` concatenates
             the bias in the update gate, reset gate and candidate calculations.
-            If it is set to False, no bias will be applied to the update gate, 
-            reset gate and candidate calculations. If it is set to None or one 
-            attribute of ParamAttr, gru_unit will create ParamAttr as 
+            If it is set to False, no bias will be applied to the update gate,
+            reset gate and candidate calculations. If it is set to None or one
+            attribute of ParamAttr, gru_unit will create ParamAttr as
             bias_attr. If the Initializer of the bias_attr is not set, the bias
             is initialized zero. Default: None.
         activation (string): The activation type for cell (actNode).
@@ -1064,9 +1064,9 @@ def dropout(x,
                                            inference: out = input
                                            (make is a tensor same shape with input, value is 0 or 1
                                             ratio of 0 is dropout_prob)
-                                           dropout op can be removed from the program. 
+                                           dropout op can be removed from the program.
                                            the program will be efficient
-                                        
+
 
 
     Returns:
@@ -2149,7 +2149,7 @@ def pool2d(input,
         ceil_mode (bool): ${ceil_mode_comment}
         name (str|None): A name for this layer(optional). If set None, the
                         layer will be named automatically.
-        exclusive (bool): Whether to exclude padding points in average pooling 
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is true
 
     Returns:
@@ -2240,7 +2240,7 @@ def pool3d(input,
         ceil_mode (bool): ${ceil_mode_comment}
         name (str): A name for this layer(optional). If set None, the layer
             will be named automatically.
-        exclusive (bool): Whether to exclude padding points in average pooling 
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is true
 
     Returns:
@@ -4342,7 +4342,7 @@ def nce(input,
         sampler (str): The sampler used to sample class from negtive classes.
                        It can be 'uniform', 'log_uniform' or 'custom_dist'.
                        default: 'uniform'.
-        custom_dist (Variable): A tensor with shape [num_total_classes]. 
+        custom_dist (Variable): A tensor with shape [num_total_classes].
                        It is used when sampler is set to 'custom_dist'.
                        custom_dist[i] is the probsbility of i-th class to be sampled.
                        default: None.
@@ -4385,7 +4385,7 @@ def nce(input,
                           num_neg_samples=3,
                           sampler="custom_dist",
                           custom_dist=dist)
-            
+
     """
     helper = LayerHelper('nce', **locals())
     assert isinstance(input, Variable)
@@ -4556,9 +4556,9 @@ def transpose(x, perm, name=None):
     Examples:
         .. code-block:: python
 
-            # use append_batch_size=False to avoid prepending extra 
+            # use append_batch_size=False to avoid prepending extra
             # batch size in shape
-            x = fluid.layers.data(name='x', shape=[5, 10, 15], 
+            x = fluid.layers.data(name='x', shape=[5, 10, 15],
                             dtype='float32', append_batch_size=False)
             x_transposed = layers.transpose(x, perm=[1, 0, 2])
     """
@@ -4835,7 +4835,7 @@ def softmax_with_cross_entropy(logits,
     3) If numeric_stable_mode is True, softmax is calculated first by:
 
     .. math::
-        
+
         max_j = \\max_{i=0}^{K}{\\text{logit}_i}
 
         log\\_max\\_sum_j = \\log\\sum_{i=0}^{K}\\exp(logit_i - max_j)
@@ -4858,18 +4858,18 @@ def softmax_with_cross_entropy(logits,
         numeric_stable_mode (bool): A flag to indicate whether to use a more
                                     numerically stable algorithm. Only valid
                                     when soft_label is False and GPU is used.
-                                    When soft_label is True or CPU is used, 
-                                    the algorithm is always numerically stable. 
-                                    Note that the speed may be slower when use 
+                                    When soft_label is True or CPU is used,
+                                    the algorithm is always numerically stable.
+                                    Note that the speed may be slower when use
                                     stable algorithm. Default: False
-        return_softmax (bool): A flag indicating whether to return the softmax 
+        return_softmax (bool): A flag indicating whether to return the softmax
                                along with the cross entropy loss. Default: False
 
     Returns:
-        Variable or Tuple of two Variables: Return the cross entropy loss if 
-                              `return_softmax` is False, otherwise the tuple 
-                              (loss, softmax), where the cross entropy loss is 
-                              a 2-D tensor with shape [N x 1], and softmax is a 
+        Variable or Tuple of two Variables: Return the cross entropy loss if
+                              `return_softmax` is False, otherwise the tuple
+                              (loss, softmax), where the cross entropy loss is
+                              a 2-D tensor with shape [N x 1], and softmax is a
                               2-D tensor with shape [N x K].
 
     Examples:
@@ -5756,20 +5756,20 @@ def image_resize(input,
                          Default: None
         name(str|None): A name for this layer(optional). If set None, the layer
                         will be named automatically.
-        resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST' 
+        resample(str): The resample method. It supports 'BILINEAR' and 'NEAREST'
                        currently.
                        Default: 'BILINEAR'
-        actual_shape(Variable): An optional input to specify output shape 
-                                dynamically. If provided, image resize  
-                                according to this given shape rather than 
+        actual_shape(Variable): An optional input to specify output shape
+                                dynamically. If provided, image resize
+                                according to this given shape rather than
                                 :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the 
-                                highest priority. It is recommended to use 
-                                actual_shape instead of :attr:`out_shape` if you 
-                                want to specify output shape dynamically. When 
-                                using actual_shape to specify output shape, one of 
-                                :attr:`out_shape` and :attr:`scale` should also be 
-                                set, otherwise errors would be occured in graph 
+                                shape. That is to say actual_shape has the
+                                highest priority. It is recommended to use
+                                actual_shape instead of :attr:`out_shape` if you
+                                want to specify output shape dynamically. When
+                                using actual_shape to specify output shape, one of
+                                :attr:`out_shape` and :attr:`scale` should also be
+                                set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
 
@@ -5780,7 +5780,7 @@ def image_resize(input,
     Raises:
         TypeError: out_shape should be a list or tuple or Variable.
         TypeError: actual_shape should either be Variable or None.
-        ValueError: The 'resample' of image_resize can only be 'BILINEAR' 
+        ValueError: The 'resample' of image_resize can only be 'BILINEAR'
                     or 'NEAREST' currently.
         ValueError: One of out_shape and scale must not be None.
         ValueError: out_shape length should be 2.
@@ -5852,17 +5852,17 @@ def resize_bilinear(input,
                     name=None,
                     actual_shape=None):
     """
-    Resize input by performing bilinear interpolation based on given 
-    output shape which specified by actual_shape, out_shape and scale 
+    Resize input by performing bilinear interpolation based on given
+    output shape which specified by actual_shape, out_shape and scale
     in priority order.
 
-    Bilinear interpolation is an extension of linear interpolation for 
-    interpolating functions of two variables (e.g. H-direction and 
-    W-direction in this op) on a rectilinear 2D grid. The key idea is 
-    to perform linear interpolation first in one direction, and then 
+    Bilinear interpolation is an extension of linear interpolation for
+    interpolating functions of two variables (e.g. H-direction and
+    W-direction in this op) on a rectilinear 2D grid. The key idea is
+    to perform linear interpolation first in one direction, and then
     again in the other direction.
 
-    For details of bilinear interpolation, please refer to Wikipedia: 
+    For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation
 
     Args:
@@ -5875,17 +5875,17 @@ def resize_bilinear(input,
              a higher priority than scale. Default: None.
 
         name(str|None): The output variable name.
-        actual_shape(Variable): An optional input to specify output shape 
-                                dynamically. If provided, image resize  
-                                according to this given shape rather than 
+        actual_shape(Variable): An optional input to specify output shape
+                                dynamically. If provided, image resize
+                                according to this given shape rather than
                                 :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the 
-                                highest priority. It is recommended to use 
-                                actual_shape instead of :attr:`out_shape` if you 
-                                want to specify output shape dynamically. When 
-                                using actual_shape to specify output shape, one of 
-                                :attr:`out_shape` and :attr:`scale` should also be 
-                                set, otherwise errors would be occured in graph 
+                                shape. That is to say actual_shape has the
+                                highest priority. It is recommended to use
+                                actual_shape instead of :attr:`out_shape` if you
+                                want to specify output shape dynamically. When
+                                using actual_shape to specify output shape, one of
+                                :attr:`out_shape` and :attr:`scale` should also be
+                                set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
 
@@ -5909,11 +5909,11 @@ def resize_nearest(input,
                    actual_shape=None):
     """
     Resize input by performing nearest neighbor interpolation in both the
-    3rd dimention(in height direction) and the 4th dimention(in width 
-    direction) based on given output shape which specified by actual_shape, 
+    3rd dimention(in height direction) and the 4th dimention(in width
+    direction) based on given output shape which specified by actual_shape,
     out_shape and scale in priority order.
 
-    For details of nearest neighbor interpolation, please refer to Wikipedia: 
+    For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation
 
     Args:
@@ -5926,17 +5926,17 @@ def resize_nearest(input,
              a higher priority than scale. Default: None.
 
         name(str|None): The output variable name.
-        actual_shape(Variable): An optional input to specify output shape 
-                                dynamically. If provided, image resize  
-                                according to this given shape rather than 
+        actual_shape(Variable): An optional input to specify output shape
+                                dynamically. If provided, image resize
+                                according to this given shape rather than
                                 :attr:`out_shape` and :attr:`scale` specifying
-                                shape. That is to say actual_shape has the 
-                                highest priority. It is recommended to use 
-                                actual_shape instead of :attr:`out_shape` if you 
-                                want to specify output shape dynamically. When 
-                                using actual_shape to specify output shape, one of 
-                                :attr:`out_shape` and :attr:`scale` should also be 
-                                set, otherwise errors would be occured in graph 
+                                shape. That is to say actual_shape has the
+                                highest priority. It is recommended to use
+                                actual_shape instead of :attr:`out_shape` if you
+                                want to specify output shape dynamically. When
+                                using actual_shape to specify output shape, one of
+                                :attr:`out_shape` and :attr:`scale` should also be
+                                set, otherwise errors would be occured in graph
                                 constructing stage.
                                 Default: None
 
@@ -6446,15 +6446,15 @@ def affine_grid(theta, out_shape, name=None):
                         [x_14, x_15, x_16]]
                        [[x_21, x_22, x_23]
                         [x_24, x_25, x_26]]]
-      
+
               out_shape = [2, 3, 5, 5]
-      
+
           Step 1:
-      
+
               Generate normalized coordinates according to out_shape.
               The values of the normalized coordinates are in the interval between -1 and 1.
               The shape of the normalized coordinates is [2, H, W] as below:
-      
+
               C = [[[-1.  -1.  -1.  -1.  -1. ]
                     [-0.5 -0.5 -0.5 -0.5 -0.5]
                     [ 0.   0.   0.   0.   0. ]
@@ -7702,6 +7702,15 @@ def logical_and(x, y, out=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            left = fluid.layers.data(
+                name='left', shape=[1], dtype='int32')
+            right = fluid.layers.data(
+                name='right', shape=[1], dtype='int32')
+            result = fluid.layers.logical_and(x=left, y=right)
     """
 
     return _logical_op(
@@ -7721,6 +7730,15 @@ def logical_or(x, y, out=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            left = fluid.layers.data(
+                name='left', shape=[1], dtype='int32')
+            right = fluid.layers.data(
+                name='right', shape=[1], dtype='int32')
+            result = fluid.layers.logical_or(x=left, y=right)
     """
 
     return _logical_op(
@@ -7740,6 +7758,15 @@ def logical_xor(x, y, out=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            left = fluid.layers.data(
+                name='left', shape=[1], dtype='int32')
+            right = fluid.layers.data(
+                name='right', shape=[1], dtype='int32')
+            result = fluid.layers.logical_xor(x=left, y=right)
     """
 
     return _logical_op(
@@ -7758,6 +7785,13 @@ def logical_not(x, out=None, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            left = fluid.layers.data(
+                name='left', shape=[1], dtype='int32')
+            result = fluid.layers.logical_not(x=left)
     """
 
     return _logical_op(
@@ -7777,6 +7811,13 @@ def clip(x, min, max, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', shape=[1], dtype='float32')
+            reward = fluid.layers.clip(x=input, min=-1.0, max=1.0)
     """
 
     helper = LayerHelper("clip", **locals())
@@ -7809,6 +7850,13 @@ def clip_by_norm(x, max_norm, name=None):
 
     Returns:
         out(${out_type}): ${out_comment}
+
+    Examples:
+        .. code-block:: python
+
+            input = fluid.layers.data(
+                name='data', shape=[1], dtype='float32')
+            reward = fluid.layers.clip_by_norm(x=input, max_norm=1.0)
     """
 
     helper = LayerHelper("clip_by_norm", **locals())
@@ -7954,19 +8002,19 @@ def maxout(x, groups, name=None):
 def space_to_depth(x, blocksize, name=None):
     """
     Gives a blocksize to space_to_depth the input LoDtensor with Layout: [batch, channel, height, width]
-    
-    This op rearranges blocks of spatial data, into depth. More specifically, this op outputs a copy of the 
-    input LoDtensor where values from the height and width dimensions are moved to the channel dimension. 
+
+    This op rearranges blocks of spatial data, into depth. More specifically, this op outputs a copy of the
+    input LoDtensor where values from the height and width dimensions are moved to the channel dimension.
     The attr blocksize indicates the input block size.
-    
-    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] according 
+
+    space_to_depth will reorgnize the elements of input with shape[batch, channel, height, width] according
     to blocksize to construct output with shape [batch, channel * blocksize * blocksize, height/blocksize, width/blocksize]:
-    
-    space_to_depth is used to This operation is useful for resizing the activations between convolutions 
+
+    space_to_depth is used to This operation is useful for resizing the activations between convolutions
     (but keeping all data)
 
     - Non-overlapping blocks of size block_size x block size are rearranged into depth at each location.
-    - The depth of the output tensor is block_size * block_size * input channel 
+    - The depth of the output tensor is block_size * block_size * input channel
     - The Y, X coordinates within each block of the input become the high order component of the output channel index
     - channel should be divisible by square of blocksize
     - height, width should be divsible by blocksize
@@ -8013,7 +8061,7 @@ def space_to_depth(x, blocksize, name=None):
 
 @templatedoc()
 def sequence_reverse(x, name=None):
-    """ 
+    """
     ${comment}
 
     Args:
@@ -8080,21 +8128,21 @@ def affine_channel(x, scale=None, bias=None, data_layout='NCHW', name=None):
 
 
 def similarity_focus(input, axis, indexes, name=None):
-    """  
+    """
     SimilarityFocus Operator
 
     Generate a similarity focus mask with the same shape of input using the following method:
-    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding 
-       to the axis according to the indexes. For example, if axis=1 and indexes=[a], 
-       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X 
+    1. Extract the 3-D tensor(here the first dimension is BatchSize) corresponding
+       to the axis according to the indexes. For example, if axis=1 and indexes=[a],
+       it will get the matrix T=X[:, a, :, :]. In this case, if the shape of input X
        is (BatchSize, A, B, C), the shape of tensor T is (BatchSize, B, C).
-    2. For each index, find the largest numbers in the tensor T, so that the same 
-       row and same column has at most one number(what it means is that if the 
-       largest number has been found in the i-th row and the j-th column, then 
-       the numbers in the i-th row or j-th column will be skipped. And then the 
-       next largest number will be selected from the remaining numbers. Obviously 
-       there will be min(B, C) numbers), and mark the corresponding position of the 
-       3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for 
+    2. For each index, find the largest numbers in the tensor T, so that the same
+       row and same column has at most one number(what it means is that if the
+       largest number has been found in the i-th row and the j-th column, then
+       the numbers in the i-th row or j-th column will be skipped. And then the
+       next largest number will be selected from the remaining numbers. Obviously
+       there will be min(B, C) numbers), and mark the corresponding position of the
+       3-D similarity focus mask as 1, otherwise as 0. Do elementwise-or for
        each index.
     3. Broadcast the 3-D similarity focus mask to the same shape of input X.
 
@@ -8150,16 +8198,16 @@ def similarity_focus(input, axis, indexes, name=None):
                               [1.0, 0.0]]]]
 
     Args:
-        input(Variable): The input tensor variable(default float). It should 
+        input(Variable): The input tensor variable(default float). It should
             be a 4-D tensor with shape [BatchSize, A, B, C].
         axis(int): Indicating the dimension to be selected. It can only be
             1, 2 or 3.
         indexes(list): Indicating the indexes of the selected dimension.
 
     Returns:
-        Variable: A tensor variable with the same shape and same type 
+        Variable: A tensor variable with the same shape and same type
             as the input.
-        
+
     Examples:
         .. code-block:: python
             data = fluid.layers.data(
@@ -8262,12 +8310,12 @@ def hash(input, hash_size, num_hash=1, name=None):
 @templatedoc()
 def grid_sampler(x, grid, name=None):
     """
-    This operation samples input X by using bilinear interpolation based on 
+    This operation samples input X by using bilinear interpolation based on
     flow field grid, which is usually gennerated by affine_grid. The grid of
-    shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates 
-    with shape [N, H, W] each, where grid_x is indexing the 4th dimension 
-    (in width dimension) of input data x and grid_y is indexng the 3rd 
-    dimention (in height dimension), finally results is the bilinear 
+    shape [N, H, W, 2] is the concatenation of (grid_x, grid_y) coordinates
+    with shape [N, H, W] each, where grid_x is indexing the 4th dimension
+    (in width dimension) of input data x and grid_y is indexng the 3rd
+    dimention (in height dimension), finally results is the bilinear
     interpolation value of 4 nearest corner points.
 
     Step 1:
@@ -8277,7 +8325,7 @@ def grid_sampler(x, grid, name=None):
     grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
     Step 2:
-    Indices input data X with grid (x, y) in each [H, W] area, and bilinear 
+    Indices input data X with grid (x, y) in each [H, W] area, and bilinear
     interpolate point value by 4 nearest points.
 
       wn ------- y_n ------- en
@@ -8314,7 +8362,7 @@ def grid_sampler(x, grid, name=None):
         name (str, default None): The name of this layer.
 
     Returns:
-        out(Variable): Output of shape [N, C, H, W] data samples input X 
+        out(Variable): Output of shape [N, C, H, W] data samples input X
         using bilnear interpolation based on input grid.
 
     Exmples:

From cda60311f94aea91f8abd0394446d12095d1a8a7 Mon Sep 17 00:00:00 2001
From: Dang Qingqing <dangqingqing@baidu.com>
Date: Tue, 20 Nov 2018 13:45:33 +0800
Subject: [PATCH 06/13] Fix compling with cuDNN v5

test=develop
---
 paddle/fluid/operators/CMakeLists.txt              | 9 ++++++---
 paddle/fluid/operators/conv_fusion_op.cu.cc        | 4 ++++
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 ++++
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 975c3bfc33..ca5b30e7b8 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -36,15 +36,18 @@ endif()
 
 register_operators(EXCLUDES warpctc_op conv_fusion_op)
 
-# warpctc_cudnn need cudnn 7 above
+# warpctc_op needs cudnn 7 above
 if (WITH_GPU)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
     endif()
-  op_library(conv_fusion_op)
-  file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
+    # conv_fusion_op needs cudnn 7 above
+    if (NOT ${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+        op_library(conv_fusion_op)
+        file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n")
+    endif()
 else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc
index bd1041ce08..2c09ee7394 100644
--- a/paddle/fluid/operators/conv_fusion_op.cu.cc
+++ b/paddle/fluid/operators/conv_fusion_op.cu.cc
@@ -22,6 +22,7 @@ DECLARE_bool(cudnn_exhaustive_search);
 namespace paddle {
 namespace operators {
 
+#if CUDNN_VERSION >= 7001
 using Tensor = framework::Tensor;
 using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
 using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
@@ -178,10 +179,13 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
     workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes);
   }
 };
+#endif
 
 }  // namespace operators
 }  // namespace paddle
 
+#if CUDNN_VERSION >= 7001
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel<float>,
                         ops::CUDNNConvFusionOpKernel<double>);
+#endif
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 1513eca514..7101506f99 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,6 +23,10 @@ if(NOT WITH_DISTRIBUTE)
     LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 
+if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+endif()
+
 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185

From 53760bb111c703f319ea3492c6ede13384095584 Mon Sep 17 00:00:00 2001
From: minqiyang <minqiyang@baidu.com>
Date: Wed, 21 Nov 2018 13:29:51 +0800
Subject: [PATCH 07/13] Change requirements to support python 3.7

test=develop
---
 python/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index 84cf440397..2f81d85df0 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,5 +1,5 @@
 requests==2.9.2
-numpy>=1.12,<=1.14 #TODO:change to ">=1.12" when numpy fix bug in 1.15 and higher version
+numpy>=1.12
 protobuf==3.1
 recordio>=0.1.0
 matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib

From 3edd32d07083473f7900329bf68a6263ff9b06d3 Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Wed, 21 Nov 2018 13:53:54 +0800
Subject: [PATCH 08/13] fix(Compile): fix depends error when compile op using
 cub

some operators depend on cub and xxhash by header. The dependency should be declared explicitly rather than declared to pybind.

test=develop
---
 paddle/fluid/operators/CMakeLists.txt | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 975c3bfc33..9a98ba6d9d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -34,7 +34,12 @@ if (WITH_GPU AND TENSORRT_FOUND)
     add_subdirectory(tensorrt)
 endif()
 
-register_operators(EXCLUDES warpctc_op conv_fusion_op)
+SET(OP_HEADER_DEPS xxhash)
+if (WITH_GPU)
+    SET(OP_HEADER_DEPS ${OP_HEADER_DEPS} cub)
+endif()
+
+register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
 
 # warpctc_cudnn need cudnn 7 above
 if (WITH_GPU)
@@ -49,14 +54,14 @@ else()
     op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 endif()
 
-set(COMMON_OP_DEPS "")
+set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
 if (NOT WIN32)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 endif()
 if (WITH_GPU)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub)
+  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
 endif()
 
 # FIXME(typhoonzero): operator deps may not needed.

From 6c0e09cb1d64a014873db47cae1eeca0264e561c Mon Sep 17 00:00:00 2001
From: dengkaipeng <dengkaipeng@baidu.com>
Date: Tue, 20 Nov 2018 16:52:24 +0800
Subject: [PATCH 09/13] change interpolate unittest to serial. test=develop

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 29e4ca04a7..46dd2ef110 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -75,10 +75,12 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
+list(REMOVE_ITEM TEST_OPS test_interpolate_op)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
+py_test_modules(test_interpolate_op MODULES test_interpolate_op SERIAL)
 if(WITH_DISTRIBUTE)
     py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
     set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)

From fd290c2580cfcaa5c80e41deb1d8fc6a4028099c Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 21 Nov 2018 22:11:19 +0800
Subject: [PATCH 10/13] fix mac compile of analysis test=develop

---
 paddle/fluid/inference/analysis/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/analysis/CMakeLists.txt b/paddle/fluid/inference/analysis/CMakeLists.txt
index 0c73778b20..4bd3f93ef7 100644
--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -35,4 +35,4 @@ function(inference_analysis_test TARGET)
   endif()
 endfunction(inference_analysis_test)
 
-inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS paddle_inference_api)
+inference_analysis_test(test_analyzer SRCS analyzer_tester.cc EXTRA_DEPS reset_tensor_array paddle_inference_api)

From 533c5d580369e9605e9f0080c26c337c25301c3b Mon Sep 17 00:00:00 2001
From: Yu Yang <yuyang18@baidu.com>
Date: Thu, 22 Nov 2018 13:00:29 +0800
Subject: [PATCH 11/13] fix(Cpu): fix cpu compile and unittest

test=develop
---
 paddle/fluid/pybind/pybind.cc                      | 4 ++++
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 4 +++-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5ef5bf4d6c..358340b897 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -46,6 +46,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@@ -95,6 +96,9 @@ bool IsCompiledWithDIST() {
 }
 
 PYBIND11_PLUGIN(core) {
+  // Not used, just make sure cpu_info.cc is linked.
+  paddle::platform::CpuTotalPhysicalMemory();
+
   paddle::memory::allocation::UseAllocatorStrategyGFlag();
   py::module m("core", "C++ core of PaddlePaddle");
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 79fa99d002..4fa69191ad 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -23,7 +23,9 @@ if(NOT WITH_DISTRIBUTE)
     LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)
 
-if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+if (NOT ${WITH_GPU})
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
     LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
 endif()
 

From d9a1f3e58e89c3f3cc38fb8edc830ac38578c733 Mon Sep 17 00:00:00 2001
From: wopeizl <wopeizl@icloud.com>
Date: Thu, 22 Nov 2018 13:33:26 +0800
Subject: [PATCH 12/13] Windows/online  (#14474)

* add recordio support

* disable the openblas multi-thread on windows since no support
adjust the python script

* code style

* code style
test=develop

* add create_recordio_file_reader back

* fix code style
test=develop

* fix the gtest.cmake on windows

* fix cc_test on windows

* fix the win build
test=develop

* remove fused compile support on windows
test=develop

* add the jit support
test=develop

* add the jit support, test=develop

* add the jit support, test=develop

* add the jit back
fix compile error on windows

* rollback test=develop

* test case fix

* disable DSO by default on windows

* exclude warpctc_op on windows

* exclude the dynload_warpctc out on windows
test=develop

* fix the scripts error
test=develop

* disable avx on windows by default
test=develop

* re-organize the cmake file

* disable mkl on windows by default

* add warp_ctc back

* fix the dependency

* fix the dependency

* fix the build issue on windows

* remove unsupported flag on windows

* code style

* code style
test=develop

* fix issue

* add profiler, parallel_executor back

* clean up the pre-definitions on windows

* fix build issue

* test=develop
---
 CMakeLists.txt                                |  21 +-
 cmake/external/gtest.cmake                    |   4 +
 cmake/external/snappy.cmake                   |  12 +-
 cmake/external/snappystream.cmake             |  61 +--
 cmake/generic.cmake                           |   3 +
 cmake/operators.cmake                         |   4 +-
 cmake/simd.cmake                              |  73 ++--
 paddle/fluid/CMakeLists.txt                   |   6 +-
 paddle/fluid/framework/CMakeLists.txt         |  15 +-
 .../fast_threaded_ssa_graph_executor.h        |   2 +-
 paddle/fluid/framework/eigen.h                |   5 -
 paddle/fluid/framework/op_registry.h          |   5 -
 paddle/fluid/framework/operator.cc            |   2 -
 paddle/fluid/framework/operator.h             |   2 -
 paddle/fluid/inference/api/api_impl.h         |   6 -
 .../fluid/memory/allocation/cpu_allocator.h   |   6 +
 paddle/fluid/operators/CMakeLists.txt         |  12 +-
 .../fluid/operators/hierarchical_sigmoid_op.h |   2 +-
 paddle/fluid/operators/math/CMakeLists.txt    |  35 +-
 .../math/detail/activation_functions.h        |   1 +
 paddle/fluid/operators/math/matrix_bit_code.h |   3 +-
 .../operators/reader/create_py_reader_op.cc   |   2 +-
 paddle/fluid/operators/roi_align_op.cc        |   6 +-
 paddle/fluid/operators/roi_pool_op.cc         |   6 +-
 paddle/fluid/operators/space_to_depth_op.cc   |   2 +-
 paddle/fluid/platform/CMakeLists.txt          |  12 +-
 paddle/fluid/platform/cpu_helper.cc           |   7 +
 paddle/fluid/platform/device_tracer.h         |  12 +-
 paddle/fluid/platform/dynload/cudnn.h         |   2 -
 paddle/fluid/platform/enforce.h               |  70 +---
 paddle/fluid/platform/init.cc                 |   7 -
 paddle/fluid/platform/init.h                  |   3 -
 paddle/fluid/platform/port.h                  |  35 +-
 paddle/fluid/platform/profiler.cc             |   2 +-
 paddle/fluid/platform/profiler.h              |  10 -
 .../fluid/platform/stream_callback_manager.h  |  13 +-
 paddle/fluid/pybind/CMakeLists.txt            |   8 +-
 paddle/fluid/pybind/pybind.cc                 |  21 +-
 python/paddle/fluid/__init__.py               |   5 +-
 python/paddle/fluid/contrib/inferencer.py     |   4 +-
 python/paddle/fluid/contrib/trainer.py        |   3 +-
 python/paddle/fluid/layers/io.py              | 118 +++---
 python/paddle/fluid/layers/nn.py              | 368 +++++++++---------
 python/paddle/fluid/layers/ops.py             |  41 +-
 44 files changed, 483 insertions(+), 554 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index c62cc9bfd7..bc2ac2cd93 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -130,6 +130,21 @@ if (APPLE OR WIN32)
         "Disable MKL for building on mac and windows" FORCE)
 endif()
 
+if (WIN32)
+    set(WITH_AVX OFF CACHE STRING
+            "Disable AVX when compiling for Windows" FORCE)
+    set(WITH_DSO OFF CACHE STRING
+            "Disable DSO when compiling for Windows" FORCE)
+    set(WITH_MKL OFF CACHE STRING
+            "Disable MKL when compiling for Windows" FORCE)
+    set(WITH_DISTRIBUTE OFF CACHE STRING
+            "Disable DISTRIBUTE when compiling for Windows" FORCE)
+    set(WITH_C_API OFF CACHE STRING
+            "Disable C_API when compiling for Windows" FORCE)
+    set(WITH_FLUID_ONLY ON CACHE STRING
+            "Enable FLUID_ONLY when compiling for Windows" FORCE)
+endif()
+
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
@@ -190,11 +205,11 @@ include(external/pybind11)  # download pybind11
 include(external/cares)
 include(external/cub)
 include(external/xxhash)    # download xxhash
-
-if (NOT WIN32)
-# there is no official support of snappystream, warpctc, nccl, cupti in windows
 include(external/snappy)    # download snappy
 include(external/snappystream) # download snappystream
+
+if (NOT WIN32)
+# there is no official support of warpctc, nccl, cupti in windows
 include(external/warpctc)   # download, build, install warpctc
 include(cupti)
 endif (NOT WIN32)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index d335298742..4fe9c13fb7 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -50,7 +50,11 @@ IF(WITH_TESTING)
         CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                         -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
                         -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                        -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                        -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                         -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                        -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                        -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
                         -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
                         -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                         -DBUILD_GMOCK=ON
diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake
index af09ed4d5d..b30403d2d8 100644
--- a/cmake/external/snappy.cmake
+++ b/cmake/external/snappy.cmake
@@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
 set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
 set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE)
 
-set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+if (WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib")
+else(WIN32)
+    set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
+endif (WIN32)
 
 ExternalProject_Add(
     extern_snappy
@@ -34,8 +38,12 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
                     -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
                     -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                    -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                    -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                    -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                    -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
                     -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
                     -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake
index 6df636d7fa..1ec79462c1 100644
--- a/cmake/external/snappystream.cmake
+++ b/cmake/external/snappystream.cmake
@@ -18,36 +18,45 @@ ENDIF()
 
 include (ExternalProject)
 
-# NOTE: snappy is needed when linking with recordio
-
 set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
 set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
 set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE)
 
-set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
-
-ExternalProject_Add(
-        extern_snappystream
-        GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
-        GIT_TAG "0.2.8"
-        PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
-        UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-                        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-                        -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-                        -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-                        -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-                        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-                        -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
-                        ${EXTERNAL_OPTIONAL_ARGS}
-                        CMAKE_CACHE_ARGS
-                        -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-                        -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-                        -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
-        DEPENDS snappy
-)
+if(WIN32)
+    # Fix me, VS2015 come without VLA support
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib")
+    MESSAGE(WARNING, "In windows, snappystream has no compile support for windows,
+    please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR})
+else(WIN32)
+    set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
+
+    ExternalProject_Add(
+            extern_snappystream
+            GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
+            GIT_TAG "0.2.8"
+            PREFIX          ${SNAPPYSTREAM_SOURCES_DIR}
+            UPDATE_COMMAND  ""
+            CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+                            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+                            -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+                            -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG}
+                            -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+                            -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE}
+                            -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG}
+                            -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
+                            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+                            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
+                            -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
+                            ${EXTERNAL_OPTIONAL_ARGS}
+                            CMAKE_CACHE_ARGS
+                            -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
+                            -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
+                            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
+            DEPENDS snappy
+    )
+endif(WIN32)
 
 add_library(snappystream STATIC IMPORTED GLOBAL)
 set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES})
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index e21f89c7c5..111627a932 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -351,6 +351,9 @@ function(cc_test TARGET_NAME)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
     target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
+    if(WIN32)
+      target_link_libraries(${TARGET_NAME} shlwapi)
+    endif(WIN32)
     add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog)
     add_test(NAME ${TARGET_NAME}
              COMMAND ${TARGET_NAME} ${cc_test_ARGS}
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index ba9c266d13..17107e0698 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -84,9 +84,7 @@ function(op_library TARGET)
     endif()
     if (WIN32)
     # remove windows unsupported op, because windows has no nccl, no warpctc such ops.
-    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op"
-     "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op"
-      "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op")
+    foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op")
         if ("${TARGET}" STREQUAL "${windows_unsupport_op}")
           return()
         endif()
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index 566dc75fda..86096d4fea 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -57,43 +57,46 @@ int main()
     return 0;
 }" SSE3_FOUND)
 
-# Check AVX
-set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
-set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
-    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
-    __m256 result = _mm256_add_ps (a, b);
-    return 0;
-}" AVX_FOUND)
+# disable AVX by default on windows
+if(NOT WIN32)
+    # Check AVX
+    set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG})
+    set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+        __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+        __m256 result = _mm256_add_ps (a, b);
+        return 0;
+    }" AVX_FOUND)
 
-# Check AVX 2
-set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
-set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
-    __m256i result = _mm256_abs_epi32 (a);
-    return 0;
-}" AVX2_FOUND)
+    # Check AVX 2
+    set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG})
+    set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+        __m256i result = _mm256_abs_epi32 (a);
+        return 0;
+    }" AVX2_FOUND)
 
-# Check AVX512F
-set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
-set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
-CHECK_CXX_SOURCE_RUNS("
-#include <immintrin.h>
-int main()
-{
-    __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
-                                  13, -5, 6, -7, 9, 2, -6, 3);
-    __m512i result = _mm512_abs_epi32 (a);
-    return 0;
-}" AVX512F_FOUND)
+    # Check AVX512F
+    set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
+    set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
+    CHECK_CXX_SOURCE_RUNS("
+    #include <immintrin.h>
+    int main()
+    {
+        __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
+                                      13, -5, 6, -7, 9, 2, -6, 3);
+        __m512i result = _mm512_abs_epi32 (a);
+        return 0;
+    }" AVX512F_FOUND)
+endif(NOT WIN32)
 
 set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)
diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
index abadda3adb..6b526f0103 100644
--- a/paddle/fluid/CMakeLists.txt
+++ b/paddle/fluid/CMakeLists.txt
@@ -3,13 +3,9 @@ add_subdirectory(platform)
 add_subdirectory(framework)
 add_subdirectory(operators)
 add_subdirectory(string)
-
-add_subdirectory(pybind)
-if (NOT WIN32)
 add_subdirectory(recordio)
-endif(NOT WIN32)
+add_subdirectory(pybind)
 
 # NOTE: please add subdirectory inference at last.
 add_subdirectory(inference)
-
 add_subdirectory(train)
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index cb9057672c..43e1bc6b2e 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -31,9 +31,7 @@ function(windows_symbolic TARGET)
 endfunction()
 
 add_subdirectory(ir)
-if (NOT WIN32)
 add_subdirectory(details)
-endif (NOT WIN32)
 # ddim lib
 proto_library(framework_proto SRCS framework.proto)
 
@@ -68,11 +66,7 @@ if(WITH_GPU)
 else()
   cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor)
 endif()
-if (NOT WIN32)
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
-else()
-  cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version)
-endif (NOT WIN32)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version)
 
 cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
@@ -122,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 
-if (NOT WIN32)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
     shape_inference data_transform lod_tensor profiler)
-else()
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform lod_tensor)
-endif(NOT WIN32)
 
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context)
 
@@ -183,12 +172,10 @@ else()
   cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
 endif()
 
-if (NOT WIN32)
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS
         threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
         graph build_strategy
         fast_threaded_ssa_graph_executor)
-endif() # NOT WIN32
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
index 949616f02d..c3a8b85423 100644
--- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h
@@ -13,9 +13,9 @@
 // limitations under the License.
 
 #pragma once
+#include <ThreadPool.h>
 #include <string>
 #include <vector>
-#include "ThreadPool.h"
 #include "paddle/fluid/framework/blocking_queue.h"
 #include "paddle/fluid/framework/details/exception_holder.h"
 #include "paddle/fluid/framework/details/execution_strategy.h"
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index 2b265a773f..5bafa4345f 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -13,11 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
 
 #include "paddle/fluid/framework/tensor.h"
 #include "unsupported/Eigen/CXX11/Tensor"
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index ef2eb334a4..0e6e74293c 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -23,11 +23,6 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 
-#if defined(_WIN32)
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #include "glog/logging.h"  // For VLOG()
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/details/op_registry.h"
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 2b35943d09..1ec170b6f6 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 6918e030bf..ef83833217 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -20,8 +20,6 @@ limitations under the License. */
 #include <tuple>
 #include <unordered_map>
 #include <vector>
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 
 #include "glog/logging.h"  // For VLOG
 #include "paddle/fluid/framework/attribute.h"
diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h
index 4e4ab47ca9..9dfa48d501 100644
--- a/paddle/fluid/inference/api/api_impl.h
+++ b/paddle/fluid/inference/api/api_impl.h
@@ -14,12 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-// logging.h and windows.h conflict
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-// solve static linking error in windows
-// https://github.com/google/glog/issues/301
-#define GOOGLE_GLOG_DLL_DECL
-
 #include <glog/logging.h>
 #include <map>
 #include <memory>
diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h
index 9e0044c47a..26d3643f4e 100644
--- a/paddle/fluid/memory/allocation/cpu_allocator.h
+++ b/paddle/fluid/memory/allocation/cpu_allocator.h
@@ -15,6 +15,12 @@
 #pragma once
 #include "paddle/fluid/memory/allocation/allocator.h"
 
+#ifdef _WIN32
+#define posix_memalign_free _aligned_free
+#define posix_memalign(p, a, s) \
+  (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
+#endif
+
 namespace paddle {
 namespace memory {
 namespace allocation {
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 81c9239486..de4f23515d 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE)
     add_subdirectory(distributed_ops)
 endif()
 
-if (NOT WIN32)
-    add_subdirectory(reader)
-endif()
+add_subdirectory(reader)
 
 if (NOT WIN32)
     add_subdirectory(nccl)
@@ -42,7 +40,7 @@ endif()
 register_operators(EXCLUDES warpctc_op conv_fusion_op DEPS ${OP_HEADER_DEPS})
 
 # warpctc_op needs cudnn 7 above
-if (WITH_GPU)
+if (WITH_GPU AND NOT WIN32)
     if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
         op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc)
     else()
@@ -59,10 +57,12 @@ endif()
 
 set(COMMON_OP_DEPS ${OP_HEADER_DEPS})
 
-set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor)
 if (NOT WIN32)
-  set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
+    set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
 endif()
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions)
 if (WITH_GPU)
   set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv)
 endif()
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 64096a717b..79980cda53 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -111,7 +111,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
     auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
     auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
-    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+    Eigen::array<int, 2> bcast{1, static_cast<int>(pre_out_grad.dims()[1])};
 
     // softrelu derivative
     pre_out_grad_mat.device(place) =
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 83ee9f6c51..63363086ad 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -1,6 +1,4 @@
-if (NOT WIN32)
-    add_subdirectory(detail)
-endif(NOT WIN32)
+add_subdirectory(detail)
 
 function(math_library TARGET)
     # math_library is a function to create math library.
@@ -43,10 +41,8 @@ math_library(depthwise_conv)
 math_library(im2col)
 math_library(sampler)
 
-if (NOT WIN32) # windows do not support avx functions yet.
-    math_library(gru_compute DEPS activation_functions math_function)
-    math_library(lstm_compute DEPS activation_functions)
-endif (NOT WIN32)
+math_library(gru_compute DEPS activation_functions math_function)
+math_library(lstm_compute DEPS activation_functions)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
 math_library(math_function DEPS blas)
@@ -58,9 +54,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
-if (NOT WIN32)
-    math_library(matrix_bit_code)
-endif (NOT WIN32)
+
+math_library(matrix_bit_code)
+
 math_library(unpooling)
 math_library(vol2col)
 
@@ -76,13 +72,12 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
-if (NOT WIN32)
-    set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
-    set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
-    if(WITH_XBYAK)
-        list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
-        list(APPEND JIT_KERNEL_DEPS xbyak)
-    endif()
-    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
-    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
-endif (NOT WIN32)
+
+set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc jit_kernel_layer_norm.cc)
+set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
+if(WITH_XBYAK)
+    list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
+    list(APPEND JIT_KERNEL_DEPS xbyak)
+endif()
+cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h
index b127fbe8c8..2b3d38d95a 100644
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"
 
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
index 07854c8358..c329b8b611 100644
--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -67,7 +67,7 @@ inline constexpr size_t FindLastSet(size_t x) {
              : (std::is_same<size_t, unsigned long>::value  // NOLINT
                     ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0)
                     : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0));
-
+}
 #else
 // windows don't have built-in clz, ctz function
 template <typename T>
@@ -92,7 +92,6 @@ inline int clz(const T& value) {
 
 inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); }
 #endif  // !_WIN32
-}
 
 struct SimpleCode {
   SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {}
diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc
index 0f31ca1a94..901a92ab5b 100644
--- a/paddle/fluid/operators/reader/create_py_reader_op.cc
+++ b/paddle/fluid/operators/reader/create_py_reader_op.cc
@@ -74,7 +74,7 @@ class CreatePyReaderOpMaker : public FileReaderMakerBase {
              "Name of the `LoDTensorBlockingQueueHolder` variable");
 
     AddComment(R"DOC(
-			Create PyReader to support LoDTensor data feeding in Python side.
+      Create PyReader to support LoDTensor data feeding in Python side.
       )DOC");
   }
 };
diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc
index c57a34c3a7..79f189222e 100644
--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -35,10 +35,10 @@ class ROIAlignOp : public framework::OperatorWithKernel {
                    "The format of input tensor is NCHW.");
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     PADDLE_ENFORCE(rois_dims[1] == 4,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
     float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");
@@ -103,7 +103,7 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor), "
              "ROIs (Regions of Interest) to pool over. "
              "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], …]. "
+             "given as [[x1, y1, x2, y2], ...]. "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
     AddOutput("Out",
diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc
index 043ea680d1..3f6b2e46c7 100644
--- a/paddle/fluid/operators/roi_pool_op.cc
+++ b/paddle/fluid/operators/roi_pool_op.cc
@@ -40,10 +40,10 @@ class ROIPoolOp : public framework::OperatorWithKernel {
                    "The format of input tensor is NCHW.");
     PADDLE_ENFORCE(rois_dims.size() == 2,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
     PADDLE_ENFORCE(rois_dims[1] == kROISize,
                    "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
-                   "given as [[x1, y1, x2, y2], …].");
+                   "given as [[x1, y1, x2, y2], ...].");
 
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -110,7 +110,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor), "
              "ROIs (Regions of Interest) to pool over. "
              "should be a 2-D LoDTensor of shape (num_rois, 4)"
-             "given as [[x1, y1, x2, y2], …]. "
+             "given as [[x1, y1, x2, y2], ...]. "
              "Where batch_id is the id of the data, "
              "(x1, y1) is the top left coordinates, and "
              "(x2, y2) is the bottom right coordinates.");
diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc
index c047bc78ee..b579244673 100644
--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -86,7 +86,7 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker {
         .GreaterThan(1);
     AddComment(R"DOC(
         reorg operator used in Yolo v2.
-        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,   
+        The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize,
 
         Reshape Input(X) into the shape according to Attr(blocksize). The
         data in Input(X) are unchanged.
diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt
index 0d0613e1a4..93cb5eb2dc 100644
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -1,4 +1,3 @@
-if (NOT WIN32)
 proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto)
 py_proto_compile(profiler_py_proto SRCS profiler.proto)
 
@@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _
 
 add_dependencies(profiler_py_proto profiler_py_proto_init)
 
+if (NOT WIN32)
 add_custom_command(TARGET profiler_py_proto POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
         COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+else(NOT WIN32)
+string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/")
+add_custom_command(TARGET profiler_py_proto POST_BUILD
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler
+        COMMAND copy /Y *.py ${proto_dstpath}
+        COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler."
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif(NOT WIN32)
 
 if(WITH_GPU)
@@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context)
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
 
-
-if (NOT WIN32)
 cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
 cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
 cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
-endif(NOT WIN32)
 
 nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
 cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor)
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index 234a04b5c2..f2d691b293 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -29,6 +29,13 @@ namespace platform {
 
 void SetNumThreads(int num_threads) {
 #ifdef PADDLE_USE_OPENBLAS
+// windows has no support for openblas multi-thread
+// please refer to: https://github.com/PaddlePaddle/Paddle/issues/7234
+#ifdef _WIN32
+  if (num_threads > 1) {
+    num_threads = 1;
+  }
+#endif
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   openblas_set_num_threads(real_num_threads);
 #elif defined(PADDLE_WITH_MKLML)
diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h
index f59fc40b71..eaf047d474 100644
--- a/paddle/fluid/platform/device_tracer.h
+++ b/paddle/fluid/platform/device_tracer.h
@@ -13,17 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 
-#if !defined(_WIN32)
-#include <sys/time.h>
-#else
-#include <windows.h>
-#endif  // !_WIN32
-
-#include <time.h>
 #include <chrono>  // NOLINT
 #include <string>
 
 #include "paddle/fluid/platform/dynload/cupti.h"
+#include "paddle/fluid/platform/port.h"
 #include "paddle/fluid/platform/profiler.pb.h"
 
 namespace paddle {
@@ -32,15 +26,11 @@ namespace platform {
 ///////////////////////
 // WARN: Under Development. Don't depend on it yet.
 //////////////////////
-#if !defined(_WIN32)
 inline uint64_t PosixInNsec() {
   struct timeval tv;
   gettimeofday(&tv, nullptr);
   return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
 }
-#else
-inline uint64_t PosixInNsec() { return static_cast<uint64_t>(0); }
-#endif  // !_WIN32
 
 // DeviceTracer performs the following tasks:
 // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h
index 065b940b9c..1a83ac7780 100644
--- a/paddle/fluid/platform/dynload/cudnn.h
+++ b/paddle/fluid/platform/dynload/cudnn.h
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
 #include <glog/logging.h>
 
 #include <cudnn.h>
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index a251bfcd99..a85972bdb7 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -18,12 +18,6 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif               // __GNUC__
 
-#if defined(_WIN32)
-#define NOMINMAX  // msvc max/min macro conflict with std::min/max
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#endif
-
 #ifdef PADDLE_WITH_CUDA
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -127,14 +121,14 @@ struct EOFException : public std::exception {
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 #else
 // there is no equivalent intrinsics in msvc.
-#define UNLIKELY(condition) (condition == 0)
+#define UNLIKELY(condition) (condition)
 #endif
 
 #if !defined(_WIN32)
 #define LIKELY(condition) __builtin_expect(static_cast<bool>(condition), 1)
 #else
 // there is no equivalent intrinsics in msvc.
-#define LIKELY(condition) (condition != 0)
+#define LIKELY(condition) (condition)
 #endif
 
 template <typename... Args>
@@ -248,7 +242,6 @@ inline void throw_on_error(T e) {
   throw_on_error(e, "");
 }
 
-#if !defined(_WIN32)
 #define PADDLE_THROW(...)                                              \
   do {                                                                 \
     throw ::paddle::platform::EnforceNotMet(                           \
@@ -272,17 +265,6 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__);
 #endif  // REPLACE_ENFORCE_GLOG
 
-#else  // !_WIN32
-// disable enforce, caused by the varardic macro exception error
-#define PADDLE_THROW(x)                                      \
-  do {                                                       \
-    throw std::make_exception_ptr(                           \
-        std::runtime_error("Windows disable the enforce.")); \
-  } while (false)
-
-#define PADDLE_ENFORCE(x, ...) x
-#endif  // !_WIN32
-
 #define PADDLE_THROW_EOF()                                                     \
   do {                                                                         \
     throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \
@@ -302,20 +284,6 @@ inline void throw_on_error(T e) {
  *    extra messages is also supported, for example:
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
  */
-#if !defined(_WIN32)
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
-  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
   do {                                                       \
     if (UNLIKELY(nullptr == (__VAL))) {                      \
@@ -335,27 +303,19 @@ inline void throw_on_error(T e) {
                    paddle::string::Sprintf("" __VA_ARGS__));            \
     }                                                                   \
   } while (0)
-#else
-#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1))
-#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1))
-#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1))
-#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1))
-#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1))
-#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1))
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
-  do {                                                                 \
-    if (!((__VAL0)__CMP(__VAL1))) {                                    \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed.");    \
-    }                                                                  \
-  } while (0)
-#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...)                       \
-  do {                                                             \
-    if (nullptr == (__VAL1)) {                                     \
-      PADDLE_THROW("Windows disable the enforce. Enforce failed"); \
-    }                                                              \
-  } while (0)
-#endif  // !_WIN32
+
+#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
+  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index e07e9d3825..0ccef6c6a8 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -117,13 +117,6 @@ void InitDevices(bool init_p2p, const std::vector<int> devices) {
   places.emplace_back(platform::CPUPlace());
   platform::DeviceContextPool::Init(places);
 
-// windows has no support for openblas multi-thread
-#ifdef _WIN32
-  if (FLAGS_paddle_num_threads > 1) {
-    FLAGS_paddle_num_threads = 1;
-  }
-#endif
-
 #ifndef PADDLE_WITH_MKLDNN
   platform::SetNumThreads(FLAGS_paddle_num_threads);
 #endif
diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h
index 992ca5e6f6..0e30594672 100644
--- a/paddle/fluid/platform/init.h
+++ b/paddle/fluid/platform/init.h
@@ -16,9 +16,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#define GLOG_NO_ABBREVIATED_SEVERITIES
-#define GOOGLE_GLOG_DLL_DECL
-
 #include "gflags/gflags.h"
 #include "glog/logging.h"
 
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index 8823e97b0b..ad070171df 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -17,6 +17,7 @@
 #include <cstdio>
 #include <stdexcept>
 
+#include <time.h>
 #include <memory>
 #include <string>
 
@@ -27,8 +28,13 @@
 #include <dlfcn.h>     //  dladdr
 #include <execinfo.h>  // backtrace
 #include <sys/stat.h>
+#include <sys/time.h>
 #include <algorithm>  // std::accumulate
 #else
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+// solve static linking error in windows
+// https://github.com/google/glog/issues/301
+#define GOOGLE_GLOG_DLL_DECL
 #include <io.h>  // _popen, _pclose
 #include <stdio.h>
 #include <windows.h>
@@ -57,6 +63,25 @@ static void *dlopen(const char *filename, int flag) {
   return reinterpret_cast<void *>(hModule);
 }
 
+static int gettimeofday(struct timeval *tp, void *tzp) {
+  time_t clock;
+  struct tm tm;
+  SYSTEMTIME wtm;
+
+  GetLocalTime(&wtm);
+  tm.tm_year = wtm.wYear - 1900;
+  tm.tm_mon = wtm.wMonth - 1;
+  tm.tm_mday = wtm.wDay;
+  tm.tm_hour = wtm.wHour;
+  tm.tm_min = wtm.wMinute;
+  tm.tm_sec = wtm.wSecond;
+  tm.tm_isdst = -1;
+  clock = mktime(&tm);
+  tp->tv_sec = clock;
+  tp->tv_usec = wtm.wMilliseconds * 1000;
+
+  return (0);
+}
 #endif  // !_WIN32
 
 static void ExecShellCommand(const std::string &cmd, std::string *message) {
@@ -132,10 +157,12 @@ static void MkDir(const char *path) {
     }
   }
 #else
-  CreateDirectory(path, NULL);
-  auto errorno = GetLastError();
-  if (errorno != ERROR_ALREADY_EXISTS) {
-    throw std::runtime_error(path_error);
+  BOOL return_value = CreateDirectory(path, NULL);
+  if (!return_value) {
+    auto errorno = GetLastError();
+    if (errorno != ERROR_ALREADY_EXISTS) {
+      throw std::runtime_error(path_error);
+    }
   }
 #endif  // !_WIN32
 }
diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc
index 56bf9e31a3..998242fb4a 100644
--- a/paddle/fluid/platform/profiler.cc
+++ b/paddle/fluid/platform/profiler.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/fluid/platform/port.h"
 
-#include <sys/time.h>
 #include <algorithm>
 #include <iomanip>
 #include <limits>
diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h
index e8eae874af..f5d3490634 100644
--- a/paddle/fluid/platform/profiler.h
+++ b/paddle/fluid/platform/profiler.h
@@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
 
 void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
 
-#if !defined(_WIN32)
 struct RecordEvent {
   // dev_ctx can be set to nullptr if device is cpu.
   RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
@@ -106,15 +105,6 @@ struct RecordBlock {
   std::string name_;
   uint64_t start_ns_;
 };
-#else
-// windows do not support profiler temporarily.
-struct RecordEvent {
-  RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {}
-};
-struct RecordBlock {
-  explicit RecordBlock(int block_id) {}
-};
-#endif
 
 // Return the event list of all threads. Assumed the returned value calls
 // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h
index 0e88a439cf..11c68f3449 100644
--- a/paddle/fluid/platform/stream_callback_manager.h
+++ b/paddle/fluid/platform/stream_callback_manager.h
@@ -45,16 +45,15 @@ class StreamCallbackManager {
   inline void AddCallback(Callback &&callback) const {
     auto *stream_callback_context =
         new StreamCallbackContext(this, std::forward<Callback>(callback));
-    PADDLE_ENFORCE(
 #if CUDA_VERSION >= 10000
-        cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc,
-                           stream_callback_context)
+    PADDLE_ENFORCE(cudaLaunchHostFunc(stream_,
+                                      StreamCallbackManager::StreamCallbackFunc,
+                                      stream_callback_context));  // NOLINT
 #else
-        cudaStreamAddCallback(stream_,
-                              StreamCallbackManager::StreamCallbackFunc,
-                              stream_callback_context, 0)
+    PADDLE_ENFORCE(cudaStreamAddCallback(
+        stream_, StreamCallbackManager::StreamCallbackFunc,
+        stream_callback_context, 0));  // NOLINT
 #endif
-            );  // NOLINT
   }
 
   void Wait() const { thread_pool_.reset(new ThreadPool(1)); }
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6417da077e..fb6ee2f4a5 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,10 +1,6 @@
 
-set(PYBIND_DEPS pybind python proto_desc memory executor prune  feed_fetch_method pass_builder)
-set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc)
-if(NOT WIN32)
-  list(APPEND PYBIND_DEPS parallel_executor profiler)
-  list(APPEND PYBIND_SRCS recordio.cc)
-endif(NOT WIN32)
+set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler)
+set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc)
 if(WITH_PYTHON)
   if(WITH_AMD_GPU)
     hip_library(paddle_pybind SHARED
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 5ef5bf4d6c..6cc3a1739a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,13 +21,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#if defined(_WIN32)
-#define NOMINMAX
-#define GLOG_NO_ABBREVIATED_SEVERITIES  // msvc conflict logging with windows.h
-#define GOOGLE_GLOG_DLL_DECL
-#include <Windows.h>
-#endif
-
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/framework.pb.h"
@@ -36,9 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#ifndef _WIN32
 #include "paddle/fluid/framework/parallel_executor.h"
-#endif
 #include "paddle/fluid/framework/prune.h"
 #include "paddle/fluid/framework/reader.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -359,22 +350,16 @@ All parameter, weight, gradient are variables in Paddle.
              return self.GetMutable<platform::Communicator>();
            },
            py::return_value_policy::reference)
-
 #endif
-#ifndef _WIN32
       .def("get_reader",
            [](Variable &self) -> framework::ReaderHolder * {
              PADDLE_ENFORCE(self.IsType<framework::ReaderHolder>());
              return self.GetMutable<framework::ReaderHolder>();
            },
-           py::return_value_policy::reference)
-#endif
-      ;  // NOLINT
+           py::return_value_policy::reference);
 
-#if !defined(_WIN32)
   py::class_<framework::ReaderHolder>(m, "Reader", "")
       .def("reset", &framework::ReaderHolder::ResetAll);
-#endif
 
   using LoDTensorBlockingQueue =
       ::paddle::operators::reader::LoDTensorBlockingQueue;
@@ -643,7 +628,6 @@ All parameter, weight, gradient are variables in Paddle.
 #endif
 #endif
 
-#ifndef _WIN32
   py::enum_<platform::ProfilerState>(m, "ProfilerState", py::arithmetic())
       .value("kDisabled", platform::ProfilerState::kDisabled)
       .value("kCPU", platform::ProfilerState::kCPU)
@@ -664,7 +648,6 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("is_profiler_enabled", platform::IsProfileEnabled);
   m.def("reset_profiler", platform::ResetProfiler);
-#endif
 
   py::class_<ir::Pass, std::shared_ptr<ir::Pass>> pass(m, "Pass");
   pass.def(py::init())
@@ -693,7 +676,6 @@ All parameter, weight, gradient are variables in Paddle.
       .def("remove_pass",
            [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); });
 
-#ifndef _WIN32
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
@@ -921,7 +903,6 @@ All parameter, weight, gradient are variables in Paddle.
       });
 
   BindRecordIOWriter(&m);
-#endif
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index f2f49f813a..543acf2d34 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -115,9 +115,8 @@ def __bootstrap__():
         'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope',
         'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb',
         'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb',
-        'allocator_strategy', 'reader_queue_speed_test_mode',
-        'print_sub_graph_dir'
+        "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy',
+        'reader_queue_speed_test_mode', 'print_sub_graph_dir'
     ]
     if os.name != 'nt':
         read_env_flags.append('warpctc_dir')
diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py
index b966ae01d0..b8d5f4ffea 100644
--- a/python/paddle/fluid/contrib/inferencer.py
+++ b/python/paddle/fluid/contrib/inferencer.py
@@ -15,15 +15,13 @@
 from __future__ import print_function
 
 import contextlib
-import os
 
 from .. import core
 
 from .. import executor
 from .. import framework
 from .. import io
-if os.name != 'nt':
-    from .. import parallel_executor
+from .. import parallel_executor
 from .. import unique_name
 from .trainer import check_and_get_place
 
diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py
index 096821a5ba..8569e486f9 100644
--- a/python/paddle/fluid/contrib/trainer.py
+++ b/python/paddle/fluid/contrib/trainer.py
@@ -28,8 +28,7 @@ from .. import framework
 from .. import io
 # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module
 from .. import optimizer as opt_module
-if os.name != 'nt':
-    from .. import parallel_executor
+from .. import parallel_executor
 from ..transpiler import distribute_transpiler
 
 __all__ = [
diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py
index a9075045a2..3f47053961 100644
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
@@ -347,72 +347,70 @@ def _copy_reader_create_op_(block, op):
     return new_op
 
 
-if os.name != 'nt':
-
-    @templatedoc(op_type='create_recordio_file_reader')
-    def open_recordio_file(filename,
-                           shapes,
-                           lod_levels,
-                           dtypes,
-                           pass_num=1,
-                           for_parallel=True):
-        """
-        ${comment}
-
-        Args:
-           filename(${filename_type}): ${filename_comment}.
-           shapes(list): List of tuples which declaring data shapes.
-           lod_levels(${lod_levels_type}): ${lod_levels_comment}.
-           dtypes(list): List of strs which declaring data type.
-           pass_num(int): Number of passes to run.
-           for_parallel(Bool): Set it as True if you are going to run
-                subsequent operators in parallel.
-
-        Returns:
-           ${out_comment}.
-
-        Examples:
-
-            >>> import paddle.fluid as fluid
-            >>> reader = fluid.layers.io.open_recordio_file(
-            >>>                               filename='./data.recordio',
-            >>>                               shapes=[(3,224,224), (1)],
-            >>>                               lod_levels=[0, 0],
-            >>>                               dtypes=['float32', 'int64'])
-            >>> # Via the reader, we can use 'read_file' layer to get data:
-            >>> image, label = fluid.layers.io.read_file(reader)
-        """
-        dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
-        shape_concat = []
-        ranks = []
+@templatedoc(op_type='create_recordio_file_reader')
+def open_recordio_file(filename,
+                       shapes,
+                       lod_levels,
+                       dtypes,
+                       pass_num=1,
+                       for_parallel=True):
+    """
+    ${comment}
 
-        for shape in shapes:
-            shape_concat.extend(shape)
-            ranks.append(len(shape))
+    Args:
+       filename(${filename_type}): ${filename_comment}.
+       shapes(list): List of tuples which declaring data shapes.
+       lod_levels(${lod_levels_type}): ${lod_levels_comment}.
+       dtypes(list): List of strs which declaring data type.
+       pass_num(int): Number of passes to run.
+       for_parallel(Bool): Set it as True if you are going to run
+            subsequent operators in parallel.
 
-        var_name = unique_name('open_recordio_file')
+    Returns:
+       ${out_comment}.
 
-        startup_blk = default_startup_program().current_block()
-        startup_var = startup_blk.create_var(name=var_name)
-        startup_blk.append_op(
-            type='create_recordio_file_reader',
-            outputs={'Out': [startup_var]},
-            attrs={
-                'shape_concat': shape_concat,
-                'lod_levels': lod_levels,
-                'filename': filename,
-                'ranks': ranks
-            })
+    Examples:
 
-        startup_var.desc.set_dtypes(dtypes)
-        startup_var.persistable = True
-        main_prog_var = _copy_reader_var_(
-            default_main_program().current_block(), startup_var)
+        >>> import paddle.fluid as fluid
+        >>> reader = fluid.layers.io.open_recordio_file(
+        >>>                               filename='./data.recordio',
+        >>>                               shapes=[(3,224,224), (1)],
+        >>>                               lod_levels=[0, 0],
+        >>>                               dtypes=['float32', 'int64'])
+        >>> # Via the reader, we can use 'read_file' layer to get data:
+        >>> image, label = fluid.layers.io.read_file(reader)
+    """
+    dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes]
+    shape_concat = []
+    ranks = []
 
-        if pass_num > 1:
-            main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+    for shape in shapes:
+        shape_concat.extend(shape)
+        ranks.append(len(shape))
+
+    var_name = unique_name('open_recordio_file')
+
+    startup_blk = default_startup_program().current_block()
+    startup_var = startup_blk.create_var(name=var_name)
+    startup_blk.append_op(
+        type='create_recordio_file_reader',
+        outputs={'Out': [startup_var]},
+        attrs={
+            'shape_concat': shape_concat,
+            'lod_levels': lod_levels,
+            'filename': filename,
+            'ranks': ranks
+        })
 
-        return monkey_patch_reader_methods(main_prog_var)
+    startup_var.desc.set_dtypes(dtypes)
+    startup_var.persistable = True
+    main_prog_var = _copy_reader_var_(default_main_program().current_block(),
+                                      startup_var)
+
+    if pass_num > 1:
+        main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num)
+
+    return monkey_patch_reader_methods(main_prog_var)
 
 
 def random_data_generator(low, high, shapes, lod_levels, for_parallel=True):
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index 7b0a3e2c82..e0cc09a4c7 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -343,128 +343,126 @@ def embedding(input,
     return tmp
 
 
-if os.name != 'nt':
+@templatedoc(op_type="lstm")
+def dynamic_lstm(input,
+                 size,
+                 h_0=None,
+                 c_0=None,
+                 param_attr=None,
+                 bias_attr=None,
+                 use_peepholes=True,
+                 is_reverse=False,
+                 gate_activation='sigmoid',
+                 cell_activation='tanh',
+                 candidate_activation='tanh',
+                 dtype='float32',
+                 name=None):
+    """
+    ${comment}
 
-    @templatedoc(op_type="lstm")
-    def dynamic_lstm(input,
-                     size,
-                     h_0=None,
-                     c_0=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_peepholes=True,
-                     is_reverse=False,
-                     gate_activation='sigmoid',
-                     cell_activation='tanh',
-                     candidate_activation='tanh',
-                     dtype='float32',
-                     name=None):
-        """
-        ${comment}
-
-        Args:
-            input (Variable): ${input_comment}
-            size (int): 4 * hidden size.
-            h_0(Variable): The initial hidden state is an optional input, default is zero.
-                           This is a tensor with shape (N x D), where N is the
-                           batch size and D is the hidden size.
-            c_0(Variable): The initial cell state is an optional input, default is zero.
-                           This is a tensor with shape (N x D), where N is the
-                           batch size. `h_0` and `c_0` can be NULL but only at the same time.
-            param_attr(ParamAttr|None): The parameter attribute for the learnable
-                                   hidden-hidden weights.
-
-                                   - Weights = {:math:`W_{ch}, W_{ih}, \
-                                                    W_{fh}, W_{oh}`}
-                                   - The shape is (D x 4D), where D is the hidden
-                                     size.
-
-                                   If it is set to None or one attribute of ParamAttr,
-                                   dynamic_lstm will create ParamAttr as param_attr.
-                                   If the Initializer of the param_attr is not set, the
-                                   parameter is initialized with Xavier. Default: None.
-            bias_attr (ParamAttr|None): The bias attribute for the learnable bias
-                                  weights, which contains two parts, input-hidden
-                                  bias weights and peephole connections weights if
-                                  setting `use_peepholes` to `True`.
-
-                                  1. `use_peepholes = False`
-                                     - Biases = {:math:`b_c, b_i, b_f, b_o`}.
-                                     - The shape is (1 x 4D).
-                                  2. `use_peepholes = True`
-                                     - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
-                                                     W_{fc}, W_{oc}`}.
-                                     - The shape is (1 x 7D).
-
-                                  If it is set to None or one attribute of ParamAttr,
-                                  dynamic_lstm will create ParamAttr as bias_attr.
-                                  If the Initializer of the bias_attr is not set,
-                                  the bias is initialized zero. Default: None.
-            use_peepholes (bool): ${use_peepholes_comment}
-            is_reverse (bool): ${is_reverse_comment}
-            gate_activation (str): ${gate_activation_comment}
-            cell_activation (str): ${cell_activation_comment}
-            candidate_activation (str): ${candidate_activation_comment}
-            dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
-            name (str|None): A name for this layer(optional). If set None, the layer
-                             will be named automatically.
-
-        Returns:
-            tuple: The hidden state, and cell state of LSTM. The shape of both \
-            is (T x D), and lod is the same with the `input`.
-
-        Examples:
-            .. code-block:: python
-
-                hidden_dim = 512
-                forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
-                                               bias_attr=False)
-                forward, _ = fluid.layers.dynamic_lstm(
-                    input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
-        """
-        assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
-        helper = LayerHelper('lstm', **locals())
-        size = size // 4
-        weight = helper.create_parameter(
-            attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
-        bias_size = [1, 7 * size]
-        if not use_peepholes:
-            bias_size[1] = 4 * size
-        bias = helper.create_parameter(
-            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+    Args:
+        input (Variable): ${input_comment}
+        size (int): 4 * hidden size.
+        h_0(Variable): The initial hidden state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size and D is the hidden size.
+        c_0(Variable): The initial cell state is an optional input, default is zero.
+                       This is a tensor with shape (N x D), where N is the
+                       batch size. `h_0` and `c_0` can be NULL but only at the same time.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+                               hidden-hidden weights.
 
-        hidden = helper.create_variable_for_type_inference(dtype)
-        cell = helper.create_variable_for_type_inference(dtype)
-        batch_gate = helper.create_variable_for_type_inference(dtype)
-        batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
-        inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
-        batch_size = input.shape[0]
-        if h_0:
-            assert h_0.shape == (batch_size, size), \
-                'The shape of h0 should be (batch_size, %d)' % size
-            inputs['H0'] = h_0
-        if c_0:
-            assert c_0.shape == (batch_size, size), \
-                'The shape of c0 should be (batch_size, %d)' % size
-            inputs['C0'] = c_0
+                               - Weights = {:math:`W_{ch}, W_{ih}, \
+                                                W_{fh}, W_{oh}`}
+                               - The shape is (D x 4D), where D is the hidden
+                                 size.
 
-        helper.append_op(
-            type='lstm',
-            inputs=inputs,
-            outputs={
-                'Hidden': hidden,
-                'Cell': cell,
-                'BatchGate': batch_gate,
-                'BatchCellPreAct': batch_cell_pre_act
-            },
-            attrs={
-                'use_peepholes': use_peepholes,
-                'is_reverse': is_reverse,
-                'gate_activation': gate_activation,
-                'cell_activation': cell_activation,
-                'candidate_activation': candidate_activation
-            })
-        return hidden, cell
+                               If it is set to None or one attribute of ParamAttr,
+                               dynamic_lstm will create ParamAttr as param_attr.
+                               If the Initializer of the param_attr is not set, the
+                               parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr|None): The bias attribute for the learnable bias
+                              weights, which contains two parts, input-hidden
+                              bias weights and peephole connections weights if
+                              setting `use_peepholes` to `True`.
+
+                              1. `use_peepholes = False`
+                                 - Biases = {:math:`b_c, b_i, b_f, b_o`}.
+                                 - The shape is (1 x 4D).
+                              2. `use_peepholes = True`
+                                 - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \
+                                                 W_{fc}, W_{oc}`}.
+                                 - The shape is (1 x 7D).
+
+                              If it is set to None or one attribute of ParamAttr,
+                              dynamic_lstm will create ParamAttr as bias_attr.
+                              If the Initializer of the bias_attr is not set,
+                              the bias is initialized zero. Default: None.
+        use_peepholes (bool): ${use_peepholes_comment}
+        is_reverse (bool): ${is_reverse_comment}
+        gate_activation (str): ${gate_activation_comment}
+        cell_activation (str): ${cell_activation_comment}
+        candidate_activation (str): ${candidate_activation_comment}
+        dtype (str): Data type. Choices = ["float32", "float64"], default "float32".
+        name (str|None): A name for this layer(optional). If set None, the layer
+                         will be named automatically.
+
+    Returns:
+        tuple: The hidden state, and cell state of LSTM. The shape of both \
+        is (T x D), and lod is the same with the `input`.
+
+    Examples:
+        .. code-block:: python
+
+            hidden_dim = 512
+            forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4,
+                                           bias_attr=False)
+            forward, _ = fluid.layers.dynamic_lstm(
+                input=forward_proj, size=hidden_dim * 4, use_peepholes=False)
+    """
+    assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp."
+    helper = LayerHelper('lstm', **locals())
+    size = size // 4
+    weight = helper.create_parameter(
+        attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype)
+    bias_size = [1, 7 * size]
+    if not use_peepholes:
+        bias_size[1] = 4 * size
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+
+    hidden = helper.create_variable_for_type_inference(dtype)
+    cell = helper.create_variable_for_type_inference(dtype)
+    batch_gate = helper.create_variable_for_type_inference(dtype)
+    batch_cell_pre_act = helper.create_variable_for_type_inference(dtype)
+    inputs = {'Input': input, 'Weight': weight, 'Bias': bias}
+    batch_size = input.shape[0]
+    if h_0:
+        assert h_0.shape == (batch_size, size), \
+            'The shape of h0 should be (batch_size, %d)' % size
+        inputs['H0'] = h_0
+    if c_0:
+        assert c_0.shape == (batch_size, size), \
+            'The shape of c0 should be (batch_size, %d)' % size
+        inputs['C0'] = c_0
+
+    helper.append_op(
+        type='lstm',
+        inputs=inputs,
+        outputs={
+            'Hidden': hidden,
+            'Cell': cell,
+            'BatchGate': batch_gate,
+            'BatchCellPreAct': batch_cell_pre_act
+        },
+        attrs={
+            'use_peepholes': use_peepholes,
+            'is_reverse': is_reverse,
+            'gate_activation': gate_activation,
+            'cell_activation': cell_activation,
+            'candidate_activation': candidate_activation
+        })
+    return hidden, cell
 
 
 def dynamic_lstmp(input,
@@ -963,43 +961,39 @@ def linear_chain_crf(input, label, param_attr=None):
     return log_likelihood
 
 
-if os.name != 'nt':
-
-    @templatedoc()
-    def crf_decoding(input, param_attr, label=None):
-        """
-        ${comment}
+@templatedoc()
+def crf_decoding(input, param_attr, label=None):
+    """
+    ${comment}
 
-        Args:
-            input(${emission_type}): ${emission_comment}
+    Args:
+        input(${emission_type}): ${emission_comment}
 
-            param_attr(ParamAttr): The parameter attribute for training.
+        param_attr(ParamAttr): The parameter attribute for training.
 
-            label(${label_type}): ${label_comment}
+        label(${label_type}): ${label_comment}
 
-        Returns:
-            Variable: ${viterbi_path_comment}
+    Returns:
+        Variable: ${viterbi_path_comment}
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-               crf_decode = layers.crf_decoding(
-                    input=hidden, param_attr=ParamAttr(name="crfw"))
-        """
-        helper = LayerHelper('crf_decoding', **locals())
-        transition = helper.get_parameter(param_attr.name)
-        viterbi_path = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-        helper.append_op(
-            type='crf_decoding',
-            inputs={
-                "Emission": [input],
+           crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
+    """
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
                 "Transition": transition,
-                "Label": label
-            },
-            outputs={"ViterbiPath": [viterbi_path]})
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
 
-        return viterbi_path
+    return viterbi_path
 
 
 @templatedoc()
@@ -5593,48 +5587,42 @@ def label_smooth(label,
     return smooth_label
 
 
-if os.name != 'nt':
-
-    @templatedoc()
-    def roi_pool(input,
-                 rois,
-                 pooled_height=1,
-                 pooled_width=1,
-                 spatial_scale=1.0):
-        """
-        ${comment}
-
-        Args:
-            input (Variable): ${x_comment}
-            rois (Variable): ROIs (Regions of Interest) to pool over.
-            pooled_height (integer): ${pooled_height_comment} Default: 1
-            pooled_width (integer): ${pooled_width_comment} Default: 1
-            spatial_scale (float): ${spatial_scale_comment} Default: 1.0
-
-        Returns:
-            Variable: ${out_comment}.
-
-        Examples:
-            .. code-block:: python
-
-                pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
-        """
-        helper = LayerHelper('roi_pool', **locals())
-        dtype = helper.input_dtype()
-        pool_out = helper.create_variable_for_type_inference(dtype)
-        argmaxes = helper.create_variable_for_type_inference(dtype='int32')
-        helper.append_op(
-            type="roi_pool",
-            inputs={"X": input,
-                    "ROIs": rois},
-            outputs={"Out": pool_out,
-                     "Argmax": argmaxes},
-            attrs={
-                "pooled_height": pooled_height,
-                "pooled_width": pooled_width,
-                "spatial_scale": spatial_scale
-            })
-        return pool_out
+@templatedoc()
+def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0):
+    """
+    ${comment}
+
+    Args:
+        input (Variable): ${x_comment}
+        rois (Variable): ROIs (Regions of Interest) to pool over.
+        pooled_height (integer): ${pooled_height_comment} Default: 1
+        pooled_width (integer): ${pooled_width_comment} Default: 1
+        spatial_scale (float): ${spatial_scale_comment} Default: 1.0
+
+    Returns:
+        Variable: ${out_comment}.
+
+    Examples:
+        .. code-block:: python
+
+            pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0)
+    """
+    helper = LayerHelper('roi_pool', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_variable_for_type_inference(dtype)
+    argmaxes = helper.create_variable_for_type_inference(dtype='int32')
+    helper.append_op(
+        type="roi_pool",
+        inputs={"X": input,
+                "ROIs": rois},
+        outputs={"Out": pool_out,
+                 "Argmax": argmaxes},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale
+        })
+    return pool_out
 
 
 @templatedoc()
diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py
index 66eb1229aa..6c18af7283 100644
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -100,26 +100,27 @@ Examples:
     >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3)
 """
 
-if os.name != 'nt':
-    __all__ += ['cumsum']
-
-    _cum_sum_ = generate_layer_fn('cumsum')
-
-    def cumsum(x, axis=None, exclusive=None, reverse=None):
-        locals_var = locals().keys()
-        kwargs = dict()
-        for name in locals_var:
-            val = locals()[name]
-            if val is not None:
-                kwargs[name] = val
-        return _cum_sum_(**kwargs)
-
-    cumsum.__doc__ = _cum_sum_.__doc__ + """
-    Examples:
-    
-        >>> data = fluid.layers.data(name="input", shape=[32, 784])
-        >>> result = fluid.layers.cumsum(data, axis=0)
-    """
+__all__ += ['cumsum']
+
+_cum_sum_ = generate_layer_fn('cumsum')
+
+
+def cumsum(x, axis=None, exclusive=None, reverse=None):
+    locals_var = locals().keys()
+    kwargs = dict()
+    for name in locals_var:
+        val = locals()[name]
+        if val is not None:
+            kwargs[name] = val
+    return _cum_sum_(**kwargs)
+
+
+cumsum.__doc__ = _cum_sum_.__doc__ + """
+Examples:
+
+    >>> data = fluid.layers.data(name="input", shape=[32, 784])
+    >>> result = fluid.layers.cumsum(data, axis=0)
+"""
 
 __all__ += ['thresholded_relu']
 

From ae7d22862be83c5ca5ed2d820a11fd8ab523766d Mon Sep 17 00:00:00 2001
From: Dun <randonlang@gmail.com>
Date: Thu, 22 Nov 2018 15:42:28 +0800
Subject: [PATCH 13/13] Group Norm (#13843)

Add group normalization operator.
---
 AUTHORS.md                                    |   1 +
 paddle/fluid/API.spec                         |   1 +
 paddle/fluid/operators/group_norm_op.cc       | 162 ++++++++++
 paddle/fluid/operators/group_norm_op.cu       | 292 ++++++++++++++++++
 paddle/fluid/operators/group_norm_op.h        | 197 ++++++++++++
 python/paddle/fluid/layers/nn.py              |  79 +++++
 .../paddle/fluid/tests/unittests/op_test.py   |  10 +-
 .../tests/unittests/test_group_norm_op.py     | 143 +++++++++
 8 files changed, 880 insertions(+), 5 deletions(-)
 create mode 100644 paddle/fluid/operators/group_norm_op.cc
 create mode 100644 paddle/fluid/operators/group_norm_op.cu
 create mode 100644 paddle/fluid/operators/group_norm_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/test_group_norm_op.py

diff --git a/AUTHORS.md b/AUTHORS.md
index 54a1097b50..deafa64120 100644
--- a/AUTHORS.md
+++ b/AUTHORS.md
@@ -25,6 +25,7 @@
 | kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
+| cjld | Dun Liang |
 | lipeng-unisound | Peng Li |
 | liuyuan | Yuan Liu |
 | livc | Zhao Li |
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index da8941c351..541c4db1fa 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -103,6 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
+paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
 paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
diff --git a/paddle/fluid/operators/group_norm_op.cc b/paddle/fluid/operators/group_norm_op.cc
new file mode 100644
index 0000000000..6322659b67
--- /dev/null
+++ b/paddle/fluid/operators/group_norm_op.cc
@@ -0,0 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/group_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+class GroupNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of GroupNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto channel_num = x_dim[1];
+    auto batch_size = x_dim[0];
+    auto groups = ctx->Attrs().Get<int>("groups");
+    PADDLE_ENFORCE_LE(
+        groups, channel_num,
+        "'groups' must be less equal than the number of channels.");
+    PADDLE_ENFORCE_GE(groups, 1, "'groups' must be greater equal than 1.");
+
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], channel_num);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], channel_num);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {batch_size, groups});
+    ctx->SetOutputDim("Variance", {batch_size, groups});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Scale",
+             "Scale is a 1-dimensional tensor of size C"
+             "that is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "Bias is a 1-dimensional tensor of size C "
+             "that is applied to the output")
+        .AsDispensable();
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of each group.").AsIntermediate();
+    AddOutput("Variance", "Variance of each group.").AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 1.0f,
+                         "'epsilon' should be between 0.0 and 1.0.");
+        });
+    AddAttr<int>("groups", "The number of groups that divided from channels.")
+        .AddCustomChecker([](const int &groups) {
+          PADDLE_ENFORCE_GT(groups, 0, "'groups' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Group Normalization
+
+Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_
+)DOC");
+  }
+};
+
+class GroupNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of GroupNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
new file mode 100644
index 0000000000..2717463022
--- /dev/null
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -0,0 +1,292 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cub/cub.cuh>
+#include "paddle/fluid/operators/group_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C,
+                                              int imsize, int groups,
+                                              int group_size, T* mean, T* var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = 0, x_var = 0;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val = x[(bid * C + ccid) * imsize + imid];
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= number * imsize;
+  x_var /= number * imsize;
+  __shared__ T s_mem[2];
+  if (threadIdx.x == 0) {
+    s_mem[0] = s_mem[1] = 0;
+  }
+  __syncthreads();
+  paddle::platform::CudaAtomicAdd(&s_mem[0], x_mean);
+  paddle::platform::CudaAtomicAdd(&s_mem[1], x_var);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    paddle::platform::CudaAtomicAdd(&mean[bid * groups + gid], s_mem[0]);
+    paddle::platform::CudaAtomicAdd(&var[bid * groups + gid], s_mem[1]);
+  }
+}
+
+template <typename T>
+__global__ void GroupNormForward(const T* x, const T* mean, const T* var,
+                                 const T* scale, const T* bias, int N, int C,
+                                 int imsize, int groups, int group_size,
+                                 T epsilon, T* y, T* real_var) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  x_var = x_var - x_mean * x_mean;
+  T var_inv = 1.0 / sqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T val = x[(bid * C + ccid) * imsize + imid];
+    val = (val - x_mean) * var_inv;
+    if (scale) val *= scale[gid * group_size + cid];
+    if (bias) val += bias[gid * group_size + cid];
+    y[(bid * C + ccid) * imsize + imid] = val;
+  }
+}
+
+template <typename T>
+class GroupNormKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    Tensor temp_var;
+    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+
+    set_zero(dev_ctx, mean, static_cast<T>(0));
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    auto* temp_var_data = temp_var.data<T>();
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    int block_size = std::min(512, imsize);
+    dim3 grid(group_size, groups, x_dims[0]);
+    dim3 threads(block_size, 1, 1);
+    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, x_dims[0], x_dims[1], imsize, groups, group_size, mean_data,
+        temp_var_data);
+    GroupNormForward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, temp_var_data, scale_data, bias_data, x_dims[0],
+        x_dims[1], imsize, groups, group_size, epsilon, y_data, var_data);
+  }
+};
+
+template <typename T>
+__global__ void GroupNormBackwardGetMeanAndVar(
+    const T* x, const T* mean, const T* var, const T* scale, const T* d_y,
+    int N, int C, int imsize, int groups, int group_size, T epsilon, T* d_x,
+    T* d_mean, T* d_var, T* d_scale, T* d_bias) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  T var_inv = 1.0 / sqrt(x_var + epsilon);
+  T d_var_inv = 0, d_x_mean = 0;
+  T d_mean_data = 0, d_var_data = 0, d_scale_data = 0, d_bias_data = 0;
+
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T tmp = x[(bid * C + ccid) * imsize + imid];
+    T val = (tmp - x_mean) * var_inv;
+    T dval = d_y[(bid * C + ccid) * imsize + imid];
+    if (d_bias) d_bias_data += dval;
+    if (d_scale) d_scale_data += val * dval;
+    if (scale) dval = dval * scale[ccid];
+    d_var_data += (tmp - x_mean) * dval;
+    T d_tmp = dval * var_inv;
+    if (d_x) d_x[(bid * C + ccid) * imsize + imid] = d_tmp;
+    d_mean_data -= d_tmp;
+  }
+
+  __shared__ T s_mem[4];
+  if (threadIdx.x == 0) {
+    s_mem[0] = s_mem[1] = 0;
+    if (d_scale) s_mem[2] = 0;
+    if (d_bias) s_mem[3] = 0;
+  }
+  __syncthreads();
+  paddle::platform::CudaAtomicAdd(&s_mem[0], d_mean_data);
+  paddle::platform::CudaAtomicAdd(&s_mem[1], d_var_data);
+  if (d_scale) paddle::platform::CudaAtomicAdd(&s_mem[2], d_scale_data);
+  if (d_bias) paddle::platform::CudaAtomicAdd(&s_mem[3], d_bias_data);
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    paddle::platform::CudaAtomicAdd(&d_mean[bid * groups + gid], s_mem[0]);
+    paddle::platform::CudaAtomicAdd(&d_var[bid * groups + gid], s_mem[1]);
+    if (d_scale) paddle::platform::CudaAtomicAdd(&d_scale[ccid], s_mem[2]);
+    if (d_bias) paddle::platform::CudaAtomicAdd(&d_bias[ccid], s_mem[3]);
+  }
+}
+
+template <typename T>
+__global__ void GroupNormBackward(const T* x, const T* mean, const T* var,
+                                  const T* d_mean, const T* d_var, int N, int C,
+                                  int imsize, int groups, int group_size,
+                                  T epsilon, T* d_x) {
+  int gid = blockIdx.y;
+  int cid = blockIdx.x;
+  int bid = blockIdx.z;
+  int number = min(group_size, static_cast<int>(C - gid * group_size));
+  int ccid = gid * group_size + cid;
+  if (ccid >= C) return;
+  T x_mean = mean[bid * groups + gid];
+  T x_var = var[bid * groups + gid];
+  T d_x_mean = d_mean[bid * groups + gid];
+  T d_var_inv = d_var[bid * groups + gid];
+
+  T d_x_var =
+      -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+  d_x_mean -= 2 * d_x_var * x_mean;
+  d_x_var /= number * imsize;
+  d_x_mean /= number * imsize;
+  for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
+    T tmp = x[(bid * C + ccid) * imsize + imid];
+    if (d_x)
+      d_x[(bid * C + ccid) * imsize + imid] += d_x_mean + tmp * 2 * d_x_var;
+  }
+}
+
+template <typename T>
+class GroupNormGradKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      d_x_data = d_x->data<T>();
+    }
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+    Tensor temp_var;
+    temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, &temp_var, static_cast<T>(0));
+    T* temp_var_data = temp_var.data<T>();
+
+    Tensor temp_mean;
+    temp_mean.mutable_data<T>(var->dims(), ctx.GetPlace());
+    set_zero(dev_ctx, &temp_mean, static_cast<T>(0));
+    T* temp_mean_data = temp_mean.data<T>();
+
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    int block_size = std::min(512, imsize);
+    dim3 grid(group_size, groups, x_dims[0]);
+    dim3 threads(block_size, 1, 1);
+    GroupNormBackwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, var_data, scale_data, y_data, x_dims[0], x_dims[1],
+        imsize, groups, group_size, epsilon, d_x_data, temp_mean_data,
+        temp_var_data, d_scale_data, d_bias_data);
+    GroupNormBackward<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+        x_data, mean_data, var_data, temp_mean_data, temp_var_data, x_dims[0],
+        x_dims[1], imsize, groups, group_size, epsilon, d_x_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    group_norm,
+    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CUDADeviceContext, double>);
+REGISTER_OP_CUDA_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CUDADeviceContext, double>);
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
new file mode 100644
index 0000000000..3d6c6a46a9
--- /dev/null
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename DeviceContext, typename T>
+class GroupNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = 0, x_var = 0;
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, iter_x_data++) {
+            x_mean += iter_x_data[0];
+            x_var += iter_x_data[0] * iter_x_data[0];
+          }
+        }
+        x_mean /= number * imsize;
+        x_var /= number * imsize;
+        x_var = x_var - x_mean * x_mean;
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        mean_data[bid * groups + gid] = x_mean;
+        var_data[bid * groups + gid] = x_var;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, tmp++, iter_y_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            if (scale_data) val *= scale_data[gid * group_size + cid];
+            if (bias_data) val += bias_data[gid * group_size + cid];
+            iter_y_data[0] = val;
+          }
+        }
+      }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GroupNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    // TODO(liangdun): need to check d_x is null
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
+      d_x_data = d_x->data<T>();
+    }
+
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_d_x_data = d_x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = mean_data[bid * groups + gid];
+        T x_var = var_data[bid * groups + gid];
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        auto* tmp2 = iter_d_x_data;
+        T d_var_inv = 0, d_x_mean = 0;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, tmp++, iter_y_data++, iter_d_x_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            T dval = iter_y_data[0];
+            if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
+            if (d_scale_data)
+              d_scale_data[gid * group_size + cid] += val * dval;
+            if (scale_data) dval = scale_data[gid * group_size + cid] * dval;
+
+            d_var_inv += (tmp[0] - x_mean) * dval;
+            T d_tmp = dval * var_inv;
+            if (d_x_data) iter_d_x_data[0] += d_tmp;
+            d_x_mean -= d_tmp;
+          }
+        }
+
+        T d_x_var =
+            -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+        d_x_mean -= 2 * d_x_var * x_mean;
+        d_x_var /= number * imsize;
+        d_x_mean /= number * imsize;
+
+        iter_d_x_data = tmp2;
+
+        if (d_x_data) {
+          for (int cid = 0; cid < number; cid++) {
+            for (int imid = 0; imid < imsize;
+                 imid++, iter_x_data++, iter_d_x_data++) {
+              iter_d_x_data[0] += d_x_mean;
+              iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var;
+            }
+          }
+        }
+      }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index e0cc09a4c7..ccd9175b64 100644
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -85,6 +85,7 @@ __all__ = [
     'row_conv',
     'multiplex',
     'layer_norm',
+    'group_norm',
     'softmax_with_cross_entropy',
     'smooth_l1',
     'one_hot',
@@ -2547,6 +2548,84 @@ def layer_norm(input,
     return helper.append_activation(layer_norm_out)
 
 
+@templatedoc()
+def group_norm(input,
+               groups,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               data_layout='NCHW',
+               name=None):
+    """
+    **Group Normalization Layer**
+
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+
+    Args:
+        input(Variable): The input tensor variable.
+        groups(int): The number of groups that divided from channels.
+        epsilon(float): The small value added to the variance to prevent
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            scale :math:`g`. If it is set to False, no scale will be added to the output units.
+            If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act(str): Activation to be applied to the output of group normalizaiton.
+        data_layout(string|NCHW): Only NCHW is supported.
+        name (str): The name of this layer. It is optional.
+
+    Returns:
+        Variable: A tensor variable which is the result after applying group normalization on the input.
+
+    Examples:
+
+        >>> data = fluid.layers.data(name='data', shape=[8, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.group_norm(input=data, groups=4)
+    """
+    helper = LayerHelper('group_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    if data_layout != 'NCHW':
+        raise ValueError("unsupported data layout:" + data_layout)
+    param_shape = [input_shape[1]]
+    if param_attr:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if bias_attr:
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    group_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="group_norm",
+        inputs=inputs,
+        outputs={
+            "Y": group_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "groups": groups})
+
+    return helper.append_activation(group_norm_out)
+
+
 def conv2d_transpose(input,
                      num_filters,
                      output_size=None,
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index c195a28e45..271b9c740f 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -381,8 +381,8 @@ class OpTest(unittest.TestCase):
             outs.sort(key=len)
             checker(outs)
 
-    def __assert_is_close(self, numeric_grads, analytic_grads, names,
-                          max_relative_error, msg_prefix):
+    def _assert_is_close(self, numeric_grads, analytic_grads, names,
+                         max_relative_error, msg_prefix):
 
         for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
             abs_a = np.abs(a)
@@ -451,9 +451,9 @@ class OpTest(unittest.TestCase):
         analytic_grads = self._get_gradient(inputs_to_check, place,
                                             output_names, no_grad_set)
 
-        self.__assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
-                               max_relative_error,
-                               "Gradient Check On %s" % str(place))
+        self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
+                              max_relative_error,
+                              "Gradient Check On %s" % str(place))
 
     @staticmethod
     def _numpy_to_lod_tensor(np_value, lod, place):
diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op.py b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
new file mode 100644
index 0000000000..0b6d039f05
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@@ -0,0 +1,143 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+
+from testsuite import create_op
+
+
+def group_norm_naive(x, scale, bias, epsilon, groups):
+    N, C, H, W = x.shape
+    G = groups
+    x = x.reshape((N * G, -1))
+    mean = np.mean(x, axis=1, keepdims=True)
+    var = np.var(x, axis=1, keepdims=True)
+    output = (x - mean) / np.sqrt(var + epsilon)
+    output = output.reshape((N, C, H, W)) * scale.reshape(
+        (-1, 1, 1)) + bias.reshape((-1, 1, 1))
+    return output, mean.reshape((N, G)), var.reshape((N, G))
+
+
+class TestGroupNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "group_norm"
+        self.data_format = "NCHW"
+        self.dtype = np.float32
+        self.shape = (2, 4, 3, 3)
+        self.attrs = {'epsilon': 1e-5, 'groups': 2}
+        self.compare_between_place = False
+        self.init_test_case()
+
+        input = np.random.random(self.shape).astype(self.dtype)
+        scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        output, mean, var = group_norm_naive(
+            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'])
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(input),
+            'Scale': OpTest.np_dtype_to_fluid_dtype(scale),
+            'Bias': OpTest.np_dtype_to_fluid_dtype(bias)
+        }
+        self.outputs = {'Y': output, 'Mean': mean, 'Variance': var}
+
+    def test_check_output(self):
+        atol = 1e-4
+        place = core.CPUPlace()
+        self.check_output_with_place(place, atol=atol)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+    def do_compare_between_place(self):
+        if not core.is_compiled_with_cuda(): return
+        place = core.CPUPlace()
+        place2 = core.CUDAPlace(0)
+        self.scope = core.Scope()
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+                            op_attrs)
+        inputs_to_check = set(['X', 'Scale', 'Bias'])
+        output_names = 'Y'
+        cpu_grads = self._get_gradient(inputs_to_check, place, output_names,
+                                       None)
+        gpu_grads = self._get_gradient(inputs_to_check, place2, output_names,
+                                       None)
+        self._assert_is_close(cpu_grads, gpu_grads, inputs_to_check, 0.005,
+                              "Gradient Check On %s" % str(place))
+
+    def test_check_grad(self):
+        if self.compare_between_place:
+            self.do_compare_between_place()
+            return
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['X', 'Scale', 'Bias']), 'Y', max_relative_error=0.01)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['X', 'Scale', 'Bias']),
+                'Y',
+                max_relative_error=0.01)
+
+    def init_test_case(self):
+        pass
+
+
+class TestGroupNormOp1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+
+
+class TestGroupNormOp2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+
+
+class TestGroupNormOpBigEps1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps3(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpLargeData(TestGroupNormOp):
+    def init_test_case(self):
+        self.shape = (2, 32, 64, 64)
+        self.attrs['groups'] = 8
+        self.compare_between_place = True
+
+
+if __name__ == '__main__':
+    unittest.main()