Merge remote-tracking branch 'upstream/develop' into windows/build

7 years ago · dfbac60398
parent 7c8c9dc9bf dd6fd4c747
commit dfbac60398
16 changed files with 1447 additions and 24 deletions
--- a/AUTHORS.md
+++ b/AUTHORS.md
@ -25,6 +25,7 @@
 | kexinzhao | Ke-Xin Zhao |
 | kuke | Yi-Bing Liu |
 | lcy-seso | Ying Cao |
+| cjld | Dun Liang |
 | lipeng-unisound | Peng Li |
 | liuyuan | Yuan Liu |
 | livc | Zhao Li |
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@ -103,6 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
 paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
+paddle.fluid.layers.group_norm ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None))
 paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, False, False))
 paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
--- a/paddle/fluid/operators/group_norm_op.cc
+++ b/paddle/fluid/operators/group_norm_op.cc
@ -0,0 +1,162 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/group_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+class GroupNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"),
+                   "Output(Y) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mean"),
+                   "Output(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Variance"),
+                   "Output(Variance) of GroupNormOp should not be null.");
+
+    auto x_dim = ctx->GetInputDim("X");
+    auto channel_num = x_dim[1];
+    auto batch_size = x_dim[0];
+    auto groups = ctx->Attrs().Get<int>("groups");
+    PADDLE_ENFORCE_LE(
+        groups, channel_num,
+        "'groups' must be less equal than the number of channels.");
+    PADDLE_ENFORCE_GE(groups, 1, "'groups' must be greater equal than 1.");
+
+    if (ctx->HasInput("Scale")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], channel_num);
+    }
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], channel_num);
+    }
+
+    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
+    ctx->SetOutputDim("Mean", {batch_size, groups});
+    ctx->SetOutputDim("Variance", {batch_size, groups});
+    ctx->ShareLoD("X", "Y");
+  }
+};
+
+class GroupNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("Scale",
+             "Scale is a 1-dimensional tensor of size C"
+             "that is applied to the output.")
+        .AsDispensable();
+    AddInput("Bias",
+             "Bias is a 1-dimensional tensor of size C "
+             "that is applied to the output")
+        .AsDispensable();
+    AddOutput("Y", "Result after normalization.");
+    AddOutput("Mean", "Mean of each group.").AsIntermediate();
+    AddOutput("Variance", "Variance of each group.").AsIntermediate();
+
+    AddAttr<float>("epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE(epsilon >= 0.0f && epsilon <= 1.0f,
+                         "'epsilon' should be between 0.0 and 1.0.");
+        });
+    AddAttr<int>("groups", "The number of groups that divided from channels.")
+        .AddCustomChecker([](const int &groups) {
+          PADDLE_ENFORCE_GT(groups, 0, "'groups' should be greater than zero.");
+        });
+
+    AddComment(R"DOC(
+Group Normalization
+
+Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`_
+)DOC");
+  }
+};
+
+class GroupNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"),
+                   "Input(Mean) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"),
+                   "Input(Variance) of GroupNormOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
+                   "Input(Y@GRAD) of GroupNormOp should not be null.");
+
+    // check output
+    if (ctx->HasOutput(framework::GradVarName("X"))) {
+      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Scale"))) {
+      ctx->SetOutputDim(framework::GradVarName("Scale"),
+                        ctx->GetInputDim("Scale"));
+    }
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    const auto *var = ctx.InputVar(framework::GradVarName("Y"));
+    if (var == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    const Tensor *t = nullptr;
+    if (var->IsType<Tensor>()) {
+      t = &var->Get<Tensor>();
+    } else if (var->IsType<LoDTensor>()) {
+      t = &var->Get<LoDTensor>();
+    }
+    if (t == nullptr) {
+      PADDLE_THROW("can't find Y@GRAD");
+    }
+    return framework::OpKernelType(framework::ToDataType(t->type()),
+                                   ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(group_norm, ops::GroupNormOp, ops::GroupNormOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(group_norm_grad, ops::GroupNormGradOp);
+REGISTER_OP_CPU_KERNEL(
+    group_norm, ops::GroupNormKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormKernel<paddle::platform::CPUDeviceContext, double>);
+REGISTER_OP_CPU_KERNEL(
+    group_norm_grad,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::GroupNormGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@ -0,0 +1,197 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using DataLayout = framework::DataLayout;
+
+template <typename DeviceContext, typename T>
+class GroupNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* bias = ctx.Input<Tensor>("Bias");
+    auto* x = ctx.Input<Tensor>("X");
+
+    auto* y = ctx.Output<Tensor>("Y");
+    auto* mean = ctx.Output<Tensor>("Mean");
+    auto* var = ctx.Output<Tensor>("Variance");
+    const auto groups = ctx.Attr<int>("groups");
+
+    const auto x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    y->mutable_data<T>(ctx.GetPlace());
+    mean->mutable_data<T>(ctx.GetPlace());
+    var->mutable_data<T>(ctx.GetPlace());
+
+    auto* x_data = x->data<T>();
+    auto* y_data = y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+    const T* bias_data = nullptr;
+    if (bias) bias_data = bias->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = 0, x_var = 0;
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, iter_x_data++) {
+            x_mean += iter_x_data[0];
+            x_var += iter_x_data[0] * iter_x_data[0];
+          }
+        }
+        x_mean /= number * imsize;
+        x_var /= number * imsize;
+        x_var = x_var - x_mean * x_mean;
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        mean_data[bid * groups + gid] = x_mean;
+        var_data[bid * groups + gid] = x_var;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize; imid++, tmp++, iter_y_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            if (scale_data) val *= scale_data[gid * group_size + cid];
+            if (bias_data) val += bias_data[gid * group_size + cid];
+            iter_y_data[0] = val;
+          }
+        }
+      }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class GroupNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto* x = ctx.Input<Tensor>("X");
+    auto* mean = ctx.Input<Tensor>("Mean");
+    auto* var = ctx.Input<Tensor>("Variance");
+    auto* scale = ctx.Input<Tensor>("Scale");
+    auto* d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto groups = ctx.Attr<int>("groups");
+
+    // init output
+    auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    const auto& x_dims = x->dims();
+    const int group_size = (x_dims[1] - 1) / groups + 1;
+
+    // TODO(liangdun): need to check d_x is null
+    math::SetConstant<DeviceContext, T> set_zero;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    T* d_x_data = nullptr;
+    if (d_x) {
+      d_x->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_x, static_cast<T>(0));
+      d_x_data = d_x->data<T>();
+    }
+
+    auto* x_data = x->data<T>();
+    auto* y_data = d_y->data<T>();
+    auto* mean_data = mean->data<T>();
+    auto* var_data = var->data<T>();
+    T* d_scale_data = nullptr;
+    if (d_scale) {
+      d_scale->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_scale, static_cast<T>(0));
+      d_scale_data = d_scale->data<T>();
+    }
+    T* d_bias_data = nullptr;
+    if (d_bias) {
+      d_bias->mutable_data<T>(ctx.GetPlace());
+      set_zero(dev_ctx, d_bias, static_cast<T>(0));
+      d_bias_data = d_bias->data<T>();
+    }
+
+    const T* scale_data = nullptr;
+    if (scale) scale_data = scale->data<T>();
+
+    int imsize = x_dims[2] * x_dims[3];
+    auto* iter_x_data = x_data;
+    auto* iter_d_x_data = d_x_data;
+    auto* iter_y_data = y_data;
+    for (int bid = 0; bid < x_dims[0]; bid++)
+      for (int gid = 0; gid < groups; gid++) {
+        T x_mean = mean_data[bid * groups + gid];
+        T x_var = var_data[bid * groups + gid];
+        T var_inv = 1.0 / sqrt(x_var + epsilon);
+        int number = std::min(group_size,
+                              static_cast<int>(x_dims[1] - gid * group_size));
+        auto* tmp = iter_x_data;
+        auto* tmp2 = iter_d_x_data;
+        T d_var_inv = 0, d_x_mean = 0;
+        for (int cid = 0; cid < number; cid++) {
+          for (int imid = 0; imid < imsize;
+               imid++, tmp++, iter_y_data++, iter_d_x_data++) {
+            T val = (tmp[0] - x_mean) * var_inv;
+            T dval = iter_y_data[0];
+            if (d_bias_data) d_bias_data[gid * group_size + cid] += dval;
+            if (d_scale_data)
+              d_scale_data[gid * group_size + cid] += val * dval;
+            if (scale_data) dval = scale_data[gid * group_size + cid] * dval;
+
+            d_var_inv += (tmp[0] - x_mean) * dval;
+            T d_tmp = dval * var_inv;
+            if (d_x_data) iter_d_x_data[0] += d_tmp;
+            d_x_mean -= d_tmp;
+          }
+        }
+
+        T d_x_var =
+            -1.0 / (2 * (x_var + epsilon) * sqrt(x_var + epsilon)) * d_var_inv;
+        d_x_mean -= 2 * d_x_var * x_mean;
+        d_x_var /= number * imsize;
+        d_x_mean /= number * imsize;
+
+        iter_d_x_data = tmp2;
+
+        if (d_x_data) {
+          for (int cid = 0; cid < number; cid++) {
+            for (int imid = 0; imid < imsize;
+                 imid++, iter_x_data++, iter_d_x_data++) {
+              iter_d_x_data[0] += d_x_mean;
+              iter_d_x_data[0] += iter_x_data[0] * 2 * d_x_var;
+            }
+          }
+        }
+      }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/detail/activation_functions.h
+++ b/paddle/fluid/operators/math/detail/activation_functions.h
@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <math.h>
 #include <string>
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/hostdevice.h"

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@ -37,6 +37,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h"
+#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/init.h"
 #include "paddle/fluid/platform/place.h"
@ -86,6 +87,9 @@ bool IsCompiledWithDIST() {
 }

 PYBIND11_PLUGIN(core) {
+  // Not used, just make sure cpu_info.cc is linked.
+  paddle::platform::CpuTotalPhysicalMemory();
+
  paddle::memory::allocation::UseAllocatorStrategyGFlag();
  py::module m("core", "C++ core of PaddlePaddle");

--- a/python/paddle/fluid/contrib/utils/init.py
+++ b/python/paddle/fluid/contrib/utils/init.py
@ -0,0 +1,20 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from . import hdfs_utils
+from .hdfs_utils import *
+
+__all__ = hdfs_utils.__all__
--- a/python/paddle/fluid/contrib/utils/hdfs_utils.py
+++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@ -85,6 +85,7 @@ __all__ = [
    'row_conv',
    'multiplex',
    'layer_norm',
+    'group_norm',
    'softmax_with_cross_entropy',
    'smooth_l1',
    'one_hot',
@ -2547,6 +2548,84 @@ def layer_norm(input,
    return helper.append_activation(layer_norm_out)


+@templatedoc()
+def group_norm(input,
+               groups,
+               epsilon=1e-05,
+               param_attr=None,
+               bias_attr=None,
+               act=None,
+               data_layout='NCHW',
+               name=None):
+    """
+    **Group Normalization Layer**
+
+    Refer to `Group Normalization <https://arxiv.org/abs/1803.08494>`
+
+    Args:
+        input(Variable): The input tensor variable.
+        groups(int): The number of groups that divided from channels.
+        epsilon(float): The small value added to the variance to prevent
+            division by zero.
+        param_attr(ParamAttr|None): The parameter attribute for the learnable
+            scale :math:`g`. If it is set to False, no scale will be added to the output units.
+            If it is set to None, the bias is initialized one. Default: None.
+        bias_attr(ParamAttr|None): The parameter attribute for the learnable
+            bias :math:`b`. If it is set to False, no bias will be added to the output units.
+            If it is set to None, the bias is initialized zero. Default: None.
+        act(str): Activation to be applied to the output of group normalizaiton.
+        data_layout(string|NCHW): Only NCHW is supported.
+        name (str): The name of this layer. It is optional.
+
+    Returns:
+        Variable: A tensor variable which is the result after applying group normalization on the input.
+
+    Examples:
+
+        >>> data = fluid.layers.data(name='data', shape=[8, 32, 32],
+        >>>                          dtype='float32')
+        >>> x = fluid.layers.group_norm(input=data, groups=4)
+    """
+    helper = LayerHelper('group_norm', **locals())
+    dtype = helper.input_dtype()
+
+    # create intput and parameters
+    inputs = {'X': input}
+    input_shape = input.shape
+    if data_layout != 'NCHW':
+        raise ValueError("unsupported data layout:" + data_layout)
+    param_shape = [input_shape[1]]
+    if param_attr:
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0))
+        inputs['Scale'] = scale
+    if bias_attr:
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
+
+    # create output
+    mean_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    variance_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
+    group_norm_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="group_norm",
+        inputs=inputs,
+        outputs={
+            "Y": group_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon,
+               "groups": groups})
+
+    return helper.append_activation(group_norm_out)
+
+
 def conv2d_transpose(input,
                     num_filters,
                     output_size=None,
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@ -23,11 +23,11 @@ if(NOT WITH_DISTRIBUTE)
    LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification)
 endif(NOT WITH_DISTRIBUTE)

-if(WITH_GPU)
-    if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
-        LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
-    endif()
-endif(WITH_GPU)
+if (NOT ${WITH_GPU})
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+elseif(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
+    LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
+endif()

 list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290
 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184
@ -81,10 +81,12 @@ list(REMOVE_ITEM TEST_OPS test_dist_se_resnext)
 list(REMOVE_ITEM TEST_OPS test_dist_transformer)
 list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer)
 list(REMOVE_ITEM TEST_OPS test_image_classification_resnet)
+list(REMOVE_ITEM TEST_OPS test_interpolate_op)
 foreach(TEST_OP ${TEST_OPS})
    py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
+py_test_modules(test_interpolate_op MODULES test_interpolate_op SERIAL)
 if(WITH_DISTRIBUTE)
    py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
    set_tests_properties(test_listen_and_serv_op PROPERTIES TIMEOUT 20)
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@ -381,8 +381,8 @@ class OpTest(unittest.TestCase):
            outs.sort(key=len)
            checker(outs)

-    def __assert_is_close(self, numeric_grads, analytic_grads, names,
-                          max_relative_error, msg_prefix):
+    def _assert_is_close(self, numeric_grads, analytic_grads, names,
+                         max_relative_error, msg_prefix):

        for a, b, name in six.moves.zip(numeric_grads, analytic_grads, names):
            abs_a = np.abs(a)
@ -451,9 +451,9 @@ class OpTest(unittest.TestCase):
        analytic_grads = self._get_gradient(inputs_to_check, place,
                                            output_names, no_grad_set)

-        self.__assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
-                               max_relative_error,
-                               "Gradient Check On %s" % str(place))
+        self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
+                              max_relative_error,
+                              "Gradient Check On %s" % str(place))

    @staticmethod
    def _numpy_to_lod_tensor(np_value, lod, place):
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op.py
@ -0,0 +1,143 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+
+from operator import mul
+import paddle.fluid.core as core
+import paddle.fluid as fluid
+from op_test import OpTest
+
+from testsuite import create_op
+
+
+def group_norm_naive(x, scale, bias, epsilon, groups):
+    N, C, H, W = x.shape
+    G = groups
+    x = x.reshape((N * G, -1))
+    mean = np.mean(x, axis=1, keepdims=True)
+    var = np.var(x, axis=1, keepdims=True)
+    output = (x - mean) / np.sqrt(var + epsilon)
+    output = output.reshape((N, C, H, W)) * scale.reshape(
+        (-1, 1, 1)) + bias.reshape((-1, 1, 1))
+    return output, mean.reshape((N, G)), var.reshape((N, G))
+
+
+class TestGroupNormOp(OpTest):
+    def setUp(self):
+        self.op_type = "group_norm"
+        self.data_format = "NCHW"
+        self.dtype = np.float32
+        self.shape = (2, 4, 3, 3)
+        self.attrs = {'epsilon': 1e-5, 'groups': 2}
+        self.compare_between_place = False
+        self.init_test_case()
+
+        input = np.random.random(self.shape).astype(self.dtype)
+        scale = np.random.random([self.shape[1]]).astype(self.dtype)
+        bias = np.random.random([self.shape[1]]).astype(self.dtype)
+        output, mean, var = group_norm_naive(
+            input, scale, bias, self.attrs['epsilon'], self.attrs['groups'])
+
+        self.inputs = {
+            'X': OpTest.np_dtype_to_fluid_dtype(input),
+            'Scale': OpTest.np_dtype_to_fluid_dtype(scale),
+            'Bias': OpTest.np_dtype_to_fluid_dtype(bias)
+        }
+        self.outputs = {'Y': output, 'Mean': mean, 'Variance': var}
+
+    def test_check_output(self):
+        atol = 1e-4
+        place = core.CPUPlace()
+        self.check_output_with_place(place, atol=atol)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_output_with_place(place, atol=atol)
+
+    def do_compare_between_place(self):
+        if not core.is_compiled_with_cuda(): return
+        place = core.CPUPlace()
+        place2 = core.CUDAPlace(0)
+        self.scope = core.Scope()
+        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
+        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
+        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
+        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
+                            op_attrs)
+        inputs_to_check = set(['X', 'Scale', 'Bias'])
+        output_names = 'Y'
+        cpu_grads = self._get_gradient(inputs_to_check, place, output_names,
+                                       None)
+        gpu_grads = self._get_gradient(inputs_to_check, place2, output_names,
+                                       None)
+        self._assert_is_close(cpu_grads, gpu_grads, inputs_to_check, 0.005,
+                              "Gradient Check On %s" % str(place))
+
+    def test_check_grad(self):
+        if self.compare_between_place:
+            self.do_compare_between_place()
+            return
+        place = core.CPUPlace()
+        self.check_grad_with_place(
+            place, set(['X', 'Scale', 'Bias']), 'Y', max_relative_error=0.01)
+        if core.is_compiled_with_cuda():
+            place = core.CUDAPlace(0)
+            self.check_grad_with_place(
+                place,
+                set(['X', 'Scale', 'Bias']),
+                'Y',
+                max_relative_error=0.01)
+
+    def init_test_case(self):
+        pass
+
+
+class TestGroupNormOp1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+
+
+class TestGroupNormOp2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+
+
+class TestGroupNormOpBigEps1(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 1
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps2(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['groups'] = 4
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpBigEps3(TestGroupNormOp):
+    def init_test_case(self):
+        self.attrs['epsilon'] = 0.5
+
+
+class TestGroupNormOpLargeData(TestGroupNormOp):
+    def init_test_case(self):
+        self.shape = (2, 32, 64, 64)
+        self.attrs['groups'] = 8
+        self.compare_between_place = True
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@ -36,17 +36,21 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
    tar xzf protobuf-cpp-3.1.0.tar.gz && \
    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz

-RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
+RUN wget https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt -O /root/requirements.txt

 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
    go get github.com/Masterminds/glide && \
    rm -rf /root/requirements.txt

 RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python

 RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
@ -9,12 +9,12 @@ set -ex
 # remove others to expedite build and reduce docker image size. The original
 # manylinux docker image project builds many python versions.
 # NOTE We added back 3.5.1, since auditwheel requires python 3.3+
-CPYTHON_VERSIONS="2.7.11 3.5.1"
+CPYTHON_VERSIONS="3.7.0 3.6.0 3.5.1 2.7.11"

 # openssl version to build, with expected sha256 hash of .tar.gz
 # archive
-OPENSSL_ROOT=openssl-1.0.2l
-OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
+OPENSSL_ROOT=openssl-1.1.0i
+OPENSSL_HASH=ebbfc844a8c8cc0ea5dc10b86c9ce97f401837f3fa08c17b2cdadc118253cf99
 EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
 DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
 PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
@ -25,7 +25,7 @@ AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969

 # Dependencies for compiling Python that we want to remove from
 # the final image after compiling Python
-PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel libffi-devel"

 # Libraries that are allowed as part of the manylinux1 profile
 MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel freetype-devel libpng-devel"
@ -61,7 +61,7 @@ yum -y install bzip2 make git patch unzip bison yasm diffutils \

 wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
 cd cmake-3.5.2 && ./bootstrap && \
-make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
+make -j8 && make install && cd .. && rm cmake-3.5.2.tar.gz


 # Install newest autoconf
@ -77,11 +77,13 @@ mkdir -p /opt/python
 build_cpythons $CPYTHON_VERSIONS

 PY35_BIN=/opt/python/cp35-cp35m/bin
+PY36_BIN=/opt/python/cp36-cp36m/bin
+PY37_BIN=/opt/python/cp37-cp37m/bin
 # NOTE Since our custom manylinux image builds pythons with shared
 # libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
 # python.
 ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
-LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib:$(dirname ${PY36_BIN})/lib:$(dirname ${PY37_BIN})/lib"

 # Our openssl doesn't know how to find the system CA trust store
 #   (https://github.com/pypa/manylinux/issues/53)
@ -119,9 +121,8 @@ ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
 # final image
 yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
    avahi freetype bitstream-vera-fonts \
-    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1
-yum -y install ${MANYLINUX1_DEPS}
-yum -y clean all > /dev/null 2>&1
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1 || true
+yum -y install ${MANYLINUX1_DEPS} && yum -y clean all > /dev/null 2>&1 || true
 yum list installed
 # we don't need libpython*.a, and they're many megabytes
 find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
--- a/tools/manylinux1/build_scripts/build_utils.sh
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@ -52,9 +52,17 @@ function do_cpython_build {

    # NOTE --enable-shared for generating libpython shared library needed for
    # linking of some of the nupic.core test executables.
-    CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
-    make -j2 > /dev/null
-    make install > /dev/null
+    if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then
+        # NOTE python 3.7 should be installed via make altinstall rather than
+        # make install, and we should specify the location of ssl
+        CFLAGS="-Wformat" ./configure --prefix=${prefix} --with-openssl=/usr/local/ssl --enable-shared $unicode_flags > /dev/null
+        make -j8 > /dev/null
+        make altinstall > /dev/null
+    else
+        CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+        make -j8 > /dev/null
+        make install > /dev/null
+    fi
    popd
    echo "ZZZ looking for libpython"
    find / -name 'libpython*.so*'
@ -64,6 +72,9 @@ function do_cpython_build {
    if [ -e ${prefix}/bin/python3 ]; then
        ln -s python3 ${prefix}/bin/python
    fi
+    if [ -e ${prefix}/bin/python3.7 ]; then
+        ln -s python3.7 ${prefix}/bin/python
+    fi
    # NOTE Make libpython shared library visible to python calls below
    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel